Skip to content

Commit

Permalink
gutf8: Drop ifunc code and always call strlen() when validating UTF-8
Browse files Browse the repository at this point in the history
This fixes a heap buffer overflow read in `g_utf8_validate()` and
`g_str_is_ascii()`, at the cost of always calling `strlen()` on the
input string if its length isn’t known already.

The overflow read was not a security vulnerability, but getting valgrind
and asan to understand that, across all platforms and build
configurations, doesn’t seem to be possible with the resources available
to us. In particular, the `ifunc` approach doesn’t work on muslc, and
doesn’t work when statically linked.

The UTF-8 validation code should still be faster than the old approach
(GLib 2.82 and older), as `strlen()` is SIMD-accelerated in glibc, and
UTF-8 validation is SIMD accelerated in GLib. The combination of the two
should still be faster than the bytewise read loop we used to have.

Unfortunately, correctness and testability have to be prioritised over
absolute performance.

Signed-off-by: Philip Withnall <pwithnall@gnome.org>

Fixes: #3493
Fixes: #3511
Fixes: #3526
  • Loading branch information
pwithnall committed Nov 19, 2024
1 parent dfe2524 commit 96205fc
Showing 1 changed file with 8 additions and 131 deletions.
139 changes: 8 additions & 131 deletions glib/gutf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@
#include "glibintl.h"
#include "gvalgrind.h"

#if g_macro__has_attribute(ifunc) && !defined(G_OS_WIN32)
#define HAVE_WORKING_IFUNC_ATTRIBUTE 1
#endif

#define UTF8_COMPUTE(Char, Mask, Len) \
if (Char < 128) \
{ \
Expand Down Expand Up @@ -1607,12 +1603,6 @@ load_u8 (gconstpointer memory,
# define _attribute_aligned(n)
#endif

/* See the HAVE_WORKING_IFUNC_ATTRIBUTE definition for an explanation of the
* no_sanitize_address annotation.
*/
#if g_macro__has_attribute(no_sanitize_address)
__attribute__((no_sanitize_address))
#endif
static inline gsize
load_word (gconstpointer memory,
gsize offset)
Expand Down Expand Up @@ -1648,7 +1638,7 @@ utf8_verify_ascii (const char **strp,
gsize *lenp)
{
const char *str = *strp;
gsize len = lenp ? *lenp : (gsize)-1;
gsize len = lenp ? *lenp : strlen (str);

while (len > 0 && load_u8 (str, 0) < 128)
{
Expand Down Expand Up @@ -1697,7 +1687,7 @@ utf8_verify (const char **strp,
gsize *lenp)
{
const char *str = *strp;
gsize len = lenp ? *lenp : (gsize)-1;
gsize len = lenp ? *lenp : strlen (str);

/* See Unicode 10.0.0, Chapter 3, Section D92 */

Expand Down Expand Up @@ -1835,77 +1825,6 @@ utf8_verify (const char **strp,
*lenp = len;
}

static gboolean
g_utf8_validate_native (const char *str,
gssize max_len,
const char **end)
{
if (max_len >= 0)
return g_utf8_validate_len (str, max_len, end);

utf8_verify (&str, NULL);

if (end != NULL)
*end = str;

return *str == 0;
}

#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
/* The fast implementation of UTF-8 validation in `utf8_verify()` technically
* uses undefined behaviour when the string length is not provided (i.e. when
* it’s looking for a trailing nul terminator): when doing word-sized reads of
* the string, it can read up to the word size (minus one byte) beyond the end
* of the string in order to find the nul terminator.
*
* While this is guaranteed to not cause a page fault (at worst, the nul
* terminator could be in the final word of the page, and the code won’t read
* any further than that), it is still technically undefined behaviour in C,
* because we’re reading off the end of an array.
*
* We don’t *think* this can cause any bugs due to compiler optimisations,
* because glibc does exactly the same thing in its string handling code, and
* that code has been extensively tested. For example:
* https://github.com/bminor/glibc/blob/2c1903cbbac0022153a67776f474c221250ad6ed/string/strchrnul.c
*
* However, both valgrind and asan warn about the read beyond the end of the
* array (a ‘heap buffer overflow read’). They’re right to do this (they can’t
* know the read is bounded to the word size minus one, and guaranteed to not
* cross a page boundary), but it’s annoying for any application which calls
* `g_utf8_validate()`.
*
* Use an [indirect function (`ifunc`)](https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-ifunc-function-attribute)
* to use a fallback implementation of `g_utf8_validate()` when running under
* valgrind. This is resolved at load time using `resolve_g_utf8_validate()`.
*
* Similarly, mark the real implementation so that it’s not instrumented by asan
* using `no_sanitize_address`.
*/
static gboolean
g_utf8_validate_valgrind (const char *str,
gssize max_len,
const char **end)
{
if (max_len < 0)
max_len = strlen (str);

return g_utf8_validate_len (str, max_len, end);
}

typedef gboolean (*GUtf8ValidateFunc) (const char *str,
gssize max_len,
const char **end);

static GUtf8ValidateFunc
resolve_g_utf8_validate (void)
{
if (RUNNING_ON_VALGRIND)
return g_utf8_validate_valgrind;
else
return g_utf8_validate_native;
}
#endif /* HAVE_WORKING_IFUNC_ATTRIBUTE */

/**
* g_utf8_validate:
* @str: (array length=max_len) (element-type guint8): a pointer to character data
Expand All @@ -1932,20 +1851,15 @@ resolve_g_utf8_validate (void)
*
* Returns: `TRUE` if the text was valid UTF-8
*/
#if g_macro__has_attribute(no_sanitize_address)
__attribute__((no_sanitize_address))
#endif
gboolean
g_utf8_validate (const char *str,
gssize max_len,
const gchar **end)
#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
__attribute__((ifunc ("resolve_g_utf8_validate")));
#else
{
return g_utf8_validate_native (str, max_len, end);
size_t max_len_unsigned = (max_len >= 0) ? (size_t) max_len : strlen (str);

return g_utf8_validate_len (str, max_len_unsigned, end);
}
#endif

/**
* g_utf8_validate_len:
Expand Down Expand Up @@ -1975,38 +1889,6 @@ g_utf8_validate_len (const char *str,
return max_len == 0;
}

static gboolean
g_str_is_ascii_native (const char *str)
{
utf8_verify_ascii (&str, NULL);

return *str == 0;
}

#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
/* See above comment about `ifunc` use for g_utf8_validate(). */
static gboolean
g_str_is_ascii_valgrind (const char *str)
{
size_t len = strlen (str);

utf8_verify_ascii (&str, &len);

return *str == 0;
}

typedef gboolean (*GStrIsAsciiFunc) (const char *str);

static GStrIsAsciiFunc
resolve_g_str_is_ascii (void)
{
if (RUNNING_ON_VALGRIND)
return g_str_is_ascii_valgrind;
else
return g_str_is_ascii_native;
}
#endif /* HAVE_WORKING_IFUNC_ATTRIBUTE */

/**
* g_str_is_ascii:
* @str: a string
Expand All @@ -2018,18 +1900,13 @@ resolve_g_str_is_ascii (void)
*
* Since: 2.40
*/
#if g_macro__has_attribute(no_sanitize_address)
__attribute__((no_sanitize_address))
#endif
gboolean
g_str_is_ascii (const gchar *str)
#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
__attribute__((ifunc ("resolve_g_str_is_ascii")));
#else
{
return g_str_is_ascii_native (str);
utf8_verify_ascii (&str, NULL);

return *str == 0;
}
#endif

/**
* g_unichar_validate:
Expand Down

0 comments on commit 96205fc

Please sign in to comment.