Skip to content

Commit

Permalink
Merge branch 'wip/pwithnall/3493-unicode-with-strlen' into 'main'
Browse files Browse the repository at this point in the history
gutf8: Drop ifunc code and always call strlen() when validating UTF-8

Closes #3493, #3511, and #3526

See merge request GNOME/glib!4404
  • Loading branch information
mcatanzaro committed Nov 19, 2024
2 parents ccee4c2 + 96205fc commit 3e87611
Showing 1 changed file with 8 additions and 131 deletions.
139 changes: 8 additions & 131 deletions glib/gutf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@
#include "glibintl.h"
#include "gvalgrind.h"

#if g_macro__has_attribute(ifunc) && !defined(G_OS_WIN32)
#define HAVE_WORKING_IFUNC_ATTRIBUTE 1
#endif

#define UTF8_COMPUTE(Char, Mask, Len) \
if (Char < 128) \
{ \
Expand Down Expand Up @@ -1607,12 +1603,6 @@ load_u8 (gconstpointer memory,
# define _attribute_aligned(n)
#endif

/* See the HAVE_WORKING_IFUNC_ATTRIBUTE definition for an explanation of the
* no_sanitize_address annotation.
*/
#if g_macro__has_attribute(no_sanitize_address)
__attribute__((no_sanitize_address))
#endif
static inline gsize
load_word (gconstpointer memory,
gsize offset)
Expand Down Expand Up @@ -1648,7 +1638,7 @@ utf8_verify_ascii (const char **strp,
gsize *lenp)
{
const char *str = *strp;
gsize len = lenp ? *lenp : (gsize)-1;
gsize len = lenp ? *lenp : strlen (str);

while (len > 0 && load_u8 (str, 0) < 128)
{
Expand Down Expand Up @@ -1697,7 +1687,7 @@ utf8_verify (const char **strp,
gsize *lenp)
{
const char *str = *strp;
gsize len = lenp ? *lenp : (gsize)-1;
gsize len = lenp ? *lenp : strlen (str);

/* See Unicode 10.0.0, Chapter 3, Section D92 */

Expand Down Expand Up @@ -1835,77 +1825,6 @@ utf8_verify (const char **strp,
*lenp = len;
}

static gboolean
g_utf8_validate_native (const char *str,
gssize max_len,
const char **end)
{
if (max_len >= 0)
return g_utf8_validate_len (str, max_len, end);

utf8_verify (&str, NULL);

if (end != NULL)
*end = str;

return *str == 0;
}

#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
/* The fast implementation of UTF-8 validation in `utf8_verify()` technically
* uses undefined behaviour when the string length is not provided (i.e. when
* it’s looking for a trailing nul terminator): when doing word-sized reads of
* the string, it can read up to the word size (minus one byte) beyond the end
* of the string in order to find the nul terminator.
*
* While this is guaranteed to not cause a page fault (at worst, the nul
* terminator could be in the final word of the page, and the code won’t read
* any further than that), it is still technically undefined behaviour in C,
* because we’re reading off the end of an array.
*
* We don’t *think* this can cause any bugs due to compiler optimisations,
* because glibc does exactly the same thing in its string handling code, and
* that code has been extensively tested. For example:
* https://github.com/bminor/glibc/blob/2c1903cbbac0022153a67776f474c221250ad6ed/string/strchrnul.c
*
* However, both valgrind and asan warn about the read beyond the end of the
* array (a ‘heap buffer overflow read’). They’re right to do this (they can’t
* know the read is bounded to the word size minus one, and guaranteed to not
* cross a page boundary), but it’s annoying for any application which calls
* `g_utf8_validate()`.
*
* Use an [indirect function (`ifunc`)](https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-ifunc-function-attribute)
* to use a fallback implementation of `g_utf8_validate()` when running under
* valgrind. This is resolved at load time using `resolve_g_utf8_validate()`.
*
* Similarly, mark the real implementation so that it’s not instrumented by asan
* using `no_sanitize_address`.
*/
static gboolean
g_utf8_validate_valgrind (const char *str,
gssize max_len,
const char **end)
{
if (max_len < 0)
max_len = strlen (str);

return g_utf8_validate_len (str, max_len, end);
}

typedef gboolean (*GUtf8ValidateFunc) (const char *str,
gssize max_len,
const char **end);

static GUtf8ValidateFunc
resolve_g_utf8_validate (void)
{
if (RUNNING_ON_VALGRIND)
return g_utf8_validate_valgrind;
else
return g_utf8_validate_native;
}
#endif /* HAVE_WORKING_IFUNC_ATTRIBUTE */

/**
* g_utf8_validate:
* @str: (array length=max_len) (element-type guint8): a pointer to character data
Expand All @@ -1932,20 +1851,15 @@ resolve_g_utf8_validate (void)
*
* Returns: `TRUE` if the text was valid UTF-8
*/
#if g_macro__has_attribute(no_sanitize_address)
__attribute__((no_sanitize_address))
#endif
gboolean
g_utf8_validate (const char *str,
gssize max_len,
const gchar **end)
#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
__attribute__((ifunc ("resolve_g_utf8_validate")));
#else
{
return g_utf8_validate_native (str, max_len, end);
size_t max_len_unsigned = (max_len >= 0) ? (size_t) max_len : strlen (str);

return g_utf8_validate_len (str, max_len_unsigned, end);
}
#endif

/**
* g_utf8_validate_len:
Expand Down Expand Up @@ -1975,38 +1889,6 @@ g_utf8_validate_len (const char *str,
return max_len == 0;
}

static gboolean
g_str_is_ascii_native (const char *str)
{
utf8_verify_ascii (&str, NULL);

return *str == 0;
}

#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
/* See above comment about `ifunc` use for g_utf8_validate(). */
static gboolean
g_str_is_ascii_valgrind (const char *str)
{
size_t len = strlen (str);

utf8_verify_ascii (&str, &len);

return *str == 0;
}

typedef gboolean (*GStrIsAsciiFunc) (const char *str);

static GStrIsAsciiFunc
resolve_g_str_is_ascii (void)
{
if (RUNNING_ON_VALGRIND)
return g_str_is_ascii_valgrind;
else
return g_str_is_ascii_native;
}
#endif /* HAVE_WORKING_IFUNC_ATTRIBUTE */

/**
* g_str_is_ascii:
* @str: a string
Expand All @@ -2018,18 +1900,13 @@ resolve_g_str_is_ascii (void)
*
* Since: 2.40
*/
#if g_macro__has_attribute(no_sanitize_address)
__attribute__((no_sanitize_address))
#endif
gboolean
g_str_is_ascii (const gchar *str)
#ifdef HAVE_WORKING_IFUNC_ATTRIBUTE
__attribute__((ifunc ("resolve_g_str_is_ascii")));
#else
{
return g_str_is_ascii_native (str);
utf8_verify_ascii (&str, NULL);

return *str == 0;
}
#endif

/**
* g_unichar_validate:
Expand Down

0 comments on commit 3e87611

Please sign in to comment.