Merge branch 'ticket32845_squashed'

This commit is contained in:
Nick Mathewson 2020-01-07 10:16:15 -05:00
commit e231cd5b61
3 changed files with 57 additions and 3 deletions

4
changes/ticket32845 Normal file
View File

@ -0,0 +1,4 @@
o Testing:
- Add more test cases for tor's UTF-8 validation function. Also, check the
arguments passed to the function for consistency.
Closes ticket 32845.

View File

@ -506,6 +506,23 @@ validate_char(const uint8_t *c, uint8_t len)
int
string_is_utf8(const char *str, size_t len)
{
// If str is NULL, don't try to read it
if (!str) {
// We could test for this case, but the low-level logs would produce
// confusing test output.
// LCOV_EXCL_START
if (len) {
// Use the low-level logging function, so that the log module can
// validate UTF-8 (if needed in future code)
tor_log_err_sigsafe(
"BUG: string_is_utf8() called with NULL str but non-zero len.");
// Since it's a bug, we should probably reject this string
return false;
}
// LCOV_EXCL_STOP
return true;
}
for (size_t i = 0; i < len;) {
uint8_t num_bytes = bytes_in_char(str[i]);
if (num_bytes == 0) // Invalid leading byte found.
@ -530,8 +547,8 @@ string_is_utf8(const char *str, size_t len)
int
string_is_utf8_no_bom(const char *str, size_t len)
{
if (len >= 3 && (!strcmpstart(str, "\uFEFF") ||
!strcmpstart(str, "\uFFFE"))) {
if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") ||
!strcmpstart(str, "\uFFFE"))) {
return false;
}
return string_is_utf8(str, len);

View File

@ -4104,10 +4104,43 @@ test_util_string_is_utf8(void *ptr)
tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
tt_int_op(1, OP_EQ, string_is_utf8("\xee\x80\x80", 3));
// The maximum legal codepoint, 10FFFF.
// The minimum legal codepoint, 0x00.
tt_int_op(1, OP_EQ, string_is_utf8("\0", 1));
// The maximum legal codepoint, 0x10FFFF.
tt_int_op(1, OP_EQ, string_is_utf8("\xf4\x8f\xbf\xbf", 4));
tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\x80\x80", 4));
/* Test cases that vary between programming languages /
* UTF-8 implementations.
* Source: POC||GTFO 19, page 43
* https://www.alchemistowl.org/pocorgtfo/
*/
// Invalid (in most implementations)
// surrogate
tt_int_op(0, OP_EQ, string_is_utf8("\xed\xa0\x81", 3));
// nullsurrog
tt_int_op(0, OP_EQ, string_is_utf8("\x30\x00\xed\xa0\x81", 5));
// threehigh
tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
// fourhigh
tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\xbf\xbf", 4));
// fivebyte
tt_int_op(0, OP_EQ, string_is_utf8("\xfb\x80\x80\x80\x80", 5));
// sixbyte
tt_int_op(0, OP_EQ, string_is_utf8("\xfd\x80\x80\x80\x80", 5));
// sixhigh
tt_int_op(0, OP_EQ, string_is_utf8("\xfd\xbf\xbf\xbf\xbf", 5));
// Valid (in most implementations)
// fourbyte
tt_int_op(1, OP_EQ, string_is_utf8("\xf0\x90\x8d\x88", 4));
// fourbyte2
tt_int_op(1, OP_EQ, string_is_utf8("\xf0\xbf\xbf\xbf", 4));
// nullbyte
tt_int_op(1, OP_EQ, string_is_utf8("\x30\x31\x32\x00\x33", 5));
done:
;
}