Exposed utf8_valid as a public function to reduce duplicity. Allow utf-8 string length of 0 to be valid.

This commit is contained in:
skarg
2012-05-11 15:03:35 +00:00
parent 35e953ba8b
commit 54c4ce342c
3 changed files with 21 additions and 117 deletions
+3
View File
@@ -145,6 +145,9 @@ extern "C" {
BACNET_CHARACTER_STRING * char_string); BACNET_CHARACTER_STRING * char_string);
bool characterstring_valid( bool characterstring_valid(
BACNET_CHARACTER_STRING * char_string); BACNET_CHARACTER_STRING * char_string);
bool utf8_isvalid(
const char *str,
size_t length);
/* returns false if the string exceeds capacity /* returns false if the string exceeds capacity
initialize by using length=0 */ initialize by using length=0 */
+2 -105
View File
@@ -35,107 +35,6 @@
#include "device.h" #include "device.h"
#include "bname.h" #include "bname.h"
/* Basic UTF-8 manipulation routines
by Jeff Bezanson
placed in the public domain Fall 2005 */
static const char trailingBytesForUTF8[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4,
4, 4, 4, 5, 5, 5, 5
};
/* based on the valid_utf8 routine from the PCRE library by Philip Hazel
length is in bytes, since without knowing whether the string is valid
it's hard to know how many characters there are! */
static int utf8_isvalid(
const char *str,
int length)
{
const unsigned char *p, *pend = (unsigned char *) str + length;
unsigned char c;
int ab;
for (p = (unsigned char *) str; p < pend; p++) {
c = *p;
/* null in middle of string */
if (c == 0) {
return 0;
}
/* ASCII character */
if (c < 128) {
continue;
}
if ((c & 0xc0) != 0xc0) {
return 0;
}
ab = trailingBytesForUTF8[c];
if (length < ab) {
return 0;
}
length -= ab;
p++;
/* Check top bits in the second byte */
if ((*p & 0xc0) != 0x80) {
return 0;
}
/* Check for overlong sequences for each different length */
switch (ab) {
/* Check for xx00 000x */
case 1:
if ((c & 0x3e) == 0)
return 0;
continue; /* We know there aren't any more bytes to check */
/* Check for 1110 0000, xx0x xxxx */
case 2:
if (c == 0xe0 && (*p & 0x20) == 0)
return 0;
break;
/* Check for 1111 0000, xx00 xxxx */
case 3:
if (c == 0xf0 && (*p & 0x30) == 0)
return 0;
break;
/* Check for 1111 1000, xx00 0xxx */
case 4:
if (c == 0xf8 && (*p & 0x38) == 0)
return 0;
break;
/* Check for leading 0xfe or 0xff,
and then for 1111 1100, xx00 00xx */
case 5:
if (c == 0xfe || c == 0xff || (c == 0xfc && (*p & 0x3c) == 0))
return 0;
break;
}
/* Check for valid bytes after the 2nd, if any; all must start 10 */
while (--ab > 0) {
if ((*(++p) & 0xc0) != 0x80)
return 0;
}
}
return 1;
}
static bool bacnet_name_isvalid( static bool bacnet_name_isvalid(
uint8_t encoding, uint8_t encoding,
uint8_t length, uint8_t length,
@@ -145,10 +44,8 @@ static bool bacnet_name_isvalid(
if ((encoding < MAX_CHARACTER_STRING_ENCODING) && if ((encoding < MAX_CHARACTER_STRING_ENCODING) &&
(length <= NV_EEPROM_NAME_SIZE)) { (length <= NV_EEPROM_NAME_SIZE)) {
if (encoding == CHARACTER_ANSI_X34) { if (encoding == CHARACTER_UTF8) {
if (utf8_isvalid(str, length)) { valid = utf8_isvalid(str, length);
valid = true;
}
} else { } else {
valid = true; valid = true;
} }
+16 -12
View File
@@ -521,7 +521,7 @@ static const char trailingBytesForUTF8[256] = {
/* based on the valid_utf8 routine from the PCRE library by Philip Hazel /* based on the valid_utf8 routine from the PCRE library by Philip Hazel
length is in bytes, since without knowing whether the string is valid length is in bytes, since without knowing whether the string is valid
it's hard to know how many characters there are! */ it's hard to know how many characters there are! */
static int utf8_isvalid( bool utf8_isvalid(
const char *str, const char *str,
size_t length) size_t length)
{ {
@@ -529,72 +529,76 @@ static int utf8_isvalid(
unsigned char c; unsigned char c;
size_t ab; size_t ab;
/* empty string is valid */
if (length == 0) {
return true;
}
for (p = (unsigned char *) str; p < pend; p++) { for (p = (unsigned char *) str; p < pend; p++) {
c = *p; c = *p;
/* null in middle of string */ /* null in middle of string */
if (c == 0) { if (c == 0) {
return 0; return false;
} }
/* ASCII character */ /* ASCII character */
if (c < 128) { if (c < 128) {
continue; continue;
} }
if ((c & 0xc0) != 0xc0) { if ((c & 0xc0) != 0xc0) {
return 0; return false;
} }
ab = (size_t)trailingBytesForUTF8[c]; ab = (size_t)trailingBytesForUTF8[c];
if (length < ab) { if (length < ab) {
return 0; return false;
} }
length -= ab; length -= ab;
p++; p++;
/* Check top bits in the second byte */ /* Check top bits in the second byte */
if ((*p & 0xc0) != 0x80) { if ((*p & 0xc0) != 0x80) {
return 0; return false;
} }
/* Check for overlong sequences for each different length */ /* Check for overlong sequences for each different length */
switch (ab) { switch (ab) {
/* Check for xx00 000x */ /* Check for xx00 000x */
case 1: case 1:
if ((c & 0x3e) == 0) if ((c & 0x3e) == 0)
return 0; return false;
continue; /* We know there aren't any more bytes to check */ continue; /* We know there aren't any more bytes to check */
/* Check for 1110 0000, xx0x xxxx */ /* Check for 1110 0000, xx0x xxxx */
case 2: case 2:
if (c == 0xe0 && (*p & 0x20) == 0) if (c == 0xe0 && (*p & 0x20) == 0)
return 0; return false;
break; break;
/* Check for 1111 0000, xx00 xxxx */ /* Check for 1111 0000, xx00 xxxx */
case 3: case 3:
if (c == 0xf0 && (*p & 0x30) == 0) if (c == 0xf0 && (*p & 0x30) == 0)
return 0; return false;
break; break;
/* Check for 1111 1000, xx00 0xxx */ /* Check for 1111 1000, xx00 0xxx */
case 4: case 4:
if (c == 0xf8 && (*p & 0x38) == 0) if (c == 0xf8 && (*p & 0x38) == 0)
return 0; return false;
break; break;
/* Check for leading 0xfe or 0xff, /* Check for leading 0xfe or 0xff,
and then for 1111 1100, xx00 00xx */ and then for 1111 1100, xx00 00xx */
case 5: case 5:
if (c == 0xfe || c == 0xff || (c == 0xfc && (*p & 0x3c) == 0)) if (c == 0xfe || c == 0xff || (c == 0xfc && (*p & 0x3c) == 0))
return 0; return false;
break; break;
} }
/* Check for valid bytes after the 2nd, if any; all must start 10 */ /* Check for valid bytes after the 2nd, if any; all must start 10 */
while (--ab > 0) { while (--ab > 0) {
if ((*(++p) & 0xc0) != 0x80) if ((*(++p) & 0xc0) != 0x80)
return 0; return false;
} }
} }
return 1; return true;
} }
bool characterstring_valid( bool characterstring_valid(