utf-8.c 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. /*******************************************************************************
  2. * Copyright (c) 2009, 2018 IBM Corp.
  3. *
  4. * All rights reserved. This program and the accompanying materials
  5. * are made available under the terms of the Eclipse Public License v2.0
  6. * and Eclipse Distribution License v1.0 which accompany this distribution.
  7. *
  8. * The Eclipse Public License is available at
  9. * https://www.eclipse.org/legal/epl-2.0/
  10. * and the Eclipse Distribution License is available at
  11. * http://www.eclipse.org/org/documents/edl-v10.php.
  12. *
  13. * Contributors:
  14. * Ian Craggs - initial API and implementation and/or initial documentation
  15. *******************************************************************************/
  16. /**
  17. * @file
  18. * \brief Functions for checking that strings contain UTF-8 characters only
  19. *
  20. * See page 104 of the Unicode Standard 5.0 for the list of well formed
  21. * UTF-8 byte sequences.
  22. *
  23. */
  24. #include "utf-8.h"
  25. #include <stdlib.h>
  26. #include <string.h>
  27. #include "StackTrace.h"
  28. /**
  29. * Macro to determine the number of elements in a single-dimension array
  30. */
  31. #if !defined(ARRAY_SIZE)
  32. #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
  33. #endif
  34. /**
  35. * Structure to hold the valid ranges of UTF-8 characters, for each byte up to 4
  36. */
  37. struct
  38. {
  39. int len; /**< number of elements in the following array (1 to 4) */
  40. struct
  41. {
  42. char lower; /**< lower limit of valid range */
  43. char upper; /**< upper limit of valid range */
  44. } bytes[4]; /**< up to 4 bytes can be used per character */
  45. }
  46. valid_ranges[] =
  47. {
  48. {1, { {00, 0x7F} } },
  49. {2, { {0xC2, 0xDF}, {0x80, 0xBF} } },
  50. {3, { {0xE0, 0xE0}, {0xA0, 0xBF}, {0x80, 0xBF} } },
  51. {3, { {0xE1, 0xEC}, {0x80, 0xBF}, {0x80, 0xBF} } },
  52. {3, { {0xED, 0xED}, {0x80, 0x9F}, {0x80, 0xBF} } },
  53. {3, { {0xEE, 0xEF}, {0x80, 0xBF}, {0x80, 0xBF} } },
  54. {4, { {0xF0, 0xF0}, {0x90, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF} } },
  55. {4, { {0xF1, 0xF3}, {0x80, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF} } },
  56. {4, { {0xF4, 0xF4}, {0x80, 0x8F}, {0x80, 0xBF}, {0x80, 0xBF} } },
  57. };
  58. static const char* UTF8_char_validate(int len, const char* data);
  59. /**
  60. * Validate a single UTF-8 character
  61. * @param len the length of the string in "data"
  62. * @param data the bytes to check for a valid UTF-8 char
  63. * @return pointer to the start of the next UTF-8 character in "data"
  64. */
  65. static const char* UTF8_char_validate(int len, const char* data)
  66. {
  67. int good = 0;
  68. int charlen = 2;
  69. int i, j;
  70. const char *rc = NULL;
  71. if (data == NULL)
  72. goto exit; /* don't have data, can't continue */
  73. /* first work out how many bytes this char is encoded in */
  74. if ((data[0] & 128) == 0)
  75. charlen = 1;
  76. else if ((data[0] & 0xF0) == 0xF0)
  77. charlen = 4;
  78. else if ((data[0] & 0xE0) == 0xE0)
  79. charlen = 3;
  80. if (charlen > len)
  81. goto exit; /* not enough characters in the string we were given */
  82. for (i = 0; i < ARRAY_SIZE(valid_ranges); ++i)
  83. { /* just has to match one of these rows */
  84. if (valid_ranges[i].len == charlen)
  85. {
  86. good = 1;
  87. for (j = 0; j < charlen; ++j)
  88. {
  89. if (data[j] < valid_ranges[i].bytes[j].lower ||
  90. data[j] > valid_ranges[i].bytes[j].upper)
  91. {
  92. good = 0; /* failed the check */
  93. break;
  94. }
  95. }
  96. if (good)
  97. break;
  98. }
  99. }
  100. if (good)
  101. rc = data + charlen;
  102. exit:
  103. return rc;
  104. }
  105. /**
  106. * Validate a length-delimited string has only UTF-8 characters
  107. * @param len the length of the string in "data"
  108. * @param data the bytes to check for valid UTF-8 characters
  109. * @return 1 (true) if the string has only UTF-8 characters, 0 (false) otherwise
  110. */
  111. int UTF8_validate(int len, const char* data)
  112. {
  113. const char* curdata = NULL;
  114. int rc = 0;
  115. FUNC_ENTRY;
  116. if (len == 0 || data == NULL)
  117. {
  118. rc = 1;
  119. goto exit;
  120. }
  121. curdata = UTF8_char_validate(len, data);
  122. while (curdata && (curdata < data + len))
  123. curdata = UTF8_char_validate((int)(data + len - curdata), curdata);
  124. rc = curdata != NULL;
  125. exit:
  126. FUNC_EXIT_RC(rc);
  127. return rc;
  128. }
  129. /**
  130. * Validate a null-terminated string has only UTF-8 characters
  131. * @param string the string to check for valid UTF-8 characters
  132. * @return 1 (true) if the string has only UTF-8 characters, 0 (false) otherwise
  133. */
  134. int UTF8_validateString(const char* string)
  135. {
  136. int rc = 0;
  137. FUNC_ENTRY;
  138. if (string != NULL)
  139. {
  140. rc = UTF8_validate((int)strlen(string), string);
  141. }
  142. FUNC_EXIT_RC(rc);
  143. return rc;
  144. }
  145. #if defined(UNIT_TESTS)
  146. #include <stdio.h>
  147. typedef struct
  148. {
  149. int len;
  150. char data[20];
  151. } tests;
  152. tests valid_strings[] =
  153. {
  154. {3, "hjk" },
  155. {7, {0x41, 0xE2, 0x89, 0xA2, 0xCE, 0x91, 0x2E} },
  156. {3, {'f', 0xC9, 0xB1 } },
  157. {9, {0xED, 0x95, 0x9C, 0xEA, 0xB5, 0xAD, 0xEC, 0x96, 0xB4} },
  158. {9, {0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E} },
  159. {4, {0x2F, 0x2E, 0x2E, 0x2F} },
  160. {7, {0xEF, 0xBB, 0xBF, 0xF0, 0xA3, 0x8E, 0xB4} },
  161. };
  162. tests invalid_strings[] =
  163. {
  164. {2, {0xC0, 0x80} },
  165. {5, {0x2F, 0xC0, 0xAE, 0x2E, 0x2F} },
  166. {6, {0xED, 0xA1, 0x8C, 0xED, 0xBE, 0xB4} },
  167. {1, {0xF4} },
  168. };
  169. int main (int argc, char *argv[])
  170. {
  171. int i, failed = 0;
  172. for (i = 0; i < ARRAY_SIZE(valid_strings); ++i)
  173. {
  174. if (!UTF8_validate(valid_strings[i].len, valid_strings[i].data))
  175. {
  176. printf("valid test %d failed\n", i);
  177. failed = 1;
  178. }
  179. else
  180. printf("valid test %d passed\n", i);
  181. }
  182. for (i = 0; i < ARRAY_SIZE(invalid_strings); ++i)
  183. {
  184. if (UTF8_validate(invalid_strings[i].len, invalid_strings[i].data))
  185. {
  186. printf("invalid test %d failed\n", i);
  187. failed = 1;
  188. }
  189. else
  190. printf("invalid test %d passed\n", i);
  191. }
  192. if (failed)
  193. printf("Failed\n");
  194. else
  195. printf("Passed\n");
  196. //Don't crash on null data
  197. UTF8_validateString(NULL);
  198. UTF8_validate(1, NULL);
  199. UTF8_char_validate(1, NULL);
  200. return 0;
  201. } /* End of main function*/
  202. #endif