utf.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649
  1. #include "utf.h"
  2. #include "ByteReader.h"
  3. #include "ByteWriter.h"
  4. #include "foundation/error.h"
  5. #include <string.h>
  6. static const uint8_t mask_tab[6]={0x80,0xE0,0xF0,0xF8,0xFC,0xFE};
  7. static const uint8_t val_tab[6]={0,0xC0,0xE0,0xF0,0xF8,0xFC};
  8. // returns the number of utf-16 words required to store a given codepoint
  9. static size_t ucs4_to_utf16_count(uint32_t codepoint)
  10. {
  11. if (codepoint >= 0x110000)
  12. return 0; // out of bounds
  13. if (codepoint >= 0x10000)
  14. return 2;
  15. return 1;
  16. }
  17. static int utf16LE_to_ucs4_character(bytereader_t const byte_reader, uint32_t *codepoint)
  18. {
  19. uint16_t lead;
  20. lead = bytereader_read_u16_le(byte_reader);
  21. if (lead < 0xD800 || lead >= 0xE000)
  22. {
  23. *codepoint = lead;
  24. return NErr_Success;
  25. }
  26. if (lead < 0xDC00)
  27. {
  28. if (bytereader_size(byte_reader) >= 2)
  29. {
  30. uint16_t trail = bytereader_read_u16_le(byte_reader);
  31. if (trail >= 0xDC00 && trail < 0xE000)
  32. {
  33. *codepoint = 0x10000 + ((lead - 0xD800) << 10) + (trail - 0xDC00);
  34. return NErr_Success;
  35. }
  36. }
  37. }
  38. return NErr_Error; // invalid
  39. }
  40. static int utf16BE_to_ucs4_character(bytereader_t const byte_reader, uint32_t *codepoint)
  41. {
  42. uint16_t lead;
  43. lead = bytereader_read_u16_be(byte_reader);
  44. if (lead < 0xD800 || lead >= 0xE000)
  45. {
  46. *codepoint = lead;
  47. return NErr_Success;
  48. }
  49. if (lead < 0xDC00)
  50. {
  51. if (bytereader_size(byte_reader) >= 2)
  52. {
  53. uint16_t trail = bytereader_read_u16_be(byte_reader);
  54. if (trail >= 0xDC00 && trail < 0xE000)
  55. {
  56. *codepoint = 0x10000 + ((lead - 0xD800) << 10) + (trail - 0xDC00);
  57. return NErr_Success;
  58. }
  59. }
  60. }
  61. return NErr_Error; // invalid
  62. }
  63. static size_t utf8_to_ucs4_character(const char *utf8, size_t len, uint32_t *codepoint)
  64. {
  65. uint32_t res=0;
  66. size_t n;
  67. size_t cnt=0;
  68. while(1)
  69. {
  70. if ((*utf8&mask_tab[cnt])==val_tab[cnt]) break;
  71. if (++cnt==6) return 0;
  72. }
  73. cnt++;
  74. if (cnt==2 && !(*utf8&0x1E))
  75. return 0;
  76. if (cnt==1)
  77. res=*utf8;
  78. else
  79. res=(0xFF>>(cnt+1))&*utf8;
  80. if (cnt > len)
  81. return 0;
  82. for (n=1;n<cnt;n++)
  83. {
  84. if ((utf8[n]&0xC0) != 0x80)
  85. return 0;
  86. if (!res && n==2 && !((utf8[n]&0x7F) >> (7 - cnt)))
  87. return 0;
  88. res=(res<<6)|(utf8[n]&0x3F);
  89. }
  90. if (codepoint)
  91. *codepoint=res;
  92. return cnt;
  93. }
  94. // returns the number of utf-8 bytes required to store a given codepoint
  95. static size_t ucs4_to_utf8_count(uint32_t codepoint)
  96. {
  97. if (codepoint < 0x80)
  98. return 1;
  99. else if (codepoint < 0x800)
  100. return 2;
  101. else if (codepoint < 0x10000)
  102. return 3;
  103. else if (codepoint < 0x200000)
  104. return 4;
  105. else if (codepoint < 0x4000000)
  106. return 5;
  107. else if (codepoint <= 0x7FFFFFFF)
  108. return 6;
  109. else
  110. return 0;
  111. }
  112. static size_t ucs4_to_utf8_character(char *target, uint32_t codepoint, size_t max)
  113. {
  114. size_t count = ucs4_to_utf8_count(codepoint);
  115. if (!count)
  116. return 0;
  117. if (count>max) return 0;
  118. if (target == 0)
  119. return count;
  120. switch (count)
  121. {
  122. case 6:
  123. target[5] = 0x80 | (codepoint & 0x3F);
  124. codepoint = codepoint >> 6;
  125. codepoint |= 0x4000000;
  126. case 5:
  127. target[4] = 0x80 | (codepoint & 0x3F);
  128. codepoint = codepoint >> 6;
  129. codepoint |= 0x200000;
  130. case 4:
  131. target[3] = 0x80 | (codepoint & 0x3F);
  132. codepoint = codepoint >> 6;
  133. codepoint |= 0x10000;
  134. case 3:
  135. target[2] = 0x80 | (codepoint & 0x3F);
  136. codepoint = codepoint >> 6;
  137. codepoint |= 0x800;
  138. case 2:
  139. target[1] = 0x80 | (codepoint & 0x3F);
  140. codepoint = codepoint >> 6;
  141. codepoint |= 0xC0;
  142. case 1:
  143. target[0] = codepoint;
  144. }
  145. return count;
  146. }
  147. static size_t ucs4_to_utf16LE_character(bytewriter_t byte_writer, uint32_t codepoint)
  148. {
  149. if (codepoint >= 0x110000)
  150. return 0;
  151. if (codepoint >= 0x10000)
  152. {
  153. if (bytewriter_size(byte_writer) < 4)
  154. return 0;
  155. bytewriter_write_u16_le(byte_writer, ((codepoint - 0x10000) >> 10) + 0xD800); // high surrogate
  156. bytewriter_write_u16_le(byte_writer, ((codepoint - 0x10000) & 0x3FF) + 0xDC00); // low surrogate
  157. return 2;
  158. }
  159. else
  160. {
  161. bytewriter_write_u16_le(byte_writer, codepoint);
  162. return 1;
  163. }
  164. }
  165. static size_t ucs4_to_utf16BE_character(bytewriter_t byte_writer, uint32_t codepoint)
  166. {
  167. if (codepoint >= 0x110000)
  168. return 0;
  169. if (codepoint >= 0x10000)
  170. {
  171. if (bytewriter_size(byte_writer) < 4)
  172. return 0;
  173. bytewriter_write_u16_be(byte_writer, ((codepoint - 0x10000) >> 10) + 0xD800); // high surrogate
  174. bytewriter_write_u16_be(byte_writer, ((codepoint - 0x10000) & 0x3FF) + 0xDC00); // low surrogate
  175. return 2;
  176. }
  177. else
  178. {
  179. bytewriter_write_u16_be(byte_writer, codepoint);
  180. return 1;
  181. }
  182. }
  183. size_t utf16LE_to_utf8(const uint16_t *src, size_t source_len, char *dst, size_t out_len)
  184. {
  185. uint32_t codepoint;
  186. size_t position=0;
  187. size_t characters_processed;
  188. bytereader_s byte_reader;
  189. bytereader_init(&byte_reader, src, source_len*2);
  190. if (!dst) // they just want the size
  191. {
  192. while (bytereader_size(&byte_reader))
  193. {
  194. if (utf16LE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success)
  195. break;
  196. characters_processed = ucs4_to_utf8_count(codepoint);
  197. if (!characters_processed)
  198. break;
  199. position+=characters_processed;
  200. }
  201. return position;
  202. }
  203. while(bytereader_size(&byte_reader) && position<out_len)
  204. {
  205. if (utf16LE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success)
  206. break;
  207. characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position);
  208. if (!characters_processed)
  209. break;
  210. position+=characters_processed;
  211. }
  212. if (position<out_len)
  213. dst[position]=0;
  214. return position;
  215. }
  216. size_t utf16BE_to_utf8(const uint16_t *src, size_t source_len, char *dst, size_t out_len)
  217. {
  218. uint32_t codepoint;
  219. size_t position=0;
  220. size_t characters_processed;
  221. bytereader_s byte_reader;
  222. bytereader_init(&byte_reader, src, source_len*2);
  223. if (!dst) // they just want the size
  224. {
  225. while (bytereader_size(&byte_reader))
  226. {
  227. if (utf16BE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success)
  228. break;
  229. characters_processed = ucs4_to_utf8_count(codepoint);
  230. if (!characters_processed)
  231. break;
  232. position+=characters_processed;
  233. }
  234. return position;
  235. }
  236. while(bytereader_size(&byte_reader) && position<out_len)
  237. {
  238. if (utf16BE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success)
  239. break;
  240. characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position);
  241. if (!characters_processed)
  242. break;
  243. position+=characters_processed;
  244. }
  245. if (position<out_len)
  246. dst[position]=0;
  247. return position;
  248. }
  249. size_t ucs4_to_utf8(const uint32_t *src, size_t source_len, char *dst, size_t out_len)
  250. {
  251. uint32_t codepoint;
  252. size_t position=0;
  253. size_t characters_processed;
  254. bytereader_s byte_reader;
  255. bytereader_init(&byte_reader, src, source_len*4);
  256. if (!dst) // they just want the size
  257. {
  258. while (bytereader_size(&byte_reader) > 3)
  259. {
  260. codepoint = bytereader_read_u32_le(&byte_reader);
  261. characters_processed = ucs4_to_utf8_count(codepoint);
  262. if (!characters_processed)
  263. break;
  264. position+=characters_processed;
  265. }
  266. return position;
  267. }
  268. while(bytereader_size(&byte_reader) > 3 && position<out_len)
  269. {
  270. codepoint = bytereader_read_u32_le(&byte_reader);
  271. characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position);
  272. if (!characters_processed)
  273. break;
  274. position+=characters_processed;
  275. }
  276. if (position<out_len)
  277. dst[position]=0;
  278. return position;
  279. }
  280. size_t utf8_to_utf16LE(const char *src, size_t source_len, uint16_t *dst, size_t out_len)
  281. {
  282. uint32_t codepoint;
  283. size_t characters_processed;
  284. bytewriter_s byte_writer;
  285. if (!dst) // they just want the size
  286. {
  287. size_t position=0;
  288. while (source_len)
  289. {
  290. characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
  291. if (codepoint == 0xFFFD)
  292. break;
  293. source_len -= characters_processed;
  294. src += characters_processed;
  295. characters_processed = ucs4_to_utf16_count(codepoint);
  296. if (!characters_processed)
  297. break;
  298. position+=characters_processed;
  299. }
  300. return position;
  301. }
  302. bytewriter_init(&byte_writer, dst, out_len*2);
  303. while(source_len && bytewriter_size(&byte_writer))
  304. {
  305. characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
  306. if (codepoint == 0xFFFD)
  307. break;
  308. source_len -= characters_processed;
  309. src += characters_processed;
  310. characters_processed=ucs4_to_utf16LE_character(&byte_writer, codepoint);
  311. if (!characters_processed)
  312. break;
  313. }
  314. if (bytewriter_size(&byte_writer))
  315. bytewriter_write_u16_le(&byte_writer, 0);
  316. return out_len - bytewriter_size(&byte_writer)/2;
  317. }
  318. size_t utf8_to_utf16BE(const char *src, size_t source_len, uint16_t *dst, size_t out_len)
  319. {
  320. uint32_t codepoint;
  321. size_t characters_processed;
  322. bytewriter_s byte_writer;
  323. if (!dst) // they just want the size
  324. {
  325. size_t position=0;
  326. while (source_len)
  327. {
  328. characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
  329. if (codepoint == 0xFFFD)
  330. break;
  331. source_len -= characters_processed;
  332. src += characters_processed;
  333. characters_processed = ucs4_to_utf16_count(codepoint);
  334. if (!characters_processed)
  335. break;
  336. position+=characters_processed;
  337. }
  338. return position;
  339. }
  340. bytewriter_init(&byte_writer, dst, out_len*2);
  341. while(source_len && bytewriter_size(&byte_writer))
  342. {
  343. characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
  344. if (codepoint == 0xFFFD)
  345. break;
  346. source_len -= characters_processed;
  347. src += characters_processed;
  348. characters_processed=ucs4_to_utf16BE_character(&byte_writer, codepoint);
  349. if (!characters_processed)
  350. break;
  351. }
  352. if (bytewriter_size(&byte_writer))
  353. bytewriter_write_u16_be(&byte_writer, 0);
  354. return out_len - bytewriter_size(&byte_writer)/2;
  355. }
  356. size_t utf8_to_ISO_8859_1(const char *src, size_t source_len, char *dst, size_t out_len)
  357. {
  358. uint32_t codepoint;
  359. size_t position=0;
  360. size_t characters_processed;
  361. if (!dst) // they just want the size
  362. {
  363. while (source_len)
  364. {
  365. characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
  366. if (codepoint == 0xFFFD)
  367. break;
  368. source_len -= characters_processed;
  369. src += characters_processed;
  370. position++;
  371. }
  372. return position;
  373. }
  374. while(source_len && position<out_len)
  375. {
  376. characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
  377. if (codepoint == 0xFFFD)
  378. break;
  379. source_len -= characters_processed;
  380. src += characters_processed;
  381. if (codepoint < 256)
  382. dst[position++] = codepoint;
  383. else
  384. dst[position++] = '?';
  385. }
  386. if (position<out_len)
  387. dst[position]=0;
  388. return position;
  389. }
  390. size_t ISO_8859_1_to_utf8(const char *src, size_t source_len, char *dst, size_t out_len)
  391. {
  392. uint32_t codepoint;
  393. size_t position=0;
  394. size_t characters_processed;
  395. if (!dst) // they just want the size
  396. {
  397. while (source_len)
  398. {
  399. codepoint = *src++;
  400. source_len--;
  401. characters_processed = ucs4_to_utf8_count(codepoint);
  402. if (!characters_processed)
  403. break;
  404. position+=characters_processed;
  405. }
  406. return position;
  407. }
  408. while(source_len && position<out_len)
  409. {
  410. codepoint = *src++;
  411. source_len--;
  412. characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position);
  413. if (!characters_processed)
  414. break;
  415. position+=characters_processed;
  416. }
  417. if (position<out_len)
  418. dst[position]=0;
  419. return position;
  420. }
  421. size_t utf8_to_ucs4(const char *src, size_t source_len, uint32_t *dst, size_t out_len)
  422. {
  423. uint32_t codepoint;
  424. size_t characters_processed;
  425. bytewriter_s byte_writer;
  426. if (!dst) // they just want the size
  427. {
  428. size_t position=0;
  429. while (source_len)
  430. {
  431. characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
  432. if (codepoint == 0xFFFD)
  433. break;
  434. source_len -= characters_processed;
  435. src += characters_processed;
  436. characters_processed = 1;
  437. position+=characters_processed;
  438. }
  439. return position;
  440. }
  441. bytewriter_init(&byte_writer, dst, out_len*4);
  442. while(source_len && bytewriter_size(&byte_writer))
  443. {
  444. characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
  445. if (codepoint == 0xFFFD)
  446. break;
  447. source_len -= characters_processed;
  448. src += characters_processed;
  449. bytewriter_write_u32_le(&byte_writer, codepoint);
  450. }
  451. if (bytewriter_size(&byte_writer))
  452. bytewriter_write_u32_le(&byte_writer, 0);
  453. return out_len - bytewriter_size(&byte_writer)/4;
  454. }
  455. size_t ASCII_to_utf8(const char *src, size_t source_len, char *dst, size_t out_len)
  456. {
  457. uint32_t codepoint;
  458. size_t position=0;
  459. size_t characters_processed;
  460. if (!dst) // they just want the size
  461. {
  462. while (source_len)
  463. {
  464. codepoint = *src++;
  465. source_len--;
  466. characters_processed = ucs4_to_utf8_count(codepoint);
  467. if (!characters_processed)
  468. break;
  469. position+=characters_processed;
  470. }
  471. return position;
  472. }
  473. while(source_len && position<out_len)
  474. {
  475. codepoint = *src++;
  476. source_len--;
  477. characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position);
  478. if (!characters_processed)
  479. break;
  480. position+=characters_processed;
  481. }
  482. if (position<out_len)
  483. dst[position]=0;
  484. return position;
  485. }
  486. size_t utf8_to_ASCII(const char *src, size_t source_len, char *dst, size_t out_len)
  487. {
  488. uint32_t codepoint;
  489. size_t position=0;
  490. size_t characters_processed;
  491. if (!dst) // they just want the size
  492. {
  493. while (source_len)
  494. {
  495. characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
  496. if (codepoint == 0xFFFD)
  497. break;
  498. source_len -= characters_processed;
  499. src += characters_processed;
  500. position++;
  501. }
  502. return position;
  503. }
  504. while(source_len && position<out_len)
  505. {
  506. characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
  507. if (codepoint == 0xFFFD)
  508. break;
  509. source_len -= characters_processed;
  510. src += characters_processed;
  511. if (codepoint < 128)
  512. dst[position++] = codepoint;
  513. else
  514. dst[position++] = '?';
  515. }
  516. if (position<out_len)
  517. dst[position]=0;
  518. return position;
  519. }
  520. size_t utf8_strnlen(const char *src, size_t source_len, size_t codepoints)
  521. {
  522. uint32_t codepoint = 0;
  523. size_t position=0;
  524. size_t i=0;
  525. for (i=0;i<codepoints && *src;i++)
  526. {
  527. size_t characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
  528. if (codepoint == 0xFFFD)
  529. break;
  530. source_len -= characters_processed;
  531. src += characters_processed;
  532. position+=characters_processed;
  533. }
  534. return position;
  535. }