123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649 |
- #include "utf.h"
- #include "ByteReader.h"
- #include "ByteWriter.h"
- #include "foundation/error.h"
- #include <string.h>
- static const uint8_t mask_tab[6]={0x80,0xE0,0xF0,0xF8,0xFC,0xFE};
- static const uint8_t val_tab[6]={0,0xC0,0xE0,0xF0,0xF8,0xFC};
- // returns the number of utf-16 words required to store a given codepoint
- static size_t ucs4_to_utf16_count(uint32_t codepoint)
- {
- if (codepoint >= 0x110000)
- return 0; // out of bounds
- if (codepoint >= 0x10000)
- return 2;
- return 1;
- }
- static int utf16LE_to_ucs4_character(bytereader_t const byte_reader, uint32_t *codepoint)
- {
- uint16_t lead;
- lead = bytereader_read_u16_le(byte_reader);
- if (lead < 0xD800 || lead >= 0xE000)
- {
- *codepoint = lead;
- return NErr_Success;
- }
- if (lead < 0xDC00)
- {
- if (bytereader_size(byte_reader) >= 2)
- {
- uint16_t trail = bytereader_read_u16_le(byte_reader);
- if (trail >= 0xDC00 && trail < 0xE000)
- {
- *codepoint = 0x10000 + ((lead - 0xD800) << 10) + (trail - 0xDC00);
- return NErr_Success;
- }
- }
- }
- return NErr_Error; // invalid
- }
- static int utf16BE_to_ucs4_character(bytereader_t const byte_reader, uint32_t *codepoint)
- {
- uint16_t lead;
- lead = bytereader_read_u16_be(byte_reader);
- if (lead < 0xD800 || lead >= 0xE000)
- {
- *codepoint = lead;
- return NErr_Success;
- }
- if (lead < 0xDC00)
- {
- if (bytereader_size(byte_reader) >= 2)
- {
- uint16_t trail = bytereader_read_u16_be(byte_reader);
- if (trail >= 0xDC00 && trail < 0xE000)
- {
- *codepoint = 0x10000 + ((lead - 0xD800) << 10) + (trail - 0xDC00);
- return NErr_Success;
- }
- }
- }
- return NErr_Error; // invalid
- }
- static size_t utf8_to_ucs4_character(const char *utf8, size_t len, uint32_t *codepoint)
- {
- uint32_t res=0;
- size_t n;
- size_t cnt=0;
- while(1)
- {
- if ((*utf8&mask_tab[cnt])==val_tab[cnt]) break;
- if (++cnt==6) return 0;
- }
- cnt++;
- if (cnt==2 && !(*utf8&0x1E))
- return 0;
- if (cnt==1)
- res=*utf8;
- else
- res=(0xFF>>(cnt+1))&*utf8;
- if (cnt > len)
- return 0;
- for (n=1;n<cnt;n++)
- {
- if ((utf8[n]&0xC0) != 0x80)
- return 0;
- if (!res && n==2 && !((utf8[n]&0x7F) >> (7 - cnt)))
- return 0;
- res=(res<<6)|(utf8[n]&0x3F);
- }
- if (codepoint)
- *codepoint=res;
- return cnt;
- }
- // returns the number of utf-8 bytes required to store a given codepoint
- static size_t ucs4_to_utf8_count(uint32_t codepoint)
- {
- if (codepoint < 0x80)
- return 1;
- else if (codepoint < 0x800)
- return 2;
- else if (codepoint < 0x10000)
- return 3;
- else if (codepoint < 0x200000)
- return 4;
- else if (codepoint < 0x4000000)
- return 5;
- else if (codepoint <= 0x7FFFFFFF)
- return 6;
- else
- return 0;
- }
- static size_t ucs4_to_utf8_character(char *target, uint32_t codepoint, size_t max)
- {
- size_t count = ucs4_to_utf8_count(codepoint);
- if (!count)
- return 0;
- if (count>max) return 0;
- if (target == 0)
- return count;
- switch (count)
- {
- case 6:
- target[5] = 0x80 | (codepoint & 0x3F);
- codepoint = codepoint >> 6;
- codepoint |= 0x4000000;
- case 5:
- target[4] = 0x80 | (codepoint & 0x3F);
- codepoint = codepoint >> 6;
- codepoint |= 0x200000;
- case 4:
- target[3] = 0x80 | (codepoint & 0x3F);
- codepoint = codepoint >> 6;
- codepoint |= 0x10000;
- case 3:
- target[2] = 0x80 | (codepoint & 0x3F);
- codepoint = codepoint >> 6;
- codepoint |= 0x800;
- case 2:
- target[1] = 0x80 | (codepoint & 0x3F);
- codepoint = codepoint >> 6;
- codepoint |= 0xC0;
- case 1:
- target[0] = codepoint;
- }
- return count;
- }
- static size_t ucs4_to_utf16LE_character(bytewriter_t byte_writer, uint32_t codepoint)
- {
- if (codepoint >= 0x110000)
- return 0;
- if (codepoint >= 0x10000)
- {
- if (bytewriter_size(byte_writer) < 4)
- return 0;
- bytewriter_write_u16_le(byte_writer, ((codepoint - 0x10000) >> 10) + 0xD800); // high surrogate
- bytewriter_write_u16_le(byte_writer, ((codepoint - 0x10000) & 0x3FF) + 0xDC00); // low surrogate
- return 2;
- }
- else
- {
- bytewriter_write_u16_le(byte_writer, codepoint);
- return 1;
- }
- }
- static size_t ucs4_to_utf16BE_character(bytewriter_t byte_writer, uint32_t codepoint)
- {
- if (codepoint >= 0x110000)
- return 0;
- if (codepoint >= 0x10000)
- {
- if (bytewriter_size(byte_writer) < 4)
- return 0;
- bytewriter_write_u16_be(byte_writer, ((codepoint - 0x10000) >> 10) + 0xD800); // high surrogate
- bytewriter_write_u16_be(byte_writer, ((codepoint - 0x10000) & 0x3FF) + 0xDC00); // low surrogate
- return 2;
- }
- else
- {
- bytewriter_write_u16_be(byte_writer, codepoint);
- return 1;
- }
- }
- size_t utf16LE_to_utf8(const uint16_t *src, size_t source_len, char *dst, size_t out_len)
- {
- uint32_t codepoint;
- size_t position=0;
- size_t characters_processed;
- bytereader_s byte_reader;
- bytereader_init(&byte_reader, src, source_len*2);
- if (!dst) // they just want the size
- {
- while (bytereader_size(&byte_reader))
- {
- if (utf16LE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success)
- break;
- characters_processed = ucs4_to_utf8_count(codepoint);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- return position;
- }
- while(bytereader_size(&byte_reader) && position<out_len)
- {
- if (utf16LE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success)
- break;
- characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- if (position<out_len)
- dst[position]=0;
- return position;
- }
- size_t utf16BE_to_utf8(const uint16_t *src, size_t source_len, char *dst, size_t out_len)
- {
- uint32_t codepoint;
- size_t position=0;
- size_t characters_processed;
- bytereader_s byte_reader;
- bytereader_init(&byte_reader, src, source_len*2);
- if (!dst) // they just want the size
- {
- while (bytereader_size(&byte_reader))
- {
- if (utf16BE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success)
- break;
- characters_processed = ucs4_to_utf8_count(codepoint);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- return position;
- }
- while(bytereader_size(&byte_reader) && position<out_len)
- {
- if (utf16BE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success)
- break;
- characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- if (position<out_len)
- dst[position]=0;
- return position;
- }
- size_t ucs4_to_utf8(const uint32_t *src, size_t source_len, char *dst, size_t out_len)
- {
- uint32_t codepoint;
- size_t position=0;
- size_t characters_processed;
- bytereader_s byte_reader;
- bytereader_init(&byte_reader, src, source_len*4);
- if (!dst) // they just want the size
- {
- while (bytereader_size(&byte_reader) > 3)
- {
- codepoint = bytereader_read_u32_le(&byte_reader);
-
- characters_processed = ucs4_to_utf8_count(codepoint);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- return position;
- }
- while(bytereader_size(&byte_reader) > 3 && position<out_len)
- {
- codepoint = bytereader_read_u32_le(&byte_reader);
- characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- if (position<out_len)
- dst[position]=0;
- return position;
- }
- size_t utf8_to_utf16LE(const char *src, size_t source_len, uint16_t *dst, size_t out_len)
- {
- uint32_t codepoint;
- size_t characters_processed;
- bytewriter_s byte_writer;
- if (!dst) // they just want the size
- {
- size_t position=0;
- while (source_len)
- {
- characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
- if (codepoint == 0xFFFD)
- break;
- source_len -= characters_processed;
- src += characters_processed;
- characters_processed = ucs4_to_utf16_count(codepoint);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- return position;
- }
- bytewriter_init(&byte_writer, dst, out_len*2);
- while(source_len && bytewriter_size(&byte_writer))
- {
- characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
- if (codepoint == 0xFFFD)
- break;
- source_len -= characters_processed;
- src += characters_processed;
- characters_processed=ucs4_to_utf16LE_character(&byte_writer, codepoint);
- if (!characters_processed)
- break;
- }
- if (bytewriter_size(&byte_writer))
- bytewriter_write_u16_le(&byte_writer, 0);
- return out_len - bytewriter_size(&byte_writer)/2;
- }
- size_t utf8_to_utf16BE(const char *src, size_t source_len, uint16_t *dst, size_t out_len)
- {
- uint32_t codepoint;
- size_t characters_processed;
- bytewriter_s byte_writer;
- if (!dst) // they just want the size
- {
- size_t position=0;
- while (source_len)
- {
- characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
- if (codepoint == 0xFFFD)
- break;
- source_len -= characters_processed;
- src += characters_processed;
- characters_processed = ucs4_to_utf16_count(codepoint);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- return position;
- }
- bytewriter_init(&byte_writer, dst, out_len*2);
- while(source_len && bytewriter_size(&byte_writer))
- {
- characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
- if (codepoint == 0xFFFD)
- break;
- source_len -= characters_processed;
- src += characters_processed;
- characters_processed=ucs4_to_utf16BE_character(&byte_writer, codepoint);
- if (!characters_processed)
- break;
- }
- if (bytewriter_size(&byte_writer))
- bytewriter_write_u16_be(&byte_writer, 0);
-
- return out_len - bytewriter_size(&byte_writer)/2;
- }
- size_t utf8_to_ISO_8859_1(const char *src, size_t source_len, char *dst, size_t out_len)
- {
- uint32_t codepoint;
- size_t position=0;
- size_t characters_processed;
- if (!dst) // they just want the size
- {
- while (source_len)
- {
- characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
- if (codepoint == 0xFFFD)
- break;
- source_len -= characters_processed;
- src += characters_processed;
- position++;
- }
- return position;
- }
- while(source_len && position<out_len)
- {
- characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
- if (codepoint == 0xFFFD)
- break;
- source_len -= characters_processed;
- src += characters_processed;
- if (codepoint < 256)
- dst[position++] = codepoint;
- else
- dst[position++] = '?';
- }
- if (position<out_len)
- dst[position]=0;
- return position;
- }
- size_t ISO_8859_1_to_utf8(const char *src, size_t source_len, char *dst, size_t out_len)
- {
- uint32_t codepoint;
- size_t position=0;
- size_t characters_processed;
- if (!dst) // they just want the size
- {
- while (source_len)
- {
- codepoint = *src++;
- source_len--;
- characters_processed = ucs4_to_utf8_count(codepoint);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- return position;
- }
- while(source_len && position<out_len)
- {
- codepoint = *src++;
- source_len--;
- characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- if (position<out_len)
- dst[position]=0;
- return position;
- }
- size_t utf8_to_ucs4(const char *src, size_t source_len, uint32_t *dst, size_t out_len)
- {
- uint32_t codepoint;
- size_t characters_processed;
- bytewriter_s byte_writer;
- if (!dst) // they just want the size
- {
- size_t position=0;
- while (source_len)
- {
- characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
- if (codepoint == 0xFFFD)
- break;
- source_len -= characters_processed;
- src += characters_processed;
- characters_processed = 1;
- position+=characters_processed;
- }
- return position;
- }
- bytewriter_init(&byte_writer, dst, out_len*4);
- while(source_len && bytewriter_size(&byte_writer))
- {
- characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
- if (codepoint == 0xFFFD)
- break;
- source_len -= characters_processed;
- src += characters_processed;
- bytewriter_write_u32_le(&byte_writer, codepoint);
- }
- if (bytewriter_size(&byte_writer))
- bytewriter_write_u32_le(&byte_writer, 0);
- return out_len - bytewriter_size(&byte_writer)/4;
- }
- size_t ASCII_to_utf8(const char *src, size_t source_len, char *dst, size_t out_len)
- {
- uint32_t codepoint;
- size_t position=0;
- size_t characters_processed;
- if (!dst) // they just want the size
- {
- while (source_len)
- {
- codepoint = *src++;
- source_len--;
- characters_processed = ucs4_to_utf8_count(codepoint);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- return position;
- }
- while(source_len && position<out_len)
- {
- codepoint = *src++;
- source_len--;
- characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position);
- if (!characters_processed)
- break;
- position+=characters_processed;
- }
- if (position<out_len)
- dst[position]=0;
- return position;
- }
- size_t utf8_to_ASCII(const char *src, size_t source_len, char *dst, size_t out_len)
- {
- uint32_t codepoint;
- size_t position=0;
- size_t characters_processed;
- if (!dst) // they just want the size
- {
- while (source_len)
- {
- characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
- if (codepoint == 0xFFFD)
- break;
- source_len -= characters_processed;
- src += characters_processed;
- position++;
- }
- return position;
- }
- while(source_len && position<out_len)
- {
- characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
- if (codepoint == 0xFFFD)
- break;
- source_len -= characters_processed;
- src += characters_processed;
- if (codepoint < 128)
- dst[position++] = codepoint;
- else
- dst[position++] = '?';
- }
- if (position<out_len)
- dst[position]=0;
- return position;
- }
- size_t utf8_strnlen(const char *src, size_t source_len, size_t codepoints)
- {
- uint32_t codepoint = 0;
- size_t position=0;
- size_t i=0;
- for (i=0;i<codepoints && *src;i++)
- {
- size_t characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint);
- if (codepoint == 0xFFFD)
- break;
- source_len -= characters_processed;
- src += characters_processed;
- position+=characters_processed;
- }
- return position;
- }
|