diff options
Diffstat (limited to 'Src/replicant/nu/utf.c')
-rw-r--r-- | Src/replicant/nu/utf.c | 649 |
1 files changed, 649 insertions, 0 deletions
diff --git a/Src/replicant/nu/utf.c b/Src/replicant/nu/utf.c new file mode 100644 index 00000000..04495aa5 --- /dev/null +++ b/Src/replicant/nu/utf.c @@ -0,0 +1,649 @@ +#include "utf.h" + +#include "ByteReader.h" +#include "ByteWriter.h" +#include "foundation/error.h" +#include <string.h> + +static const uint8_t mask_tab[6]={0x80,0xE0,0xF0,0xF8,0xFC,0xFE}; + +static const uint8_t val_tab[6]={0,0xC0,0xE0,0xF0,0xF8,0xFC}; + +// returns the number of utf-16 words required to store a given codepoint +static size_t ucs4_to_utf16_count(uint32_t codepoint) +{ + if (codepoint >= 0x110000) + return 0; // out of bounds + + if (codepoint >= 0x10000) + return 2; + + return 1; +} + +static int utf16LE_to_ucs4_character(bytereader_t const byte_reader, uint32_t *codepoint) +{ + uint16_t lead; + + lead = bytereader_read_u16_le(byte_reader); + if (lead < 0xD800 || lead >= 0xE000) + { + *codepoint = lead; + return NErr_Success; + } + + if (lead < 0xDC00) + { + if (bytereader_size(byte_reader) >= 2) + { + uint16_t trail = bytereader_read_u16_le(byte_reader); + if (trail >= 0xDC00 && trail < 0xE000) + { + *codepoint = 0x10000 + ((lead - 0xD800) << 10) + (trail - 0xDC00); + return NErr_Success; + } + } + } + + return NErr_Error; // invalid +} + +static int utf16BE_to_ucs4_character(bytereader_t const byte_reader, uint32_t *codepoint) +{ + uint16_t lead; + + lead = bytereader_read_u16_be(byte_reader); + if (lead < 0xD800 || lead >= 0xE000) + { + *codepoint = lead; + return NErr_Success; + } + + if (lead < 0xDC00) + { + if (bytereader_size(byte_reader) >= 2) + { + uint16_t trail = bytereader_read_u16_be(byte_reader); + if (trail >= 0xDC00 && trail < 0xE000) + { + *codepoint = 0x10000 + ((lead - 0xD800) << 10) + (trail - 0xDC00); + return NErr_Success; + } + } + } + + return NErr_Error; // invalid +} + +static size_t utf8_to_ucs4_character(const char *utf8, size_t len, uint32_t *codepoint) +{ + uint32_t res=0; + size_t n; + size_t cnt=0; + while(1) + { + if ((*utf8&mask_tab[cnt])==val_tab[cnt]) break; + if (++cnt==6) return 0; + } + cnt++; + + + if (cnt==2 && !(*utf8&0x1E)) + return 0; + + if (cnt==1) + res=*utf8; + else + res=(0xFF>>(cnt+1))&*utf8; + + if (cnt > len) + return 0; + + for (n=1;n<cnt;n++) + { + if ((utf8[n]&0xC0) != 0x80) + return 0; + if (!res && n==2 && !((utf8[n]&0x7F) >> (7 - cnt))) + return 0; + + res=(res<<6)|(utf8[n]&0x3F); + } + + if (codepoint) + *codepoint=res; + + return cnt; +} + +// returns the number of utf-8 bytes required to store a given codepoint +static size_t ucs4_to_utf8_count(uint32_t codepoint) +{ + if (codepoint < 0x80) + return 1; + else if (codepoint < 0x800) + return 2; + else if (codepoint < 0x10000) + return 3; + else if (codepoint < 0x200000) + return 4; + else if (codepoint < 0x4000000) + return 5; + else if (codepoint <= 0x7FFFFFFF) + return 6; + else + return 0; +} + +static size_t ucs4_to_utf8_character(char *target, uint32_t codepoint, size_t max) +{ + size_t count = ucs4_to_utf8_count(codepoint); + + if (!count) + return 0; + + if (count>max) return 0; + + if (target == 0) + return count; + + switch (count) + { + case 6: + target[5] = 0x80 | (codepoint & 0x3F); + codepoint = codepoint >> 6; + codepoint |= 0x4000000; + case 5: + target[4] = 0x80 | (codepoint & 0x3F); + codepoint = codepoint >> 6; + codepoint |= 0x200000; + case 4: + target[3] = 0x80 | (codepoint & 0x3F); + codepoint = codepoint >> 6; + codepoint |= 0x10000; + case 3: + target[2] = 0x80 | (codepoint & 0x3F); + codepoint = codepoint >> 6; + codepoint |= 0x800; + case 2: + target[1] = 0x80 | (codepoint & 0x3F); + codepoint = codepoint >> 6; + codepoint |= 0xC0; + case 1: + target[0] = codepoint; + } + + return count; +} + +static size_t ucs4_to_utf16LE_character(bytewriter_t byte_writer, uint32_t codepoint) +{ + if (codepoint >= 0x110000) + return 0; + + if (codepoint >= 0x10000) + { + if (bytewriter_size(byte_writer) < 4) + return 0; + + bytewriter_write_u16_le(byte_writer, ((codepoint - 0x10000) >> 10) + 0xD800); // high surrogate + bytewriter_write_u16_le(byte_writer, ((codepoint - 0x10000) & 0x3FF) + 0xDC00); // low surrogate + return 2; + } + else + { + bytewriter_write_u16_le(byte_writer, codepoint); + return 1; + } +} + +static size_t ucs4_to_utf16BE_character(bytewriter_t byte_writer, uint32_t codepoint) +{ + if (codepoint >= 0x110000) + return 0; + + if (codepoint >= 0x10000) + { + if (bytewriter_size(byte_writer) < 4) + return 0; + + bytewriter_write_u16_be(byte_writer, ((codepoint - 0x10000) >> 10) + 0xD800); // high surrogate + bytewriter_write_u16_be(byte_writer, ((codepoint - 0x10000) & 0x3FF) + 0xDC00); // low surrogate + return 2; + } + else + { + bytewriter_write_u16_be(byte_writer, codepoint); + return 1; + } +} + +size_t utf16LE_to_utf8(const uint16_t *src, size_t source_len, char *dst, size_t out_len) +{ + uint32_t codepoint; + size_t position=0; + size_t characters_processed; + bytereader_s byte_reader; + bytereader_init(&byte_reader, src, source_len*2); + + if (!dst) // they just want the size + { + while (bytereader_size(&byte_reader)) + { + if (utf16LE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success) + break; + + characters_processed = ucs4_to_utf8_count(codepoint); + if (!characters_processed) + break; + + position+=characters_processed; + } + return position; + } + + while(bytereader_size(&byte_reader) && position<out_len) + { + if (utf16LE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success) + break; + + characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position); + if (!characters_processed) + break; + position+=characters_processed; + } + if (position<out_len) + dst[position]=0; + return position; +} + +size_t utf16BE_to_utf8(const uint16_t *src, size_t source_len, char *dst, size_t out_len) +{ + uint32_t codepoint; + size_t position=0; + size_t characters_processed; + bytereader_s byte_reader; + bytereader_init(&byte_reader, src, source_len*2); + + if (!dst) // they just want the size + { + while (bytereader_size(&byte_reader)) + { + if (utf16BE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success) + break; + + characters_processed = ucs4_to_utf8_count(codepoint); + if (!characters_processed) + break; + + position+=characters_processed; + } + return position; + } + + while(bytereader_size(&byte_reader) && position<out_len) + { + if (utf16BE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success) + break; + + characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position); + if (!characters_processed) + break; + position+=characters_processed; + } + if (position<out_len) + dst[position]=0; + return position; +} + + +size_t ucs4_to_utf8(const uint32_t *src, size_t source_len, char *dst, size_t out_len) +{ + uint32_t codepoint; + size_t position=0; + size_t characters_processed; + bytereader_s byte_reader; + bytereader_init(&byte_reader, src, source_len*4); + + if (!dst) // they just want the size + { + while (bytereader_size(&byte_reader) > 3) + { + codepoint = bytereader_read_u32_le(&byte_reader); + + characters_processed = ucs4_to_utf8_count(codepoint); + if (!characters_processed) + break; + + position+=characters_processed; + } + return position; + } + + while(bytereader_size(&byte_reader) > 3 && position<out_len) + { + codepoint = bytereader_read_u32_le(&byte_reader); + + characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position); + if (!characters_processed) + break; + position+=characters_processed; + } + if (position<out_len) + dst[position]=0; + return position; +} + +size_t utf8_to_utf16LE(const char *src, size_t source_len, uint16_t *dst, size_t out_len) +{ + uint32_t codepoint; + size_t characters_processed; + bytewriter_s byte_writer; + + if (!dst) // they just want the size + { + size_t position=0; + while (source_len) + { + characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint); + if (codepoint == 0xFFFD) + break; + + source_len -= characters_processed; + src += characters_processed; + + characters_processed = ucs4_to_utf16_count(codepoint); + if (!characters_processed) + break; + + position+=characters_processed; + } + return position; + } + + + bytewriter_init(&byte_writer, dst, out_len*2); + while(source_len && bytewriter_size(&byte_writer)) + { + characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint); + if (codepoint == 0xFFFD) + break; + + source_len -= characters_processed; + src += characters_processed; + + characters_processed=ucs4_to_utf16LE_character(&byte_writer, codepoint); + if (!characters_processed) + break; + } + if (bytewriter_size(&byte_writer)) + bytewriter_write_u16_le(&byte_writer, 0); + return out_len - bytewriter_size(&byte_writer)/2; +} + +size_t utf8_to_utf16BE(const char *src, size_t source_len, uint16_t *dst, size_t out_len) +{ + uint32_t codepoint; + size_t characters_processed; + bytewriter_s byte_writer; + + if (!dst) // they just want the size + { + size_t position=0; + while (source_len) + { + characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint); + if (codepoint == 0xFFFD) + break; + + source_len -= characters_processed; + src += characters_processed; + + characters_processed = ucs4_to_utf16_count(codepoint); + if (!characters_processed) + break; + + position+=characters_processed; + } + return position; + } + bytewriter_init(&byte_writer, dst, out_len*2); + while(source_len && bytewriter_size(&byte_writer)) + { + characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint); + if (codepoint == 0xFFFD) + break; + + source_len -= characters_processed; + src += characters_processed; + + characters_processed=ucs4_to_utf16BE_character(&byte_writer, codepoint); + if (!characters_processed) + break; + + } + if (bytewriter_size(&byte_writer)) + bytewriter_write_u16_be(&byte_writer, 0); + + return out_len - bytewriter_size(&byte_writer)/2; + +} + +size_t utf8_to_ISO_8859_1(const char *src, size_t source_len, char *dst, size_t out_len) +{ + uint32_t codepoint; + size_t position=0; + size_t characters_processed; + + if (!dst) // they just want the size + { + while (source_len) + { + characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint); + if (codepoint == 0xFFFD) + break; + + source_len -= characters_processed; + src += characters_processed; + position++; + } + return position; + } + + while(source_len && position<out_len) + { + characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint); + if (codepoint == 0xFFFD) + break; + + source_len -= characters_processed; + src += characters_processed; + + if (codepoint < 256) + dst[position++] = codepoint; + else + dst[position++] = '?'; + } + if (position<out_len) + dst[position]=0; + return position; +} + +size_t ISO_8859_1_to_utf8(const char *src, size_t source_len, char *dst, size_t out_len) +{ + uint32_t codepoint; + size_t position=0; + size_t characters_processed; + + if (!dst) // they just want the size + { + while (source_len) + { + codepoint = *src++; + source_len--; + + characters_processed = ucs4_to_utf8_count(codepoint); + if (!characters_processed) + break; + + position+=characters_processed; + } + return position; + } + + while(source_len && position<out_len) + { + codepoint = *src++; + + source_len--; + + characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position); + if (!characters_processed) + break; + + position+=characters_processed; + } + if (position<out_len) + dst[position]=0; + return position; +} + +size_t utf8_to_ucs4(const char *src, size_t source_len, uint32_t *dst, size_t out_len) +{ + uint32_t codepoint; + size_t characters_processed; + bytewriter_s byte_writer; + + if (!dst) // they just want the size + { + size_t position=0; + while (source_len) + { + characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint); + if (codepoint == 0xFFFD) + break; + + source_len -= characters_processed; + src += characters_processed; + + characters_processed = 1; + + position+=characters_processed; + } + return position; + } + + bytewriter_init(&byte_writer, dst, out_len*4); + while(source_len && bytewriter_size(&byte_writer)) + { + characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint); + if (codepoint == 0xFFFD) + break; + + source_len -= characters_processed; + src += characters_processed; + + bytewriter_write_u32_le(&byte_writer, codepoint); + } + if (bytewriter_size(&byte_writer)) + bytewriter_write_u32_le(&byte_writer, 0); + return out_len - bytewriter_size(&byte_writer)/4; +} + +size_t ASCII_to_utf8(const char *src, size_t source_len, char *dst, size_t out_len) +{ + uint32_t codepoint; + size_t position=0; + size_t characters_processed; + + if (!dst) // they just want the size + { + while (source_len) + { + codepoint = *src++; + source_len--; + + characters_processed = ucs4_to_utf8_count(codepoint); + if (!characters_processed) + break; + + position+=characters_processed; + } + return position; + } + + while(source_len && position<out_len) + { + codepoint = *src++; + + source_len--; + + characters_processed=ucs4_to_utf8_character(&dst[position], codepoint, out_len-position); + if (!characters_processed) + break; + + position+=characters_processed; + } + if (position<out_len) + dst[position]=0; + return position; +} + +size_t utf8_to_ASCII(const char *src, size_t source_len, char *dst, size_t out_len) +{ + uint32_t codepoint; + size_t position=0; + size_t characters_processed; + + if (!dst) // they just want the size + { + while (source_len) + { + characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint); + if (codepoint == 0xFFFD) + break; + + source_len -= characters_processed; + src += characters_processed; + position++; + } + return position; + } + + while(source_len && position<out_len) + { + characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint); + if (codepoint == 0xFFFD) + break; + + source_len -= characters_processed; + src += characters_processed; + + if (codepoint < 128) + dst[position++] = codepoint; + else + dst[position++] = '?'; + } + if (position<out_len) + dst[position]=0; + return position; +} + +size_t utf8_strnlen(const char *src, size_t source_len, size_t codepoints) +{ + uint32_t codepoint = 0; + size_t position=0; + size_t i=0; + + for (i=0;i<codepoints && *src;i++) + { + size_t characters_processed = utf8_to_ucs4_character(src, source_len, &codepoint); + if (codepoint == 0xFFFD) + break; + + source_len -= characters_processed; + src += characters_processed; + position+=characters_processed; + } + return position; + +} |