From 20d28e80a5c861a9d5f449ea911ab75b4f37ad0d Mon Sep 17 00:00:00 2001 From: Jef Date: Tue, 24 Sep 2024 14:54:57 +0200 Subject: Initial community commit --- Src/replicant/nu/utf.c | 649 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 649 insertions(+) create mode 100644 Src/replicant/nu/utf.c (limited to 'Src/replicant/nu/utf.c') diff --git a/Src/replicant/nu/utf.c b/Src/replicant/nu/utf.c new file mode 100644 index 00000000..04495aa5 --- /dev/null +++ b/Src/replicant/nu/utf.c @@ -0,0 +1,649 @@ +#include "utf.h" + +#include "ByteReader.h" +#include "ByteWriter.h" +#include "foundation/error.h" +#include + +static const uint8_t mask_tab[6]={0x80,0xE0,0xF0,0xF8,0xFC,0xFE}; + +static const uint8_t val_tab[6]={0,0xC0,0xE0,0xF0,0xF8,0xFC}; + +// returns the number of utf-16 words required to store a given codepoint +static size_t ucs4_to_utf16_count(uint32_t codepoint) +{ + if (codepoint >= 0x110000) + return 0; // out of bounds + + if (codepoint >= 0x10000) + return 2; + + return 1; +} + +static int utf16LE_to_ucs4_character(bytereader_t const byte_reader, uint32_t *codepoint) +{ + uint16_t lead; + + lead = bytereader_read_u16_le(byte_reader); + if (lead < 0xD800 || lead >= 0xE000) + { + *codepoint = lead; + return NErr_Success; + } + + if (lead < 0xDC00) + { + if (bytereader_size(byte_reader) >= 2) + { + uint16_t trail = bytereader_read_u16_le(byte_reader); + if (trail >= 0xDC00 && trail < 0xE000) + { + *codepoint = 0x10000 + ((lead - 0xD800) << 10) + (trail - 0xDC00); + return NErr_Success; + } + } + } + + return NErr_Error; // invalid +} + +static int utf16BE_to_ucs4_character(bytereader_t const byte_reader, uint32_t *codepoint) +{ + uint16_t lead; + + lead = bytereader_read_u16_be(byte_reader); + if (lead < 0xD800 || lead >= 0xE000) + { + *codepoint = lead; + return NErr_Success; + } + + if (lead < 0xDC00) + { + if (bytereader_size(byte_reader) >= 2) + { + uint16_t trail = bytereader_read_u16_be(byte_reader); + if (trail >= 0xDC00 && trail < 0xE000) + { + *codepoint = 0x10000 + ((lead - 0xD800) << 10) + (trail - 0xDC00); + return NErr_Success; + } + } + } + + return NErr_Error; // invalid +} + +static size_t utf8_to_ucs4_character(const char *utf8, size_t len, uint32_t *codepoint) +{ + uint32_t res=0; + size_t n; + size_t cnt=0; + while(1) + { + if ((*utf8&mask_tab[cnt])==val_tab[cnt]) break; + if (++cnt==6) return 0; + } + cnt++; + + + if (cnt==2 && !(*utf8&0x1E)) + return 0; + + if (cnt==1) + res=*utf8; + else + res=(0xFF>>(cnt+1))&*utf8; + + if (cnt > len) + return 0; + + for (n=1;n> (7 - cnt))) + return 0; + + res=(res<<6)|(utf8[n]&0x3F); + } + + if (codepoint) + *codepoint=res; + + return cnt; +} + +// returns the number of utf-8 bytes required to store a given codepoint +static size_t ucs4_to_utf8_count(uint32_t codepoint) +{ + if (codepoint < 0x80) + return 1; + else if (codepoint < 0x800) + return 2; + else if (codepoint < 0x10000) + return 3; + else if (codepoint < 0x200000) + return 4; + else if (codepoint < 0x4000000) + return 5; + else if (codepoint <= 0x7FFFFFFF) + return 6; + else + return 0; +} + +static size_t ucs4_to_utf8_character(char *target, uint32_t codepoint, size_t max) +{ + size_t count = ucs4_to_utf8_count(codepoint); + + if (!count) + return 0; + + if (count>max) return 0; + + if (target == 0) + return count; + + switch (count) + { + case 6: + target[5] = 0x80 | (codepoint & 0x3F); + codepoint = codepoint >> 6; + codepoint |= 0x4000000; + case 5: + target[4] = 0x80 | (codepoint & 0x3F); + codepoint = codepoint >> 6; + codepoint |= 0x200000; + case 4: + target[3] = 0x80 | (codepoint & 0x3F); + codepoint = codepoint >> 6; + codepoint |= 0x10000; + case 3: + target[2] = 0x80 | (codepoint & 0x3F); + codepoint = codepoint >> 6; + codepoint |= 0x800; + case 2: + target[1] = 0x80 | (codepoint & 0x3F); + codepoint = codepoint >> 6; + codepoint |= 0xC0; + case 1: + target[0] = codepoint; + } + + return count; +} + +static size_t ucs4_to_utf16LE_character(bytewriter_t byte_writer, uint32_t codepoint) +{ + if (codepoint >= 0x110000) + return 0; + + if (codepoint >= 0x10000) + { + if (bytewriter_size(byte_writer) < 4) + return 0; + + bytewriter_write_u16_le(byte_writer, ((codepoint - 0x10000) >> 10) + 0xD800); // high surrogate + bytewriter_write_u16_le(byte_writer, ((codepoint - 0x10000) & 0x3FF) + 0xDC00); // low surrogate + return 2; + } + else + { + bytewriter_write_u16_le(byte_writer, codepoint); + return 1; + } +} + +static size_t ucs4_to_utf16BE_character(bytewriter_t byte_writer, uint32_t codepoint) +{ + if (codepoint >= 0x110000) + return 0; + + if (codepoint >= 0x10000) + { + if (bytewriter_size(byte_writer) < 4) + return 0; + + bytewriter_write_u16_be(byte_writer, ((codepoint - 0x10000) >> 10) + 0xD800); // high surrogate + bytewriter_write_u16_be(byte_writer, ((codepoint - 0x10000) & 0x3FF) + 0xDC00); // low surrogate + return 2; + } + else + { + bytewriter_write_u16_be(byte_writer, codepoint); + return 1; + } +} + +size_t utf16LE_to_utf8(const uint16_t *src, size_t source_len, char *dst, size_t out_len) +{ + uint32_t codepoint; + size_t position=0; + size_t characters_processed; + bytereader_s byte_reader; + bytereader_init(&byte_reader, src, source_len*2); + + if (!dst) // they just want the size + { + while (bytereader_size(&byte_reader)) + { + if (utf16LE_to_ucs4_character(&byte_reader, &codepoint) != NErr_Success) + break; + + characters_processed = ucs4_to_utf8_count(codepoint); + if (!characters_processed) + break; + + position+=characters_processed; + } + return position; + } + + while(bytereader_size(&byte_reader) && position 3) + { + codepoint = bytereader_read_u32_le(&byte_reader); + + characters_processed = ucs4_to_utf8_count(codepoint); + if (!characters_processed) + break; + + position+=characters_processed; + } + return position; + } + + while(bytereader_size(&byte_reader) > 3 && position