Created
July 27, 2010 21:09
-
-
Save mietek/492864 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdbool.h> | |
#include <stdint.h> | |
#include "convert_utf8.h" | |
#define UTF8_SINGLE_BYTE_CONTROL_MASK 0x80 | |
#define UTF8_SINGLE_BYTE_CONTROL_VALUE 0x00 | |
// Returns true if `byte` is standalone in UTF8. | |
inline static bool is_utf8_single_byte(uint8_t byte) { | |
return (byte & UTF8_SINGLE_BYTE_CONTROL_MASK) == UTF8_SINGLE_BYTE_CONTROL_VALUE; | |
} | |
#define UTF8_SEQUENCE_BYTE_CONTROL_MASK 0xC0 | |
#define UTF8_SEQUENCE_BYTE_CONTROL_VALUE 0x80 | |
#define UTF8_SEQUENCE_BYTE_DATA_MASK 0x3F | |
#define UTF8_SEQUENCE_WORD_DATA_SHIFT 6 | |
// Returns true if `byte` is a part of a UTF8 sequence. | |
inline static bool is_utf8_sequence_byte(uint8_t byte) { | |
return (byte & UTF8_SEQUENCE_BYTE_CONTROL_MASK) == UTF8_SEQUENCE_BYTE_CONTROL_VALUE; | |
} | |
// Returns the data bits of a UTF8 sequence byte. | |
inline static uint32_t decode_utf8_sequence_byte(uint8_t byte) { | |
return byte & UTF8_SEQUENCE_BYTE_DATA_MASK; | |
} | |
// Returns a UTF8 sequence byte encoding the data bits of a UTF32 word. | |
inline static uint8_t encode_utf8_sequence_byte(uint32_t word) { | |
return UTF8_SEQUENCE_BYTE_CONTROL_VALUE | (word & UTF8_SEQUENCE_BYTE_DATA_MASK); | |
} | |
#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK 0xE0 | |
#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xC0 | |
#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK 0x1F | |
#define UTF8_TWO_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x00000780 | |
// Returns true if `byte` is the start of a UTF8 two-byte sequence. | |
inline static bool is_utf8_two_byte_sequence_start_byte(uint8_t byte) { | |
return (byte & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE; | |
} | |
// Returns the data bits of a UTF8 two-byte sequence start byte. | |
inline static uint32_t decode_utf8_two_byte_sequence_start_byte(uint8_t byte) { | |
return byte & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK; | |
} | |
// Returns true if `word` is a non-overlong UTF8 two-byte sequence. | |
inline static bool is_non_overlong_utf8_two_byte_sequence_word(uint32_t word) { | |
return word & UTF8_TWO_BYTE_SEQUENCE_WORD_NON_ZERO_MASK; | |
} | |
// Returns a UTF8 two-byte sequence start byte encoding the data bits of `word`. | |
inline static uint8_t encode_utf8_two_byte_sequence_start_byte(uint32_t word) { | |
return UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE | (word & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK); | |
} | |
#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK 0xF0 | |
#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xE0 | |
#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK 0x0F | |
#define UTF8_THREE_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x0000F800 | |
// Returns true if `byte` is the start of a UTF8 three-byte sequence. | |
inline static bool is_utf8_three_byte_sequence_start_byte(uint8_t byte) { | |
return (byte & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE; | |
} | |
// Returns the data bits of a UTF8 three-byte sequence start byte. | |
inline static uint32_t decode_utf8_three_byte_sequence_start_byte(uint8_t byte) { | |
return byte & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK; | |
} | |
// Returns true if `word` is a non-overlong UTF8 three-byte sequence. | |
inline static bool is_non_overlong_utf8_three_byte_sequence_word(uint32_t word) { | |
return word & UTF8_THREE_BYTE_SEQUENCE_WORD_NON_ZERO_MASK; | |
} | |
// Returns a UTF8 three-byte sequence start byte encoding the data bits of `word`. | |
inline static uint8_t encode_utf8_three_byte_sequence_start_byte(uint32_t word) { | |
return UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE | (word & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK); | |
} | |
#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK 0xF8 | |
#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xF0 | |
#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK 0x07 | |
#define UTF8_FOUR_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x001F0000 | |
// Returns true if `byte` is the start of a UTF8 four-byte sequence. | |
inline static bool is_utf8_four_byte_sequence_start_byte(uint8_t byte) { | |
return (byte & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE; | |
} | |
// Returns the data bits of a UTF8 four-byte sequence start byte. | |
inline static uint32_t decode_utf8_four_byte_sequence_start_byte(uint8_t byte) { | |
return byte & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK; | |
} | |
// Returns true if `word` is a non-overlong UTF8 four-byte sequence. | |
inline static bool is_non_overlong_utf8_four_byte_sequence_word(uint32_t word) { | |
return word & UTF8_FOUR_BYTE_SEQUENCE_WORD_NON_ZERO_MASK; | |
} | |
// Returns a UTF8 four-byte sequence start byte encoding the data bits of `word`. | |
inline static uint8_t encode_utf8_four_byte_sequence_start_byte(uint32_t word) { | |
return UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE | (word & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK); | |
} | |
// Decodes a UTF32 word from a UTF8 two-byte sequence, or '?' if unsuccessful. | |
// Returns true if successful. | |
inline static bool decode_utf8_two_byte_sequence(uint32_t *dst, const uint8_t *src) { | |
uint32_t word; | |
if (is_utf8_sequence_byte(src[1])) { | |
word = decode_utf8_two_byte_sequence_start_byte(src[0]); | |
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
word |= decode_utf8_sequence_byte(src[1]); | |
if (is_non_overlong_utf8_two_byte_sequence_word(word)) { | |
*dst = word; | |
return true; | |
} | |
} | |
*dst = '?'; | |
return false; | |
} | |
// Decodes a UTF32 word from a UTF8 three-byte sequence, or '?' if unsuccessful. | |
// Returns true if successful. | |
inline static bool decode_utf8_three_byte_sequence(uint32_t *dst, const uint8_t *src) { | |
uint32_t word; | |
if (is_utf8_sequence_byte(src[1]) && | |
is_utf8_sequence_byte(src[2])) { | |
word = decode_utf8_three_byte_sequence_start_byte(src[0]); | |
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
word |= decode_utf8_sequence_byte(src[1]); | |
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
word |= decode_utf8_sequence_byte(src[2]); | |
if (is_non_overlong_utf8_three_byte_sequence_word(word)) { | |
*dst = word; | |
return true; | |
} | |
} | |
*dst = '?'; | |
return false; | |
} | |
// Decodes a UTF32 word from a UTF8 four-byte sequence, or '?' if unsuccessful. | |
// Returns true if successful. | |
inline static bool decode_utf8_four_byte_sequence(uint32_t *dst, const uint8_t *src) { | |
uint32_t word; | |
if (is_utf8_sequence_byte(src[1]) && | |
is_utf8_sequence_byte(src[2]) && | |
is_utf8_sequence_byte(src[3])) { | |
word = decode_utf8_four_byte_sequence_start_byte(src[0]); | |
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
word |= decode_utf8_sequence_byte(src[1]); | |
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
word |= decode_utf8_sequence_byte(src[2]); | |
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
word |= decode_utf8_sequence_byte(src[3]); | |
if (is_non_overlong_utf8_four_byte_sequence_word(word)) { | |
*dst = word; | |
return true; | |
} | |
} | |
*dst = '?'; | |
return false; | |
} | |
// Converts text from UTF8 to UTF32, outputting at most `dst_size - 1` words, 0-terminated. | |
// Returns the number of words actually output. | |
// Converts malformed bytes to '?', and leaves truncated sequences unprocessed. | |
// `src_unused` may be NULL. | |
uint32_t convert_from_utf8_to_utf32(uint32_t *dst, uint32_t dst_size, const uint8_t *src, uint32_t src_size, uint32_t *src_unused) { | |
uint32_t i, j; | |
for (i = 0, j = 0; i < src_size && j + 1 < dst_size; i++, j++) { | |
if (is_utf8_single_byte(src[i])) | |
dst[j] = src[i]; | |
else if (is_utf8_two_byte_sequence_start_byte(src[i])) { | |
if (i + 1 >= src_size) | |
break; | |
if (decode_utf8_two_byte_sequence(&dst[j], &src[i])) | |
i++; | |
} | |
else if (is_utf8_three_byte_sequence_start_byte(src[i])) { | |
if (i + 2 >= src_size) | |
break; | |
if (decode_utf8_three_byte_sequence(&dst[j], &src[i])) | |
i += 2; | |
} | |
else if (is_utf8_four_byte_sequence_start_byte(src[i])) { | |
if (i + 3 >= src_size) | |
break; | |
if (decode_utf8_four_byte_sequence(&dst[j], &src[i])) | |
i += 3; | |
} | |
else | |
dst[j] = '?'; | |
} | |
if (src_unused) | |
*src_unused = src_size - i; | |
dst[j] = 0; | |
return j; | |
} | |
// Encodes a UTF32 word to a UTF8 two-byte sequence. | |
inline static void encode_utf8_two_byte_sequence(uint8_t *dst, uint32_t word) { | |
dst[1] = encode_utf8_sequence_byte(word); | |
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
dst[0] = encode_utf8_two_byte_sequence_start_byte(word); | |
} | |
// Encodes a UTF32 word to a UTF8 three-byte sequence. | |
inline static void encode_utf8_three_byte_sequence(uint8_t *dst, uint32_t word) { | |
dst[2] = encode_utf8_sequence_byte(word); | |
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
dst[1] = encode_utf8_sequence_byte(word); | |
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
dst[0] = encode_utf8_three_byte_sequence_start_byte(word); | |
} | |
// Encodes a UTF32 word to a UTF8 four-byte sequence. | |
inline static void encode_utf8_four_byte_sequence(uint8_t *dst, uint32_t word) { | |
dst[3] = encode_utf8_sequence_byte(word); | |
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
dst[2] = encode_utf8_sequence_byte(word); | |
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
dst[1] = encode_utf8_sequence_byte(word); | |
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT; | |
dst[0] = encode_utf8_four_byte_sequence_start_byte(word); | |
} | |
#define MAX_UTF8_SINGLE_BYTE_WORD 0x0000007F | |
#define MAX_UTF8_TWO_BYTE_SEQUENCE_WORD 0x000007FF | |
#define MAX_UTF8_THREE_BYTE_SEQUENCE_WORD 0x0000FFFF | |
#define MAX_UTF8_FOUR_BYTE_SEQUENCE_WORD 0x0010FFFF | |
// Converts text from UTF32 to UTF8, outputting at most `dst_size - 1` bytes, 0-terminated. | |
// Returns the number of bytes actually output. | |
// Converts malformed words to '?'. | |
// `src_unused` may be NULL. | |
uint32_t convert_from_utf32_to_utf8(uint8_t *dst, uint32_t dst_size, const uint32_t *src, uint32_t src_size, uint32_t *src_unused) { | |
uint32_t i, j; | |
for (i = 0, j = 0; i < src_size && j + 1 < dst_size; i++, j++) { | |
if (src[i] <= MAX_UTF8_SINGLE_BYTE_WORD) | |
dst[j] = src[i]; | |
else if (src[i] <= MAX_UTF8_TWO_BYTE_SEQUENCE_WORD) { | |
if (j + 2 >= dst_size) | |
break; | |
encode_utf8_two_byte_sequence(&dst[j], src[i]); | |
j++; | |
} | |
else if (src[i] <= MAX_UTF8_THREE_BYTE_SEQUENCE_WORD) { | |
if (j + 3 >= dst_size) | |
break; | |
encode_utf8_three_byte_sequence(&dst[j], src[i]); | |
j += 2; | |
} | |
else if (src[i] <= MAX_UTF8_FOUR_BYTE_SEQUENCE_WORD) { | |
if (j + 4 >= dst_size) | |
break; | |
encode_utf8_four_byte_sequence(&dst[j], src[i]); | |
j += 3; | |
} | |
else | |
dst[j] = '?'; | |
} | |
if (src_unused) | |
*src_unused = src_size - i; | |
dst[j] = 0; | |
return j; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Converts text from UTF8 to UTF32, outputting at most `dst_size - 1` words, 0-terminated. | |
// Returns the number of words actually output. | |
// Converts malformed bytes to '?', and leaves truncated sequences unprocessed. | |
// `src_unused` may be NULL. | |
uint32_t convert_from_utf8_to_utf32(uint32_t *dst, uint32_t dst_size, const uint8_t *src, uint32_t src_size, uint32_t *src_unused); | |
// Converts text from UTF32 to UTF8, outputting at most `dst_size - 1` bytes, 0-terminated. | |
// Returns the number of bytes actually output. | |
// Converts malformed words to '?'. | |
// `src_unused` may be NULL. | |
uint32_t convert_from_utf32_to_utf8(uint8_t *dst, uint32_t dst_size, const uint32_t *src, uint32_t src_size, uint32_t *src_unused); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment