1 // SPDX-License-Identifier: GPL-2.0+
3 * charset conversion utils
5 * Copyright (c) 2017 Rob Clark
11 s32 utf8_get(const char **src)
26 * We do not expect a continuation byte (0x80 - 0xbf).
27 * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2
29 * The highest code point is 0x10ffff which is coded as
30 * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4.
32 if (c < 0xc2 || code > 0xf4)
43 if (c < 0x80 || c > 0xbf)
51 if ((code >= 0xD800 && code <= 0xDFFF) ||
58 if (c < 0x80 || c > 0xbf)
61 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
65 if (c < 0x80 || c > 0xbf)
74 int utf8_put(s32 code, char **dst)
78 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
84 **dst = code >> 6 | 0xC0;
87 **dst = code >> 12 | 0xE0;
89 **dst = code >> 18 | 0xF0;
91 **dst = (code >> 12 & 0x3F) | 0x80;
94 **dst = (code >> 6 & 0x3F) | 0x80;
97 **dst = (code & 0x3F) | 0x80;
103 size_t utf8_utf16_strnlen(const char *src, size_t count)
107 for (; *src && count; --count) {
108 s32 code = utf8_get(&src);
113 /* Reserve space for a replacement character */
115 } else if (code < 0x10000) {
124 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
126 if (!src || !dst || !*dst)
129 for (; count && *src; --count) {
130 s32 code = utf8_get(&src);
134 utf16_put(code, dst);
140 s32 utf16_get(const u16 **src)
150 if (code >= 0xDC00 && code <= 0xDFFF)
152 if (code >= 0xD800 && code <= 0xDBFF) {
160 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
168 int utf16_put(s32 code, u16 **dst)
172 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
174 if (code < 0x10000) {
178 **dst = code >> 10 | 0xD800;
180 **dst = (code & 0x3ff) | 0xDC00;
186 size_t utf16_strnlen(const u16 *src, size_t count)
190 for (; *src && count; --count) {
191 s32 code = utf16_get(&src);
196 * In case of an illegal sequence still reserve space for a
197 * replacement character.
204 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
208 for (; *src && count; --count) {
209 s32 code = utf16_get(&src);
214 /* Reserve space for a replacement character */
216 else if (code < 0x80)
218 else if (code < 0x800)
220 else if (code < 0x10000)
228 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
230 if (!src || !dst || !*dst)
233 for (; count && *src; --count) {
234 s32 code = utf16_get(&src);
245 size_t u16_strlen(const u16 *in)
248 for (i = 0; in[i]; i++);
252 size_t u16_strnlen(const u16 *in, size_t count)
255 for (i = 0; count-- && in[i]; i++);
259 uint16_t *utf16_strcpy(uint16_t *dest, const uint16_t *src)
261 uint16_t *tmp = dest;
263 while ((*dest++ = *src++) != '\0')
269 uint16_t *utf16_strdup(const uint16_t *s)
275 new = malloc((u16_strlen(s) + 1) * 2);
278 utf16_strcpy(new, s);
282 /* Convert UTF-16 to UTF-8. */
283 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
285 uint32_t code_high = 0;
288 uint32_t code = *src++;
291 if (code >= 0xDC00 && code <= 0xDFFF) {
292 /* Surrogate pair. */
293 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
295 *dest++ = (code >> 18) | 0xF0;
296 *dest++ = ((code >> 12) & 0x3F) | 0x80;
297 *dest++ = ((code >> 6) & 0x3F) | 0x80;
298 *dest++ = (code & 0x3F) | 0x80;
302 /* *src may be valid. Don't eat it. */
308 if (code <= 0x007F) {
310 } else if (code <= 0x07FF) {
311 *dest++ = (code >> 6) | 0xC0;
312 *dest++ = (code & 0x3F) | 0x80;
313 } else if (code >= 0xD800 && code <= 0xDBFF) {
316 } else if (code >= 0xDC00 && code <= 0xDFFF) {
319 } else if (code < 0x10000) {
320 *dest++ = (code >> 12) | 0xE0;
321 *dest++ = ((code >> 6) & 0x3F) | 0x80;
322 *dest++ = (code & 0x3F) | 0x80;
324 *dest++ = (code >> 18) | 0xF0;
325 *dest++ = ((code >> 12) & 0x3F) | 0x80;
326 *dest++ = ((code >> 6) & 0x3F) | 0x80;
327 *dest++ = (code & 0x3F) | 0x80;
335 uint16_t *utf8_to_utf16(uint16_t *dest, const uint8_t *src, size_t size)
344 /* Exit on zero byte */
347 } else if (*src <= 0xbf) {
350 } else if (*src <= 0xdf) {
351 code = *src++ & 0x1f;
353 } else if (*src <= 0xef) {
354 code = *src++ & 0x0f;
356 } else if (*src <= 0xf7) {
357 code = *src++ & 0x07;
364 for (; extension_bytes && size; --size, --extension_bytes) {
365 if ((*src & 0xc0) == 0x80) {
367 code |= *src++ & 0x3f;
377 if (code < 0x10000) {
381 * Simplified expression for
382 * (((code - 0x10000) >> 10) & 0x3ff) | 0xd800
384 *dest++ = (code >> 10) + 0xd7c0;
385 *dest++ = (code & 0x3ff) | 0xdc00;