1 // SPDX-License-Identifier: GPL-2.0+
3 * charset conversion utils
5 * Copyright (c) 2017 Rob Clark
9 #include <capitalization.h>
11 #include <efi_loader.h>
16 * codepage_437 - Unicode to codepage 437 translation table
18 const u16 codepage_437[160] = CP437;
20 static struct capitalization_table capitalization_table[] =
21 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
22 UNICODE_CAPITALIZATION_TABLE;
23 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
24 CP1250_CAPITALIZATION_TABLE;
26 CP437_CAPITALIZATION_TABLE;
30 * get_code() - read Unicode code point from UTF-8 stream
32 * @read_u8: - stream reader
33 * @src: - string buffer passed to stream reader, optional
34 * Return: - Unicode code point, or -1
36 static int get_code(u8 (*read_u8)(void *data), void *data)
43 if (ch >= 0xc2 && ch <= 0xf4) {
52 if (ch < 0x80 || ch > 0xbf)
60 if ((code >= 0xD800 && code <= 0xDFFF) ||
64 if (ch < 0x80 || ch > 0xbf)
67 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
71 if (ch < 0x80 || ch > 0xbf)
75 } else if (ch >= 0x80) {
84 * read_string() - read byte from character string
86 * @data: - pointer to string
89 * The string pointer is incremented if it does not point to '\0'.
91 static u8 read_string(void *data)
94 const char **src = (const char **)data;
97 if (!src || !*src || !**src)
105 * read_console() - read byte from console
107 * @data - not used, needed to match interface
108 * Return: - byte read or 0 on error
110 static u8 read_console(void *data)
120 int console_read_unicode(s32 *code)
126 /* No input available */
130 /* Read Unicode code */
131 c = get_code(read_console, NULL);
139 s32 utf8_get(const char **src)
141 return get_code(read_string, src);
144 int utf8_put(s32 code, char **dst)
148 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
150 if (code <= 0x007F) {
153 if (code <= 0x07FF) {
154 **dst = code >> 6 | 0xC0;
156 if (code < 0x10000) {
157 **dst = code >> 12 | 0xE0;
159 **dst = code >> 18 | 0xF0;
161 **dst = (code >> 12 & 0x3F) | 0x80;
164 **dst = (code >> 6 & 0x3F) | 0x80;
167 **dst = (code & 0x3F) | 0x80;
173 size_t utf8_utf16_strnlen(const char *src, size_t count)
177 for (; *src && count; --count) {
178 s32 code = utf8_get(&src);
183 /* Reserve space for a replacement character */
185 } else if (code < 0x10000) {
194 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
196 if (!src || !dst || !*dst)
199 for (; count && *src; --count) {
200 s32 code = utf8_get(&src);
204 utf16_put(code, dst);
210 s32 utf16_get(const u16 **src)
220 if (code >= 0xDC00 && code <= 0xDFFF)
222 if (code >= 0xD800 && code <= 0xDBFF) {
230 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
238 int utf16_put(s32 code, u16 **dst)
242 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
244 if (code < 0x10000) {
248 **dst = code >> 10 | 0xD800;
250 **dst = (code & 0x3ff) | 0xDC00;
256 size_t utf16_strnlen(const u16 *src, size_t count)
260 for (; *src && count; --count) {
261 s32 code = utf16_get(&src);
266 * In case of an illegal sequence still reserve space for a
267 * replacement character.
274 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
278 for (; *src && count; --count) {
279 s32 code = utf16_get(&src);
284 /* Reserve space for a replacement character */
286 else if (code < 0x80)
288 else if (code < 0x800)
290 else if (code < 0x10000)
298 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
300 if (!src || !dst || !*dst)
303 for (; count && *src; --count) {
304 s32 code = utf16_get(&src);
314 s32 utf_to_lower(const s32 code)
316 struct capitalization_table *pos = capitalization_table;
320 if (code >= 'A' && code <= 'Z')
324 for (; pos->upper; ++pos) {
325 if (pos->upper == code) {
333 s32 utf_to_upper(const s32 code)
335 struct capitalization_table *pos = capitalization_table;
339 if (code >= 'a' && code <= 'z')
343 for (; pos->lower; ++pos) {
344 if (pos->lower == code) {
353 * u16_strcasecmp() - compare two u16 strings case insensitively
355 * @s1: first string to compare
356 * @s2: second string to compare
357 * @n: maximum number of u16 to compare
358 * Return: 0 if the first n u16 are the same in s1 and s2
359 * < 0 if the first different u16 in s1 is less than the
360 * corresponding u16 in s2
361 * > 0 if the first different u16 in s1 is greater than the
363 int u16_strcasecmp(const u16 *s1, const u16 *s2)
369 c1 = utf_to_upper(utf16_get(&s1));
370 c2 = utf_to_upper(utf16_get(&s2));
372 if (ret || !c1 || c1 == -1 || c2 == -1)
379 * u16_strncmp() - compare two u16 string
381 * @s1: first string to compare
382 * @s2: second string to compare
383 * @n: maximum number of u16 to compare
384 * Return: 0 if the first n u16 are the same in s1 and s2
385 * < 0 if the first different u16 in s1 is less than the
386 * corresponding u16 in s2
387 * > 0 if the first different u16 in s1 is greater than the
388 * corresponding u16 in s2
390 int __efi_runtime u16_strncmp(const u16 *s1, const u16 *s2, size_t n)
394 for (; n; --n, ++s1, ++s2) {
403 size_t __efi_runtime u16_strnlen(const u16 *in, size_t count)
406 for (i = 0; count-- && in[i]; i++);
410 size_t u16_strsize(const void *in)
412 return (u16_strlen(in) + 1) * sizeof(u16);
415 u16 *u16_strcpy(u16 *dest, const u16 *src)
419 for (;; dest++, src++) {
428 u16 *u16_strdup(const void *src)
435 len = u16_strsize(src);
439 memcpy(new, src, len);
444 size_t u16_strlcat(u16 *dest, const u16 *src, size_t count)
446 size_t destlen = u16_strnlen(dest, count);
447 size_t srclen = u16_strlen(src);
448 size_t ret = destlen + srclen;
450 if (destlen >= count)
453 srclen -= (ret - count + 1);
454 memcpy(&dest[destlen], src, 2 * srclen);
455 dest[destlen + srclen] = 0x0000;
460 /* Convert UTF-16 to UTF-8. */
461 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
463 uint32_t code_high = 0;
466 uint32_t code = *src++;
469 if (code >= 0xDC00 && code <= 0xDFFF) {
470 /* Surrogate pair. */
471 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
473 *dest++ = (code >> 18) | 0xF0;
474 *dest++ = ((code >> 12) & 0x3F) | 0x80;
475 *dest++ = ((code >> 6) & 0x3F) | 0x80;
476 *dest++ = (code & 0x3F) | 0x80;
480 /* *src may be valid. Don't eat it. */
486 if (code <= 0x007F) {
488 } else if (code <= 0x07FF) {
489 *dest++ = (code >> 6) | 0xC0;
490 *dest++ = (code & 0x3F) | 0x80;
491 } else if (code >= 0xD800 && code <= 0xDBFF) {
494 } else if (code >= 0xDC00 && code <= 0xDFFF) {
497 } else if (code < 0x10000) {
498 *dest++ = (code >> 12) | 0xE0;
499 *dest++ = ((code >> 6) & 0x3F) | 0x80;
500 *dest++ = (code & 0x3F) | 0x80;
502 *dest++ = (code >> 18) | 0xF0;
503 *dest++ = ((code >> 12) & 0x3F) | 0x80;
504 *dest++ = ((code >> 6) & 0x3F) | 0x80;
505 *dest++ = (code & 0x3F) | 0x80;
513 int utf_to_cp(s32 *c, const u16 *codepage)
518 /* Look up codepage translation */
519 for (j = 0; j < 0xA0; ++j) {
520 if (*c == codepage[j]) {
534 int utf8_to_cp437_stream(u8 c, char *buffer)
543 end = buffer + strlen(buffer);
549 ret = utf_to_cp(&s, codepage_437);
558 int utf8_to_utf32_stream(u8 c, char *buffer)
566 end = buffer + strlen(buffer);
577 * Appending the byte lead to an invalid UTF-8 byte sequence.
578 * Consider it as the start of a new code sequence.