]>
Commit | Line | Data |
---|---|---|
f739fcd8 | 1 | // SPDX-License-Identifier: GPL-2.0+ |
78178bb0 RC |
2 | /* |
3 | * charset conversion utils | |
4 | * | |
5 | * Copyright (c) 2017 Rob Clark | |
78178bb0 RC |
6 | */ |
7 | ||
78178bb0 | 8 | #include <charset.h> |
b5130a81 | 9 | #include <capitalization.h> |
70616a1e | 10 | #include <cp437.h> |
6974a4a3 | 11 | #include <efi_loader.h> |
73bb90ca | 12 | #include <errno.h> |
78178bb0 RC |
13 | #include <malloc.h> |
14 | ||
70616a1e HS |
15 | /** |
16 | * codepage_437 - Unicode to codepage 437 translation table | |
17 | */ | |
ac72d17f | 18 | const u16 codepage_437[160] = CP437; |
70616a1e | 19 | |
b5130a81 HS |
20 | static struct capitalization_table capitalization_table[] = |
21 | #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION | |
22 | UNICODE_CAPITALIZATION_TABLE; | |
23 | #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250 | |
24 | CP1250_CAPITALIZATION_TABLE; | |
25 | #else | |
26 | CP437_CAPITALIZATION_TABLE; | |
27 | #endif | |
28 | ||
35cbb796 HS |
29 | /** |
30 | * get_code() - read Unicode code point from UTF-8 stream | |
31 | * | |
32 | * @read_u8: - stream reader | |
33 | * @src: - string buffer passed to stream reader, optional | |
ddbaff53 | 34 | * Return: - Unicode code point, or -1 |
35cbb796 HS |
35 | */ |
36 | static int get_code(u8 (*read_u8)(void *data), void *data) | |
d8c28232 | 37 | { |
35cbb796 | 38 | s32 ch = 0; |
d8c28232 | 39 | |
35cbb796 HS |
40 | ch = read_u8(data); |
41 | if (!ch) | |
d8c28232 | 42 | return 0; |
35cbb796 HS |
43 | if (ch >= 0xc2 && ch <= 0xf4) { |
44 | int code = 0; | |
45 | ||
46 | if (ch >= 0xe0) { | |
47 | if (ch >= 0xf0) { | |
d8c28232 | 48 | /* 0xf0 - 0xf4 */ |
35cbb796 HS |
49 | ch &= 0x07; |
50 | code = ch << 18; | |
51 | ch = read_u8(data); | |
52 | if (ch < 0x80 || ch > 0xbf) | |
53 | goto error; | |
54 | ch &= 0x3f; | |
d8c28232 HS |
55 | } else { |
56 | /* 0xe0 - 0xef */ | |
35cbb796 | 57 | ch &= 0x0f; |
d8c28232 | 58 | } |
35cbb796 | 59 | code += ch << 12; |
d8c28232 HS |
60 | if ((code >= 0xD800 && code <= 0xDFFF) || |
61 | code >= 0x110000) | |
35cbb796 HS |
62 | goto error; |
63 | ch = read_u8(data); | |
64 | if (ch < 0x80 || ch > 0xbf) | |
65 | goto error; | |
d8c28232 HS |
66 | } |
67 | /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ | |
35cbb796 HS |
68 | ch &= 0x3f; |
69 | code += ch << 6; | |
70 | ch = read_u8(data); | |
71 | if (ch < 0x80 || ch > 0xbf) | |
72 | goto error; | |
73 | ch &= 0x3f; | |
74 | ch += code; | |
75 | } else if (ch >= 0x80) { | |
76 | goto error; | |
d8c28232 | 77 | } |
35cbb796 HS |
78 | return ch; |
79 | error: | |
ddbaff53 | 80 | return -1; |
35cbb796 HS |
81 | } |
82 | ||
83 | /** | |
84 | * read_string() - read byte from character string | |
85 | * | |
86 | * @data: - pointer to string | |
87 | * Return: - byte read | |
88 | * | |
89 | * The string pointer is incremented if it does not point to '\0'. | |
90 | */ | |
91 | static u8 read_string(void *data) | |
92 | ||
93 | { | |
94 | const char **src = (const char **)data; | |
95 | u8 c; | |
96 | ||
97 | if (!src || !*src || !**src) | |
98 | return 0; | |
99 | c = **src; | |
d8c28232 | 100 | ++*src; |
35cbb796 HS |
101 | return c; |
102 | } | |
103 | ||
104 | /** | |
105 | * read_console() - read byte from console | |
106 | * | |
60d79876 HS |
107 | * @data - not used, needed to match interface |
108 | * Return: - byte read or 0 on error | |
35cbb796 HS |
109 | */ |
110 | static u8 read_console(void *data) | |
111 | { | |
60d79876 HS |
112 | int ch; |
113 | ||
c670aeee | 114 | ch = getchar(); |
60d79876 HS |
115 | if (ch < 0) |
116 | ch = 0; | |
117 | return ch; | |
35cbb796 HS |
118 | } |
119 | ||
120 | int console_read_unicode(s32 *code) | |
121 | { | |
ddbaff53 HS |
122 | for (;;) { |
123 | s32 c; | |
124 | ||
125 | if (!tstc()) { | |
126 | /* No input available */ | |
127 | return 1; | |
128 | } | |
35cbb796 | 129 | |
ddbaff53 HS |
130 | /* Read Unicode code */ |
131 | c = get_code(read_console, NULL); | |
132 | if (c > 0) { | |
133 | *code = c; | |
134 | return 0; | |
135 | } | |
136 | } | |
35cbb796 HS |
137 | } |
138 | ||
139 | s32 utf8_get(const char **src) | |
140 | { | |
141 | return get_code(read_string, src); | |
d8c28232 HS |
142 | } |
143 | ||
144 | int utf8_put(s32 code, char **dst) | |
145 | { | |
146 | if (!dst || !*dst) | |
147 | return -1; | |
148 | if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) | |
149 | return -1; | |
150 | if (code <= 0x007F) { | |
151 | **dst = code; | |
152 | } else { | |
153 | if (code <= 0x07FF) { | |
154 | **dst = code >> 6 | 0xC0; | |
155 | } else { | |
156 | if (code < 0x10000) { | |
157 | **dst = code >> 12 | 0xE0; | |
158 | } else { | |
159 | **dst = code >> 18 | 0xF0; | |
160 | ++*dst; | |
161 | **dst = (code >> 12 & 0x3F) | 0x80; | |
162 | } | |
163 | ++*dst; | |
164 | **dst = (code >> 6 & 0x3F) | 0x80; | |
165 | } | |
166 | ++*dst; | |
167 | **dst = (code & 0x3F) | 0x80; | |
168 | } | |
169 | ++*dst; | |
170 | return 0; | |
171 | } | |
172 | ||
173 | size_t utf8_utf16_strnlen(const char *src, size_t count) | |
174 | { | |
175 | size_t len = 0; | |
176 | ||
177 | for (; *src && count; --count) { | |
178 | s32 code = utf8_get(&src); | |
179 | ||
180 | if (!code) | |
181 | break; | |
182 | if (code < 0) { | |
183 | /* Reserve space for a replacement character */ | |
184 | len += 1; | |
185 | } else if (code < 0x10000) { | |
186 | len += 1; | |
187 | } else { | |
188 | len += 2; | |
189 | } | |
190 | } | |
191 | return len; | |
192 | } | |
193 | ||
194 | int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) | |
195 | { | |
196 | if (!src || !dst || !*dst) | |
197 | return -1; | |
198 | ||
199 | for (; count && *src; --count) { | |
200 | s32 code = utf8_get(&src); | |
201 | ||
202 | if (code < 0) | |
203 | code = '?'; | |
204 | utf16_put(code, dst); | |
205 | } | |
206 | **dst = 0; | |
207 | return 0; | |
208 | } | |
209 | ||
210 | s32 utf16_get(const u16 **src) | |
211 | { | |
212 | s32 code, code2; | |
213 | ||
214 | if (!src || !*src) | |
215 | return -1; | |
216 | if (!**src) | |
217 | return 0; | |
218 | code = **src; | |
219 | ++*src; | |
220 | if (code >= 0xDC00 && code <= 0xDFFF) | |
221 | return -1; | |
222 | if (code >= 0xD800 && code <= 0xDBFF) { | |
223 | if (!**src) | |
224 | return -1; | |
225 | code &= 0x3ff; | |
226 | code <<= 10; | |
227 | code += 0x10000; | |
228 | code2 = **src; | |
229 | ++*src; | |
230 | if (code2 <= 0xDC00 || code2 >= 0xDFFF) | |
231 | return -1; | |
232 | code2 &= 0x3ff; | |
233 | code += code2; | |
234 | } | |
235 | return code; | |
236 | } | |
237 | ||
238 | int utf16_put(s32 code, u16 **dst) | |
239 | { | |
240 | if (!dst || !*dst) | |
241 | return -1; | |
242 | if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) | |
243 | return -1; | |
244 | if (code < 0x10000) { | |
245 | **dst = code; | |
246 | } else { | |
247 | code -= 0x10000; | |
248 | **dst = code >> 10 | 0xD800; | |
249 | ++*dst; | |
250 | **dst = (code & 0x3ff) | 0xDC00; | |
251 | } | |
252 | ++*dst; | |
253 | return 0; | |
254 | } | |
255 | ||
256 | size_t utf16_strnlen(const u16 *src, size_t count) | |
257 | { | |
258 | size_t len = 0; | |
259 | ||
260 | for (; *src && count; --count) { | |
261 | s32 code = utf16_get(&src); | |
262 | ||
263 | if (!code) | |
264 | break; | |
265 | /* | |
266 | * In case of an illegal sequence still reserve space for a | |
267 | * replacement character. | |
268 | */ | |
269 | ++len; | |
270 | } | |
271 | return len; | |
272 | } | |
273 | ||
274 | size_t utf16_utf8_strnlen(const u16 *src, size_t count) | |
275 | { | |
276 | size_t len = 0; | |
277 | ||
278 | for (; *src && count; --count) { | |
279 | s32 code = utf16_get(&src); | |
280 | ||
281 | if (!code) | |
282 | break; | |
283 | if (code < 0) | |
284 | /* Reserve space for a replacement character */ | |
285 | len += 1; | |
286 | else if (code < 0x80) | |
287 | len += 1; | |
288 | else if (code < 0x800) | |
289 | len += 2; | |
290 | else if (code < 0x10000) | |
291 | len += 3; | |
292 | else | |
293 | len += 4; | |
294 | } | |
295 | return len; | |
296 | } | |
297 | ||
298 | int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) | |
299 | { | |
300 | if (!src || !dst || !*dst) | |
301 | return -1; | |
302 | ||
303 | for (; count && *src; --count) { | |
304 | s32 code = utf16_get(&src); | |
305 | ||
306 | if (code < 0) | |
307 | code = '?'; | |
308 | utf8_put(code, dst); | |
309 | } | |
310 | **dst = 0; | |
311 | return 0; | |
312 | } | |
313 | ||
b5130a81 HS |
314 | s32 utf_to_lower(const s32 code) |
315 | { | |
316 | struct capitalization_table *pos = capitalization_table; | |
317 | s32 ret = code; | |
318 | ||
319 | if (code <= 0x7f) { | |
320 | if (code >= 'A' && code <= 'Z') | |
321 | ret += 0x20; | |
322 | return ret; | |
323 | } | |
324 | for (; pos->upper; ++pos) { | |
325 | if (pos->upper == code) { | |
326 | ret = pos->lower; | |
327 | break; | |
328 | } | |
329 | } | |
330 | return ret; | |
331 | } | |
332 | ||
333 | s32 utf_to_upper(const s32 code) | |
334 | { | |
335 | struct capitalization_table *pos = capitalization_table; | |
336 | s32 ret = code; | |
337 | ||
338 | if (code <= 0x7f) { | |
339 | if (code >= 'a' && code <= 'z') | |
340 | ret -= 0x20; | |
341 | return ret; | |
342 | } | |
343 | for (; pos->lower; ++pos) { | |
344 | if (pos->lower == code) { | |
345 | ret = pos->upper; | |
346 | break; | |
347 | } | |
348 | } | |
349 | return ret; | |
350 | } | |
78178bb0 | 351 | |
7a9b366c HS |
352 | /* |
353 | * u16_strcasecmp() - compare two u16 strings case insensitively | |
354 | * | |
355 | * @s1: first string to compare | |
356 | * @s2: second string to compare | |
357 | * @n: maximum number of u16 to compare | |
358 | * Return: 0 if the first n u16 are the same in s1 and s2 | |
359 | * < 0 if the first different u16 in s1 is less than the | |
360 | * corresponding u16 in s2 | |
361 | * > 0 if the first different u16 in s1 is greater than the | |
362 | */ | |
363 | int u16_strcasecmp(const u16 *s1, const u16 *s2) | |
364 | { | |
365 | int ret = 0; | |
366 | s32 c1, c2; | |
367 | ||
368 | for (;;) { | |
369 | c1 = utf_to_upper(utf16_get(&s1)); | |
370 | c2 = utf_to_upper(utf16_get(&s2)); | |
371 | ret = c1 - c2; | |
372 | if (ret || !c1 || c1 == -1 || c2 == -1) | |
373 | break; | |
374 | } | |
375 | return ret; | |
376 | } | |
377 | ||
f8062c96 AT |
378 | /* |
379 | * u16_strncmp() - compare two u16 string | |
380 | * | |
381 | * @s1: first string to compare | |
382 | * @s2: second string to compare | |
383 | * @n: maximum number of u16 to compare | |
384 | * Return: 0 if the first n u16 are the same in s1 and s2 | |
385 | * < 0 if the first different u16 in s1 is less than the | |
386 | * corresponding u16 in s2 | |
387 | * > 0 if the first different u16 in s1 is greater than the | |
388 | * corresponding u16 in s2 | |
389 | */ | |
00da8d65 | 390 | int __efi_runtime u16_strncmp(const u16 *s1, const u16 *s2, size_t n) |
f8062c96 AT |
391 | { |
392 | int ret = 0; | |
393 | ||
394 | for (; n; --n, ++s1, ++s2) { | |
395 | ret = *s1 - *s2; | |
396 | if (ret || !*s1) | |
397 | break; | |
398 | } | |
399 | ||
400 | return ret; | |
401 | } | |
402 | ||
6974a4a3 | 403 | size_t __efi_runtime u16_strnlen(const u16 *in, size_t count) |
78178bb0 RC |
404 | { |
405 | size_t i; | |
406 | for (i = 0; count-- && in[i]; i++); | |
407 | return i; | |
408 | } | |
409 | ||
4835d35a SG |
410 | size_t u16_strsize(const void *in) |
411 | { | |
412 | return (u16_strlen(in) + 1) * sizeof(u16); | |
413 | } | |
414 | ||
2a3537ae AT |
415 | u16 *u16_strcpy(u16 *dest, const u16 *src) |
416 | { | |
417 | u16 *tmp = dest; | |
418 | ||
419 | for (;; dest++, src++) { | |
420 | *dest = *src; | |
421 | if (!*src) | |
422 | break; | |
423 | } | |
424 | ||
425 | return tmp; | |
426 | } | |
427 | ||
317068b8 | 428 | u16 *u16_strdup(const void *src) |
2a3537ae AT |
429 | { |
430 | u16 *new; | |
317068b8 | 431 | size_t len; |
2a3537ae AT |
432 | |
433 | if (!src) | |
434 | return NULL; | |
967407de | 435 | len = u16_strsize(src); |
317068b8 | 436 | new = malloc(len); |
2a3537ae AT |
437 | if (!new) |
438 | return NULL; | |
317068b8 | 439 | memcpy(new, src, len); |
2a3537ae AT |
440 | |
441 | return new; | |
442 | } | |
443 | ||
eca08ce9 MK |
444 | size_t u16_strlcat(u16 *dest, const u16 *src, size_t count) |
445 | { | |
7c00b80d | 446 | size_t destlen = u16_strnlen(dest, count); |
eca08ce9 | 447 | size_t srclen = u16_strlen(src); |
7c00b80d | 448 | size_t ret = destlen + srclen; |
eca08ce9 MK |
449 | |
450 | if (destlen >= count) | |
451 | return ret; | |
7c00b80d MS |
452 | if (ret >= count) |
453 | srclen -= (ret - count + 1); | |
eca08ce9 MK |
454 | memcpy(&dest[destlen], src, 2 * srclen); |
455 | dest[destlen + srclen] = 0x0000; | |
456 | ||
457 | return ret; | |
458 | } | |
459 | ||
78178bb0 RC |
460 | /* Convert UTF-16 to UTF-8. */ |
461 | uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size) | |
462 | { | |
463 | uint32_t code_high = 0; | |
464 | ||
465 | while (size--) { | |
466 | uint32_t code = *src++; | |
467 | ||
468 | if (code_high) { | |
469 | if (code >= 0xDC00 && code <= 0xDFFF) { | |
470 | /* Surrogate pair. */ | |
471 | code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000; | |
472 | ||
473 | *dest++ = (code >> 18) | 0xF0; | |
474 | *dest++ = ((code >> 12) & 0x3F) | 0x80; | |
475 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
476 | *dest++ = (code & 0x3F) | 0x80; | |
477 | } else { | |
478 | /* Error... */ | |
479 | *dest++ = '?'; | |
480 | /* *src may be valid. Don't eat it. */ | |
481 | src--; | |
482 | } | |
483 | ||
484 | code_high = 0; | |
485 | } else { | |
486 | if (code <= 0x007F) { | |
487 | *dest++ = code; | |
488 | } else if (code <= 0x07FF) { | |
489 | *dest++ = (code >> 6) | 0xC0; | |
490 | *dest++ = (code & 0x3F) | 0x80; | |
491 | } else if (code >= 0xD800 && code <= 0xDBFF) { | |
492 | code_high = code; | |
493 | continue; | |
494 | } else if (code >= 0xDC00 && code <= 0xDFFF) { | |
495 | /* Error... */ | |
496 | *dest++ = '?'; | |
497 | } else if (code < 0x10000) { | |
498 | *dest++ = (code >> 12) | 0xE0; | |
499 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
500 | *dest++ = (code & 0x3F) | 0x80; | |
501 | } else { | |
502 | *dest++ = (code >> 18) | 0xF0; | |
503 | *dest++ = ((code >> 12) & 0x3F) | 0x80; | |
504 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
505 | *dest++ = (code & 0x3F) | 0x80; | |
506 | } | |
507 | } | |
508 | } | |
509 | ||
510 | return dest; | |
511 | } | |
73bb90ca | 512 | |
73bb90ca HS |
513 | int utf_to_cp(s32 *c, const u16 *codepage) |
514 | { | |
515 | if (*c >= 0x80) { | |
516 | int j; | |
517 | ||
518 | /* Look up codepage translation */ | |
ac72d17f | 519 | for (j = 0; j < 0xA0; ++j) { |
73bb90ca | 520 | if (*c == codepage[j]) { |
ac72d17f JG |
521 | if (j < 0x20) |
522 | *c = j; | |
523 | else | |
524 | *c = j + 0x60; | |
73bb90ca HS |
525 | return 0; |
526 | } | |
527 | } | |
528 | *c = '?'; | |
529 | return -ENOENT; | |
530 | } | |
531 | return 0; | |
532 | } | |
e91789e2 HS |
533 | |
534 | int utf8_to_cp437_stream(u8 c, char *buffer) | |
535 | { | |
536 | char *end; | |
537 | const char *pos; | |
538 | s32 s; | |
539 | int ret; | |
540 | ||
541 | for (;;) { | |
542 | pos = buffer; | |
543 | end = buffer + strlen(buffer); | |
544 | *end++ = c; | |
545 | *end = 0; | |
546 | s = utf8_get(&pos); | |
547 | if (s > 0) { | |
548 | *buffer = 0; | |
549 | ret = utf_to_cp(&s, codepage_437); | |
550 | return s; | |
551 | } | |
552 | if (pos == end) | |
553 | return 0; | |
554 | *buffer = 0; | |
555 | } | |
556 | } | |
557 | ||
558 | int utf8_to_utf32_stream(u8 c, char *buffer) | |
559 | { | |
560 | char *end; | |
561 | const char *pos; | |
562 | s32 s; | |
563 | ||
564 | for (;;) { | |
565 | pos = buffer; | |
566 | end = buffer + strlen(buffer); | |
567 | *end++ = c; | |
568 | *end = 0; | |
569 | s = utf8_get(&pos); | |
570 | if (s > 0) { | |
571 | *buffer = 0; | |
572 | return s; | |
573 | } | |
574 | if (pos == end) | |
575 | return 0; | |
3150da34 HS |
576 | /* |
577 | * Appending the byte lead to an invalid UTF-8 byte sequence. | |
578 | * Consider it as the start of a new code sequence. | |
579 | */ | |
e91789e2 HS |
580 | *buffer = 0; |
581 | } | |
582 | } |