]>
Commit | Line | Data |
---|---|---|
f739fcd8 | 1 | // SPDX-License-Identifier: GPL-2.0+ |
78178bb0 RC |
2 | /* |
3 | * charset conversion utils | |
4 | * | |
5 | * Copyright (c) 2017 Rob Clark | |
78178bb0 RC |
6 | */ |
7 | ||
35cbb796 | 8 | #include <common.h> |
78178bb0 | 9 | #include <charset.h> |
b5130a81 | 10 | #include <capitalization.h> |
70616a1e | 11 | #include <cp437.h> |
6974a4a3 | 12 | #include <efi_loader.h> |
73bb90ca | 13 | #include <errno.h> |
78178bb0 RC |
14 | #include <malloc.h> |
15 | ||
70616a1e HS |
16 | /** |
17 | * codepage_437 - Unicode to codepage 437 translation table | |
18 | */ | |
19 | const u16 codepage_437[128] = CP437; | |
20 | ||
b5130a81 HS |
21 | static struct capitalization_table capitalization_table[] = |
22 | #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION | |
23 | UNICODE_CAPITALIZATION_TABLE; | |
24 | #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250 | |
25 | CP1250_CAPITALIZATION_TABLE; | |
26 | #else | |
27 | CP437_CAPITALIZATION_TABLE; | |
28 | #endif | |
29 | ||
35cbb796 HS |
30 | /** |
31 | * get_code() - read Unicode code point from UTF-8 stream | |
32 | * | |
33 | * @read_u8: - stream reader | |
34 | * @src: - string buffer passed to stream reader, optional | |
ddbaff53 | 35 | * Return: - Unicode code point, or -1 |
35cbb796 HS |
36 | */ |
37 | static int get_code(u8 (*read_u8)(void *data), void *data) | |
d8c28232 | 38 | { |
35cbb796 | 39 | s32 ch = 0; |
d8c28232 | 40 | |
35cbb796 HS |
41 | ch = read_u8(data); |
42 | if (!ch) | |
d8c28232 | 43 | return 0; |
35cbb796 HS |
44 | if (ch >= 0xc2 && ch <= 0xf4) { |
45 | int code = 0; | |
46 | ||
47 | if (ch >= 0xe0) { | |
48 | if (ch >= 0xf0) { | |
d8c28232 | 49 | /* 0xf0 - 0xf4 */ |
35cbb796 HS |
50 | ch &= 0x07; |
51 | code = ch << 18; | |
52 | ch = read_u8(data); | |
53 | if (ch < 0x80 || ch > 0xbf) | |
54 | goto error; | |
55 | ch &= 0x3f; | |
d8c28232 HS |
56 | } else { |
57 | /* 0xe0 - 0xef */ | |
35cbb796 | 58 | ch &= 0x0f; |
d8c28232 | 59 | } |
35cbb796 | 60 | code += ch << 12; |
d8c28232 HS |
61 | if ((code >= 0xD800 && code <= 0xDFFF) || |
62 | code >= 0x110000) | |
35cbb796 HS |
63 | goto error; |
64 | ch = read_u8(data); | |
65 | if (ch < 0x80 || ch > 0xbf) | |
66 | goto error; | |
d8c28232 HS |
67 | } |
68 | /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ | |
35cbb796 HS |
69 | ch &= 0x3f; |
70 | code += ch << 6; | |
71 | ch = read_u8(data); | |
72 | if (ch < 0x80 || ch > 0xbf) | |
73 | goto error; | |
74 | ch &= 0x3f; | |
75 | ch += code; | |
76 | } else if (ch >= 0x80) { | |
77 | goto error; | |
d8c28232 | 78 | } |
35cbb796 HS |
79 | return ch; |
80 | error: | |
ddbaff53 | 81 | return -1; |
35cbb796 HS |
82 | } |
83 | ||
84 | /** | |
85 | * read_string() - read byte from character string | |
86 | * | |
87 | * @data: - pointer to string | |
88 | * Return: - byte read | |
89 | * | |
90 | * The string pointer is incremented if it does not point to '\0'. | |
91 | */ | |
92 | static u8 read_string(void *data) | |
93 | ||
94 | { | |
95 | const char **src = (const char **)data; | |
96 | u8 c; | |
97 | ||
98 | if (!src || !*src || !**src) | |
99 | return 0; | |
100 | c = **src; | |
d8c28232 | 101 | ++*src; |
35cbb796 HS |
102 | return c; |
103 | } | |
104 | ||
105 | /** | |
106 | * read_console() - read byte from console | |
107 | * | |
60d79876 HS |
108 | * @data - not used, needed to match interface |
109 | * Return: - byte read or 0 on error | |
35cbb796 HS |
110 | */ |
111 | static u8 read_console(void *data) | |
112 | { | |
60d79876 HS |
113 | int ch; |
114 | ||
c670aeee | 115 | ch = getchar(); |
60d79876 HS |
116 | if (ch < 0) |
117 | ch = 0; | |
118 | return ch; | |
35cbb796 HS |
119 | } |
120 | ||
121 | int console_read_unicode(s32 *code) | |
122 | { | |
ddbaff53 HS |
123 | for (;;) { |
124 | s32 c; | |
125 | ||
126 | if (!tstc()) { | |
127 | /* No input available */ | |
128 | return 1; | |
129 | } | |
35cbb796 | 130 | |
ddbaff53 HS |
131 | /* Read Unicode code */ |
132 | c = get_code(read_console, NULL); | |
133 | if (c > 0) { | |
134 | *code = c; | |
135 | return 0; | |
136 | } | |
137 | } | |
35cbb796 HS |
138 | } |
139 | ||
140 | s32 utf8_get(const char **src) | |
141 | { | |
142 | return get_code(read_string, src); | |
d8c28232 HS |
143 | } |
144 | ||
145 | int utf8_put(s32 code, char **dst) | |
146 | { | |
147 | if (!dst || !*dst) | |
148 | return -1; | |
149 | if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) | |
150 | return -1; | |
151 | if (code <= 0x007F) { | |
152 | **dst = code; | |
153 | } else { | |
154 | if (code <= 0x07FF) { | |
155 | **dst = code >> 6 | 0xC0; | |
156 | } else { | |
157 | if (code < 0x10000) { | |
158 | **dst = code >> 12 | 0xE0; | |
159 | } else { | |
160 | **dst = code >> 18 | 0xF0; | |
161 | ++*dst; | |
162 | **dst = (code >> 12 & 0x3F) | 0x80; | |
163 | } | |
164 | ++*dst; | |
165 | **dst = (code >> 6 & 0x3F) | 0x80; | |
166 | } | |
167 | ++*dst; | |
168 | **dst = (code & 0x3F) | 0x80; | |
169 | } | |
170 | ++*dst; | |
171 | return 0; | |
172 | } | |
173 | ||
174 | size_t utf8_utf16_strnlen(const char *src, size_t count) | |
175 | { | |
176 | size_t len = 0; | |
177 | ||
178 | for (; *src && count; --count) { | |
179 | s32 code = utf8_get(&src); | |
180 | ||
181 | if (!code) | |
182 | break; | |
183 | if (code < 0) { | |
184 | /* Reserve space for a replacement character */ | |
185 | len += 1; | |
186 | } else if (code < 0x10000) { | |
187 | len += 1; | |
188 | } else { | |
189 | len += 2; | |
190 | } | |
191 | } | |
192 | return len; | |
193 | } | |
194 | ||
195 | int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) | |
196 | { | |
197 | if (!src || !dst || !*dst) | |
198 | return -1; | |
199 | ||
200 | for (; count && *src; --count) { | |
201 | s32 code = utf8_get(&src); | |
202 | ||
203 | if (code < 0) | |
204 | code = '?'; | |
205 | utf16_put(code, dst); | |
206 | } | |
207 | **dst = 0; | |
208 | return 0; | |
209 | } | |
210 | ||
211 | s32 utf16_get(const u16 **src) | |
212 | { | |
213 | s32 code, code2; | |
214 | ||
215 | if (!src || !*src) | |
216 | return -1; | |
217 | if (!**src) | |
218 | return 0; | |
219 | code = **src; | |
220 | ++*src; | |
221 | if (code >= 0xDC00 && code <= 0xDFFF) | |
222 | return -1; | |
223 | if (code >= 0xD800 && code <= 0xDBFF) { | |
224 | if (!**src) | |
225 | return -1; | |
226 | code &= 0x3ff; | |
227 | code <<= 10; | |
228 | code += 0x10000; | |
229 | code2 = **src; | |
230 | ++*src; | |
231 | if (code2 <= 0xDC00 || code2 >= 0xDFFF) | |
232 | return -1; | |
233 | code2 &= 0x3ff; | |
234 | code += code2; | |
235 | } | |
236 | return code; | |
237 | } | |
238 | ||
239 | int utf16_put(s32 code, u16 **dst) | |
240 | { | |
241 | if (!dst || !*dst) | |
242 | return -1; | |
243 | if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) | |
244 | return -1; | |
245 | if (code < 0x10000) { | |
246 | **dst = code; | |
247 | } else { | |
248 | code -= 0x10000; | |
249 | **dst = code >> 10 | 0xD800; | |
250 | ++*dst; | |
251 | **dst = (code & 0x3ff) | 0xDC00; | |
252 | } | |
253 | ++*dst; | |
254 | return 0; | |
255 | } | |
256 | ||
257 | size_t utf16_strnlen(const u16 *src, size_t count) | |
258 | { | |
259 | size_t len = 0; | |
260 | ||
261 | for (; *src && count; --count) { | |
262 | s32 code = utf16_get(&src); | |
263 | ||
264 | if (!code) | |
265 | break; | |
266 | /* | |
267 | * In case of an illegal sequence still reserve space for a | |
268 | * replacement character. | |
269 | */ | |
270 | ++len; | |
271 | } | |
272 | return len; | |
273 | } | |
274 | ||
275 | size_t utf16_utf8_strnlen(const u16 *src, size_t count) | |
276 | { | |
277 | size_t len = 0; | |
278 | ||
279 | for (; *src && count; --count) { | |
280 | s32 code = utf16_get(&src); | |
281 | ||
282 | if (!code) | |
283 | break; | |
284 | if (code < 0) | |
285 | /* Reserve space for a replacement character */ | |
286 | len += 1; | |
287 | else if (code < 0x80) | |
288 | len += 1; | |
289 | else if (code < 0x800) | |
290 | len += 2; | |
291 | else if (code < 0x10000) | |
292 | len += 3; | |
293 | else | |
294 | len += 4; | |
295 | } | |
296 | return len; | |
297 | } | |
298 | ||
299 | int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) | |
300 | { | |
301 | if (!src || !dst || !*dst) | |
302 | return -1; | |
303 | ||
304 | for (; count && *src; --count) { | |
305 | s32 code = utf16_get(&src); | |
306 | ||
307 | if (code < 0) | |
308 | code = '?'; | |
309 | utf8_put(code, dst); | |
310 | } | |
311 | **dst = 0; | |
312 | return 0; | |
313 | } | |
314 | ||
b5130a81 HS |
315 | s32 utf_to_lower(const s32 code) |
316 | { | |
317 | struct capitalization_table *pos = capitalization_table; | |
318 | s32 ret = code; | |
319 | ||
320 | if (code <= 0x7f) { | |
321 | if (code >= 'A' && code <= 'Z') | |
322 | ret += 0x20; | |
323 | return ret; | |
324 | } | |
325 | for (; pos->upper; ++pos) { | |
326 | if (pos->upper == code) { | |
327 | ret = pos->lower; | |
328 | break; | |
329 | } | |
330 | } | |
331 | return ret; | |
332 | } | |
333 | ||
334 | s32 utf_to_upper(const s32 code) | |
335 | { | |
336 | struct capitalization_table *pos = capitalization_table; | |
337 | s32 ret = code; | |
338 | ||
339 | if (code <= 0x7f) { | |
340 | if (code >= 'a' && code <= 'z') | |
341 | ret -= 0x20; | |
342 | return ret; | |
343 | } | |
344 | for (; pos->lower; ++pos) { | |
345 | if (pos->lower == code) { | |
346 | ret = pos->upper; | |
347 | break; | |
348 | } | |
349 | } | |
350 | return ret; | |
351 | } | |
78178bb0 | 352 | |
f8062c96 AT |
353 | /* |
354 | * u16_strncmp() - compare two u16 string | |
355 | * | |
356 | * @s1: first string to compare | |
357 | * @s2: second string to compare | |
358 | * @n: maximum number of u16 to compare | |
359 | * Return: 0 if the first n u16 are the same in s1 and s2 | |
360 | * < 0 if the first different u16 in s1 is less than the | |
361 | * corresponding u16 in s2 | |
362 | * > 0 if the first different u16 in s1 is greater than the | |
363 | * corresponding u16 in s2 | |
364 | */ | |
365 | int u16_strncmp(const u16 *s1, const u16 *s2, size_t n) | |
366 | { | |
367 | int ret = 0; | |
368 | ||
369 | for (; n; --n, ++s1, ++s2) { | |
370 | ret = *s1 - *s2; | |
371 | if (ret || !*s1) | |
372 | break; | |
373 | } | |
374 | ||
375 | return ret; | |
376 | } | |
377 | ||
317068b8 | 378 | size_t u16_strlen(const void *in) |
78178bb0 | 379 | { |
317068b8 HS |
380 | const char *pos = in; |
381 | size_t ret; | |
382 | ||
383 | for (; pos[0] || pos[1]; pos += 2) | |
384 | ; | |
385 | ret = pos - (char *)in; | |
386 | ret >>= 1; | |
387 | return ret; | |
78178bb0 RC |
388 | } |
389 | ||
6974a4a3 | 390 | size_t __efi_runtime u16_strnlen(const u16 *in, size_t count) |
78178bb0 RC |
391 | { |
392 | size_t i; | |
393 | for (i = 0; count-- && in[i]; i++); | |
394 | return i; | |
395 | } | |
396 | ||
4835d35a SG |
397 | size_t u16_strsize(const void *in) |
398 | { | |
399 | return (u16_strlen(in) + 1) * sizeof(u16); | |
400 | } | |
401 | ||
2a3537ae AT |
402 | u16 *u16_strcpy(u16 *dest, const u16 *src) |
403 | { | |
404 | u16 *tmp = dest; | |
405 | ||
406 | for (;; dest++, src++) { | |
407 | *dest = *src; | |
408 | if (!*src) | |
409 | break; | |
410 | } | |
411 | ||
412 | return tmp; | |
413 | } | |
414 | ||
317068b8 | 415 | u16 *u16_strdup(const void *src) |
2a3537ae AT |
416 | { |
417 | u16 *new; | |
317068b8 | 418 | size_t len; |
2a3537ae AT |
419 | |
420 | if (!src) | |
421 | return NULL; | |
317068b8 HS |
422 | len = (u16_strlen(src) + 1) * sizeof(u16); |
423 | new = malloc(len); | |
2a3537ae AT |
424 | if (!new) |
425 | return NULL; | |
317068b8 | 426 | memcpy(new, src, len); |
2a3537ae AT |
427 | |
428 | return new; | |
429 | } | |
430 | ||
78178bb0 RC |
431 | /* Convert UTF-16 to UTF-8. */ |
432 | uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size) | |
433 | { | |
434 | uint32_t code_high = 0; | |
435 | ||
436 | while (size--) { | |
437 | uint32_t code = *src++; | |
438 | ||
439 | if (code_high) { | |
440 | if (code >= 0xDC00 && code <= 0xDFFF) { | |
441 | /* Surrogate pair. */ | |
442 | code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000; | |
443 | ||
444 | *dest++ = (code >> 18) | 0xF0; | |
445 | *dest++ = ((code >> 12) & 0x3F) | 0x80; | |
446 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
447 | *dest++ = (code & 0x3F) | 0x80; | |
448 | } else { | |
449 | /* Error... */ | |
450 | *dest++ = '?'; | |
451 | /* *src may be valid. Don't eat it. */ | |
452 | src--; | |
453 | } | |
454 | ||
455 | code_high = 0; | |
456 | } else { | |
457 | if (code <= 0x007F) { | |
458 | *dest++ = code; | |
459 | } else if (code <= 0x07FF) { | |
460 | *dest++ = (code >> 6) | 0xC0; | |
461 | *dest++ = (code & 0x3F) | 0x80; | |
462 | } else if (code >= 0xD800 && code <= 0xDBFF) { | |
463 | code_high = code; | |
464 | continue; | |
465 | } else if (code >= 0xDC00 && code <= 0xDFFF) { | |
466 | /* Error... */ | |
467 | *dest++ = '?'; | |
468 | } else if (code < 0x10000) { | |
469 | *dest++ = (code >> 12) | 0xE0; | |
470 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
471 | *dest++ = (code & 0x3F) | 0x80; | |
472 | } else { | |
473 | *dest++ = (code >> 18) | 0xF0; | |
474 | *dest++ = ((code >> 12) & 0x3F) | 0x80; | |
475 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
476 | *dest++ = (code & 0x3F) | 0x80; | |
477 | } | |
478 | } | |
479 | } | |
480 | ||
481 | return dest; | |
482 | } | |
73bb90ca | 483 | |
73bb90ca HS |
484 | int utf_to_cp(s32 *c, const u16 *codepage) |
485 | { | |
486 | if (*c >= 0x80) { | |
487 | int j; | |
488 | ||
489 | /* Look up codepage translation */ | |
490 | for (j = 0; j < 0x80; ++j) { | |
491 | if (*c == codepage[j]) { | |
492 | *c = j + 0x80; | |
493 | return 0; | |
494 | } | |
495 | } | |
496 | *c = '?'; | |
497 | return -ENOENT; | |
498 | } | |
499 | return 0; | |
500 | } | |
e91789e2 HS |
501 | |
502 | int utf8_to_cp437_stream(u8 c, char *buffer) | |
503 | { | |
504 | char *end; | |
505 | const char *pos; | |
506 | s32 s; | |
507 | int ret; | |
508 | ||
509 | for (;;) { | |
510 | pos = buffer; | |
511 | end = buffer + strlen(buffer); | |
512 | *end++ = c; | |
513 | *end = 0; | |
514 | s = utf8_get(&pos); | |
515 | if (s > 0) { | |
516 | *buffer = 0; | |
517 | ret = utf_to_cp(&s, codepage_437); | |
518 | return s; | |
519 | } | |
520 | if (pos == end) | |
521 | return 0; | |
522 | *buffer = 0; | |
523 | } | |
524 | } | |
525 | ||
526 | int utf8_to_utf32_stream(u8 c, char *buffer) | |
527 | { | |
528 | char *end; | |
529 | const char *pos; | |
530 | s32 s; | |
531 | ||
532 | for (;;) { | |
533 | pos = buffer; | |
534 | end = buffer + strlen(buffer); | |
535 | *end++ = c; | |
536 | *end = 0; | |
537 | s = utf8_get(&pos); | |
538 | if (s > 0) { | |
539 | *buffer = 0; | |
540 | return s; | |
541 | } | |
542 | if (pos == end) | |
543 | return 0; | |
544 | *buffer = 0; | |
545 | } | |
546 | } |