]>
Commit | Line | Data |
---|---|---|
f739fcd8 | 1 | // SPDX-License-Identifier: GPL-2.0+ |
78178bb0 RC |
2 | /* |
3 | * charset conversion utils | |
4 | * | |
5 | * Copyright (c) 2017 Rob Clark | |
78178bb0 RC |
6 | */ |
7 | ||
35cbb796 | 8 | #include <common.h> |
78178bb0 | 9 | #include <charset.h> |
b5130a81 | 10 | #include <capitalization.h> |
70616a1e | 11 | #include <cp437.h> |
6974a4a3 | 12 | #include <efi_loader.h> |
78178bb0 RC |
13 | #include <malloc.h> |
14 | ||
70616a1e HS |
15 | /** |
16 | * codepage_437 - Unicode to codepage 437 translation table | |
17 | */ | |
18 | const u16 codepage_437[128] = CP437; | |
19 | ||
b5130a81 HS |
20 | static struct capitalization_table capitalization_table[] = |
21 | #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION | |
22 | UNICODE_CAPITALIZATION_TABLE; | |
23 | #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250 | |
24 | CP1250_CAPITALIZATION_TABLE; | |
25 | #else | |
26 | CP437_CAPITALIZATION_TABLE; | |
27 | #endif | |
28 | ||
35cbb796 HS |
29 | /** |
30 | * get_code() - read Unicode code point from UTF-8 stream | |
31 | * | |
32 | * @read_u8: - stream reader | |
33 | * @src: - string buffer passed to stream reader, optional | |
34 | * Return: - Unicode code point | |
35 | */ | |
36 | static int get_code(u8 (*read_u8)(void *data), void *data) | |
d8c28232 | 37 | { |
35cbb796 | 38 | s32 ch = 0; |
d8c28232 | 39 | |
35cbb796 HS |
40 | ch = read_u8(data); |
41 | if (!ch) | |
d8c28232 | 42 | return 0; |
35cbb796 HS |
43 | if (ch >= 0xc2 && ch <= 0xf4) { |
44 | int code = 0; | |
45 | ||
46 | if (ch >= 0xe0) { | |
47 | if (ch >= 0xf0) { | |
d8c28232 | 48 | /* 0xf0 - 0xf4 */ |
35cbb796 HS |
49 | ch &= 0x07; |
50 | code = ch << 18; | |
51 | ch = read_u8(data); | |
52 | if (ch < 0x80 || ch > 0xbf) | |
53 | goto error; | |
54 | ch &= 0x3f; | |
d8c28232 HS |
55 | } else { |
56 | /* 0xe0 - 0xef */ | |
35cbb796 | 57 | ch &= 0x0f; |
d8c28232 | 58 | } |
35cbb796 | 59 | code += ch << 12; |
d8c28232 HS |
60 | if ((code >= 0xD800 && code <= 0xDFFF) || |
61 | code >= 0x110000) | |
35cbb796 HS |
62 | goto error; |
63 | ch = read_u8(data); | |
64 | if (ch < 0x80 || ch > 0xbf) | |
65 | goto error; | |
d8c28232 HS |
66 | } |
67 | /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ | |
35cbb796 HS |
68 | ch &= 0x3f; |
69 | code += ch << 6; | |
70 | ch = read_u8(data); | |
71 | if (ch < 0x80 || ch > 0xbf) | |
72 | goto error; | |
73 | ch &= 0x3f; | |
74 | ch += code; | |
75 | } else if (ch >= 0x80) { | |
76 | goto error; | |
d8c28232 | 77 | } |
35cbb796 HS |
78 | return ch; |
79 | error: | |
80 | return '?'; | |
81 | } | |
82 | ||
83 | /** | |
84 | * read_string() - read byte from character string | |
85 | * | |
86 | * @data: - pointer to string | |
87 | * Return: - byte read | |
88 | * | |
89 | * The string pointer is incremented if it does not point to '\0'. | |
90 | */ | |
91 | static u8 read_string(void *data) | |
92 | ||
93 | { | |
94 | const char **src = (const char **)data; | |
95 | u8 c; | |
96 | ||
97 | if (!src || !*src || !**src) | |
98 | return 0; | |
99 | c = **src; | |
d8c28232 | 100 | ++*src; |
35cbb796 HS |
101 | return c; |
102 | } | |
103 | ||
104 | /** | |
105 | * read_console() - read byte from console | |
106 | * | |
60d79876 HS |
107 | * @data - not used, needed to match interface |
108 | * Return: - byte read or 0 on error | |
35cbb796 HS |
109 | */ |
110 | static u8 read_console(void *data) | |
111 | { | |
60d79876 HS |
112 | int ch; |
113 | ||
c670aeee | 114 | ch = getchar(); |
60d79876 HS |
115 | if (ch < 0) |
116 | ch = 0; | |
117 | return ch; | |
35cbb796 HS |
118 | } |
119 | ||
120 | int console_read_unicode(s32 *code) | |
121 | { | |
122 | if (!tstc()) { | |
123 | /* No input available */ | |
124 | return 1; | |
125 | } | |
126 | ||
127 | /* Read Unicode code */ | |
128 | *code = get_code(read_console, NULL); | |
129 | return 0; | |
130 | } | |
131 | ||
132 | s32 utf8_get(const char **src) | |
133 | { | |
134 | return get_code(read_string, src); | |
d8c28232 HS |
135 | } |
136 | ||
137 | int utf8_put(s32 code, char **dst) | |
138 | { | |
139 | if (!dst || !*dst) | |
140 | return -1; | |
141 | if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) | |
142 | return -1; | |
143 | if (code <= 0x007F) { | |
144 | **dst = code; | |
145 | } else { | |
146 | if (code <= 0x07FF) { | |
147 | **dst = code >> 6 | 0xC0; | |
148 | } else { | |
149 | if (code < 0x10000) { | |
150 | **dst = code >> 12 | 0xE0; | |
151 | } else { | |
152 | **dst = code >> 18 | 0xF0; | |
153 | ++*dst; | |
154 | **dst = (code >> 12 & 0x3F) | 0x80; | |
155 | } | |
156 | ++*dst; | |
157 | **dst = (code >> 6 & 0x3F) | 0x80; | |
158 | } | |
159 | ++*dst; | |
160 | **dst = (code & 0x3F) | 0x80; | |
161 | } | |
162 | ++*dst; | |
163 | return 0; | |
164 | } | |
165 | ||
166 | size_t utf8_utf16_strnlen(const char *src, size_t count) | |
167 | { | |
168 | size_t len = 0; | |
169 | ||
170 | for (; *src && count; --count) { | |
171 | s32 code = utf8_get(&src); | |
172 | ||
173 | if (!code) | |
174 | break; | |
175 | if (code < 0) { | |
176 | /* Reserve space for a replacement character */ | |
177 | len += 1; | |
178 | } else if (code < 0x10000) { | |
179 | len += 1; | |
180 | } else { | |
181 | len += 2; | |
182 | } | |
183 | } | |
184 | return len; | |
185 | } | |
186 | ||
187 | int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) | |
188 | { | |
189 | if (!src || !dst || !*dst) | |
190 | return -1; | |
191 | ||
192 | for (; count && *src; --count) { | |
193 | s32 code = utf8_get(&src); | |
194 | ||
195 | if (code < 0) | |
196 | code = '?'; | |
197 | utf16_put(code, dst); | |
198 | } | |
199 | **dst = 0; | |
200 | return 0; | |
201 | } | |
202 | ||
203 | s32 utf16_get(const u16 **src) | |
204 | { | |
205 | s32 code, code2; | |
206 | ||
207 | if (!src || !*src) | |
208 | return -1; | |
209 | if (!**src) | |
210 | return 0; | |
211 | code = **src; | |
212 | ++*src; | |
213 | if (code >= 0xDC00 && code <= 0xDFFF) | |
214 | return -1; | |
215 | if (code >= 0xD800 && code <= 0xDBFF) { | |
216 | if (!**src) | |
217 | return -1; | |
218 | code &= 0x3ff; | |
219 | code <<= 10; | |
220 | code += 0x10000; | |
221 | code2 = **src; | |
222 | ++*src; | |
223 | if (code2 <= 0xDC00 || code2 >= 0xDFFF) | |
224 | return -1; | |
225 | code2 &= 0x3ff; | |
226 | code += code2; | |
227 | } | |
228 | return code; | |
229 | } | |
230 | ||
231 | int utf16_put(s32 code, u16 **dst) | |
232 | { | |
233 | if (!dst || !*dst) | |
234 | return -1; | |
235 | if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) | |
236 | return -1; | |
237 | if (code < 0x10000) { | |
238 | **dst = code; | |
239 | } else { | |
240 | code -= 0x10000; | |
241 | **dst = code >> 10 | 0xD800; | |
242 | ++*dst; | |
243 | **dst = (code & 0x3ff) | 0xDC00; | |
244 | } | |
245 | ++*dst; | |
246 | return 0; | |
247 | } | |
248 | ||
249 | size_t utf16_strnlen(const u16 *src, size_t count) | |
250 | { | |
251 | size_t len = 0; | |
252 | ||
253 | for (; *src && count; --count) { | |
254 | s32 code = utf16_get(&src); | |
255 | ||
256 | if (!code) | |
257 | break; | |
258 | /* | |
259 | * In case of an illegal sequence still reserve space for a | |
260 | * replacement character. | |
261 | */ | |
262 | ++len; | |
263 | } | |
264 | return len; | |
265 | } | |
266 | ||
267 | size_t utf16_utf8_strnlen(const u16 *src, size_t count) | |
268 | { | |
269 | size_t len = 0; | |
270 | ||
271 | for (; *src && count; --count) { | |
272 | s32 code = utf16_get(&src); | |
273 | ||
274 | if (!code) | |
275 | break; | |
276 | if (code < 0) | |
277 | /* Reserve space for a replacement character */ | |
278 | len += 1; | |
279 | else if (code < 0x80) | |
280 | len += 1; | |
281 | else if (code < 0x800) | |
282 | len += 2; | |
283 | else if (code < 0x10000) | |
284 | len += 3; | |
285 | else | |
286 | len += 4; | |
287 | } | |
288 | return len; | |
289 | } | |
290 | ||
291 | int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) | |
292 | { | |
293 | if (!src || !dst || !*dst) | |
294 | return -1; | |
295 | ||
296 | for (; count && *src; --count) { | |
297 | s32 code = utf16_get(&src); | |
298 | ||
299 | if (code < 0) | |
300 | code = '?'; | |
301 | utf8_put(code, dst); | |
302 | } | |
303 | **dst = 0; | |
304 | return 0; | |
305 | } | |
306 | ||
b5130a81 HS |
307 | s32 utf_to_lower(const s32 code) |
308 | { | |
309 | struct capitalization_table *pos = capitalization_table; | |
310 | s32 ret = code; | |
311 | ||
312 | if (code <= 0x7f) { | |
313 | if (code >= 'A' && code <= 'Z') | |
314 | ret += 0x20; | |
315 | return ret; | |
316 | } | |
317 | for (; pos->upper; ++pos) { | |
318 | if (pos->upper == code) { | |
319 | ret = pos->lower; | |
320 | break; | |
321 | } | |
322 | } | |
323 | return ret; | |
324 | } | |
325 | ||
326 | s32 utf_to_upper(const s32 code) | |
327 | { | |
328 | struct capitalization_table *pos = capitalization_table; | |
329 | s32 ret = code; | |
330 | ||
331 | if (code <= 0x7f) { | |
332 | if (code >= 'a' && code <= 'z') | |
333 | ret -= 0x20; | |
334 | return ret; | |
335 | } | |
336 | for (; pos->lower; ++pos) { | |
337 | if (pos->lower == code) { | |
338 | ret = pos->upper; | |
339 | break; | |
340 | } | |
341 | } | |
342 | return ret; | |
343 | } | |
78178bb0 | 344 | |
f8062c96 AT |
345 | /* |
346 | * u16_strncmp() - compare two u16 string | |
347 | * | |
348 | * @s1: first string to compare | |
349 | * @s2: second string to compare | |
350 | * @n: maximum number of u16 to compare | |
351 | * Return: 0 if the first n u16 are the same in s1 and s2 | |
352 | * < 0 if the first different u16 in s1 is less than the | |
353 | * corresponding u16 in s2 | |
354 | * > 0 if the first different u16 in s1 is greater than the | |
355 | * corresponding u16 in s2 | |
356 | */ | |
357 | int u16_strncmp(const u16 *s1, const u16 *s2, size_t n) | |
358 | { | |
359 | int ret = 0; | |
360 | ||
361 | for (; n; --n, ++s1, ++s2) { | |
362 | ret = *s1 - *s2; | |
363 | if (ret || !*s1) | |
364 | break; | |
365 | } | |
366 | ||
367 | return ret; | |
368 | } | |
369 | ||
317068b8 | 370 | size_t u16_strlen(const void *in) |
78178bb0 | 371 | { |
317068b8 HS |
372 | const char *pos = in; |
373 | size_t ret; | |
374 | ||
375 | for (; pos[0] || pos[1]; pos += 2) | |
376 | ; | |
377 | ret = pos - (char *)in; | |
378 | ret >>= 1; | |
379 | return ret; | |
78178bb0 RC |
380 | } |
381 | ||
6974a4a3 | 382 | size_t __efi_runtime u16_strnlen(const u16 *in, size_t count) |
78178bb0 RC |
383 | { |
384 | size_t i; | |
385 | for (i = 0; count-- && in[i]; i++); | |
386 | return i; | |
387 | } | |
388 | ||
4835d35a SG |
389 | size_t u16_strsize(const void *in) |
390 | { | |
391 | return (u16_strlen(in) + 1) * sizeof(u16); | |
392 | } | |
393 | ||
2a3537ae AT |
394 | u16 *u16_strcpy(u16 *dest, const u16 *src) |
395 | { | |
396 | u16 *tmp = dest; | |
397 | ||
398 | for (;; dest++, src++) { | |
399 | *dest = *src; | |
400 | if (!*src) | |
401 | break; | |
402 | } | |
403 | ||
404 | return tmp; | |
405 | } | |
406 | ||
317068b8 | 407 | u16 *u16_strdup(const void *src) |
2a3537ae AT |
408 | { |
409 | u16 *new; | |
317068b8 | 410 | size_t len; |
2a3537ae AT |
411 | |
412 | if (!src) | |
413 | return NULL; | |
317068b8 HS |
414 | len = (u16_strlen(src) + 1) * sizeof(u16); |
415 | new = malloc(len); | |
2a3537ae AT |
416 | if (!new) |
417 | return NULL; | |
317068b8 | 418 | memcpy(new, src, len); |
2a3537ae AT |
419 | |
420 | return new; | |
421 | } | |
422 | ||
78178bb0 RC |
423 | /* Convert UTF-16 to UTF-8. */ |
424 | uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size) | |
425 | { | |
426 | uint32_t code_high = 0; | |
427 | ||
428 | while (size--) { | |
429 | uint32_t code = *src++; | |
430 | ||
431 | if (code_high) { | |
432 | if (code >= 0xDC00 && code <= 0xDFFF) { | |
433 | /* Surrogate pair. */ | |
434 | code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000; | |
435 | ||
436 | *dest++ = (code >> 18) | 0xF0; | |
437 | *dest++ = ((code >> 12) & 0x3F) | 0x80; | |
438 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
439 | *dest++ = (code & 0x3F) | 0x80; | |
440 | } else { | |
441 | /* Error... */ | |
442 | *dest++ = '?'; | |
443 | /* *src may be valid. Don't eat it. */ | |
444 | src--; | |
445 | } | |
446 | ||
447 | code_high = 0; | |
448 | } else { | |
449 | if (code <= 0x007F) { | |
450 | *dest++ = code; | |
451 | } else if (code <= 0x07FF) { | |
452 | *dest++ = (code >> 6) | 0xC0; | |
453 | *dest++ = (code & 0x3F) | 0x80; | |
454 | } else if (code >= 0xD800 && code <= 0xDBFF) { | |
455 | code_high = code; | |
456 | continue; | |
457 | } else if (code >= 0xDC00 && code <= 0xDFFF) { | |
458 | /* Error... */ | |
459 | *dest++ = '?'; | |
460 | } else if (code < 0x10000) { | |
461 | *dest++ = (code >> 12) | 0xE0; | |
462 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
463 | *dest++ = (code & 0x3F) | 0x80; | |
464 | } else { | |
465 | *dest++ = (code >> 18) | 0xF0; | |
466 | *dest++ = ((code >> 12) & 0x3F) | 0x80; | |
467 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
468 | *dest++ = (code & 0x3F) | 0x80; | |
469 | } | |
470 | } | |
471 | } | |
472 | ||
473 | return dest; | |
474 | } |