]>
Commit | Line | Data |
---|---|---|
f739fcd8 | 1 | // SPDX-License-Identifier: GPL-2.0+ |
78178bb0 RC |
2 | /* |
3 | * charset conversion utils | |
4 | * | |
5 | * Copyright (c) 2017 Rob Clark | |
78178bb0 RC |
6 | */ |
7 | ||
35cbb796 | 8 | #include <common.h> |
78178bb0 | 9 | #include <charset.h> |
b5130a81 | 10 | #include <capitalization.h> |
78178bb0 RC |
11 | #include <malloc.h> |
12 | ||
b5130a81 HS |
13 | static struct capitalization_table capitalization_table[] = |
14 | #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION | |
15 | UNICODE_CAPITALIZATION_TABLE; | |
16 | #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250 | |
17 | CP1250_CAPITALIZATION_TABLE; | |
18 | #else | |
19 | CP437_CAPITALIZATION_TABLE; | |
20 | #endif | |
21 | ||
35cbb796 HS |
22 | /** |
23 | * get_code() - read Unicode code point from UTF-8 stream | |
24 | * | |
25 | * @read_u8: - stream reader | |
26 | * @src: - string buffer passed to stream reader, optional | |
27 | * Return: - Unicode code point | |
28 | */ | |
29 | static int get_code(u8 (*read_u8)(void *data), void *data) | |
d8c28232 | 30 | { |
35cbb796 | 31 | s32 ch = 0; |
d8c28232 | 32 | |
35cbb796 HS |
33 | ch = read_u8(data); |
34 | if (!ch) | |
d8c28232 | 35 | return 0; |
35cbb796 HS |
36 | if (ch >= 0xc2 && ch <= 0xf4) { |
37 | int code = 0; | |
38 | ||
39 | if (ch >= 0xe0) { | |
40 | if (ch >= 0xf0) { | |
d8c28232 | 41 | /* 0xf0 - 0xf4 */ |
35cbb796 HS |
42 | ch &= 0x07; |
43 | code = ch << 18; | |
44 | ch = read_u8(data); | |
45 | if (ch < 0x80 || ch > 0xbf) | |
46 | goto error; | |
47 | ch &= 0x3f; | |
d8c28232 HS |
48 | } else { |
49 | /* 0xe0 - 0xef */ | |
35cbb796 | 50 | ch &= 0x0f; |
d8c28232 | 51 | } |
35cbb796 | 52 | code += ch << 12; |
d8c28232 HS |
53 | if ((code >= 0xD800 && code <= 0xDFFF) || |
54 | code >= 0x110000) | |
35cbb796 HS |
55 | goto error; |
56 | ch = read_u8(data); | |
57 | if (ch < 0x80 || ch > 0xbf) | |
58 | goto error; | |
d8c28232 HS |
59 | } |
60 | /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ | |
35cbb796 HS |
61 | ch &= 0x3f; |
62 | code += ch << 6; | |
63 | ch = read_u8(data); | |
64 | if (ch < 0x80 || ch > 0xbf) | |
65 | goto error; | |
66 | ch &= 0x3f; | |
67 | ch += code; | |
68 | } else if (ch >= 0x80) { | |
69 | goto error; | |
d8c28232 | 70 | } |
35cbb796 HS |
71 | return ch; |
72 | error: | |
73 | return '?'; | |
74 | } | |
75 | ||
76 | /** | |
77 | * read_string() - read byte from character string | |
78 | * | |
79 | * @data: - pointer to string | |
80 | * Return: - byte read | |
81 | * | |
82 | * The string pointer is incremented if it does not point to '\0'. | |
83 | */ | |
84 | static u8 read_string(void *data) | |
85 | ||
86 | { | |
87 | const char **src = (const char **)data; | |
88 | u8 c; | |
89 | ||
90 | if (!src || !*src || !**src) | |
91 | return 0; | |
92 | c = **src; | |
d8c28232 | 93 | ++*src; |
35cbb796 HS |
94 | return c; |
95 | } | |
96 | ||
97 | /** | |
98 | * read_console() - read byte from console | |
99 | * | |
60d79876 HS |
100 | * @data - not used, needed to match interface |
101 | * Return: - byte read or 0 on error | |
35cbb796 HS |
102 | */ |
103 | static u8 read_console(void *data) | |
104 | { | |
60d79876 HS |
105 | int ch; |
106 | ||
107 | ch = getc(); | |
108 | if (ch < 0) | |
109 | ch = 0; | |
110 | return ch; | |
35cbb796 HS |
111 | } |
112 | ||
113 | int console_read_unicode(s32 *code) | |
114 | { | |
115 | if (!tstc()) { | |
116 | /* No input available */ | |
117 | return 1; | |
118 | } | |
119 | ||
120 | /* Read Unicode code */ | |
121 | *code = get_code(read_console, NULL); | |
122 | return 0; | |
123 | } | |
124 | ||
125 | s32 utf8_get(const char **src) | |
126 | { | |
127 | return get_code(read_string, src); | |
d8c28232 HS |
128 | } |
129 | ||
130 | int utf8_put(s32 code, char **dst) | |
131 | { | |
132 | if (!dst || !*dst) | |
133 | return -1; | |
134 | if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) | |
135 | return -1; | |
136 | if (code <= 0x007F) { | |
137 | **dst = code; | |
138 | } else { | |
139 | if (code <= 0x07FF) { | |
140 | **dst = code >> 6 | 0xC0; | |
141 | } else { | |
142 | if (code < 0x10000) { | |
143 | **dst = code >> 12 | 0xE0; | |
144 | } else { | |
145 | **dst = code >> 18 | 0xF0; | |
146 | ++*dst; | |
147 | **dst = (code >> 12 & 0x3F) | 0x80; | |
148 | } | |
149 | ++*dst; | |
150 | **dst = (code >> 6 & 0x3F) | 0x80; | |
151 | } | |
152 | ++*dst; | |
153 | **dst = (code & 0x3F) | 0x80; | |
154 | } | |
155 | ++*dst; | |
156 | return 0; | |
157 | } | |
158 | ||
159 | size_t utf8_utf16_strnlen(const char *src, size_t count) | |
160 | { | |
161 | size_t len = 0; | |
162 | ||
163 | for (; *src && count; --count) { | |
164 | s32 code = utf8_get(&src); | |
165 | ||
166 | if (!code) | |
167 | break; | |
168 | if (code < 0) { | |
169 | /* Reserve space for a replacement character */ | |
170 | len += 1; | |
171 | } else if (code < 0x10000) { | |
172 | len += 1; | |
173 | } else { | |
174 | len += 2; | |
175 | } | |
176 | } | |
177 | return len; | |
178 | } | |
179 | ||
180 | int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) | |
181 | { | |
182 | if (!src || !dst || !*dst) | |
183 | return -1; | |
184 | ||
185 | for (; count && *src; --count) { | |
186 | s32 code = utf8_get(&src); | |
187 | ||
188 | if (code < 0) | |
189 | code = '?'; | |
190 | utf16_put(code, dst); | |
191 | } | |
192 | **dst = 0; | |
193 | return 0; | |
194 | } | |
195 | ||
196 | s32 utf16_get(const u16 **src) | |
197 | { | |
198 | s32 code, code2; | |
199 | ||
200 | if (!src || !*src) | |
201 | return -1; | |
202 | if (!**src) | |
203 | return 0; | |
204 | code = **src; | |
205 | ++*src; | |
206 | if (code >= 0xDC00 && code <= 0xDFFF) | |
207 | return -1; | |
208 | if (code >= 0xD800 && code <= 0xDBFF) { | |
209 | if (!**src) | |
210 | return -1; | |
211 | code &= 0x3ff; | |
212 | code <<= 10; | |
213 | code += 0x10000; | |
214 | code2 = **src; | |
215 | ++*src; | |
216 | if (code2 <= 0xDC00 || code2 >= 0xDFFF) | |
217 | return -1; | |
218 | code2 &= 0x3ff; | |
219 | code += code2; | |
220 | } | |
221 | return code; | |
222 | } | |
223 | ||
224 | int utf16_put(s32 code, u16 **dst) | |
225 | { | |
226 | if (!dst || !*dst) | |
227 | return -1; | |
228 | if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) | |
229 | return -1; | |
230 | if (code < 0x10000) { | |
231 | **dst = code; | |
232 | } else { | |
233 | code -= 0x10000; | |
234 | **dst = code >> 10 | 0xD800; | |
235 | ++*dst; | |
236 | **dst = (code & 0x3ff) | 0xDC00; | |
237 | } | |
238 | ++*dst; | |
239 | return 0; | |
240 | } | |
241 | ||
242 | size_t utf16_strnlen(const u16 *src, size_t count) | |
243 | { | |
244 | size_t len = 0; | |
245 | ||
246 | for (; *src && count; --count) { | |
247 | s32 code = utf16_get(&src); | |
248 | ||
249 | if (!code) | |
250 | break; | |
251 | /* | |
252 | * In case of an illegal sequence still reserve space for a | |
253 | * replacement character. | |
254 | */ | |
255 | ++len; | |
256 | } | |
257 | return len; | |
258 | } | |
259 | ||
260 | size_t utf16_utf8_strnlen(const u16 *src, size_t count) | |
261 | { | |
262 | size_t len = 0; | |
263 | ||
264 | for (; *src && count; --count) { | |
265 | s32 code = utf16_get(&src); | |
266 | ||
267 | if (!code) | |
268 | break; | |
269 | if (code < 0) | |
270 | /* Reserve space for a replacement character */ | |
271 | len += 1; | |
272 | else if (code < 0x80) | |
273 | len += 1; | |
274 | else if (code < 0x800) | |
275 | len += 2; | |
276 | else if (code < 0x10000) | |
277 | len += 3; | |
278 | else | |
279 | len += 4; | |
280 | } | |
281 | return len; | |
282 | } | |
283 | ||
284 | int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) | |
285 | { | |
286 | if (!src || !dst || !*dst) | |
287 | return -1; | |
288 | ||
289 | for (; count && *src; --count) { | |
290 | s32 code = utf16_get(&src); | |
291 | ||
292 | if (code < 0) | |
293 | code = '?'; | |
294 | utf8_put(code, dst); | |
295 | } | |
296 | **dst = 0; | |
297 | return 0; | |
298 | } | |
299 | ||
b5130a81 HS |
300 | s32 utf_to_lower(const s32 code) |
301 | { | |
302 | struct capitalization_table *pos = capitalization_table; | |
303 | s32 ret = code; | |
304 | ||
305 | if (code <= 0x7f) { | |
306 | if (code >= 'A' && code <= 'Z') | |
307 | ret += 0x20; | |
308 | return ret; | |
309 | } | |
310 | for (; pos->upper; ++pos) { | |
311 | if (pos->upper == code) { | |
312 | ret = pos->lower; | |
313 | break; | |
314 | } | |
315 | } | |
316 | return ret; | |
317 | } | |
318 | ||
319 | s32 utf_to_upper(const s32 code) | |
320 | { | |
321 | struct capitalization_table *pos = capitalization_table; | |
322 | s32 ret = code; | |
323 | ||
324 | if (code <= 0x7f) { | |
325 | if (code >= 'a' && code <= 'z') | |
326 | ret -= 0x20; | |
327 | return ret; | |
328 | } | |
329 | for (; pos->lower; ++pos) { | |
330 | if (pos->lower == code) { | |
331 | ret = pos->upper; | |
332 | break; | |
333 | } | |
334 | } | |
335 | return ret; | |
336 | } | |
78178bb0 | 337 | |
f8062c96 AT |
338 | /* |
339 | * u16_strncmp() - compare two u16 string | |
340 | * | |
341 | * @s1: first string to compare | |
342 | * @s2: second string to compare | |
343 | * @n: maximum number of u16 to compare | |
344 | * Return: 0 if the first n u16 are the same in s1 and s2 | |
345 | * < 0 if the first different u16 in s1 is less than the | |
346 | * corresponding u16 in s2 | |
347 | * > 0 if the first different u16 in s1 is greater than the | |
348 | * corresponding u16 in s2 | |
349 | */ | |
350 | int u16_strncmp(const u16 *s1, const u16 *s2, size_t n) | |
351 | { | |
352 | int ret = 0; | |
353 | ||
354 | for (; n; --n, ++s1, ++s2) { | |
355 | ret = *s1 - *s2; | |
356 | if (ret || !*s1) | |
357 | break; | |
358 | } | |
359 | ||
360 | return ret; | |
361 | } | |
362 | ||
317068b8 | 363 | size_t u16_strlen(const void *in) |
78178bb0 | 364 | { |
317068b8 HS |
365 | const char *pos = in; |
366 | size_t ret; | |
367 | ||
368 | for (; pos[0] || pos[1]; pos += 2) | |
369 | ; | |
370 | ret = pos - (char *)in; | |
371 | ret >>= 1; | |
372 | return ret; | |
78178bb0 RC |
373 | } |
374 | ||
1dde0d57 | 375 | size_t u16_strnlen(const u16 *in, size_t count) |
78178bb0 RC |
376 | { |
377 | size_t i; | |
378 | for (i = 0; count-- && in[i]; i++); | |
379 | return i; | |
380 | } | |
381 | ||
4835d35a SG |
382 | size_t u16_strsize(const void *in) |
383 | { | |
384 | return (u16_strlen(in) + 1) * sizeof(u16); | |
385 | } | |
386 | ||
2a3537ae AT |
387 | u16 *u16_strcpy(u16 *dest, const u16 *src) |
388 | { | |
389 | u16 *tmp = dest; | |
390 | ||
391 | for (;; dest++, src++) { | |
392 | *dest = *src; | |
393 | if (!*src) | |
394 | break; | |
395 | } | |
396 | ||
397 | return tmp; | |
398 | } | |
399 | ||
317068b8 | 400 | u16 *u16_strdup(const void *src) |
2a3537ae AT |
401 | { |
402 | u16 *new; | |
317068b8 | 403 | size_t len; |
2a3537ae AT |
404 | |
405 | if (!src) | |
406 | return NULL; | |
317068b8 HS |
407 | len = (u16_strlen(src) + 1) * sizeof(u16); |
408 | new = malloc(len); | |
2a3537ae AT |
409 | if (!new) |
410 | return NULL; | |
317068b8 | 411 | memcpy(new, src, len); |
2a3537ae AT |
412 | |
413 | return new; | |
414 | } | |
415 | ||
78178bb0 RC |
416 | /* Convert UTF-16 to UTF-8. */ |
417 | uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size) | |
418 | { | |
419 | uint32_t code_high = 0; | |
420 | ||
421 | while (size--) { | |
422 | uint32_t code = *src++; | |
423 | ||
424 | if (code_high) { | |
425 | if (code >= 0xDC00 && code <= 0xDFFF) { | |
426 | /* Surrogate pair. */ | |
427 | code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000; | |
428 | ||
429 | *dest++ = (code >> 18) | 0xF0; | |
430 | *dest++ = ((code >> 12) & 0x3F) | 0x80; | |
431 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
432 | *dest++ = (code & 0x3F) | 0x80; | |
433 | } else { | |
434 | /* Error... */ | |
435 | *dest++ = '?'; | |
436 | /* *src may be valid. Don't eat it. */ | |
437 | src--; | |
438 | } | |
439 | ||
440 | code_high = 0; | |
441 | } else { | |
442 | if (code <= 0x007F) { | |
443 | *dest++ = code; | |
444 | } else if (code <= 0x07FF) { | |
445 | *dest++ = (code >> 6) | 0xC0; | |
446 | *dest++ = (code & 0x3F) | 0x80; | |
447 | } else if (code >= 0xD800 && code <= 0xDBFF) { | |
448 | code_high = code; | |
449 | continue; | |
450 | } else if (code >= 0xDC00 && code <= 0xDFFF) { | |
451 | /* Error... */ | |
452 | *dest++ = '?'; | |
453 | } else if (code < 0x10000) { | |
454 | *dest++ = (code >> 12) | 0xE0; | |
455 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
456 | *dest++ = (code & 0x3F) | 0x80; | |
457 | } else { | |
458 | *dest++ = (code >> 18) | 0xF0; | |
459 | *dest++ = ((code >> 12) & 0x3F) | 0x80; | |
460 | *dest++ = ((code >> 6) & 0x3F) | 0x80; | |
461 | *dest++ = (code & 0x3F) | 0x80; | |
462 | } | |
463 | } | |
464 | } | |
465 | ||
466 | return dest; | |
467 | } |