]> Git Repo - J-u-boot.git/blob - lib/charset.c
lib: charset: utility functions for Unicode
[J-u-boot.git] / lib / charset.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  *  charset conversion utils
4  *
5  *  Copyright (c) 2017 Rob Clark
6  */
7
8 #include <charset.h>
9 #include <malloc.h>
10
11 s32 utf8_get(const char **src)
12 {
13         s32 code = 0;
14         unsigned char c;
15
16         if (!src || !*src)
17                 return -1;
18         if (!**src)
19                 return 0;
20         c = **src;
21         if (c >= 0x80) {
22                 ++*src;
23                 if (!**src)
24                         return -1;
25                 /*
26                  * We do not expect a continuation byte (0x80 - 0xbf).
27                  * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2
28                  * here.
29                  * The highest code point is 0x10ffff which is coded as
30                  * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4.
31                  */
32                 if (c < 0xc2 || code > 0xf4)
33                         return -1;
34                 if (c >= 0xe0) {
35                         if (c >= 0xf0) {
36                                 /* 0xf0 - 0xf4 */
37                                 c &= 0x07;
38                                 code = c << 18;
39                                 c = **src;
40                                 ++*src;
41                                 if (!**src)
42                                         return -1;
43                                 if (c < 0x80 || c > 0xbf)
44                                         return -1;
45                                 c &= 0x3f;
46                         } else {
47                                 /* 0xe0 - 0xef */
48                                 c &= 0x0f;
49                         }
50                         code += c << 12;
51                         if ((code >= 0xD800 && code <= 0xDFFF) ||
52                             code >= 0x110000)
53                                 return -1;
54                         c = **src;
55                         ++*src;
56                         if (!**src)
57                                 return -1;
58                         if (c < 0x80 || c > 0xbf)
59                                 return -1;
60                 }
61                 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
62                 c &= 0x3f;
63                 code += c << 6;
64                 c = **src;
65                 if (c < 0x80 || c > 0xbf)
66                         return -1;
67                 c &= 0x3f;
68         }
69         code += c;
70         ++*src;
71         return code;
72 }
73
74 int utf8_put(s32 code, char **dst)
75 {
76         if (!dst || !*dst)
77                 return -1;
78         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
79                 return -1;
80         if (code <= 0x007F) {
81                 **dst = code;
82         } else {
83                 if (code <= 0x07FF) {
84                         **dst = code >> 6 | 0xC0;
85                 } else {
86                         if (code < 0x10000) {
87                                 **dst = code >> 12 | 0xE0;
88                         } else {
89                                 **dst = code >> 18 | 0xF0;
90                                 ++*dst;
91                                 **dst = (code >> 12 & 0x3F) | 0x80;
92                         }
93                         ++*dst;
94                         **dst = (code >> 6 & 0x3F) | 0x80;
95                 }
96                 ++*dst;
97                 **dst = (code & 0x3F) | 0x80;
98         }
99         ++*dst;
100         return 0;
101 }
102
103 size_t utf8_utf16_strnlen(const char *src, size_t count)
104 {
105         size_t len = 0;
106
107         for (; *src && count; --count)  {
108                 s32 code = utf8_get(&src);
109
110                 if (!code)
111                         break;
112                 if (code < 0) {
113                         /* Reserve space for a replacement character */
114                         len += 1;
115                 } else if (code < 0x10000) {
116                         len += 1;
117                 } else {
118                         len += 2;
119                 }
120         }
121         return len;
122 }
123
124 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
125 {
126         if (!src || !dst || !*dst)
127                 return -1;
128
129         for (; count && *src; --count) {
130                 s32 code = utf8_get(&src);
131
132                 if (code < 0)
133                         code = '?';
134                 utf16_put(code, dst);
135         }
136         **dst = 0;
137         return 0;
138 }
139
140 s32 utf16_get(const u16 **src)
141 {
142         s32 code, code2;
143
144         if (!src || !*src)
145                 return -1;
146         if (!**src)
147                 return 0;
148         code = **src;
149         ++*src;
150         if (code >= 0xDC00 && code <= 0xDFFF)
151                 return -1;
152         if (code >= 0xD800 && code <= 0xDBFF) {
153                 if (!**src)
154                         return -1;
155                 code &= 0x3ff;
156                 code <<= 10;
157                 code += 0x10000;
158                 code2 = **src;
159                 ++*src;
160                 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
161                         return -1;
162                 code2 &= 0x3ff;
163                 code += code2;
164         }
165         return code;
166 }
167
168 int utf16_put(s32 code, u16 **dst)
169 {
170         if (!dst || !*dst)
171                 return -1;
172         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
173                 return -1;
174         if (code < 0x10000) {
175                 **dst = code;
176         } else {
177                 code -= 0x10000;
178                 **dst = code >> 10 | 0xD800;
179                 ++*dst;
180                 **dst = (code & 0x3ff) | 0xDC00;
181         }
182         ++*dst;
183         return 0;
184 }
185
186 size_t utf16_strnlen(const u16 *src, size_t count)
187 {
188         size_t len = 0;
189
190         for (; *src && count; --count)  {
191                 s32 code = utf16_get(&src);
192
193                 if (!code)
194                         break;
195                 /*
196                  * In case of an illegal sequence still reserve space for a
197                  * replacement character.
198                  */
199                 ++len;
200         }
201         return len;
202 }
203
204 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
205 {
206         size_t len = 0;
207
208         for (; *src && count; --count)  {
209                 s32 code = utf16_get(&src);
210
211                 if (!code)
212                         break;
213                 if (code < 0)
214                         /* Reserve space for a replacement character */
215                         len += 1;
216                 else if (code < 0x80)
217                         len += 1;
218                 else if (code < 0x800)
219                         len += 2;
220                 else if (code < 0x10000)
221                         len += 3;
222                 else
223                         len += 4;
224         }
225         return len;
226 }
227
228 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
229 {
230         if (!src || !dst || !*dst)
231                 return -1;
232
233         for (; count && *src; --count) {
234                 s32 code = utf16_get(&src);
235
236                 if (code < 0)
237                         code = '?';
238                 utf8_put(code, dst);
239         }
240         **dst = 0;
241         return 0;
242 }
243
244
245 size_t u16_strlen(const u16 *in)
246 {
247         size_t i;
248         for (i = 0; in[i]; i++);
249         return i;
250 }
251
252 size_t u16_strnlen(const u16 *in, size_t count)
253 {
254         size_t i;
255         for (i = 0; count-- && in[i]; i++);
256         return i;
257 }
258
259 uint16_t *utf16_strcpy(uint16_t *dest, const uint16_t *src)
260 {
261         uint16_t *tmp = dest;
262
263         while ((*dest++ = *src++) != '\0')
264                 /* nothing */;
265         return tmp;
266
267 }
268
269 uint16_t *utf16_strdup(const uint16_t *s)
270 {
271         uint16_t *new;
272
273         if (!s)
274                 return NULL;
275         new = malloc((u16_strlen(s) + 1) * 2);
276         if (!new)
277                 return NULL;
278         utf16_strcpy(new, s);
279         return new;
280 }
281
282 /* Convert UTF-16 to UTF-8.  */
283 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
284 {
285         uint32_t code_high = 0;
286
287         while (size--) {
288                 uint32_t code = *src++;
289
290                 if (code_high) {
291                         if (code >= 0xDC00 && code <= 0xDFFF) {
292                                 /* Surrogate pair.  */
293                                 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
294
295                                 *dest++ = (code >> 18) | 0xF0;
296                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
297                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
298                                 *dest++ = (code & 0x3F) | 0x80;
299                         } else {
300                                 /* Error...  */
301                                 *dest++ = '?';
302                                 /* *src may be valid. Don't eat it.  */
303                                 src--;
304                         }
305
306                         code_high = 0;
307                 } else {
308                         if (code <= 0x007F) {
309                                 *dest++ = code;
310                         } else if (code <= 0x07FF) {
311                                 *dest++ = (code >> 6) | 0xC0;
312                                 *dest++ = (code & 0x3F) | 0x80;
313                         } else if (code >= 0xD800 && code <= 0xDBFF) {
314                                 code_high = code;
315                                 continue;
316                         } else if (code >= 0xDC00 && code <= 0xDFFF) {
317                                 /* Error... */
318                                 *dest++ = '?';
319                         } else if (code < 0x10000) {
320                                 *dest++ = (code >> 12) | 0xE0;
321                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
322                                 *dest++ = (code & 0x3F) | 0x80;
323                         } else {
324                                 *dest++ = (code >> 18) | 0xF0;
325                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
326                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
327                                 *dest++ = (code & 0x3F) | 0x80;
328                         }
329                 }
330         }
331
332         return dest;
333 }
334
335 uint16_t *utf8_to_utf16(uint16_t *dest, const uint8_t *src, size_t size)
336 {
337         while (size--) {
338                 int extension_bytes;
339                 uint32_t code;
340
341                 extension_bytes = 0;
342                 if (*src <= 0x7f) {
343                         code = *src++;
344                         /* Exit on zero byte */
345                         if (!code)
346                                 size = 0;
347                 } else if (*src <= 0xbf) {
348                         /* Illegal code */
349                         code = '?';
350                 } else if (*src <= 0xdf) {
351                         code = *src++ & 0x1f;
352                         extension_bytes = 1;
353                 } else if (*src <= 0xef) {
354                         code = *src++ & 0x0f;
355                         extension_bytes = 2;
356                 } else if (*src <= 0xf7) {
357                         code = *src++ & 0x07;
358                         extension_bytes = 3;
359                 } else {
360                         /* Illegal code */
361                         code = '?';
362                 }
363
364                 for (; extension_bytes && size; --size, --extension_bytes) {
365                         if ((*src & 0xc0) == 0x80) {
366                                 code <<= 6;
367                                 code |= *src++ & 0x3f;
368                         } else {
369                                 /* Illegal code */
370                                 code = '?';
371                                 ++src;
372                                 --size;
373                                 break;
374                         }
375                 }
376
377                 if (code < 0x10000) {
378                         *dest++ = code;
379                 } else {
380                         /*
381                          * Simplified expression for
382                          * (((code - 0x10000) >> 10) & 0x3ff) | 0xd800
383                          */
384                         *dest++ = (code >> 10) + 0xd7c0;
385                         *dest++ = (code & 0x3ff) | 0xdc00;
386                 }
387         }
388         return dest;
389 }
This page took 0.050899 seconds and 4 git commands to generate.