]> Git Repo - linux.git/blame - fs/unicode/utf8-norm.c
unicode: Add utf8-data module
[linux.git] / fs / unicode / utf8-norm.c
CommitLineData
9f806850 1// SPDX-License-Identifier: GPL-2.0-only
44594c2f
OW
2/*
3 * Copyright (c) 2014 SGI.
4 * All rights reserved.
44594c2f
OW
5 */
6
7#include "utf8n.h"
8
2b3d0478 9int utf8version_is_supported(const struct unicode_map *um, unsigned int version)
44594c2f 10{
2b3d0478 11 int i = um->tables->utf8agetab_size - 1;
44594c2f 12
2b3d0478
CH
13 while (i >= 0 && um->tables->utf8agetab[i] != 0) {
14 if (version == um->tables->utf8agetab[i])
44594c2f
OW
15 return 1;
16 i--;
17 }
18 return 0;
19}
20EXPORT_SYMBOL(utf8version_is_supported);
21
22/*
23 * UTF-8 valid ranges.
24 *
25 * The UTF-8 encoding spreads the bits of a 32bit word over several
26 * bytes. This table gives the ranges that can be held and how they'd
27 * be represented.
28 *
29 * 0x00000000 0x0000007F: 0xxxxxxx
30 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
31 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
32 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
33 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
34 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
35 *
36 * There is an additional requirement on UTF-8, in that only the
37 * shortest representation of a 32bit value is to be used. A decoder
38 * must not decode sequences that do not satisfy this requirement.
39 * Thus the allowed ranges have a lower bound.
40 *
41 * 0x00000000 0x0000007F: 0xxxxxxx
42 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
43 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
44 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
45 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
46 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
47 *
48 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
49 * 17 planes of 65536 values. This limits the sequences actually seen
50 * even more, to just the following.
51 *
52 * 0 - 0x7F: 0 - 0x7F
53 * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF
54 * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF
55 * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
56 *
57 * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
58 *
59 * Note that the longest sequence seen with valid usage is 4 bytes,
60 * the same a single UTF-32 character. This makes the UTF-8
61 * representation of Unicode strictly smaller than UTF-32.
62 *
63 * The shortest sequence requirement was introduced by:
64 * Corrigendum #1: UTF-8 Shortest Form
65 * It can be found here:
66 * http://www.unicode.org/versions/corrigendum1.html
67 *
68 */
69
70/*
71 * Return the number of bytes used by the current UTF-8 sequence.
72 * Assumes the input points to the first byte of a valid UTF-8
73 * sequence.
74 */
75static inline int utf8clen(const char *s)
76{
77 unsigned char c = *s;
78
79 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
80}
81
a8384c68
OW
82/*
83 * Decode a 3-byte UTF-8 sequence.
84 */
85static unsigned int
86utf8decode3(const char *str)
87{
88 unsigned int uc;
89
90 uc = *str++ & 0x0F;
91 uc <<= 6;
92 uc |= *str++ & 0x3F;
93 uc <<= 6;
94 uc |= *str++ & 0x3F;
95
96 return uc;
97}
98
99/*
100 * Encode a 3-byte UTF-8 sequence.
101 */
102static int
103utf8encode3(char *str, unsigned int val)
104{
105 str[2] = (val & 0x3F) | 0x80;
106 val >>= 6;
107 str[1] = (val & 0x3F) | 0x80;
108 val >>= 6;
109 str[0] = val | 0xE0;
110
111 return 3;
112}
113
44594c2f
OW
114/*
115 * utf8trie_t
116 *
117 * A compact binary tree, used to decode UTF-8 characters.
118 *
119 * Internal nodes are one byte for the node itself, and up to three
120 * bytes for an offset into the tree. The first byte contains the
121 * following information:
122 * NEXTBYTE - flag - advance to next byte if set
123 * BITNUM - 3 bit field - the bit number to tested
124 * OFFLEN - 2 bit field - number of bytes in the offset
125 * if offlen == 0 (non-branching node)
126 * RIGHTPATH - 1 bit field - set if the following node is for the
127 * right-hand path (tested bit is set)
128 * TRIENODE - 1 bit field - set if the following node is an internal
129 * node, otherwise it is a leaf node
130 * if offlen != 0 (branching node)
131 * LEFTNODE - 1 bit field - set if the left-hand node is internal
132 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
133 *
134 * Due to the way utf8 works, there cannot be branching nodes with
135 * NEXTBYTE set, and moreover those nodes always have a righthand
136 * descendant.
137 */
138typedef const unsigned char utf8trie_t;
139#define BITNUM 0x07
140#define NEXTBYTE 0x08
141#define OFFLEN 0x30
142#define OFFLEN_SHIFT 4
143#define RIGHTPATH 0x40
144#define TRIENODE 0x80
145#define RIGHTNODE 0x40
146#define LEFTNODE 0x80
147
148/*
149 * utf8leaf_t
150 *
151 * The leaves of the trie are embedded in the trie, and so the same
152 * underlying datatype: unsigned char.
153 *
154 * leaf[0]: The unicode version, stored as a generation number that is
2b3d0478 155 * an index into ->utf8agetab[]. With this we can filter code
44594c2f
OW
156 * points based on the unicode version in which they were
157 * defined. The CCC of a non-defined code point is 0.
158 * leaf[1]: Canonical Combining Class. During normalization, we need
159 * to do a stable sort into ascending order of all characters
160 * with a non-zero CCC that occur between two characters with
161 * a CCC of 0, or at the begin or end of a string.
162 * The unicode standard guarantees that all CCC values are
163 * between 0 and 254 inclusive, which leaves 255 available as
164 * a special value.
165 * Code points with CCC 0 are known as stoppers.
166 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
167 * start of a NUL-terminated string that is the decomposition
168 * of the character.
169 * The CCC of a decomposable character is the same as the CCC
170 * of the first character of its decomposition.
171 * Some characters decompose as the empty string: these are
172 * characters with the Default_Ignorable_Code_Point property.
173 * These do affect normalization, as they all have CCC 0.
174 *
a8384c68
OW
175 * The decompositions in the trie have been fully expanded, with the
176 * exception of Hangul syllables, which are decomposed algorithmically.
44594c2f
OW
177 *
178 * Casefolding, if applicable, is also done using decompositions.
179 *
180 * The trie is constructed in such a way that leaves exist for all
181 * UTF-8 sequences that match the criteria from the "UTF-8 valid
182 * ranges" comment above, and only for those sequences. Therefore a
183 * lookup in the trie can be used to validate the UTF-8 input.
184 */
185typedef const unsigned char utf8leaf_t;
186
187#define LEAF_GEN(LEAF) ((LEAF)[0])
188#define LEAF_CCC(LEAF) ((LEAF)[1])
189#define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
190
191#define MINCCC (0)
192#define MAXCCC (254)
193#define STOPPER (0)
194#define DECOMPOSE (255)
195
a8384c68
OW
196/* Marker for hangul syllable decomposition. */
197#define HANGUL ((char)(255))
198/* Size of the synthesized leaf used for Hangul syllable decomposition. */
199#define UTF8HANGULLEAF (12)
200
201/*
202 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
203 *
204 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
205 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
206 *
207 * SBase = 0xAC00
208 * LBase = 0x1100
209 * VBase = 0x1161
210 * TBase = 0x11A7
211 * LCount = 19
212 * VCount = 21
213 * TCount = 28
214 * NCount = 588 (VCount * TCount)
215 * SCount = 11172 (LCount * NCount)
216 *
217 * Decomposition:
218 * SIndex = s - SBase
219 *
220 * LV (Canonical/Full)
221 * LIndex = SIndex / NCount
222 * VIndex = (Sindex % NCount) / TCount
223 * LPart = LBase + LIndex
224 * VPart = VBase + VIndex
225 *
226 * LVT (Canonical)
227 * LVIndex = (SIndex / TCount) * TCount
228 * TIndex = (Sindex % TCount)
229 * LVPart = SBase + LVIndex
230 * TPart = TBase + TIndex
231 *
232 * LVT (Full)
233 * LIndex = SIndex / NCount
234 * VIndex = (Sindex % NCount) / TCount
235 * TIndex = (Sindex % TCount)
236 * LPart = LBase + LIndex
237 * VPart = VBase + VIndex
238 * if (TIndex == 0) {
239 * d = <LPart, VPart>
240 * } else {
241 * TPart = TBase + TIndex
242 * d = <LPart, TPart, VPart>
243 * }
244 */
245
246/* Constants */
247#define SB (0xAC00)
248#define LB (0x1100)
249#define VB (0x1161)
250#define TB (0x11A7)
251#define LC (19)
252#define VC (21)
253#define TC (28)
254#define NC (VC * TC)
255#define SC (LC * NC)
256
257/* Algorithmic decomposition of hangul syllable. */
258static utf8leaf_t *
259utf8hangul(const char *str, unsigned char *hangul)
260{
261 unsigned int si;
262 unsigned int li;
263 unsigned int vi;
264 unsigned int ti;
265 unsigned char *h;
266
267 /* Calculate the SI, LI, VI, and TI values. */
268 si = utf8decode3(str) - SB;
269 li = si / NC;
270 vi = (si % NC) / TC;
271 ti = si % TC;
272
273 /* Fill in base of leaf. */
274 h = hangul;
275 LEAF_GEN(h) = 2;
276 LEAF_CCC(h) = DECOMPOSE;
277 h += 2;
278
279 /* Add LPart, a 3-byte UTF-8 sequence. */
280 h += utf8encode3((char *)h, li + LB);
281
282 /* Add VPart, a 3-byte UTF-8 sequence. */
283 h += utf8encode3((char *)h, vi + VB);
284
285 /* Add TPart if required, also a 3-byte UTF-8 sequence. */
286 if (ti)
287 h += utf8encode3((char *)h, ti + TB);
288
289 /* Terminate string. */
290 h[0] = '\0';
291
292 return hangul;
293}
294
44594c2f
OW
295/*
296 * Use trie to scan s, touching at most len bytes.
297 * Returns the leaf if one exists, NULL otherwise.
298 *
299 * A non-NULL return guarantees that the UTF-8 sequence starting at s
300 * is well-formed and corresponds to a known unicode code point. The
301 * shorthand for this will be "is valid UTF-8 unicode".
302 */
6ca99ce7
CH
303static utf8leaf_t *utf8nlookup(const struct unicode_map *um,
304 enum utf8_normalization n, unsigned char *hangul, const char *s,
305 size_t len)
44594c2f 306{
2b3d0478 307 utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset;
44594c2f
OW
308 int offlen;
309 int offset;
310 int mask;
311 int node;
312
44594c2f
OW
313 if (len == 0)
314 return NULL;
315
44594c2f
OW
316 node = 1;
317 while (node) {
318 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
319 if (*trie & NEXTBYTE) {
320 if (--len == 0)
321 return NULL;
322 s++;
323 }
324 mask = 1 << (*trie & BITNUM);
325 if (*s & mask) {
326 /* Right leg */
327 if (offlen) {
328 /* Right node at offset of trie */
329 node = (*trie & RIGHTNODE);
330 offset = trie[offlen];
331 while (--offlen) {
332 offset <<= 8;
333 offset |= trie[offlen];
334 }
335 trie += offset;
336 } else if (*trie & RIGHTPATH) {
337 /* Right node after this node */
338 node = (*trie & TRIENODE);
339 trie++;
340 } else {
341 /* No right node. */
a8384c68 342 return NULL;
44594c2f
OW
343 }
344 } else {
345 /* Left leg */
346 if (offlen) {
347 /* Left node after this node. */
348 node = (*trie & LEFTNODE);
349 trie += offlen + 1;
350 } else if (*trie & RIGHTPATH) {
351 /* No left node. */
a8384c68 352 return NULL;
44594c2f
OW
353 } else {
354 /* Left node after this node */
355 node = (*trie & TRIENODE);
356 trie++;
357 }
358 }
359 }
a8384c68
OW
360 /*
361 * Hangul decomposition is done algorithmically. These are the
362 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
363 * always 3 bytes long, so s has been advanced twice, and the
364 * start of the sequence is at s-2.
365 */
366 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
367 trie = utf8hangul(s - 2, hangul);
44594c2f
OW
368 return trie;
369}
370
371/*
372 * Use trie to scan s.
373 * Returns the leaf if one exists, NULL otherwise.
374 *
375 * Forwards to utf8nlookup().
376 */
6ca99ce7
CH
377static utf8leaf_t *utf8lookup(const struct unicode_map *um,
378 enum utf8_normalization n, unsigned char *hangul, const char *s)
44594c2f 379{
6ca99ce7 380 return utf8nlookup(um, n, hangul, s, (size_t)-1);
44594c2f
OW
381}
382
44594c2f
OW
383/*
384 * Length of the normalization of s, touch at most len bytes.
385 * Return -1 if s is not valid UTF-8 unicode.
386 */
6ca99ce7
CH
387ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n,
388 const char *s, size_t len)
44594c2f
OW
389{
390 utf8leaf_t *leaf;
391 size_t ret = 0;
a8384c68 392 unsigned char hangul[UTF8HANGULLEAF];
44594c2f 393
44594c2f 394 while (len && *s) {
6ca99ce7 395 leaf = utf8nlookup(um, n, hangul, s, len);
44594c2f
OW
396 if (!leaf)
397 return -1;
2b3d0478
CH
398 if (um->tables->utf8agetab[LEAF_GEN(leaf)] >
399 um->ntab[n]->maxage)
44594c2f
OW
400 ret += utf8clen(s);
401 else if (LEAF_CCC(leaf) == DECOMPOSE)
402 ret += strlen(LEAF_STR(leaf));
403 else
404 ret += utf8clen(s);
405 len -= utf8clen(s);
406 s += utf8clen(s);
407 }
408 return ret;
409}
410EXPORT_SYMBOL(utf8nlen);
411
412/*
413 * Set up an utf8cursor for use by utf8byte().
414 *
415 * u8c : pointer to cursor.
416 * data : const struct utf8data to use for normalization.
417 * s : string.
418 * len : length of s.
419 *
420 * Returns -1 on error, 0 on success.
421 */
6ca99ce7
CH
422int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um,
423 enum utf8_normalization n, const char *s, size_t len)
44594c2f 424{
44594c2f
OW
425 if (!s)
426 return -1;
6ca99ce7
CH
427 u8c->um = um;
428 u8c->n = n;
44594c2f
OW
429 u8c->s = s;
430 u8c->p = NULL;
431 u8c->ss = NULL;
432 u8c->sp = NULL;
433 u8c->len = len;
434 u8c->slen = 0;
435 u8c->ccc = STOPPER;
436 u8c->nccc = STOPPER;
437 /* Check we didn't clobber the maximum length. */
438 if (u8c->len != len)
439 return -1;
440 /* The first byte of s may not be an utf8 continuation. */
441 if (len > 0 && (*s & 0xC0) == 0x80)
442 return -1;
443 return 0;
444}
445EXPORT_SYMBOL(utf8ncursor);
446
44594c2f
OW
447/*
448 * Get one byte from the normalized form of the string described by u8c.
449 *
450 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
451 *
452 * The cursor keeps track of the location in the string in u8c->s.
453 * When a character is decomposed, the current location is stored in
454 * u8c->p, and u8c->s is set to the start of the decomposition. Note
455 * that bytes from a decomposition do not count against u8c->len.
456 *
457 * Characters are emitted if they match the current CCC in u8c->ccc.
458 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
459 * and the function returns 0 in that case.
460 *
461 * Sorting by CCC is done by repeatedly scanning the string. The
462 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
463 * the start of the scan. The first pass finds the lowest CCC to be
464 * emitted and stores it in u8c->nccc, the second pass emits the
465 * characters with this CCC and finds the next lowest CCC. This limits
466 * the number of passes to 1 + the number of different CCCs in the
467 * sequence being scanned.
468 *
469 * Therefore:
470 * u8c->p != NULL -> a decomposition is being scanned.
471 * u8c->ss != NULL -> this is a repeating scan.
472 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
473 */
474int utf8byte(struct utf8cursor *u8c)
475{
476 utf8leaf_t *leaf;
477 int ccc;
478
479 for (;;) {
480 /* Check for the end of a decomposed character. */
481 if (u8c->p && *u8c->s == '\0') {
482 u8c->s = u8c->p;
483 u8c->p = NULL;
484 }
485
486 /* Check for end-of-string. */
487 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
488 /* There is no next byte. */
489 if (u8c->ccc == STOPPER)
490 return 0;
491 /* End-of-string during a scan counts as a stopper. */
492 ccc = STOPPER;
493 goto ccc_mismatch;
494 } else if ((*u8c->s & 0xC0) == 0x80) {
495 /* This is a continuation of the current character. */
496 if (!u8c->p)
497 u8c->len--;
498 return (unsigned char)*u8c->s++;
499 }
500
501 /* Look up the data for the current character. */
a8384c68 502 if (u8c->p) {
6ca99ce7 503 leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
a8384c68 504 } else {
6ca99ce7 505 leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul,
a8384c68
OW
506 u8c->s, u8c->len);
507 }
44594c2f
OW
508
509 /* No leaf found implies that the input is a binary blob. */
510 if (!leaf)
511 return -1;
512
513 ccc = LEAF_CCC(leaf);
514 /* Characters that are too new have CCC 0. */
2b3d0478 515 if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] >
6ca99ce7 516 u8c->um->ntab[u8c->n]->maxage) {
44594c2f
OW
517 ccc = STOPPER;
518 } else if (ccc == DECOMPOSE) {
519 u8c->len -= utf8clen(u8c->s);
520 u8c->p = u8c->s + utf8clen(u8c->s);
521 u8c->s = LEAF_STR(leaf);
522 /* Empty decomposition implies CCC 0. */
523 if (*u8c->s == '\0') {
524 if (u8c->ccc == STOPPER)
525 continue;
526 ccc = STOPPER;
527 goto ccc_mismatch;
528 }
a8384c68 529
6ca99ce7 530 leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
15f0d8d0
TT
531 if (!leaf)
532 return -1;
a8384c68 533 ccc = LEAF_CCC(leaf);
44594c2f
OW
534 }
535
536 /*
537 * If this is not a stopper, then see if it updates
538 * the next canonical class to be emitted.
539 */
540 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
541 u8c->nccc = ccc;
542
543 /*
544 * Return the current byte if this is the current
545 * combining class.
546 */
547 if (ccc == u8c->ccc) {
548 if (!u8c->p)
549 u8c->len--;
550 return (unsigned char)*u8c->s++;
551 }
552
553 /* Current combining class mismatch. */
554ccc_mismatch:
555 if (u8c->nccc == STOPPER) {
556 /*
557 * Scan forward for the first canonical class
558 * to be emitted. Save the position from
559 * which to restart.
560 */
561 u8c->ccc = MINCCC - 1;
562 u8c->nccc = ccc;
563 u8c->sp = u8c->p;
564 u8c->ss = u8c->s;
565 u8c->slen = u8c->len;
566 if (!u8c->p)
567 u8c->len -= utf8clen(u8c->s);
568 u8c->s += utf8clen(u8c->s);
569 } else if (ccc != STOPPER) {
570 /* Not a stopper, and not the ccc we're emitting. */
571 if (!u8c->p)
572 u8c->len -= utf8clen(u8c->s);
573 u8c->s += utf8clen(u8c->s);
574 } else if (u8c->nccc != MAXCCC + 1) {
575 /* At a stopper, restart for next ccc. */
576 u8c->ccc = u8c->nccc;
577 u8c->nccc = MAXCCC + 1;
578 u8c->s = u8c->ss;
579 u8c->p = u8c->sp;
580 u8c->len = u8c->slen;
581 } else {
582 /* All done, proceed from here. */
583 u8c->ccc = STOPPER;
584 u8c->nccc = STOPPER;
585 u8c->sp = NULL;
586 u8c->ss = NULL;
587 u8c->slen = 0;
588 }
589 }
590}
591EXPORT_SYMBOL(utf8byte);
This page took 0.276948 seconds and 4 git commands to generate.