/home/andy/git/oilshell/oil/data_lang/utf8_impls/bjoern_dfa.h
Line | Count | Source |
1 | | #ifndef BJOERN_DFA |
2 | | #define BJOERN_DFA |
3 | | |
4 | | // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> |
5 | | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. |
6 | | |
7 | | #include "stdint.h" |
8 | | |
9 | | #if 0 |
10 | | |
11 | | #define UTF8_ACCEPT 0 |
12 | | #define UTF8_REJECT 1 |
13 | | |
14 | | static const uint8_t utf8d[] = { |
15 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f |
16 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f |
17 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f |
18 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f |
19 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f |
20 | | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf |
21 | | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df |
22 | | 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef |
23 | | 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff |
24 | | |
25 | | 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 |
26 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 |
27 | | 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 |
28 | | 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 |
29 | | 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 |
30 | | }; |
31 | | |
32 | | uint32_t inline |
33 | | decode(uint32_t* state, uint32_t* codep, uint32_t byte) { |
34 | | uint32_t type = utf8d[byte]; |
35 | | |
36 | | *codep = (*state != UTF8_ACCEPT) ? |
37 | | (byte & 0x3fu) | (*codep << 6) : |
38 | | (0xff >> type) & (byte); |
39 | | |
40 | | *state = utf8d[256 + *state*16 + type]; |
41 | | return *state; |
42 | | } |
43 | | |
44 | | #else |
45 | | |
46 | | // Newer version, lower on the page |
47 | | |
48 | 2.31k | #define UTF8_ACCEPT 0 |
49 | 84 | #define UTF8_REJECT 12 |
50 | | |
51 | | static const uint8_t utf8d[] = { |
52 | | // The first part of the table maps bytes to character classes that |
53 | | // to reduce the size of the transition table and create bitmasks. |
54 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
55 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
56 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
57 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
58 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, |
59 | | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
60 | | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
61 | | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, |
62 | | |
63 | | // The second part is a transition table that maps a combination |
64 | | // of a state of the automaton and a character class to a state. |
65 | | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, |
66 | | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, |
67 | | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, |
68 | | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, |
69 | | 12,36,12,12,12,12,12,12,12,12,12,12, |
70 | | }; |
71 | | |
72 | | static inline |
73 | 841 | uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) { |
74 | 841 | uint32_t type = utf8d[byte]; |
75 | | |
76 | 841 | *codep = (*state != UTF8_ACCEPT) ? |
77 | 63 | (byte & 0x3fu) | (*codep << 6) : |
78 | 841 | (0xff >> type) & (byte); |
79 | | |
80 | 841 | *state = utf8d[256 + *state + type]; |
81 | 841 | return *state; |
82 | 841 | } data_lang.cc:_ZL6decodePjS_j Line | Count | Source | 73 | 574 | uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) { | 74 | 574 | uint32_t type = utf8d[byte]; | 75 | | | 76 | 574 | *codep = (*state != UTF8_ACCEPT) ? | 77 | 42 | (byte & 0x3fu) | (*codep << 6) : | 78 | 574 | (0xff >> type) & (byte); | 79 | | | 80 | 574 | *state = utf8d[256 + *state + type]; | 81 | 574 | return *state; | 82 | 574 | } |
j8_libc.c:_ZL6decodePjS_j Line | Count | Source | 73 | 267 | uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) { | 74 | 267 | uint32_t type = utf8d[byte]; | 75 | | | 76 | 267 | *codep = (*state != UTF8_ACCEPT) ? | 77 | 21 | (byte & 0x3fu) | (*codep << 6) : | 78 | 267 | (0xff >> type) & (byte); | 79 | | | 80 | 267 | *state = utf8d[256 + *state + type]; | 81 | 267 | return *state; | 82 | 267 | } |
|
83 | | |
84 | | #endif |
85 | | |
86 | | #endif // BJOERN_DFA |