/home/andy/git/oilshell/oil/data_lang/j8.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef DATA_LANG_J8_H |
2 | | #define DATA_LANG_J8_H |
3 | | |
4 | | #include <stdio.h> // sprintf |
5 | | #include <string.h> // memcmp |
6 | | |
7 | | #include "data_lang/utf8_impls/bjoern_dfa.h" |
8 | | |
9 | | #define J8_OUT(ch) \ |
10 | 1.11k | **p_out = (ch); \ |
11 | 1.11k | (*p_out)++ |
12 | | |
13 | | static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out, |
14 | 1.04k | int j8_escape) { |
15 | | // We use a slightly weird double pointer style because |
16 | | // *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8) |
17 | | // *p_out may be advanced by 1 to 6 bytes (depending on escaping) |
18 | | |
19 | | // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne() |
20 | | // all call Bjoern DFA decode(), and there's a subtle issue where p_in MUST |
21 | | // have a NUL terminator is required. This is so INCOMPLETE UTF-8 sequences |
22 | | // are terminated with an INVALID byte that the state machine can accept, and |
23 | | // 0x00 can only be ITSELF, never part of a sequence. An alternative would be |
24 | | // to do more bounds checks in these functions. |
25 | | |
26 | | // CALLER MUST CHECK that we are able to write up to 6 bytes! |
27 | | // Because the longest output is \u001f or \u{1f} for control chars, since |
28 | | // we don't escapes like \u{1f926} right now |
29 | | // |
30 | | // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data |
31 | | // \yff instead of Unicode replacement char |
32 | | // \u{1} instead of \u0001 for unprintable low chars |
33 | | |
34 | | // Returns: |
35 | | // 0 wrote valid UTF-8 (encoded or not) |
36 | | // 1 wrote byte that's invalid UTF-8 |
37 | | |
38 | 1.04k | unsigned char ch = **p_in; |
39 | | |
40 | | // |
41 | | // Handle \\ \b \f \n \r \t |
42 | | // |
43 | | |
44 | | // clang-format off |
45 | 1.04k | switch (ch) { |
46 | 12 | case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0; |
47 | 15 | case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0; |
48 | 15 | case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0; |
49 | 15 | case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0; |
50 | 15 | case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0; |
51 | 15 | case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0; |
52 | 1.04k | } |
53 | | // clang-format on |
54 | | |
55 | | // |
56 | | // Conditionally handle \' and \" |
57 | | // |
58 | 961 | if (ch == '\'' && j8_escape) { // J8-style strings \' |
59 | 0 | J8_OUT('\\'); |
60 | 0 | J8_OUT('\''); |
61 | 0 | (*p_in)++; |
62 | 0 | return 0; |
63 | 0 | } |
64 | 961 | if (ch == '"' && !j8_escape) { // JSON-style strings \" |
65 | 0 | J8_OUT('\\'); |
66 | 0 | J8_OUT('"'); |
67 | 0 | (*p_in)++; |
68 | 0 | return 0; |
69 | 0 | } |
70 | | |
71 | | // |
72 | | // Unprintable ASCII control codes |
73 | | // |
74 | 961 | if (ch < 0x20) { |
75 | 189 | if (j8_escape) { |
76 | | // printf("Writing for %04x %p\n", ch, *p_out); |
77 | 75 | int n = sprintf((char*)*p_out, "\\u{%x}", ch); |
78 | | // printf("! Wrote %d bytes for %04x\n", n, ch); |
79 | 75 | *p_out += n; |
80 | 114 | } else { |
81 | | // printf("Writing for %04x %p\n", ch, *p_out); |
82 | 114 | int n = sprintf((char*)*p_out, "\\u%04x", ch); |
83 | 114 | *p_out += n; |
84 | | // printf("Wrote %d bytes for %04x\n", n, ch); |
85 | 114 | } |
86 | 189 | (*p_in)++; |
87 | 189 | return 0; |
88 | 189 | } |
89 | | |
90 | | // |
91 | | // UTF-8 encoded runes and invalid bytes |
92 | | // |
93 | 772 | unsigned char* start = *p_in; // save start position |
94 | 772 | uint32_t codepoint = 0; |
95 | 772 | uint32_t state = UTF8_ACCEPT; |
96 | | |
97 | 835 | while (1) { |
98 | 835 | decode(&state, &codepoint, ch); |
99 | | // printf(" state %d\n", state); |
100 | 835 | switch (state) { |
101 | 78 | case UTF8_REJECT: { |
102 | 78 | if (j8_escape) { |
103 | 30 | int n = sprintf((char*)*p_out, "\\y%02x", *start); |
104 | 30 | *p_out += n; |
105 | 48 | } else { |
106 | | // Unicode replacement char is U+FFFD, so write encoded form |
107 | | // >>> '\ufffd'.encode('utf-8') |
108 | | // b'\xef\xbf\xbd' |
109 | 48 | J8_OUT('\xef'); |
110 | 48 | J8_OUT('\xbf'); |
111 | 48 | J8_OUT('\xbd'); |
112 | 48 | } |
113 | 78 | (*p_in) = start; // REWIND because we might have consumed NUL terminator! |
114 | 78 | (*p_in)++; // Advance past the byte we wrote |
115 | 78 | return 1; |
116 | 0 | } |
117 | 694 | case UTF8_ACCEPT: { |
118 | 694 | (*p_in)++; |
119 | | // printf("start %p p_in %p\n", start, *p_in); |
120 | 1.41k | while (start < *p_in) { |
121 | 721 | J8_OUT(*start); |
122 | 721 | start++; |
123 | 721 | } |
124 | 694 | return 0; |
125 | 0 | } |
126 | 63 | default: |
127 | 63 | (*p_in)++; // advance, next UTF8_ACCEPT will write it |
128 | 63 | ch = **p_in; |
129 | 63 | break; |
130 | 835 | } |
131 | 835 | } |
132 | | // Unreachable |
133 | 772 | } data_lang.cc:_ZL11J8EncodeOnePPhS0_i Line | Count | Source | 14 | 710 | int j8_escape) { | 15 | | // We use a slightly weird double pointer style because | 16 | | // *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8) | 17 | | // *p_out may be advanced by 1 to 6 bytes (depending on escaping) | 18 | | | 19 | | // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne() | 20 | | // all call Bjoern DFA decode(), and there's a subtle issue where p_in MUST | 21 | | // have a NUL terminator is required. This is so INCOMPLETE UTF-8 sequences | 22 | | // are terminated with an INVALID byte that the state machine can accept, and | 23 | | // 0x00 can only be ITSELF, never part of a sequence. An alternative would be | 24 | | // to do more bounds checks in these functions. | 25 | | | 26 | | // CALLER MUST CHECK that we are able to write up to 6 bytes! | 27 | | // Because the longest output is \u001f or \u{1f} for control chars, since | 28 | | // we don't escapes like \u{1f926} right now | 29 | | // | 30 | | // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data | 31 | | // \yff instead of Unicode replacement char | 32 | | // \u{1} instead of \u0001 for unprintable low chars | 33 | | | 34 | | // Returns: | 35 | | // 0 wrote valid UTF-8 (encoded or not) | 36 | | // 1 wrote byte that's invalid UTF-8 | 37 | | | 38 | 710 | unsigned char ch = **p_in; | 39 | | | 40 | | // | 41 | | // Handle \\ \b \f \n \r \t | 42 | | // | 43 | | | 44 | | // clang-format off | 45 | 710 | switch (ch) { | 46 | 8 | case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0; | 47 | 10 | case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0; | 48 | 10 | case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0; | 49 | 10 | case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0; | 50 | 10 | case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0; | 51 | 10 | case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0; | 52 | 710 | } | 53 | | // clang-format on | 54 | | | 55 | | // | 56 | | // Conditionally handle \' and \" | 57 | | // | 58 | 652 | if (ch == '\'' && j8_escape) { // J8-style strings \' | 59 | 0 | J8_OUT('\\'); | 60 | 0 | J8_OUT('\''); | 61 | 0 | (*p_in)++; | 62 | 0 | return 0; | 63 | 0 | } | 64 | 652 | if (ch == '"' && !j8_escape) { // JSON-style strings \" | 65 | 0 | J8_OUT('\\'); | 66 | 0 | J8_OUT('"'); | 67 | 0 | (*p_in)++; | 68 | 0 | return 0; | 69 | 0 | } | 70 | | | 71 | | // | 72 | | // Unprintable ASCII control codes | 73 | | // | 74 | 652 | if (ch < 0x20) { | 75 | 126 | if (j8_escape) { | 76 | | // printf("Writing for %04x %p\n", ch, *p_out); | 77 | 50 | int n = sprintf((char*)*p_out, "\\u{%x}", ch); | 78 | | // printf("! Wrote %d bytes for %04x\n", n, ch); | 79 | 50 | *p_out += n; | 80 | 76 | } else { | 81 | | // printf("Writing for %04x %p\n", ch, *p_out); | 82 | 76 | int n = sprintf((char*)*p_out, "\\u%04x", ch); | 83 | 76 | *p_out += n; | 84 | | // printf("Wrote %d bytes for %04x\n", n, ch); | 85 | 76 | } | 86 | 126 | (*p_in)++; | 87 | 126 | return 0; | 88 | 126 | } | 89 | | | 90 | | // | 91 | | // UTF-8 encoded runes and invalid bytes | 92 | | // | 93 | 526 | unsigned char* start = *p_in; // save start position | 94 | 526 | uint32_t codepoint = 0; | 95 | 526 | uint32_t state = UTF8_ACCEPT; | 96 | | | 97 | 568 | while (1) { | 98 | 568 | decode(&state, &codepoint, ch); | 99 | | // printf(" state %d\n", state); | 100 | 568 | switch (state) { | 101 | 52 | case UTF8_REJECT: { | 102 | 52 | if (j8_escape) { | 103 | 20 | int n = sprintf((char*)*p_out, "\\y%02x", *start); | 104 | 20 | *p_out += n; | 105 | 32 | } else { | 106 | | // Unicode replacement char is U+FFFD, so write encoded form | 107 | | // >>> '\ufffd'.encode('utf-8') | 108 | | // b'\xef\xbf\xbd' | 109 | 32 | J8_OUT('\xef'); | 110 | 32 | J8_OUT('\xbf'); | 111 | 32 | J8_OUT('\xbd'); | 112 | 32 | } | 113 | 52 | (*p_in) = start; // REWIND because we might have consumed NUL terminator! | 114 | 52 | (*p_in)++; // Advance past the byte we wrote | 115 | 52 | return 1; | 116 | 0 | } | 117 | 474 | case UTF8_ACCEPT: { | 118 | 474 | (*p_in)++; | 119 | | // printf("start %p p_in %p\n", start, *p_in); | 120 | 966 | while (start < *p_in) { | 121 | 492 | J8_OUT(*start); | 122 | 492 | start++; | 123 | 492 | } | 124 | 474 | return 0; | 125 | 0 | } | 126 | 42 | default: | 127 | 42 | (*p_in)++; // advance, next UTF8_ACCEPT will write it | 128 | 42 | ch = **p_in; | 129 | 42 | break; | 130 | 568 | } | 131 | 568 | } | 132 | | // Unreachable | 133 | 526 | } |
j8_libc.c:_ZL11J8EncodeOnePPhS0_i Line | Count | Source | 14 | 338 | int j8_escape) { | 15 | | // We use a slightly weird double pointer style because | 16 | | // *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8) | 17 | | // *p_out may be advanced by 1 to 6 bytes (depending on escaping) | 18 | | | 19 | | // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne() | 20 | | // all call Bjoern DFA decode(), and there's a subtle issue where p_in MUST | 21 | | // have a NUL terminator is required. This is so INCOMPLETE UTF-8 sequences | 22 | | // are terminated with an INVALID byte that the state machine can accept, and | 23 | | // 0x00 can only be ITSELF, never part of a sequence. An alternative would be | 24 | | // to do more bounds checks in these functions. | 25 | | | 26 | | // CALLER MUST CHECK that we are able to write up to 6 bytes! | 27 | | // Because the longest output is \u001f or \u{1f} for control chars, since | 28 | | // we don't escapes like \u{1f926} right now | 29 | | // | 30 | | // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data | 31 | | // \yff instead of Unicode replacement char | 32 | | // \u{1} instead of \u0001 for unprintable low chars | 33 | | | 34 | | // Returns: | 35 | | // 0 wrote valid UTF-8 (encoded or not) | 36 | | // 1 wrote byte that's invalid UTF-8 | 37 | | | 38 | 338 | unsigned char ch = **p_in; | 39 | | | 40 | | // | 41 | | // Handle \\ \b \f \n \r \t | 42 | | // | 43 | | | 44 | | // clang-format off | 45 | 338 | switch (ch) { | 46 | 4 | case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0; | 47 | 5 | case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0; | 48 | 5 | case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0; | 49 | 5 | case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0; | 50 | 5 | case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0; | 51 | 5 | case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0; | 52 | 338 | } | 53 | | // clang-format on | 54 | | | 55 | | // | 56 | | // Conditionally handle \' and \" | 57 | | // | 58 | 309 | if (ch == '\'' && j8_escape) { // J8-style strings \' | 59 | 0 | J8_OUT('\\'); | 60 | 0 | J8_OUT('\''); | 61 | 0 | (*p_in)++; | 62 | 0 | return 0; | 63 | 0 | } | 64 | 309 | if (ch == '"' && !j8_escape) { // JSON-style strings \" | 65 | 0 | J8_OUT('\\'); | 66 | 0 | J8_OUT('"'); | 67 | 0 | (*p_in)++; | 68 | 0 | return 0; | 69 | 0 | } | 70 | | | 71 | | // | 72 | | // Unprintable ASCII control codes | 73 | | // | 74 | 309 | if (ch < 0x20) { | 75 | 63 | if (j8_escape) { | 76 | | // printf("Writing for %04x %p\n", ch, *p_out); | 77 | 25 | int n = sprintf((char*)*p_out, "\\u{%x}", ch); | 78 | | // printf("! Wrote %d bytes for %04x\n", n, ch); | 79 | 25 | *p_out += n; | 80 | 38 | } else { | 81 | | // printf("Writing for %04x %p\n", ch, *p_out); | 82 | 38 | int n = sprintf((char*)*p_out, "\\u%04x", ch); | 83 | 38 | *p_out += n; | 84 | | // printf("Wrote %d bytes for %04x\n", n, ch); | 85 | 38 | } | 86 | 63 | (*p_in)++; | 87 | 63 | return 0; | 88 | 63 | } | 89 | | | 90 | | // | 91 | | // UTF-8 encoded runes and invalid bytes | 92 | | // | 93 | 246 | unsigned char* start = *p_in; // save start position | 94 | 246 | uint32_t codepoint = 0; | 95 | 246 | uint32_t state = UTF8_ACCEPT; | 96 | | | 97 | 267 | while (1) { | 98 | 267 | decode(&state, &codepoint, ch); | 99 | | // printf(" state %d\n", state); | 100 | 267 | switch (state) { | 101 | 26 | case UTF8_REJECT: { | 102 | 26 | if (j8_escape) { | 103 | 10 | int n = sprintf((char*)*p_out, "\\y%02x", *start); | 104 | 10 | *p_out += n; | 105 | 16 | } else { | 106 | | // Unicode replacement char is U+FFFD, so write encoded form | 107 | | // >>> '\ufffd'.encode('utf-8') | 108 | | // b'\xef\xbf\xbd' | 109 | 16 | J8_OUT('\xef'); | 110 | 16 | J8_OUT('\xbf'); | 111 | 16 | J8_OUT('\xbd'); | 112 | 16 | } | 113 | 26 | (*p_in) = start; // REWIND because we might have consumed NUL terminator! | 114 | 26 | (*p_in)++; // Advance past the byte we wrote | 115 | 26 | return 1; | 116 | 0 | } | 117 | 220 | case UTF8_ACCEPT: { | 118 | 220 | (*p_in)++; | 119 | | // printf("start %p p_in %p\n", start, *p_in); | 120 | 449 | while (start < *p_in) { | 121 | 229 | J8_OUT(*start); | 122 | 229 | start++; | 123 | 229 | } | 124 | 220 | return 0; | 125 | 0 | } | 126 | 21 | default: | 127 | 21 | (*p_in)++; // advance, next UTF8_ACCEPT will write it | 128 | 21 | ch = **p_in; | 129 | 21 | break; | 130 | 267 | } | 131 | 267 | } | 132 | | // Unreachable | 133 | 246 | } |
|
134 | | |
135 | | // Like the above, but |
136 | | // |
137 | | // \xff instead of \yff |
138 | | // \u001f always, never \u{1f} |
139 | | // No JSON vs. J8 |
140 | | // No \" escape ever |
141 | | // No errors -- it can encode everything |
142 | | |
143 | | static inline void BashDollarEncodeOne(unsigned char** p_in, |
144 | 0 | unsigned char** p_out) { |
145 | 0 | unsigned char ch = **p_in; |
146 | | |
147 | | // |
148 | | // Handle \\ \b \f \n \r \t \' |
149 | | // |
150 | | |
151 | | // clang-format off |
152 | 0 | switch (ch) { |
153 | 0 | case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return; |
154 | 0 | case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return; |
155 | 0 | case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return; |
156 | 0 | case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return; |
157 | 0 | case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return; |
158 | 0 | case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return; |
159 | 0 | case '\'': J8_OUT('\\'); J8_OUT('\''); (*p_in)++; return; |
160 | 0 | } |
161 | | // clang-format off |
162 | | |
163 | | // |
164 | | // Unprintable ASCII control codes |
165 | | // |
166 | 0 | if (ch < 0x20) { |
167 | | // printf("Writing for %04x %p\n", ch, *p_out); |
168 | 0 | int n = sprintf((char*)*p_out, "\\u%04x", ch); |
169 | 0 | *p_out += n; |
170 | | // printf("Wrote %d bytes for %04x\n", n, ch); |
171 | 0 | (*p_in)++; |
172 | 0 | return; |
173 | 0 | } |
174 | | |
175 | | // |
176 | | // UTF-8 encoded runes and invalid bytes |
177 | | // |
178 | 0 | unsigned char* start = *p_in; // save start position |
179 | 0 | uint32_t codepoint = 0; |
180 | 0 | uint32_t state = UTF8_ACCEPT; |
181 | |
|
182 | 0 | while (1) { |
183 | | // unsigned char byte = **p_in; |
184 | 0 | decode(&state, &codepoint, ch); |
185 | | // printf(" state %d ch %d\n", state, ch); |
186 | 0 | switch (state) { |
187 | | // BUG: we don't reject IMMEDIATELY |
188 | | // |
189 | | // We could be in another state for up to 4 chars |
190 | | // And then we hit REJECT |
191 | | // And then we need to output \yff\yff\yff\yff |
192 | | // OK that's actually SIXTEEN at once? |
193 | | |
194 | 0 | case UTF8_REJECT: { |
195 | 0 | int n = sprintf((char*)*p_out, "\\x%02x", *start); |
196 | 0 | *p_out += n; |
197 | 0 | (*p_in) = start; // REWIND because we might have consumed NUL terminator! |
198 | 0 | (*p_in)++; // Advance past the byte we wrote |
199 | 0 | return; |
200 | 0 | } |
201 | 0 | case UTF8_ACCEPT: { |
202 | 0 | (*p_in)++; |
203 | | // printf("start %p p_in %p\n", start, *p_in); |
204 | 0 | while (start < *p_in) { |
205 | 0 | J8_OUT(*start); |
206 | 0 | start++; |
207 | 0 | } |
208 | 0 | return; |
209 | 0 | } |
210 | 0 | default: |
211 | 0 | (*p_in)++; // advance, next UTF8_ACCEPT will write it |
212 | 0 | ch = **p_in; |
213 | | // printf(" => ch %d\n", ch); |
214 | 0 | break; |
215 | 0 | } |
216 | 0 | } |
217 | | // Unreachable |
218 | 0 | } Unexecuted instantiation: data_lang.cc:_ZL19BashDollarEncodeOnePPhS0_ Unexecuted instantiation: j8_libc.c:_ZL19BashDollarEncodeOnePPhS0_ |
219 | | |
220 | | // BourneShellEncodeOne rules: |
221 | | // |
222 | | // must be valid UTF-8 |
223 | | // no control chars |
224 | | // no ' is required |
225 | | // no \ -- not required, but avoids ambiguous '\n' |
226 | | // |
227 | | // For example we write $'\\' or b'\\' not '\' |
228 | | // The latter should be written r'\', but we're not outputing |
229 | | |
230 | | static inline int BourneShellEncodeOne(unsigned char** p_in, |
231 | 0 | unsigned char** p_out) { |
232 | 0 | unsigned char ch = **p_in; |
233 | |
|
234 | 0 | if (ch == '\'' || ch == '\\') { // can't encode these in Bourne shell '' |
235 | 0 | return 1; |
236 | 0 | } |
237 | 0 | if (ch < 0x20) { // Unprintable ASCII control codes |
238 | 0 | return 1; |
239 | 0 | } |
240 | | |
241 | | // UTF-8 encoded runes and invalid bytes |
242 | 0 | unsigned char* start = *p_in; // save start position |
243 | 0 | uint32_t codepoint = 0; |
244 | 0 | uint32_t state = UTF8_ACCEPT; |
245 | |
|
246 | 0 | while (1) { |
247 | 0 | decode(&state, &codepoint, ch); |
248 | | // printf(" state %d\n", state); |
249 | 0 | switch (state) { |
250 | 0 | case UTF8_REJECT: { |
251 | 0 | return 1; |
252 | 0 | } |
253 | 0 | case UTF8_ACCEPT: { |
254 | 0 | (*p_in)++; |
255 | | // printf("start %p p_in %p\n", start, *p_in); |
256 | 0 | while (start < *p_in) { |
257 | 0 | J8_OUT(*start); |
258 | 0 | start++; |
259 | 0 | } |
260 | 0 | return 0; |
261 | 0 | } |
262 | 0 | default: |
263 | 0 | (*p_in)++; // advance, next UTF8_ACCEPT will write it |
264 | 0 | ch = **p_in; |
265 | 0 | break; |
266 | 0 | } |
267 | 0 | } |
268 | | // Unreachable |
269 | 0 | } Unexecuted instantiation: data_lang.cc:_ZL20BourneShellEncodeOnePPhS0_ Unexecuted instantiation: j8_libc.c:_ZL20BourneShellEncodeOnePPhS0_ |
270 | | |
271 | | // Right now \u001f and \u{1f} are the longest output sequences for a byte. |
272 | | // Bug fix: we need 6 + 1 for the NUL terminator that sprintf() writes! (Even |
273 | | // though we don't technically need it) |
274 | | |
275 | | // Bug: we may need up to 16 bytes: \yaa\yaa\yaa\yaa |
276 | | // If this is too small, we would enter an infinite loop |
277 | | // +1 for NUL terminator |
278 | | |
279 | 1.10k | #define J8_MAX_BYTES_PER_INPUT_BYTE 7 |
280 | | |
281 | | // The minimum capacity must be more than the number above. |
282 | | // TODO: Tune this for our allocator? We call buf->EnsureMoreSpace(capacity); |
283 | 166 | #define J8_MIN_CAPACITY 16 |
284 | | |
285 | | static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end, |
286 | | unsigned char** p_out, unsigned char* out_end, |
287 | 187 | int j8_escape) { |
288 | 1.18k | while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) { |
289 | | // printf("iter %d %p < %p \n", i++, *p_out, out_end); |
290 | 1.04k | int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape); |
291 | 1.04k | if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data? |
292 | 48 | return invalid_utf8; // early return |
293 | 48 | } |
294 | 1.04k | } |
295 | 139 | return 0; |
296 | 187 | } data_lang.cc:_ZL13J8EncodeChunkPPhS_S0_S_i Line | Count | Source | 287 | 131 | int j8_escape) { | 288 | 809 | while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) { | 289 | | // printf("iter %d %p < %p \n", i++, *p_out, out_end); | 290 | 710 | int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape); | 291 | 710 | if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data? | 292 | 32 | return invalid_utf8; // early return | 293 | 32 | } | 294 | 710 | } | 295 | 99 | return 0; | 296 | 131 | } |
j8_libc.c:_ZL13J8EncodeChunkPPhS_S0_S_i Line | Count | Source | 287 | 56 | int j8_escape) { | 288 | 378 | while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) { | 289 | | // printf("iter %d %p < %p \n", i++, *p_out, out_end); | 290 | 338 | int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape); | 291 | 338 | if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data? | 292 | 16 | return invalid_utf8; // early return | 293 | 16 | } | 294 | 338 | } | 295 | 40 | return 0; | 296 | 56 | } |
|
297 | | |
298 | | static inline int BashDollarEncodeChunk(unsigned char** p_in, |
299 | | unsigned char* in_end, |
300 | | unsigned char** p_out, |
301 | 0 | unsigned char* out_end) { |
302 | 0 | while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) { |
303 | 0 | BashDollarEncodeOne(p_in, p_out); |
304 | 0 | } |
305 | 0 | return 0; |
306 | 0 | } Unexecuted instantiation: data_lang.cc:_ZL21BashDollarEncodeChunkPPhS_S0_S_ Unexecuted instantiation: j8_libc.c:_ZL21BashDollarEncodeChunkPPhS_S0_S_ |
307 | | |
308 | | static inline int BourneShellEncodeChunk(unsigned char** p_in, |
309 | | unsigned char* in_end, |
310 | | unsigned char** p_out, |
311 | 0 | unsigned char* out_end) { |
312 | 0 | while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) { |
313 | 0 | int cannot_encode = BourneShellEncodeOne(p_in, p_out); |
314 | 0 | if (cannot_encode) { // we need escaping, e.g. \u0001 or \' |
315 | 0 | return cannot_encode; // early return |
316 | 0 | } |
317 | 0 | } |
318 | 0 | return 0; |
319 | 0 | } Unexecuted instantiation: data_lang.cc:_ZL22BourneShellEncodeChunkPPhS_S0_S_ Unexecuted instantiation: j8_libc.c:_ZL22BourneShellEncodeChunkPPhS_S0_S_ |
320 | | |
321 | 82 | static inline int CanOmitQuotes(unsigned char* s, int len) { |
322 | 82 | if (len == 0) { // empty string has to be quoted |
323 | 1 | return 0; |
324 | 1 | } |
325 | | |
326 | | // 3 special case keywords |
327 | 81 | if (len == 4) { |
328 | 0 | if (memcmp(s, "null", 4) == 0) { |
329 | 0 | return 0; |
330 | 0 | } |
331 | 0 | if (memcmp(s, "true", 4) == 0) { |
332 | 0 | return 0; |
333 | 0 | } |
334 | 0 | } |
335 | 81 | if (len == 5) { |
336 | 0 | if (memcmp(s, "false", 5) == 0) { |
337 | 0 | return 0; |
338 | 0 | } |
339 | 0 | } |
340 | | |
341 | 129 | for (int i = 0; i < len; ++i) { |
342 | 82 | unsigned char ch = s[i]; |
343 | | |
344 | | // Corresponds to regex [a-zA-Z0-9./_-] |
345 | 82 | if ('a' <= ch && ch <= 'z') { |
346 | 21 | continue; |
347 | 21 | } |
348 | 61 | if ('A' <= ch && ch <= 'Z') { |
349 | 0 | continue; |
350 | 0 | } |
351 | 61 | if ('0' <= ch && ch <= '9') { |
352 | 26 | continue; |
353 | 26 | } |
354 | 35 | if (ch == '.' || ch == '/' || ch == '_' || ch == '-') { |
355 | 1 | continue; |
356 | 1 | } |
357 | | // some byte requires quotes |
358 | | // Not including UTF-8 here because it can have chars that look like space |
359 | | // or quotes |
360 | 34 | return 0; |
361 | 35 | } |
362 | 47 | return 1; // everything OK |
363 | 81 | } data_lang.cc:_ZL13CanOmitQuotesPhi Line | Count | Source | 321 | 82 | static inline int CanOmitQuotes(unsigned char* s, int len) { | 322 | 82 | if (len == 0) { // empty string has to be quoted | 323 | 1 | return 0; | 324 | 1 | } | 325 | | | 326 | | // 3 special case keywords | 327 | 81 | if (len == 4) { | 328 | 0 | if (memcmp(s, "null", 4) == 0) { | 329 | 0 | return 0; | 330 | 0 | } | 331 | 0 | if (memcmp(s, "true", 4) == 0) { | 332 | 0 | return 0; | 333 | 0 | } | 334 | 0 | } | 335 | 81 | if (len == 5) { | 336 | 0 | if (memcmp(s, "false", 5) == 0) { | 337 | 0 | return 0; | 338 | 0 | } | 339 | 0 | } | 340 | | | 341 | 129 | for (int i = 0; i < len; ++i) { | 342 | 82 | unsigned char ch = s[i]; | 343 | | | 344 | | // Corresponds to regex [a-zA-Z0-9./_-] | 345 | 82 | if ('a' <= ch && ch <= 'z') { | 346 | 21 | continue; | 347 | 21 | } | 348 | 61 | if ('A' <= ch && ch <= 'Z') { | 349 | 0 | continue; | 350 | 0 | } | 351 | 61 | if ('0' <= ch && ch <= '9') { | 352 | 26 | continue; | 353 | 26 | } | 354 | 35 | if (ch == '.' || ch == '/' || ch == '_' || ch == '-') { | 355 | 1 | continue; | 356 | 1 | } | 357 | | // some byte requires quotes | 358 | | // Not including UTF-8 here because it can have chars that look like space | 359 | | // or quotes | 360 | 34 | return 0; | 361 | 35 | } | 362 | 47 | return 1; // everything OK | 363 | 81 | } |
Unexecuted instantiation: j8_libc.c:_ZL13CanOmitQuotesPhi |
364 | | |
365 | | #endif // DATA_LANG_J8_H |