cpp

Coverage Report

Created: 2024-03-13 14:13

/home/andy/git/oilshell/oil/data_lang/j8.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef DATA_LANG_J8_H
2
#define DATA_LANG_J8_H
3
4
#include <stdio.h>   // sprintf
5
#include <string.h>  // memcmp
6
7
#include "data_lang/utf8_impls/bjoern_dfa.h"
8
9
#define J8_OUT(ch) \
10
1.11k
  **p_out = (ch);  \
11
1.11k
  (*p_out)++
12
13
static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out,
14
1.04k
                              int j8_escape) {
15
  // We use a slightly weird double pointer style because
16
  //   *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8)
17
  //   *p_out may be advanced by 1 to 6 bytes (depending on escaping)
18
19
  // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne()
20
  // all call Bjoern DFA decode(), and there's a subtle issue where p_in MUST
21
  // have a NUL terminator is required. This is so INCOMPLETE UTF-8 sequences
22
  // are terminated with an INVALID byte that the state machine can accept, and
23
  // 0x00 can only be ITSELF, never part of a sequence. An alternative would be
24
  // to do more bounds checks in these functions.
25
26
  // CALLER MUST CHECK that we are able to write up to 6 bytes!
27
  //   Because the longest output is \u001f or \u{1f} for control chars, since
28
  //   we don't escapes like \u{1f926} right now
29
  //
30
  // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data
31
  //   \yff instead of Unicode replacement char
32
  //   \u{1} instead of \u0001 for unprintable low chars
33
34
  // Returns:
35
  //   0   wrote valid UTF-8 (encoded or not)
36
  //   1   wrote byte that's invalid UTF-8
37
38
1.04k
  unsigned char ch = **p_in;
39
40
  //
41
  // Handle \\ \b \f \n \r \t
42
  //
43
44
  // clang-format off
45
1.04k
  switch (ch) {
46
12
  case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0;
47
15
  case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0;
48
15
  case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0;
49
15
  case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0;
50
15
  case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0;
51
15
  case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0;
52
1.04k
  }
53
  // clang-format on
54
55
  //
56
  // Conditionally handle \' and \"
57
  //
58
961
  if (ch == '\'' && j8_escape) {  // J8-style strings \'
59
0
    J8_OUT('\\');
60
0
    J8_OUT('\'');
61
0
    (*p_in)++;
62
0
    return 0;
63
0
  }
64
961
  if (ch == '"' && !j8_escape) {  // JSON-style strings \"
65
0
    J8_OUT('\\');
66
0
    J8_OUT('"');
67
0
    (*p_in)++;
68
0
    return 0;
69
0
  }
70
71
  //
72
  // Unprintable ASCII control codes
73
  //
74
961
  if (ch < 0x20) {
75
189
    if (j8_escape) {
76
      // printf("Writing for %04x %p\n", ch, *p_out);
77
75
      int n = sprintf((char*)*p_out, "\\u{%x}", ch);
78
      // printf("! Wrote %d bytes for %04x\n", n, ch);
79
75
      *p_out += n;
80
114
    } else {
81
      // printf("Writing for %04x %p\n", ch, *p_out);
82
114
      int n = sprintf((char*)*p_out, "\\u%04x", ch);
83
114
      *p_out += n;
84
      // printf("Wrote %d bytes for %04x\n", n, ch);
85
114
    }
86
189
    (*p_in)++;
87
189
    return 0;
88
189
  }
89
90
  //
91
  // UTF-8 encoded runes and invalid bytes
92
  //
93
772
  unsigned char* start = *p_in;  // save start position
94
772
  uint32_t codepoint = 0;
95
772
  uint32_t state = UTF8_ACCEPT;
96
97
835
  while (1) {
98
835
    decode(&state, &codepoint, ch);
99
    // printf("  state %d\n", state);
100
835
    switch (state) {
101
78
    case UTF8_REJECT: {
102
78
      if (j8_escape) {
103
30
        int n = sprintf((char*)*p_out, "\\y%02x", *start);
104
30
        *p_out += n;
105
48
      } else {
106
        // Unicode replacement char is U+FFFD, so write encoded form
107
        // >>> '\ufffd'.encode('utf-8')
108
        // b'\xef\xbf\xbd'
109
48
        J8_OUT('\xef');
110
48
        J8_OUT('\xbf');
111
48
        J8_OUT('\xbd');
112
48
      }
113
78
      (*p_in) = start;  // REWIND because we might have consumed NUL terminator!
114
78
      (*p_in)++;        // Advance past the byte we wrote
115
78
      return 1;
116
0
    }
117
694
    case UTF8_ACCEPT: {
118
694
      (*p_in)++;
119
      // printf("start %p p_in %p\n", start, *p_in);
120
1.41k
      while (start < *p_in) {
121
721
        J8_OUT(*start);
122
721
        start++;
123
721
      }
124
694
      return 0;
125
0
    }
126
63
    default:
127
63
      (*p_in)++;  // advance, next UTF8_ACCEPT will write it
128
63
      ch = **p_in;
129
63
      break;
130
835
    }
131
835
  }
132
  // Unreachable
133
772
}
data_lang.cc:_ZL11J8EncodeOnePPhS0_i
Line
Count
Source
14
710
                              int j8_escape) {
15
  // We use a slightly weird double pointer style because
16
  //   *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8)
17
  //   *p_out may be advanced by 1 to 6 bytes (depending on escaping)
18
19
  // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne()
20
  // all call Bjoern DFA decode(), and there's a subtle issue where p_in MUST
21
  // have a NUL terminator is required. This is so INCOMPLETE UTF-8 sequences
22
  // are terminated with an INVALID byte that the state machine can accept, and
23
  // 0x00 can only be ITSELF, never part of a sequence. An alternative would be
24
  // to do more bounds checks in these functions.
25
26
  // CALLER MUST CHECK that we are able to write up to 6 bytes!
27
  //   Because the longest output is \u001f or \u{1f} for control chars, since
28
  //   we don't escapes like \u{1f926} right now
29
  //
30
  // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data
31
  //   \yff instead of Unicode replacement char
32
  //   \u{1} instead of \u0001 for unprintable low chars
33
34
  // Returns:
35
  //   0   wrote valid UTF-8 (encoded or not)
36
  //   1   wrote byte that's invalid UTF-8
37
38
710
  unsigned char ch = **p_in;
39
40
  //
41
  // Handle \\ \b \f \n \r \t
42
  //
43
44
  // clang-format off
45
710
  switch (ch) {
46
8
  case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0;
47
10
  case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0;
48
10
  case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0;
49
10
  case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0;
50
10
  case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0;
51
10
  case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0;
52
710
  }
53
  // clang-format on
54
55
  //
56
  // Conditionally handle \' and \"
57
  //
58
652
  if (ch == '\'' && j8_escape) {  // J8-style strings \'
59
0
    J8_OUT('\\');
60
0
    J8_OUT('\'');
61
0
    (*p_in)++;
62
0
    return 0;
63
0
  }
64
652
  if (ch == '"' && !j8_escape) {  // JSON-style strings \"
65
0
    J8_OUT('\\');
66
0
    J8_OUT('"');
67
0
    (*p_in)++;
68
0
    return 0;
69
0
  }
70
71
  //
72
  // Unprintable ASCII control codes
73
  //
74
652
  if (ch < 0x20) {
75
126
    if (j8_escape) {
76
      // printf("Writing for %04x %p\n", ch, *p_out);
77
50
      int n = sprintf((char*)*p_out, "\\u{%x}", ch);
78
      // printf("! Wrote %d bytes for %04x\n", n, ch);
79
50
      *p_out += n;
80
76
    } else {
81
      // printf("Writing for %04x %p\n", ch, *p_out);
82
76
      int n = sprintf((char*)*p_out, "\\u%04x", ch);
83
76
      *p_out += n;
84
      // printf("Wrote %d bytes for %04x\n", n, ch);
85
76
    }
86
126
    (*p_in)++;
87
126
    return 0;
88
126
  }
89
90
  //
91
  // UTF-8 encoded runes and invalid bytes
92
  //
93
526
  unsigned char* start = *p_in;  // save start position
94
526
  uint32_t codepoint = 0;
95
526
  uint32_t state = UTF8_ACCEPT;
96
97
568
  while (1) {
98
568
    decode(&state, &codepoint, ch);
99
    // printf("  state %d\n", state);
100
568
    switch (state) {
101
52
    case UTF8_REJECT: {
102
52
      if (j8_escape) {
103
20
        int n = sprintf((char*)*p_out, "\\y%02x", *start);
104
20
        *p_out += n;
105
32
      } else {
106
        // Unicode replacement char is U+FFFD, so write encoded form
107
        // >>> '\ufffd'.encode('utf-8')
108
        // b'\xef\xbf\xbd'
109
32
        J8_OUT('\xef');
110
32
        J8_OUT('\xbf');
111
32
        J8_OUT('\xbd');
112
32
      }
113
52
      (*p_in) = start;  // REWIND because we might have consumed NUL terminator!
114
52
      (*p_in)++;        // Advance past the byte we wrote
115
52
      return 1;
116
0
    }
117
474
    case UTF8_ACCEPT: {
118
474
      (*p_in)++;
119
      // printf("start %p p_in %p\n", start, *p_in);
120
966
      while (start < *p_in) {
121
492
        J8_OUT(*start);
122
492
        start++;
123
492
      }
124
474
      return 0;
125
0
    }
126
42
    default:
127
42
      (*p_in)++;  // advance, next UTF8_ACCEPT will write it
128
42
      ch = **p_in;
129
42
      break;
130
568
    }
131
568
  }
132
  // Unreachable
133
526
}
j8_libc.c:_ZL11J8EncodeOnePPhS0_i
Line
Count
Source
14
338
                              int j8_escape) {
15
  // We use a slightly weird double pointer style because
16
  //   *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8)
17
  //   *p_out may be advanced by 1 to 6 bytes (depending on escaping)
18
19
  // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne()
20
  // all call Bjoern DFA decode(), and there's a subtle issue where p_in MUST
21
  // have a NUL terminator is required. This is so INCOMPLETE UTF-8 sequences
22
  // are terminated with an INVALID byte that the state machine can accept, and
23
  // 0x00 can only be ITSELF, never part of a sequence. An alternative would be
24
  // to do more bounds checks in these functions.
25
26
  // CALLER MUST CHECK that we are able to write up to 6 bytes!
27
  //   Because the longest output is \u001f or \u{1f} for control chars, since
28
  //   we don't escapes like \u{1f926} right now
29
  //
30
  // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data
31
  //   \yff instead of Unicode replacement char
32
  //   \u{1} instead of \u0001 for unprintable low chars
33
34
  // Returns:
35
  //   0   wrote valid UTF-8 (encoded or not)
36
  //   1   wrote byte that's invalid UTF-8
37
38
338
  unsigned char ch = **p_in;
39
40
  //
41
  // Handle \\ \b \f \n \r \t
42
  //
43
44
  // clang-format off
45
338
  switch (ch) {
46
4
  case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0;
47
5
  case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0;
48
5
  case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0;
49
5
  case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0;
50
5
  case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0;
51
5
  case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0;
52
338
  }
53
  // clang-format on
54
55
  //
56
  // Conditionally handle \' and \"
57
  //
58
309
  if (ch == '\'' && j8_escape) {  // J8-style strings \'
59
0
    J8_OUT('\\');
60
0
    J8_OUT('\'');
61
0
    (*p_in)++;
62
0
    return 0;
63
0
  }
64
309
  if (ch == '"' && !j8_escape) {  // JSON-style strings \"
65
0
    J8_OUT('\\');
66
0
    J8_OUT('"');
67
0
    (*p_in)++;
68
0
    return 0;
69
0
  }
70
71
  //
72
  // Unprintable ASCII control codes
73
  //
74
309
  if (ch < 0x20) {
75
63
    if (j8_escape) {
76
      // printf("Writing for %04x %p\n", ch, *p_out);
77
25
      int n = sprintf((char*)*p_out, "\\u{%x}", ch);
78
      // printf("! Wrote %d bytes for %04x\n", n, ch);
79
25
      *p_out += n;
80
38
    } else {
81
      // printf("Writing for %04x %p\n", ch, *p_out);
82
38
      int n = sprintf((char*)*p_out, "\\u%04x", ch);
83
38
      *p_out += n;
84
      // printf("Wrote %d bytes for %04x\n", n, ch);
85
38
    }
86
63
    (*p_in)++;
87
63
    return 0;
88
63
  }
89
90
  //
91
  // UTF-8 encoded runes and invalid bytes
92
  //
93
246
  unsigned char* start = *p_in;  // save start position
94
246
  uint32_t codepoint = 0;
95
246
  uint32_t state = UTF8_ACCEPT;
96
97
267
  while (1) {
98
267
    decode(&state, &codepoint, ch);
99
    // printf("  state %d\n", state);
100
267
    switch (state) {
101
26
    case UTF8_REJECT: {
102
26
      if (j8_escape) {
103
10
        int n = sprintf((char*)*p_out, "\\y%02x", *start);
104
10
        *p_out += n;
105
16
      } else {
106
        // Unicode replacement char is U+FFFD, so write encoded form
107
        // >>> '\ufffd'.encode('utf-8')
108
        // b'\xef\xbf\xbd'
109
16
        J8_OUT('\xef');
110
16
        J8_OUT('\xbf');
111
16
        J8_OUT('\xbd');
112
16
      }
113
26
      (*p_in) = start;  // REWIND because we might have consumed NUL terminator!
114
26
      (*p_in)++;        // Advance past the byte we wrote
115
26
      return 1;
116
0
    }
117
220
    case UTF8_ACCEPT: {
118
220
      (*p_in)++;
119
      // printf("start %p p_in %p\n", start, *p_in);
120
449
      while (start < *p_in) {
121
229
        J8_OUT(*start);
122
229
        start++;
123
229
      }
124
220
      return 0;
125
0
    }
126
21
    default:
127
21
      (*p_in)++;  // advance, next UTF8_ACCEPT will write it
128
21
      ch = **p_in;
129
21
      break;
130
267
    }
131
267
  }
132
  // Unreachable
133
246
}
134
135
// Like the above, but
136
//
137
//   \xff instead of \yff
138
//   \u001f always, never \u{1f}
139
//   No JSON vs. J8
140
//     No \" escape ever
141
//     No errors -- it can encode everything
142
143
static inline void BashDollarEncodeOne(unsigned char** p_in,
144
0
                                       unsigned char** p_out) {
145
0
  unsigned char ch = **p_in;
146
147
  //
148
  // Handle \\ \b \f \n \r \t \'
149
  //
150
151
  // clang-format off
152
0
  switch (ch) {
153
0
  case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return;
154
0
  case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return;
155
0
  case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return;
156
0
  case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return;
157
0
  case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return;
158
0
  case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return;
159
0
  case '\'': J8_OUT('\\'); J8_OUT('\''); (*p_in)++; return;
160
0
  }
161
  // clang-format off
162
163
  //
164
  // Unprintable ASCII control codes
165
  //
166
0
  if (ch < 0x20) {
167
    // printf("Writing for %04x %p\n", ch, *p_out);
168
0
    int n = sprintf((char*)*p_out, "\\u%04x", ch);
169
0
    *p_out += n;
170
    // printf("Wrote %d bytes for %04x\n", n, ch);
171
0
    (*p_in)++;
172
0
    return;
173
0
  }
174
175
  //
176
  // UTF-8 encoded runes and invalid bytes
177
  //
178
0
  unsigned char* start = *p_in;  // save start position
179
0
  uint32_t codepoint = 0;
180
0
  uint32_t state = UTF8_ACCEPT;
181
182
0
  while (1) {
183
    // unsigned char byte = **p_in;
184
0
    decode(&state, &codepoint, ch);
185
    // printf("  state %d    ch %d\n", state, ch);
186
0
    switch (state) {
187
      // BUG: we don't reject IMMEDIATELY
188
      //
189
      // We could be in another state for up to 4 chars
190
      // And then we hit REJECT
191
      // And then we need to output \yff\yff\yff\yff
192
      // OK that's actually SIXTEEN at once?
193
194
0
    case UTF8_REJECT: {
195
0
      int n = sprintf((char*)*p_out, "\\x%02x", *start);
196
0
      *p_out += n;
197
0
      (*p_in) = start;  // REWIND because we might have consumed NUL terminator!
198
0
      (*p_in)++;        // Advance past the byte we wrote
199
0
      return;
200
0
    }
201
0
    case UTF8_ACCEPT: {
202
0
      (*p_in)++;
203
      // printf("start %p p_in %p\n", start, *p_in);
204
0
      while (start < *p_in) {
205
0
        J8_OUT(*start);
206
0
        start++;
207
0
      }
208
0
      return;
209
0
    }
210
0
    default:
211
0
      (*p_in)++;  // advance, next UTF8_ACCEPT will write it
212
0
      ch = **p_in;
213
      // printf(" => ch %d\n", ch);
214
0
      break;
215
0
    }
216
0
  }
217
  // Unreachable
218
0
}
Unexecuted instantiation: data_lang.cc:_ZL19BashDollarEncodeOnePPhS0_
Unexecuted instantiation: j8_libc.c:_ZL19BashDollarEncodeOnePPhS0_
219
220
// BourneShellEncodeOne rules:
221
//
222
//   must be valid UTF-8
223
//   no control chars
224
//   no ' is required
225
//   no \ -- not required, but avoids ambiguous '\n'
226
//
227
// For example we write $'\\' or b'\\' not '\'
228
// The latter should be written r'\', but we're not outputing
229
230
static inline int BourneShellEncodeOne(unsigned char** p_in,
231
0
                                       unsigned char** p_out) {
232
0
  unsigned char ch = **p_in;
233
234
0
  if (ch == '\'' || ch == '\\') {  // can't encode these in Bourne shell ''
235
0
    return 1;
236
0
  }
237
0
  if (ch < 0x20) {  // Unprintable ASCII control codes
238
0
    return 1;
239
0
  }
240
241
  // UTF-8 encoded runes and invalid bytes
242
0
  unsigned char* start = *p_in;  // save start position
243
0
  uint32_t codepoint = 0;
244
0
  uint32_t state = UTF8_ACCEPT;
245
246
0
  while (1) {
247
0
    decode(&state, &codepoint, ch);
248
    // printf("  state %d\n", state);
249
0
    switch (state) {
250
0
    case UTF8_REJECT: {
251
0
      return 1;
252
0
    }
253
0
    case UTF8_ACCEPT: {
254
0
      (*p_in)++;
255
      // printf("start %p p_in %p\n", start, *p_in);
256
0
      while (start < *p_in) {
257
0
        J8_OUT(*start);
258
0
        start++;
259
0
      }
260
0
      return 0;
261
0
    }
262
0
    default:
263
0
      (*p_in)++;  // advance, next UTF8_ACCEPT will write it
264
0
      ch = **p_in;
265
0
      break;
266
0
    }
267
0
  }
268
  // Unreachable
269
0
}
Unexecuted instantiation: data_lang.cc:_ZL20BourneShellEncodeOnePPhS0_
Unexecuted instantiation: j8_libc.c:_ZL20BourneShellEncodeOnePPhS0_
270
271
// Right now \u001f and \u{1f} are the longest output sequences for a byte.
272
// Bug fix: we need 6 + 1 for the NUL terminator that sprintf() writes!  (Even
273
// though we don't technically need it)
274
275
// Bug: we may need up to 16 bytes: \yaa\yaa\yaa\yaa
276
// If this is too small, we would enter an infinite loop
277
// +1 for NUL terminator
278
279
1.10k
#define J8_MAX_BYTES_PER_INPUT_BYTE 7
280
281
// The minimum capacity must be more than the number above.
282
// TODO: Tune this for our allocator?  We call buf->EnsureMoreSpace(capacity);
283
166
#define J8_MIN_CAPACITY 16
284
285
static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end,
286
                                unsigned char** p_out, unsigned char* out_end,
287
187
                                int j8_escape) {
288
1.18k
  while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
289
    // printf("iter %d  %p < %p \n", i++, *p_out, out_end);
290
1.04k
    int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape);
291
1.04k
    if (invalid_utf8 && !j8_escape) {  // first JSON pass got binary data?
292
48
      return invalid_utf8;             // early return
293
48
    }
294
1.04k
  }
295
139
  return 0;
296
187
}
data_lang.cc:_ZL13J8EncodeChunkPPhS_S0_S_i
Line
Count
Source
287
131
                                int j8_escape) {
288
809
  while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
289
    // printf("iter %d  %p < %p \n", i++, *p_out, out_end);
290
710
    int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape);
291
710
    if (invalid_utf8 && !j8_escape) {  // first JSON pass got binary data?
292
32
      return invalid_utf8;             // early return
293
32
    }
294
710
  }
295
99
  return 0;
296
131
}
j8_libc.c:_ZL13J8EncodeChunkPPhS_S0_S_i
Line
Count
Source
287
56
                                int j8_escape) {
288
378
  while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
289
    // printf("iter %d  %p < %p \n", i++, *p_out, out_end);
290
338
    int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape);
291
338
    if (invalid_utf8 && !j8_escape) {  // first JSON pass got binary data?
292
16
      return invalid_utf8;             // early return
293
16
    }
294
338
  }
295
40
  return 0;
296
56
}
297
298
static inline int BashDollarEncodeChunk(unsigned char** p_in,
299
                                        unsigned char* in_end,
300
                                        unsigned char** p_out,
301
0
                                        unsigned char* out_end) {
302
0
  while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
303
0
    BashDollarEncodeOne(p_in, p_out);
304
0
  }
305
0
  return 0;
306
0
}
Unexecuted instantiation: data_lang.cc:_ZL21BashDollarEncodeChunkPPhS_S0_S_
Unexecuted instantiation: j8_libc.c:_ZL21BashDollarEncodeChunkPPhS_S0_S_
307
308
static inline int BourneShellEncodeChunk(unsigned char** p_in,
309
                                         unsigned char* in_end,
310
                                         unsigned char** p_out,
311
0
                                         unsigned char* out_end) {
312
0
  while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
313
0
    int cannot_encode = BourneShellEncodeOne(p_in, p_out);
314
0
    if (cannot_encode) {     // we need escaping, e.g. \u0001 or \'
315
0
      return cannot_encode;  // early return
316
0
    }
317
0
  }
318
0
  return 0;
319
0
}
Unexecuted instantiation: data_lang.cc:_ZL22BourneShellEncodeChunkPPhS_S0_S_
Unexecuted instantiation: j8_libc.c:_ZL22BourneShellEncodeChunkPPhS_S0_S_
320
321
82
static inline int CanOmitQuotes(unsigned char* s, int len) {
322
82
  if (len == 0) {  // empty string has to be quoted
323
1
    return 0;
324
1
  }
325
326
  // 3 special case keywords
327
81
  if (len == 4) {
328
0
    if (memcmp(s, "null", 4) == 0) {
329
0
      return 0;
330
0
    }
331
0
    if (memcmp(s, "true", 4) == 0) {
332
0
      return 0;
333
0
    }
334
0
  }
335
81
  if (len == 5) {
336
0
    if (memcmp(s, "false", 5) == 0) {
337
0
      return 0;
338
0
    }
339
0
  }
340
341
129
  for (int i = 0; i < len; ++i) {
342
82
    unsigned char ch = s[i];
343
344
    // Corresponds to regex [a-zA-Z0-9./_-]
345
82
    if ('a' <= ch && ch <= 'z') {
346
21
      continue;
347
21
    }
348
61
    if ('A' <= ch && ch <= 'Z') {
349
0
      continue;
350
0
    }
351
61
    if ('0' <= ch && ch <= '9') {
352
26
      continue;
353
26
    }
354
35
    if (ch == '.' || ch == '/' || ch == '_' || ch == '-') {
355
1
      continue;
356
1
    }
357
    // some byte requires quotes
358
    // Not including UTF-8 here because it can have chars that look like space
359
    // or quotes
360
34
    return 0;
361
35
  }
362
47
  return 1;  // everything OK
363
81
}
data_lang.cc:_ZL13CanOmitQuotesPhi
Line
Count
Source
321
82
static inline int CanOmitQuotes(unsigned char* s, int len) {
322
82
  if (len == 0) {  // empty string has to be quoted
323
1
    return 0;
324
1
  }
325
326
  // 3 special case keywords
327
81
  if (len == 4) {
328
0
    if (memcmp(s, "null", 4) == 0) {
329
0
      return 0;
330
0
    }
331
0
    if (memcmp(s, "true", 4) == 0) {
332
0
      return 0;
333
0
    }
334
0
  }
335
81
  if (len == 5) {
336
0
    if (memcmp(s, "false", 5) == 0) {
337
0
      return 0;
338
0
    }
339
0
  }
340
341
129
  for (int i = 0; i < len; ++i) {
342
82
    unsigned char ch = s[i];
343
344
    // Corresponds to regex [a-zA-Z0-9./_-]
345
82
    if ('a' <= ch && ch <= 'z') {
346
21
      continue;
347
21
    }
348
61
    if ('A' <= ch && ch <= 'Z') {
349
0
      continue;
350
0
    }
351
61
    if ('0' <= ch && ch <= '9') {
352
26
      continue;
353
26
    }
354
35
    if (ch == '.' || ch == '/' || ch == '_' || ch == '-') {
355
1
      continue;
356
1
    }
357
    // some byte requires quotes
358
    // Not including UTF-8 here because it can have chars that look like space
359
    // or quotes
360
34
    return 0;
361
35
  }
362
47
  return 1;  // everything OK
363
81
}
Unexecuted instantiation: j8_libc.c:_ZL13CanOmitQuotesPhi
364
365
#endif  // DATA_LANG_J8_H