cpp

Coverage Report

Created: 2024-03-13 14:13

/home/andy/git/oilshell/oil/data_lang/utf8_impls/bjoern_dfa.h
Line
Count
Source
1
#ifndef BJOERN_DFA
2
#define BJOERN_DFA
3
4
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
5
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
6
7
#include "stdint.h"
8
9
#if 0
10
11
#define UTF8_ACCEPT 0
12
#define UTF8_REJECT 1
13
14
static const uint8_t utf8d[] = {
15
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
16
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
17
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
18
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
19
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
20
  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
21
  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
22
  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
23
  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
24
25
  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
26
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
27
  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
28
  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
29
  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
30
};
31
32
uint32_t inline
33
decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
34
  uint32_t type = utf8d[byte];
35
36
  *codep = (*state != UTF8_ACCEPT) ?
37
    (byte & 0x3fu) | (*codep << 6) :
38
    (0xff >> type) & (byte);
39
40
  *state = utf8d[256 + *state*16 + type];
41
  return *state;
42
}
43
44
#else
45
46
// Newer version, lower on the page
47
48
2.31k
#define UTF8_ACCEPT 0
49
84
#define UTF8_REJECT 12
50
51
static const uint8_t utf8d[] = {
52
  // The first part of the table maps bytes to character classes that
53
  // to reduce the size of the transition table and create bitmasks.
54
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
55
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
56
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
59
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
60
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
61
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
62
63
  // The second part is a transition table that maps a combination
64
  // of a state of the automaton and a character class to a state.
65
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
66
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
67
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
68
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
69
  12,36,12,12,12,12,12,12,12,12,12,12,
70
};
71
72
static inline
73
841
uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
74
841
  uint32_t type = utf8d[byte];
75
76
841
  *codep = (*state != UTF8_ACCEPT) ?
77
63
    (byte & 0x3fu) | (*codep << 6) :
78
841
    (0xff >> type) & (byte);
79
80
841
  *state = utf8d[256 + *state + type];
81
841
  return *state;
82
841
}
data_lang.cc:_ZL6decodePjS_j
Line
Count
Source
73
574
uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
74
574
  uint32_t type = utf8d[byte];
75
76
574
  *codep = (*state != UTF8_ACCEPT) ?
77
42
    (byte & 0x3fu) | (*codep << 6) :
78
574
    (0xff >> type) & (byte);
79
80
574
  *state = utf8d[256 + *state + type];
81
574
  return *state;
82
574
}
j8_libc.c:_ZL6decodePjS_j
Line
Count
Source
73
267
uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
74
267
  uint32_t type = utf8d[byte];
75
76
267
  *codep = (*state != UTF8_ACCEPT) ?
77
21
    (byte & 0x3fu) | (*codep << 6) :
78
267
    (0xff >> type) & (byte);
79
80
267
  *state = utf8d[256 + *state + type];
81
267
  return *state;
82
267
}
83
84
#endif
85
86
#endif  // BJOERN_DFA