OILS
/
frontend
/
match.py
1 |
"""
|
2 |
match.py - match with generated re2c code or Python regexes.
|
3 |
"""
|
4 |
|
5 |
from _devbuild.gen.id_kind_asdl import Id, Id_t
|
6 |
from _devbuild.gen.types_asdl import lex_mode_t
|
7 |
from frontend import lexer_def
|
8 |
|
9 |
from typing import Tuple, Callable, Dict, List, Any, TYPE_CHECKING
|
10 |
|
11 |
# bin/osh should work without compiling fastlex? But we want all the unit
|
12 |
# tests to run with a known version of it.
|
13 |
try:
|
14 |
import fastlex
|
15 |
except ImportError:
|
16 |
fastlex = None
|
17 |
|
18 |
if fastlex:
|
19 |
# Shouldn't use re module in this case
|
20 |
re = None
|
21 |
else:
|
22 |
import re # type: ignore
|
23 |
|
24 |
if TYPE_CHECKING:
|
25 |
SRE_Pattern = Any # Do we need a .pyi file for re or _sre?
|
26 |
SimpleMatchFunc = Callable[[str, int], Tuple[Id_t, int]]
|
27 |
LexerPairs = List[Tuple[SRE_Pattern, Id_t]]
|
28 |
|
29 |
|
30 |
def _LongestMatch(re_list, line, start_pos):
|
31 |
# type: (LexerPairs, str, int) -> Tuple[Id_t, int]
|
32 |
|
33 |
# Simulate the rule for \x00, which we generate in frontend/match.re2c.h
|
34 |
if start_pos >= len(line):
|
35 |
return Id.Eol_Tok, start_pos
|
36 |
# Simulate C-style string handling: \x00 is empty string.
|
37 |
if line[start_pos] == '\0':
|
38 |
return Id.Eol_Tok, start_pos
|
39 |
|
40 |
matches = []
|
41 |
for regex, tok_type in re_list:
|
42 |
m = regex.match(line, start_pos) # left-anchored
|
43 |
if m:
|
44 |
matches.append((m.end(0), tok_type, m.group(0)))
|
45 |
if not matches:
|
46 |
raise AssertionError('no match at position %d: %r' % (start_pos, line))
|
47 |
end_pos, tok_type, tok_val = max(matches, key=lambda m: m[0])
|
48 |
#util.log('%s %s', tok_type, end_pos)
|
49 |
return tok_type, end_pos
|
50 |
|
51 |
|
52 |
def _CompileAll(pat_list):
|
53 |
# type: (List[Tuple[bool, str, Id_t]]) -> LexerPairs
|
54 |
result = []
|
55 |
for is_regex, pat, token_id in pat_list:
|
56 |
if not is_regex:
|
57 |
pat = re.escape(pat) # type: ignore # turn $ into \$
|
58 |
result.append((re.compile(pat), token_id)) # type: ignore
|
59 |
return result
|
60 |
|
61 |
|
62 |
class _MatchOshToken_Slow(object):
|
63 |
"""An abstract matcher that doesn't depend on OSH."""
|
64 |
|
65 |
def __init__(self, lexer_def):
|
66 |
# type: (Dict[lex_mode_t, List[Tuple[bool, str, Id_t]]]) -> None
|
67 |
self.lexer_def = {} # type: Dict[lex_mode_t, LexerPairs]
|
68 |
for lex_mode, pat_list in lexer_def.items():
|
69 |
self.lexer_def[lex_mode] = _CompileAll(pat_list)
|
70 |
|
71 |
def __call__(self, lex_mode, line, start_pos):
|
72 |
# type: (lex_mode_t, str, int) -> Tuple[Id_t, int]
|
73 |
"""Returns (id, end_pos)."""
|
74 |
re_list = self.lexer_def[lex_mode]
|
75 |
|
76 |
return _LongestMatch(re_list, line, start_pos)
|
77 |
|
78 |
|
79 |
def _MatchOshToken_Fast(lex_mode, line, start_pos):
|
80 |
# type: (lex_mode_t, str, int) -> Tuple[Id_t, int]
|
81 |
"""Returns (Id, end_pos)."""
|
82 |
tok_type, end_pos = fastlex.MatchOshToken(lex_mode, line, start_pos)
|
83 |
# IMPORTANT: We're reusing Id instances here. Ids are very common, so this
|
84 |
# saves memory.
|
85 |
return tok_type, end_pos
|
86 |
|
87 |
|
88 |
class _MatchTokenSlow(object):
|
89 |
def __init__(self, pat_list):
|
90 |
# type: (List[Tuple[bool, str, Id_t]]) -> None
|
91 |
self.pat_list = _CompileAll(pat_list)
|
92 |
|
93 |
def __call__(self, line, start_pos):
|
94 |
# type: (str, int) -> Tuple[Id_t, int]
|
95 |
return _LongestMatch(self.pat_list, line, start_pos)
|
96 |
|
97 |
|
98 |
def _MatchEchoToken_Fast(line, start_pos):
|
99 |
# type: (str, int) -> Tuple[Id_t, int]
|
100 |
"""Returns (id, end_pos)."""
|
101 |
tok_type, end_pos = fastlex.MatchEchoToken(line, start_pos)
|
102 |
return tok_type, end_pos
|
103 |
|
104 |
|
105 |
def _MatchGlobToken_Fast(line, start_pos):
|
106 |
# type: (str, int) -> Tuple[Id_t, int]
|
107 |
"""Returns (id, end_pos)."""
|
108 |
tok_type, end_pos = fastlex.MatchGlobToken(line, start_pos)
|
109 |
return tok_type, end_pos
|
110 |
|
111 |
|
112 |
def _MatchPS1Token_Fast(line, start_pos):
|
113 |
# type: (str, int) -> Tuple[Id_t, int]
|
114 |
"""Returns (id, end_pos)."""
|
115 |
tok_type, end_pos = fastlex.MatchPS1Token(line, start_pos)
|
116 |
return tok_type, end_pos
|
117 |
|
118 |
|
119 |
def _MatchHistoryToken_Fast(line, start_pos):
|
120 |
# type: (str, int) -> Tuple[Id_t, int]
|
121 |
"""Returns (id, end_pos)."""
|
122 |
tok_type, end_pos = fastlex.MatchHistoryToken(line, start_pos)
|
123 |
return tok_type, end_pos
|
124 |
|
125 |
|
126 |
def _MatchBraceRangeToken_Fast(line, start_pos):
|
127 |
# type: (str, int) -> Tuple[Id_t, int]
|
128 |
"""Returns (id, end_pos)."""
|
129 |
tok_type, end_pos = fastlex.MatchBraceRangeToken(line, start_pos)
|
130 |
return tok_type, end_pos
|
131 |
|
132 |
|
133 |
#def _MatchQsnToken_Fast(line, start_pos):
|
134 |
# # type: (str, int) -> Tuple[Id_t, int]
|
135 |
# """Returns (id, end_pos)."""
|
136 |
# tok_type, end_pos = fastlex.MatchQsnToken(line, start_pos)
|
137 |
# return tok_type, end_pos
|
138 |
|
139 |
if fastlex:
|
140 |
OneToken = _MatchOshToken_Fast
|
141 |
ECHO_MATCHER = _MatchEchoToken_Fast
|
142 |
GLOB_MATCHER = _MatchGlobToken_Fast
|
143 |
PS1_MATCHER = _MatchPS1Token_Fast
|
144 |
HISTORY_MATCHER = _MatchHistoryToken_Fast
|
145 |
BRACE_RANGE_MATCHER = _MatchBraceRangeToken_Fast
|
146 |
#QSN_MATCHER = _MatchQsnToken_Fast
|
147 |
IsValidVarName = fastlex.IsValidVarName
|
148 |
ShouldHijack = fastlex.ShouldHijack
|
149 |
LooksLikeInteger = fastlex.LooksLikeInteger
|
150 |
LooksLikeFloat = fastlex.LooksLikeFloat
|
151 |
else:
|
152 |
OneToken = _MatchOshToken_Slow(lexer_def.LEXER_DEF)
|
153 |
ECHO_MATCHER = _MatchTokenSlow(lexer_def.ECHO_E_DEF)
|
154 |
GLOB_MATCHER = _MatchTokenSlow(lexer_def.GLOB_DEF)
|
155 |
PS1_MATCHER = _MatchTokenSlow(lexer_def.PS1_DEF)
|
156 |
HISTORY_MATCHER = _MatchTokenSlow(lexer_def.HISTORY_DEF)
|
157 |
BRACE_RANGE_MATCHER = _MatchTokenSlow(lexer_def.BRACE_RANGE_DEF)
|
158 |
#QSN_MATCHER = _MatchTokenSlow(lexer_def.QSN_DEF)
|
159 |
|
160 |
# Used by osh/cmd_parse.py to validate for loop name. Note it must be
|
161 |
# anchored on the right.
|
162 |
_VAR_NAME_RE = re.compile(lexer_def.VAR_NAME_RE + '$') # type: ignore
|
163 |
|
164 |
def IsValidVarName(s):
|
165 |
# type: (str) -> bool
|
166 |
return bool(_VAR_NAME_RE.match(s))
|
167 |
|
168 |
# yapf: disable
|
169 |
_SHOULD_HIJACK_RE = re.compile(lexer_def.SHOULD_HIJACK_RE + '$') # type: ignore
|
170 |
|
171 |
def ShouldHijack(s):
|
172 |
# type: (str) -> bool
|
173 |
return bool(_SHOULD_HIJACK_RE.match(s))
|
174 |
|
175 |
_LOOKS_LIKE_INTEGER_RE = re.compile(lexer_def.LOOKS_LIKE_INTEGER + '$') # type: ignore
|
176 |
|
177 |
def LooksLikeInteger(s):
|
178 |
# type: (str) -> bool
|
179 |
return bool(_LOOKS_LIKE_INTEGER_RE.match(s))
|
180 |
|
181 |
_LOOKS_LIKE_FLOAT_RE = re.compile(lexer_def.LOOKS_LIKE_FLOAT + '$') # type: ignore
|
182 |
# yapf: enable
|
183 |
|
184 |
|
185 |
def LooksLikeFloat(s):
|
186 |
# type: (str) -> bool
|
187 |
return bool(_LOOKS_LIKE_FLOAT_RE.match(s))
|
188 |
|
189 |
|
190 |
class SimpleLexer(object):
|
191 |
def __init__(self, match_func, s):
|
192 |
# type: (SimpleMatchFunc, str) -> None
|
193 |
self.match_func = match_func
|
194 |
self.s = s
|
195 |
self.pos = 0
|
196 |
|
197 |
def Next(self):
|
198 |
# type: () -> Tuple[Id_t, str]
|
199 |
"""
|
200 |
Note: match_func will return Id.Eol_Tok repeatedly the terminating NUL
|
201 |
"""
|
202 |
tok_id, end_pos = self.match_func(self.s, self.pos)
|
203 |
val = self.s[self.pos:end_pos]
|
204 |
self.pos = end_pos
|
205 |
return tok_id, val
|
206 |
|
207 |
def Tokens(self):
|
208 |
# type: () -> List[Tuple[Id_t, str]]
|
209 |
tokens = [] # type: List[Tuple[Id_t, str]]
|
210 |
while True:
|
211 |
tok_id, val = self.Next()
|
212 |
if tok_id == Id.Eol_Tok: # NUL terminator
|
213 |
break
|
214 |
tokens.append((tok_id, val))
|
215 |
return tokens
|
216 |
|
217 |
|
218 |
# Iterated over in osh/builtin_pure.py
|
219 |
def EchoLexer(s):
|
220 |
# type: (str) -> SimpleLexer
|
221 |
return SimpleLexer(ECHO_MATCHER, s)
|
222 |
|
223 |
|
224 |
def BraceRangeLexer(s):
|
225 |
# type: (str) -> SimpleLexer
|
226 |
return SimpleLexer(BRACE_RANGE_MATCHER, s)
|
227 |
|
228 |
|
229 |
#def QsnLexer(s):
|
230 |
# # type: (str) -> SimpleLexer
|
231 |
# return SimpleLexer(QSN_MATCHER, s)
|
232 |
|
233 |
|
234 |
def GlobLexer(s):
|
235 |
# type: (str) -> SimpleLexer
|
236 |
return SimpleLexer(GLOB_MATCHER, s)
|
237 |
|
238 |
|
239 |
# These tokens are "slurped"
|
240 |
|
241 |
|
242 |
def HistoryTokens(s):
|
243 |
# type: (str) -> List[Tuple[Id_t, str]]
|
244 |
lex = SimpleLexer(HISTORY_MATCHER, s)
|
245 |
return lex.Tokens()
|
246 |
|
247 |
|
248 |
def Ps1Tokens(s):
|
249 |
# type: (str) -> List[Tuple[Id_t, str]]
|
250 |
lex = SimpleLexer(PS1_MATCHER, s)
|
251 |
return lex.Tokens()
|
252 |
|
253 |
|
254 |
#
|
255 |
# osh/builtin_bracket
|
256 |
#
|
257 |
|
258 |
|
259 |
def BracketUnary(s):
|
260 |
# type: (str) -> Id_t
|
261 |
from _devbuild.gen.id_kind import TEST_UNARY_LOOKUP # break circular dep
|
262 |
return TEST_UNARY_LOOKUP.get(s, Id.Undefined_Tok)
|
263 |
|
264 |
|
265 |
def BracketBinary(s):
|
266 |
# type: (str) -> Id_t
|
267 |
from _devbuild.gen.id_kind import TEST_BINARY_LOOKUP
|
268 |
return TEST_BINARY_LOOKUP.get(s, Id.Undefined_Tok)
|
269 |
|
270 |
|
271 |
def BracketOther(s):
|
272 |
# type: (str) -> Id_t
|
273 |
from _devbuild.gen.id_kind import TEST_OTHER_LOOKUP
|
274 |
return TEST_OTHER_LOOKUP.get(s, Id.Undefined_Tok)
|