OILS
/
frontend
/
lexer.py
1 |
# Copyright 2016 Andy Chu. All rights reserved.
|
2 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
3 |
# you may not use this file except in compliance with the License.
|
4 |
# You may obtain a copy of the License at
|
5 |
#
|
6 |
# http://www.apache.org/licenses/LICENSE-2.0
|
7 |
"""
|
8 |
lexer.py - Library for lexing.
|
9 |
"""
|
10 |
|
11 |
from _devbuild.gen.syntax_asdl import Token, SourceLine
|
12 |
from _devbuild.gen.types_asdl import lex_mode_t, lex_mode_e
|
13 |
from _devbuild.gen.id_kind_asdl import Id_t, Id, Id_str, Kind
|
14 |
from asdl import runtime
|
15 |
from mycpp.mylib import log
|
16 |
from frontend import consts
|
17 |
from frontend import match
|
18 |
|
19 |
unused = log, Id_str
|
20 |
|
21 |
from typing import List, Tuple, Optional, Counter, TYPE_CHECKING
|
22 |
if TYPE_CHECKING:
|
23 |
from core.alloc import Arena
|
24 |
from frontend.reader import _Reader
|
25 |
|
26 |
|
27 |
def IsPlusEquals(tok):
|
28 |
# type: (Token) -> bool
|
29 |
"""Common pattern to test if we got foo= or foo+="""
|
30 |
index = tok.col + tok.length - 2
|
31 |
return tok.line.content[index] == '+'
|
32 |
|
33 |
|
34 |
def TokenVal(tok):
|
35 |
# type: (Token) -> str
|
36 |
"""Compute string value on demand."""
|
37 |
return tok.line.content[tok.col:tok.col + tok.length]
|
38 |
|
39 |
|
40 |
def TokenSliceLeft(tok, left_index):
|
41 |
# type: (Token, int) -> str
|
42 |
"""Slice token directly, without creating intermediate string."""
|
43 |
assert left_index > 0
|
44 |
left = tok.col + left_index
|
45 |
return tok.line.content[left:tok.col + tok.length]
|
46 |
|
47 |
|
48 |
def TokenSliceRight(tok, right_index):
|
49 |
# type: (Token, int) -> str
|
50 |
"""Slice token directly, without creating intermediate string."""
|
51 |
assert right_index < 0
|
52 |
right = tok.col + tok.length + right_index
|
53 |
return tok.line.content[tok.col:right]
|
54 |
|
55 |
|
56 |
def DummyToken(id_, val):
|
57 |
# type: (int, str) -> Token
|
58 |
|
59 |
col = -1
|
60 |
length = -1
|
61 |
return Token(id_, col, length, runtime.NO_SPID, None, val)
|
62 |
|
63 |
|
64 |
class LineLexer(object):
|
65 |
def __init__(self, arena):
|
66 |
# type: (Arena) -> None
|
67 |
self.arena = arena
|
68 |
self.replace_last_token = False # For MaybeUnreadOne
|
69 |
|
70 |
# Singleton instance because we don't allow globals.
|
71 |
# 2023-09: I tried LineLexer::Read() returning None, but that is subtly
|
72 |
# incorrect, e.g. in Lexer::Read() with NUL bytes.
|
73 |
self.eol_tok = DummyToken(Id.Eol_Tok, '')
|
74 |
|
75 |
self.Reset(None, 0) # Invalid src_line to start
|
76 |
|
77 |
def __repr__(self):
|
78 |
# type: () -> str
|
79 |
return '<LineLexer at pos %d of line %r>' % (self.line_pos,
|
80 |
self.src_line)
|
81 |
|
82 |
def Reset(self, src_line, line_pos):
|
83 |
# type: (SourceLine, int) -> None
|
84 |
#assert line, repr(line) # can't be empty or None
|
85 |
self.src_line = src_line
|
86 |
self.line_pos = line_pos
|
87 |
|
88 |
def MaybeUnreadOne(self):
|
89 |
# type: () -> bool
|
90 |
"""Return True if we can unread one character, or False otherwise.
|
91 |
|
92 |
NOTE: Only call this when you know the last token was exactly one character!
|
93 |
"""
|
94 |
if self.line_pos == 0:
|
95 |
return False
|
96 |
else:
|
97 |
self.line_pos -= 1
|
98 |
self.replace_last_token = True # don't add the next token to the arena
|
99 |
return True
|
100 |
|
101 |
def GetEofToken(self, id_):
|
102 |
# type: (int) -> Token
|
103 |
"""Create a new span ID for syntax errors involving the EOF token."""
|
104 |
if self.src_line is None:
|
105 |
# There are ZERO lines now. Add a dummy line 0 so the Token has a source
|
106 |
# to display errors.
|
107 |
src_line = self.arena.AddLine('', 0)
|
108 |
else:
|
109 |
src_line = self.src_line
|
110 |
|
111 |
return self.arena.NewToken(id_, self.line_pos, 0, src_line, '')
|
112 |
|
113 |
def LookAheadOne(self, lex_mode):
|
114 |
# type: (lex_mode_t) -> Id_t
|
115 |
"""Look ahead exactly one token in the given lexer mode."""
|
116 |
pos = self.line_pos
|
117 |
line_str = self.src_line.content
|
118 |
n = len(line_str)
|
119 |
if pos == n:
|
120 |
return Id.Unknown_Tok
|
121 |
else:
|
122 |
tok_type, _ = match.OneToken(lex_mode, line_str, pos)
|
123 |
return tok_type
|
124 |
|
125 |
def AssertAtEndOfLine(self):
|
126 |
# type: () -> None
|
127 |
assert self.line_pos == len(self.src_line.content), \
|
128 |
'%d %s' % (self.line_pos, self.src_line.content)
|
129 |
|
130 |
def LookPastSpace(self, lex_mode):
|
131 |
# type: (lex_mode_t) -> Id_t
|
132 |
"""Look ahead in current line for non-space token, using given lexer
|
133 |
mode.
|
134 |
|
135 |
Does NOT advance self.line_pos.
|
136 |
|
137 |
Called with at least the following modes:
|
138 |
lex_mode_e.Arith -- for ${a[@]} vs ${a[1+2]}
|
139 |
lex_mode_e.VSub_1
|
140 |
lex_mode_e.ShCommand
|
141 |
|
142 |
Note: Only ShCommand emits Id.WS_Space, but other lexer modes don't.
|
143 |
"""
|
144 |
pos = self.line_pos
|
145 |
line_str = self.src_line.content
|
146 |
n = len(line_str)
|
147 |
#print('Look ahead from pos %d, line %r' % (pos,self.line))
|
148 |
while True:
|
149 |
if pos == n:
|
150 |
# We don't allow lookahead while already at end of line, because it
|
151 |
# would involve interacting with the line reader, and we never need
|
152 |
# it. In lex_mode_e.ShCommand, there is an explicit newline token, but
|
153 |
# lex_mode_e.Arith doesn't have it.
|
154 |
return Id.Unknown_Tok
|
155 |
|
156 |
tok_type, end_pos = match.OneToken(lex_mode, line_str, pos)
|
157 |
|
158 |
# NOTE: Instead of hard-coding this token, we could pass it in.
|
159 |
# LookPastSpace(lex_mode, past_token_type)
|
160 |
# - WS_Space only given in lex_mode_e.ShCommand
|
161 |
# - Id.Ignored_Space given in lex_mode_e.Expr
|
162 |
if tok_type != Id.WS_Space and tok_type != Id.Ignored_Space:
|
163 |
break
|
164 |
pos = end_pos
|
165 |
|
166 |
return tok_type
|
167 |
|
168 |
def LookAheadFuncParens(self, unread):
|
169 |
# type: (int) -> bool
|
170 |
"""For finding the () in 'f ( ) { echo hi; }'.
|
171 |
|
172 |
Args:
|
173 |
unread: either 0 or 1, for the number of characters to go back
|
174 |
|
175 |
The lookahead is limited to the current line, which sacrifices a rare
|
176 |
corner case. This not recognized as a function:
|
177 |
|
178 |
foo\
|
179 |
() {}
|
180 |
|
181 |
whereas this is
|
182 |
|
183 |
foo()
|
184 |
{}
|
185 |
"""
|
186 |
pos = self.line_pos - unread
|
187 |
assert pos > 0
|
188 |
tok_type, _ = match.OneToken(lex_mode_e.FuncParens,
|
189 |
self.src_line.content, pos)
|
190 |
return tok_type == Id.LookAhead_FuncParens
|
191 |
|
192 |
def ByteLookAhead(self):
|
193 |
# type: () -> str
|
194 |
"""Lookahead a single byte.
|
195 |
|
196 |
Useful when you know the token is one char.
|
197 |
"""
|
198 |
pos = self.line_pos
|
199 |
if pos == len(self.src_line.content):
|
200 |
return ''
|
201 |
else:
|
202 |
return self.src_line.content[pos]
|
203 |
|
204 |
def ByteLookBack(self):
|
205 |
# type: () -> int
|
206 |
"""A little hack for stricter proc arg list syntax.
|
207 |
|
208 |
There has to be a space before the paren.
|
209 |
|
210 |
Yes: json write (x)
|
211 |
No: json write(x)
|
212 |
"""
|
213 |
pos = self.line_pos - 2
|
214 |
if pos < 0:
|
215 |
return -1
|
216 |
else:
|
217 |
return ord(self.src_line.content[pos])
|
218 |
|
219 |
def Read(self, lex_mode):
|
220 |
# type: (lex_mode_t) -> Token
|
221 |
|
222 |
# Inner loop optimization
|
223 |
if self.src_line:
|
224 |
line_str = self.src_line.content
|
225 |
else:
|
226 |
line_str = ''
|
227 |
line_pos = self.line_pos
|
228 |
|
229 |
tok_type, end_pos = match.OneToken(lex_mode, line_str, line_pos)
|
230 |
if tok_type == Id.Eol_Tok: # Do NOT add a span for this sentinel!
|
231 |
# LineLexer tells Lexer to read a new line.
|
232 |
return self.eol_tok
|
233 |
|
234 |
# TODO: can inline this function with formula on 16-bit Id.
|
235 |
kind = consts.GetKind(tok_type)
|
236 |
|
237 |
# Save on allocations! We often don't look at the token value.
|
238 |
# Whitelist doesn't work well? Use blacklist for now.
|
239 |
# - Kind.KW is sometimes a literal in a word
|
240 |
# - Kind.Right is for " in here docs. Lexer isn't involved.
|
241 |
# - Got an error with Kind.Left too that I don't understand
|
242 |
# - Kind.ControlFlow doesn't work because we word_.StaticEval()
|
243 |
# if kind in (Kind.Lit, Kind.VSub, Kind.Redir, Kind.Char, Kind.Backtick, Kind.KW, Kind.Right):
|
244 |
if kind in (Kind.Arith, Kind.Op, Kind.VTest, Kind.VOp0, Kind.VOp2,
|
245 |
Kind.VOp3, Kind.WS, Kind.Ignored, Kind.Eof):
|
246 |
tok_val = None # type: Optional[str]
|
247 |
else:
|
248 |
tok_val = line_str[line_pos:end_pos]
|
249 |
|
250 |
# NOTE: We're putting the arena hook in LineLexer and not Lexer because we
|
251 |
# want it to be "low level". The only thing fabricated here is a newline
|
252 |
# added at the last line, so we don't end with \0.
|
253 |
if self.replace_last_token: # make another token from the last span
|
254 |
self.arena.UnreadOne()
|
255 |
self.replace_last_token = False
|
256 |
|
257 |
tok_len = end_pos - line_pos
|
258 |
t = self.arena.NewToken(tok_type, line_pos, tok_len, self.src_line,
|
259 |
tok_val)
|
260 |
|
261 |
self.line_pos = end_pos
|
262 |
return t
|
263 |
|
264 |
|
265 |
class Lexer(object):
|
266 |
"""Read lines from the line_reader, split them into tokens with line_lexer,
|
267 |
returning them in a stream."""
|
268 |
|
269 |
def __init__(self, line_lexer, line_reader):
|
270 |
# type: (LineLexer, _Reader) -> None
|
271 |
"""
|
272 |
Args:
|
273 |
line_lexer: Underlying object to get tokens from
|
274 |
line_reader: get new lines from here
|
275 |
"""
|
276 |
self.line_lexer = line_lexer
|
277 |
self.line_reader = line_reader
|
278 |
|
279 |
self.line_id = -1 # Invalid one
|
280 |
self.translation_stack = [] # type: List[Tuple[Id_t, Id_t]]
|
281 |
self.emit_comp_dummy = False
|
282 |
|
283 |
def ResetInputObjects(self):
|
284 |
# type: () -> None
|
285 |
self.line_lexer.Reset(None, 0)
|
286 |
|
287 |
def MaybeUnreadOne(self):
|
288 |
# type: () -> bool
|
289 |
return self.line_lexer.MaybeUnreadOne()
|
290 |
|
291 |
def LookAheadOne(self, lex_mode):
|
292 |
# type: (lex_mode_t) -> Id_t
|
293 |
return self.line_lexer.LookAheadOne(lex_mode)
|
294 |
|
295 |
def LookPastSpace(self, lex_mode):
|
296 |
# type: (lex_mode_t) -> Id_t
|
297 |
return self.line_lexer.LookPastSpace(lex_mode)
|
298 |
|
299 |
def LookAheadFuncParens(self, unread):
|
300 |
# type: (int) -> bool
|
301 |
return self.line_lexer.LookAheadFuncParens(unread)
|
302 |
|
303 |
def ByteLookAhead(self):
|
304 |
# type: () -> str
|
305 |
return self.line_lexer.ByteLookAhead()
|
306 |
|
307 |
def ByteLookBack(self):
|
308 |
# type: () -> int
|
309 |
return self.line_lexer.ByteLookBack()
|
310 |
|
311 |
def EmitCompDummy(self):
|
312 |
# type: () -> None
|
313 |
"""Emit Id.Lit_CompDummy right before EOF, for completion."""
|
314 |
self.emit_comp_dummy = True
|
315 |
|
316 |
def PushHint(self, old_id, new_id):
|
317 |
# type: (Id_t, Id_t) -> None
|
318 |
"""Use cases: Id.Op_RParen -> Id.Right_Subshell -- disambiguate
|
319 |
Id.Op_RParen -> Id.Eof_RParen.
|
320 |
|
321 |
Problems for $() nesting.
|
322 |
|
323 |
- posix:
|
324 |
- case foo) and case (foo)
|
325 |
- func() {}
|
326 |
- subshell ( )
|
327 |
- bash extensions:
|
328 |
- precedence in [[, e.g. [[ (1 == 2) && (2 == 3) ]]
|
329 |
- arrays: a=(1 2 3), a+=(4 5)
|
330 |
"""
|
331 |
#log(' PushHint %s ==> %s', Id_str(old_id), Id_str(new_id))
|
332 |
self.translation_stack.append((old_id, new_id))
|
333 |
|
334 |
def MoveToNextLine(self):
|
335 |
# type: () -> None
|
336 |
"""For lookahead on the next line.
|
337 |
|
338 |
This is required by `ParseYshCase` and is used in `_NewlineOkForYshCase`.
|
339 |
|
340 |
We use this because otherwise calling `LookPastSpace` would return
|
341 |
`Id.Unknown_Tok` when the lexer has reached the end of the line. For an
|
342 |
example, take this case:
|
343 |
|
344 |
case (x) {
|
345 |
^--- We are here
|
346 |
|
347 |
(else) {
|
348 |
^--- We want lookahead to here
|
349 |
|
350 |
echo test
|
351 |
}
|
352 |
}
|
353 |
|
354 |
But, without `MoveToNextLine`, it is impossible to peek the '(' without
|
355 |
consuming it. And consuming it would be a problem once we want to hand off
|
356 |
pattern parsing to the expression parser.
|
357 |
"""
|
358 |
# Only call this when you've seen \n
|
359 |
self.line_lexer.AssertAtEndOfLine()
|
360 |
|
361 |
src_line, line_pos = self.line_reader.GetLine()
|
362 |
self.line_lexer.Reset(src_line, line_pos) # fill with a new line
|
363 |
|
364 |
def _Read(self, lex_mode):
|
365 |
# type: (lex_mode_t) -> Token
|
366 |
"""Read from the normal line buffer, not an alias."""
|
367 |
t = self.line_lexer.Read(lex_mode)
|
368 |
if t.id == Id.Eol_Tok: # We hit \0 aka Eol_Tok, read a new line
|
369 |
src_line, line_pos = self.line_reader.GetLine()
|
370 |
|
371 |
if src_line is None: # no more lines
|
372 |
if self.emit_comp_dummy:
|
373 |
id_ = Id.Lit_CompDummy
|
374 |
self.emit_comp_dummy = False # emit EOF the next time
|
375 |
else:
|
376 |
id_ = Id.Eof_Real
|
377 |
return self.line_lexer.GetEofToken(id_)
|
378 |
|
379 |
self.line_lexer.Reset(src_line, line_pos) # fill with a new line
|
380 |
t = self.line_lexer.Read(lex_mode)
|
381 |
|
382 |
# e.g. translate ) or ` into EOF
|
383 |
if len(self.translation_stack):
|
384 |
old_id, new_id = self.translation_stack[-1] # top
|
385 |
if t.id == old_id:
|
386 |
#log('==> TRANSLATING %s ==> %s', Id_str(t.id), Id_str(new_id))
|
387 |
self.translation_stack.pop()
|
388 |
t.id = new_id
|
389 |
|
390 |
return t
|
391 |
|
392 |
def Read(self, lex_mode):
|
393 |
# type: (lex_mode_t) -> Token
|
394 |
while True:
|
395 |
t = self._Read(lex_mode)
|
396 |
# TODO: Change to ALL IGNORED types, once you have SPACE_TOK. This means
|
397 |
# we don't have to handle them in the VSub_1/VSub_2/etc. states.
|
398 |
if t.id != Id.Ignored_LineCont:
|
399 |
break
|
400 |
|
401 |
#ID_HIST[t.id] += 1
|
402 |
#log('> Read() Returning %s', t)
|
403 |
return t
|
404 |
|
405 |
|
406 |
if 0: # mylib.PYTHON: not: breaks tarball build
|
407 |
import collections
|
408 |
ID_HIST = collections.Counter() # type: Counter[Id_t]
|