1 # Copyright 2016 Andy Chu. All rights reserved.
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at
5 #
6 # http://www.apache.org/licenses/LICENSE-2.0
7 """
8 lexer.py - Library for lexing.
9 """
10
11 from _devbuild.gen.syntax_asdl import Token, SourceLine
12 from _devbuild.gen.types_asdl import lex_mode_t, lex_mode_e
13 from _devbuild.gen.id_kind_asdl import Id_t, Id, Id_str, Kind
14 from asdl import runtime
15 from mycpp.mylib import log
16 from frontend import consts
17 from frontend import match
18
19 unused = log, Id_str
20
21 from typing import List, Tuple, Optional, Counter, TYPE_CHECKING
22 if TYPE_CHECKING:
23 from core.alloc import Arena
24 from frontend.reader import _Reader
25
26
27 def IsPlusEquals(tok):
28 # type: (Token) -> bool
29 """Common pattern to test if we got foo= or foo+="""
30 index = tok.col + tok.length - 2
31 return tok.line.content[index] == '+'
32
33
34 def TokenVal(tok):
35 # type: (Token) -> str
36 """Compute string value on demand."""
37 return tok.line.content[tok.col:tok.col + tok.length]
38
39
40 def TokenSliceLeft(tok, left_index):
41 # type: (Token, int) -> str
42 """Slice token directly, without creating intermediate string."""
43 assert left_index > 0
44 left = tok.col + left_index
45 return tok.line.content[left:tok.col + tok.length]
46
47
48 def TokenSliceRight(tok, right_index):
49 # type: (Token, int) -> str
50 """Slice token directly, without creating intermediate string."""
51 assert right_index < 0
52 right = tok.col + tok.length + right_index
53 return tok.line.content[tok.col:right]
54
55
56 def DummyToken(id_, val):
57 # type: (int, str) -> Token
58
59 col = -1
60 length = -1
61 return Token(id_, col, length, runtime.NO_SPID, None, val)
62
63
64 class LineLexer(object):
65 def __init__(self, arena):
66 # type: (Arena) -> None
67 self.arena = arena
68 self.replace_last_token = False # For MaybeUnreadOne
69
70 # Singleton instance because we don't allow globals.
71 # 2023-09: I tried LineLexer::Read() returning None, but that is subtly
72 # incorrect, e.g. in Lexer::Read() with NUL bytes.
73 self.eol_tok = DummyToken(Id.Eol_Tok, '')
74
75 self.Reset(None, 0) # Invalid src_line to start
76
77 def __repr__(self):
78 # type: () -> str
79 return '<LineLexer at pos %d of line %r>' % (self.line_pos,
80 self.src_line)
81
82 def Reset(self, src_line, line_pos):
83 # type: (SourceLine, int) -> None
84 #assert line, repr(line) # can't be empty or None
85 self.src_line = src_line
86 self.line_pos = line_pos
87
88 def MaybeUnreadOne(self):
89 # type: () -> bool
90 """Return True if we can unread one character, or False otherwise.
91
92 NOTE: Only call this when you know the last token was exactly one character!
93 """
94 if self.line_pos == 0:
95 return False
96 else:
97 self.line_pos -= 1
98 self.replace_last_token = True # don't add the next token to the arena
99 return True
100
101 def GetEofToken(self, id_):
102 # type: (int) -> Token
103 """Create a new span ID for syntax errors involving the EOF token."""
104 if self.src_line is None:
105 # There are ZERO lines now. Add a dummy line 0 so the Token has a source
106 # to display errors.
107 src_line = self.arena.AddLine('', 0)
108 else:
109 src_line = self.src_line
110
111 return self.arena.NewToken(id_, self.line_pos, 0, src_line, '')
112
113 def LookAheadOne(self, lex_mode):
114 # type: (lex_mode_t) -> Id_t
115 """Look ahead exactly one token in the given lexer mode."""
116 pos = self.line_pos
117 line_str = self.src_line.content
118 n = len(line_str)
119 if pos == n:
120 return Id.Unknown_Tok
121 else:
122 tok_type, _ = match.OneToken(lex_mode, line_str, pos)
123 return tok_type
124
125 def AssertAtEndOfLine(self):
126 # type: () -> None
127 assert self.line_pos == len(self.src_line.content), \
128 '%d %s' % (self.line_pos, self.src_line.content)
129
130 def LookPastSpace(self, lex_mode):
131 # type: (lex_mode_t) -> Id_t
132 """Look ahead in current line for non-space token, using given lexer
133 mode.
134
135 Does NOT advance self.line_pos.
136
137 Called with at least the following modes:
138 lex_mode_e.Arith -- for ${a[@]} vs ${a[1+2]}
139 lex_mode_e.VSub_1
140 lex_mode_e.ShCommand
141
142 Note: Only ShCommand emits Id.WS_Space, but other lexer modes don't.
143 """
144 pos = self.line_pos
145 line_str = self.src_line.content
146 n = len(line_str)
147 #print('Look ahead from pos %d, line %r' % (pos,self.line))
148 while True:
149 if pos == n:
150 # We don't allow lookahead while already at end of line, because it
151 # would involve interacting with the line reader, and we never need
152 # it. In lex_mode_e.ShCommand, there is an explicit newline token, but
153 # lex_mode_e.Arith doesn't have it.
154 return Id.Unknown_Tok
155
156 tok_type, end_pos = match.OneToken(lex_mode, line_str, pos)
157
158 # NOTE: Instead of hard-coding this token, we could pass it in.
159 # LookPastSpace(lex_mode, past_token_type)
160 # - WS_Space only given in lex_mode_e.ShCommand
161 # - Id.Ignored_Space given in lex_mode_e.Expr
162 if tok_type != Id.WS_Space and tok_type != Id.Ignored_Space:
163 break
164 pos = end_pos
165
166 return tok_type
167
168 def LookAheadFuncParens(self, unread):
169 # type: (int) -> bool
170 """For finding the () in 'f ( ) { echo hi; }'.
171
172 Args:
173 unread: either 0 or 1, for the number of characters to go back
174
175 The lookahead is limited to the current line, which sacrifices a rare
176 corner case. This not recognized as a function:
177
178 foo\
179 () {}
180
181 whereas this is
182
183 foo()
184 {}
185 """
186 pos = self.line_pos - unread
187 assert pos > 0
188 tok_type, _ = match.OneToken(lex_mode_e.FuncParens,
189 self.src_line.content, pos)
190 return tok_type == Id.LookAhead_FuncParens
191
192 def ByteLookAhead(self):
193 # type: () -> str
194 """Lookahead a single byte.
195
196 Useful when you know the token is one char.
197 """
198 pos = self.line_pos
199 if pos == len(self.src_line.content):
200 return ''
201 else:
202 return self.src_line.content[pos]
203
204 def ByteLookBack(self):
205 # type: () -> int
206 """A little hack for stricter proc arg list syntax.
207
208 There has to be a space before the paren.
209
210 Yes: json write (x)
211 No: json write(x)
212 """
213 pos = self.line_pos - 2
214 if pos < 0:
215 return -1
216 else:
217 return ord(self.src_line.content[pos])
218
219 def Read(self, lex_mode):
220 # type: (lex_mode_t) -> Token
221
222 # Inner loop optimization
223 if self.src_line:
224 line_str = self.src_line.content
225 else:
226 line_str = ''
227 line_pos = self.line_pos
228
229 tok_type, end_pos = match.OneToken(lex_mode, line_str, line_pos)
230 if tok_type == Id.Eol_Tok: # Do NOT add a span for this sentinel!
231 # LineLexer tells Lexer to read a new line.
232 return self.eol_tok
233
234 # TODO: can inline this function with formula on 16-bit Id.
235 kind = consts.GetKind(tok_type)
236
237 # Save on allocations! We often don't look at the token value.
238 # Whitelist doesn't work well? Use blacklist for now.
239 # - Kind.KW is sometimes a literal in a word
240 # - Kind.Right is for " in here docs. Lexer isn't involved.
241 # - Got an error with Kind.Left too that I don't understand
242 # - Kind.ControlFlow doesn't work because we word_.StaticEval()
243 # if kind in (Kind.Lit, Kind.VSub, Kind.Redir, Kind.Char, Kind.Backtick, Kind.KW, Kind.Right):
244 if kind in (Kind.Arith, Kind.Op, Kind.VTest, Kind.VOp0, Kind.VOp2,
245 Kind.VOp3, Kind.WS, Kind.Ignored, Kind.Eof):
246 tok_val = None # type: Optional[str]
247 else:
248 tok_val = line_str[line_pos:end_pos]
249
250 # NOTE: We're putting the arena hook in LineLexer and not Lexer because we
251 # want it to be "low level". The only thing fabricated here is a newline
252 # added at the last line, so we don't end with \0.
253 if self.replace_last_token: # make another token from the last span
254 self.arena.UnreadOne()
255 self.replace_last_token = False
256
257 tok_len = end_pos - line_pos
258 t = self.arena.NewToken(tok_type, line_pos, tok_len, self.src_line,
259 tok_val)
260
261 self.line_pos = end_pos
262 return t
263
264
265 class Lexer(object):
266 """Read lines from the line_reader, split them into tokens with line_lexer,
267 returning them in a stream."""
268
269 def __init__(self, line_lexer, line_reader):
270 # type: (LineLexer, _Reader) -> None
271 """
272 Args:
273 line_lexer: Underlying object to get tokens from
274 line_reader: get new lines from here
275 """
276 self.line_lexer = line_lexer
277 self.line_reader = line_reader
278
279 self.line_id = -1 # Invalid one
280 self.translation_stack = [] # type: List[Tuple[Id_t, Id_t]]
281 self.emit_comp_dummy = False
282
283 def ResetInputObjects(self):
284 # type: () -> None
285 self.line_lexer.Reset(None, 0)
286
287 def MaybeUnreadOne(self):
288 # type: () -> bool
289 return self.line_lexer.MaybeUnreadOne()
290
291 def LookAheadOne(self, lex_mode):
292 # type: (lex_mode_t) -> Id_t
293 return self.line_lexer.LookAheadOne(lex_mode)
294
295 def LookPastSpace(self, lex_mode):
296 # type: (lex_mode_t) -> Id_t
297 return self.line_lexer.LookPastSpace(lex_mode)
298
299 def LookAheadFuncParens(self, unread):
300 # type: (int) -> bool
301 return self.line_lexer.LookAheadFuncParens(unread)
302
303 def ByteLookAhead(self):
304 # type: () -> str
305 return self.line_lexer.ByteLookAhead()
306
307 def ByteLookBack(self):
308 # type: () -> int
309 return self.line_lexer.ByteLookBack()
310
311 def EmitCompDummy(self):
312 # type: () -> None
313 """Emit Id.Lit_CompDummy right before EOF, for completion."""
314 self.emit_comp_dummy = True
315
316 def PushHint(self, old_id, new_id):
317 # type: (Id_t, Id_t) -> None
318 """Use cases: Id.Op_RParen -> Id.Right_Subshell -- disambiguate
319 Id.Op_RParen -> Id.Eof_RParen.
320
321 Problems for $() nesting.
322
323 - posix:
324 - case foo) and case (foo)
325 - func() {}
326 - subshell ( )
327 - bash extensions:
328 - precedence in [[, e.g. [[ (1 == 2) && (2 == 3) ]]
329 - arrays: a=(1 2 3), a+=(4 5)
330 """
331 #log(' PushHint %s ==> %s', Id_str(old_id), Id_str(new_id))
332 self.translation_stack.append((old_id, new_id))
333
334 def MoveToNextLine(self):
335 # type: () -> None
336 """For lookahead on the next line.
337
338 This is required by `ParseYshCase` and is used in `_NewlineOkForYshCase`.
339
340 We use this because otherwise calling `LookPastSpace` would return
341 `Id.Unknown_Tok` when the lexer has reached the end of the line. For an
342 example, take this case:
343
344 case (x) {
345 ^--- We are here
346
347 (else) {
348 ^--- We want lookahead to here
349
350 echo test
351 }
352 }
353
354 But, without `MoveToNextLine`, it is impossible to peek the '(' without
355 consuming it. And consuming it would be a problem once we want to hand off
356 pattern parsing to the expression parser.
357 """
358 # Only call this when you've seen \n
359 self.line_lexer.AssertAtEndOfLine()
360
361 src_line, line_pos = self.line_reader.GetLine()
362 self.line_lexer.Reset(src_line, line_pos) # fill with a new line
363
364 def _Read(self, lex_mode):
365 # type: (lex_mode_t) -> Token
366 """Read from the normal line buffer, not an alias."""
367 t = self.line_lexer.Read(lex_mode)
368 if t.id == Id.Eol_Tok: # We hit \0 aka Eol_Tok, read a new line
369 src_line, line_pos = self.line_reader.GetLine()
370
371 if src_line is None: # no more lines
372 if self.emit_comp_dummy:
373 id_ = Id.Lit_CompDummy
374 self.emit_comp_dummy = False # emit EOF the next time
375 else:
376 id_ = Id.Eof_Real
377 return self.line_lexer.GetEofToken(id_)
378
379 self.line_lexer.Reset(src_line, line_pos) # fill with a new line
380 t = self.line_lexer.Read(lex_mode)
381
382 # e.g. translate ) or ` into EOF
383 if len(self.translation_stack):
384 old_id, new_id = self.translation_stack[-1] # top
385 if t.id == old_id:
386 #log('==> TRANSLATING %s ==> %s', Id_str(t.id), Id_str(new_id))
387 self.translation_stack.pop()
388 t.id = new_id
389
390 return t
391
392 def Read(self, lex_mode):
393 # type: (lex_mode_t) -> Token
394 while True:
395 t = self._Read(lex_mode)
396 # TODO: Change to ALL IGNORED types, once you have SPACE_TOK. This means
397 # we don't have to handle them in the VSub_1/VSub_2/etc. states.
398 if t.id != Id.Ignored_LineCont:
399 break
400
401 #ID_HIST[t.id] += 1
402 #log('> Read() Returning %s', t)
403 return t
404
405
406 if 0: # mylib.PYTHON: not: breaks tarball build
407 import collections
408 ID_HIST = collections.Counter() # type: Counter[Id_t]