1 # Data types for the Oils AST, aka "Lossless Syntax Tree".
2 #
3 # Invariant: the source text can be reconstructed byte-for-byte from this
4 # tree. The test/arena.sh file partially verifies this.
5 #
6 # Exceptions:
7 # - <<- here docs with leading tabs, since we don't want those for
8 # conversion. We don't want files with mixed tabs and spaces.
9 # - Distinguishing between function styles wasn't necessary:
10 # - foo() { } vs function foo { } # ksh style
11
12 # We usually try to preserve the physical order of the source in the ASDL
13 # fields. One exception is the order of redirects:
14 #
15 # echo >out.txt hi
16 # # versus
17 # echo hi >out.txt
18
19 # Unrepresented:
20 # - let arithmetic (rarely used)
21 # - coprocesses # one with arg and one without
22 # - select block
23 # - case fallthrough ;& and ;;&
24
25 # Possible refactorings:
26 #
27 # printf_part = Literal %Token | ...
28 #
29 # # %CompoundWord as first class variant:
30 # bool_expr = WordTest %CompoundWord | ...
31 #
32 # # Can DoubleQuoted have a subset of parts compared with CompoundWord?
33 # string_part = ... # subset of word_part
34 #
35 # - Distinguish word_t with BracedTree vs. those without? seq_word_t?
36 # - Remove command.NoOp ?
37
38 module syntax
39 {
40 # More efficient than the List[bool] pattern we've been using
41 BoolParamBox = (bool b)
42 IntParamBox = (int i)
43
44 # core/main_loop.py
45 parse_result = EmptyLine | Eof | Node(command cmd)
46
47 # 'source' represents the location of a line / token.
48 source =
49 Interactive
50 | Headless
51 | Unused(str comment) # completion and history never show parse errors?
52 | CFlag
53 | Stdin(str comment)
54
55 # TODO: if it's not the main script, it's sourced, and you could provide
56 # a chain of locations back to the sourced script!
57 # MainFile(str path) or SourcedFile(str path, loc location)
58 | MainFile(str path)
59 | SourcedFile(str path, loc location)
60
61 # code parsed from a word
62 # used for 'eval', 'trap', 'printf', 'complete -W', etc.
63 | ArgvWord(str what, loc location)
64
65 # code parsed from the value of a variable
66 # used for $PS1 $PROMPT_COMMAND
67 | Variable(str var_name, loc location)
68
69 # Point to the original variable reference
70 | VarRef(Token orig_tok)
71
72 # alias expansion (location of first word)
73 | Alias(str argv0, loc argv0_loc)
74
75 # 2 kinds of reparsing: backticks, and x+1 in a[x+1]=y
76 # TODO: use this for eval_unsafe_arith instead of Variable
77 | Reparsed(str what, Token left_token, Token right_token)
78
79 # For --location-str
80 | Synthetic(str s)
81
82 SourceLine = (int line_num, str content, source src)
83
84 # Token TODO:
85 # - remove .tval field. If necessary, the string value should be manually
86 # computed and attached to specific LST nodes.
87 # - maybe get rid of span_id, and re-compute length on demand too
88 # -
89 Token = (id id, int col, int length, int span_id, SourceLine? line, str tval)
90
91 # Slight ASDL bug: CompoundWord has to be defined before using it as a shared
92 # variant. The _product_counter algorithm should be moved into a separate
93 # tag-assigning pass, and shared between gen_python.py and gen_cpp.py.
94 CompoundWord = (List[word_part] parts)
95
96 # Source location for errors
97 loc =
98 Missing # equivalent of runtime.NO_SPID
99 | Token %Token
100 # Very common case: argv arrays need original location
101 | ArgWord %CompoundWord
102 | WordPart(word_part p)
103 | Word(word w)
104 | Arith(arith_expr a)
105 # e.g. for errexit blaming
106 | Command(command c)
107
108 debug_frame =
109 Main(str dollar0)
110 # call_loc => BASH_LINENO
111 # call_loc may be None with new --source flag?
112 | Source(Token? call_tok, str source_name)
113 # def_tok => BASH_SOURCE
114 # call_loc may be None if invoked via RunFuncForCompletion?
115 | Call(Token? call_tok, Token def_tok, str func_name)
116
117 #
118 # Shell language
119 #
120
121 bracket_op =
122 WholeArray(id op_id) # * or @
123 | ArrayIndex(arith_expr expr)
124
125 suffix_op =
126 Nullary %Token # ${x@Q} or ${!prefix@} (which also has prefix_op)
127 | Unary(Token op, rhs_word arg_word) # e.g. ${v:-default}
128 # TODO: Implement YSH ${x|html} and ${x %.3f}
129 | Static(Token tok, str arg)
130 | PatSub(CompoundWord pat, rhs_word replace, id replace_mode, Token slash_tok)
131 # begin is optional with ${array::1}
132 | Slice(arith_expr? begin, arith_expr? length)
133
134 BracedVarSub = (
135 Token left, # in dynamic ParseVarRef, same as name_tok
136 Token token, # location for the name
137 str var_name, # the name
138 Token? prefix_op, # prefix # or ! operators
139 bracket_op? bracket_op,
140 suffix_op? suffix_op,
141 Token right # in dynamic ParseVarRef, same as name_tok
142 )
143
144 # Variants:
145 # - Look at left token ID for $'' c'' vs r'' '' e.g. Id.Left_DollarSingleQuote
146 # - And """ and ''' e.g. Id.Left_TDoubleQuote
147 DoubleQuoted = (Token left, List[word_part] parts, Token right)
148 SingleQuoted = (Token left, List[Token] tokens, Token right)
149
150 SimpleVarSub = (Token left, str var_name)
151
152 CommandSub = (Token left_token, command child, Token right)
153
154 # - can contain word.BracedTree
155 # - no 'Token right' for now, doesn't appear to be used
156 ShArrayLiteral = (Token left, List[word] words, Token right)
157
158 # Unevaluated, typed arguments for func and proc.
159 # Note that ...arg is expr.Spread.
160 ArgList = (
161 Token left, List[expr] pos_args, List[NamedArg] named_args, Token right
162 )
163
164 AssocPair = (CompoundWord key, CompoundWord value)
165
166 word_part =
167 ShArrayLiteral %ShArrayLiteral
168 | BashAssocLiteral(Token left, List[AssocPair] pairs, Token right)
169 | Literal %Token
170 # escaped case is separate so the evaluator doesn't have to check token ID
171 | EscapedLiteral(Token token, str ch)
172 | SingleQuoted %SingleQuoted
173 | DoubleQuoted %DoubleQuoted
174 | SimpleVarSub %SimpleVarSub
175 | BracedVarSub %BracedVarSub
176 # For command sub and process sub: $(...) <(...) >(...)
177 | CommandSub %CommandSub
178 # ~ or ~bob
179 | TildeSub(Token token, str? user_name)
180 | ArithSub(Token left, arith_expr anode, Token right)
181 # {a,b,c}
182 | BracedTuple(List[CompoundWord] words)
183 # {1..10} or {-5..10..2} or {01..10} (leading zeros matter)
184 # {a..f} or {a..f..2} or {a..f..-2}
185 # the whole range is one Token,
186 | BracedRange(Token blame_tok, id kind, str start, str end, int step)
187 # note: optional int may need special handling in ASDL
188 # extended globs are parsed statically, unlike globs
189 | ExtGlob(Token op, List[CompoundWord] arms, Token right)
190
191 # YSH word_part extensions
192
193 # @myarray
194 | Splice(Token blame_tok, str var_name)
195 # $[d.key], etc.
196 | ExprSub(Token left, expr child, Token right)
197
198 # Use cases for Empty: RHS of 'x=', the argument in "${x:-}".
199 # The latter is semantically necessary. (See osh/word_parse.py).
200 # At runtime: RHS of 'declare x='.
201 rhs_word = Empty | Compound %CompoundWord
202
203 word =
204 # Returns from WordParser, but not generally stored in LST
205 Operator %Token
206 # A Compound word can contain any word_part except the Braced*Part.
207 # We could model this with another variant type but it incurs runtime
208 # overhead and seems like overkill. Note that DoubleQuoted can't
209 # contain a SingleQuoted, etc. either.
210 | Compound %CompoundWord
211 # For word sequences command.Simple, ShArrayLiteral, for_iter.Words
212 # Could be its own type
213 | BracedTree(List[word_part] parts)
214 # For dynamic parsing of test aka [ - the string is already evaluated.
215 | String(id id, str s, CompoundWord? blame_loc)
216
217 # Note: the name 'foo' is derived from token value 'foo=' or 'foo+='
218 sh_lhs_expr =
219 Name(Token left, str name)
220 | IndexedName(Token left, str name, arith_expr index)
221 | UnparsedIndex(Token left, str name, str index) # for translation
222
223 arith_expr =
224 VarSub %SimpleVarSub # e.g. $(( x ))
225 | Word %CompoundWord # e.g. $(( 123'456'$y ))
226
227 | UnaryAssign(id op_id, arith_expr child)
228 | BinaryAssign(id op_id, arith_expr left, arith_expr right)
229
230 | Unary(id op_id, arith_expr child)
231 # TODO: op should be token, e.g. for divide by zero
232 | Binary(id op_id, arith_expr left, arith_expr right)
233 | TernaryOp(arith_expr cond, arith_expr true_expr, arith_expr false_expr)
234
235 bool_expr =
236 WordTest(word w) # e.g. [[ myword ]]
237 | Binary(id op_id, word left, word right)
238 | Unary(id op_id, word child)
239 | LogicalNot(bool_expr child)
240 | LogicalAnd(bool_expr left, bool_expr right)
241 | LogicalOr(bool_expr left, bool_expr right)
242
243 redir_loc =
244 Fd(int fd) | VarName(str name)
245
246 redir_param =
247 Word %CompoundWord
248 | HereDoc(word here_begin, # e.g. EOF or 'EOF'
249 Token? here_end_tok, # Token consisting of the whole line
250 # It's always filled in AFTER creation, but
251 # temporarily so optional
252 List[word_part] stdin_parts # one for each line
253 )
254
255 Redir = (Token op, redir_loc loc, redir_param arg)
256
257 assign_op = Equal | PlusEqual
258 AssignPair = (Token left, sh_lhs_expr lhs, assign_op op, rhs_word rhs)
259 EnvPair = (Token left, str name, rhs_word val)
260
261 condition =
262 Shell(List[command] commands) # if false; true; then echo hi; fi
263 | YshExpr(expr e) # if (x > 0) { echo hi }
264 # TODO: add more specific blame location
265
266 # Each arm tests one word against multiple words
267 # shell: *.cc|*.h) echo C++ ;;
268 # YSH: *.cc|*.h { echo C++ }
269 #
270 # Three location tokens:
271 # 1. left - shell has ( or *.cc ysh has *.cc
272 # 2. middle - shell has ) ysh has {
273 # 3. right - shell has optional ;; ysh has required }
274 #
275 # For YSH typed case, left can be ( and /
276 # And case_pat may contain more details
277 CaseArm = (
278 Token left, pat pattern, Token middle, List[command] action,
279 Token? right
280 )
281
282 # The argument to match against in a case command
283 # In YSH-style case commands we match against an `expr`, but in sh-style case
284 # commands we match against a word.
285 case_arg =
286 Word(word w)
287 | YshExpr(expr e)
288
289 pat =
290 Else
291 | Words(List[word] words)
292 | YshExprs(List[expr] exprs)
293 | Eggex(re eggex)
294
295 # Each if arm starts with either an "if" or "elif" keyword
296 # In YSH, the then keyword is not used (replaced by braces {})
297 IfArm = (
298 Token keyword, condition cond, Token? then_kw, List[command] action,
299 List[int] spids)
300
301 for_iter =
302 Args # for x; do echo $x; done # implicit "$@"
303 | Words(List[word] words) # for x in 'foo' *.py { echo $x }
304 # like ShArrayLiteral, but no location for %(
305 | YshExpr(expr e, Token blame) # for x in (mylist) { echo $x }
306
307 BraceGroup = (
308 Token left, Token? doc_token, List[command] children,
309 List[Redir] redirects, Token right
310 )
311
312 # Retain references to lines
313 BlockArg = (BraceGroup brace_group, List[SourceLine] lines)
314
315 command =
316 NoOp
317 | Simple(Token? blame_tok, # TODO: make required (BracedTuple?)
318 List[EnvPair] more_env,
319 List[word] words, List[Redir] redirects,
320 ArgList? typed_args, BlockArg? block,
321 # do_fork is semantic, not syntactic
322 bool do_fork)
323 # This doesn't technically belong in the LST, but it's convenient for
324 # execution
325 | ExpandedAlias(command child, List[Redir] redirects, List[EnvPair] more_env)
326 | Sentence(command child, Token terminator)
327 # Represents "bare assignment"
328 # Token left is redundant with pairs[0].left
329 | ShAssignment(Token left, List[AssignPair] pairs, List[Redir] redirects)
330 | Retval(Token keyword, expr val)
331 | ControlFlow(Token keyword, word? arg_word)
332 # ops are | |&
333 | Pipeline(Token? negated, List[command] children, List[Token] ops)
334 # ops are && ||
335 | AndOr(List[command] children, List[Token] ops)
336 # Part of for, while, until (but not if, case, ShFunction). No redirects.
337 | DoGroup(Token left, List[command] children, Token right)
338 # A brace group is a compound command, with redirects.
339 | BraceGroup %BraceGroup
340 # Contains a single child, like CommandSub
341 | Subshell(Token left, command child, Token right, List[Redir] redirects)
342 | DParen(Token left, arith_expr child, Token right, List[Redir] redirects)
343 | DBracket(Token left, bool_expr expr, Token right, List[Redir] redirects)
344 # up to 3 iterations variables
345 | ForEach(Token keyword, List[str] iter_names, for_iter iterable,
346 Token? semi_tok, command body, List[Redir] redirects)
347 # C-style for loop. Any of the 3 expressions can be omitted.
348 # Note: body is required, but only optional here because of initialization
349 # order.
350 | ForExpr(Token keyword, arith_expr? init, arith_expr? cond,
351 arith_expr? update, command? body, List[Redir] redirects)
352 | WhileUntil(Token keyword, condition cond, command body, List[Redir] redirects)
353 | If(Token if_kw, List[IfArm] arms, Token? else_kw, List[command] else_action,
354 Token? fi_kw, List[Redir] redirects)
355 | Case(Token case_kw, case_arg to_match, Token arms_start, List[CaseArm] arms,
356 Token arms_end, List[Redir] redirects)
357 # The keyword is optional in the case of bash-style functions
358 # (ie. "foo() { ... }") which do not have one.
359 | ShFunction(Token? keyword, Token name_tok, str name, command body)
360 | TimeBlock(Token keyword, command pipeline)
361 # Some nodes optimize it out as List[command], but we use CommandList for
362 # 1. the top level
363 # 2. ls ; ls & ls (same line)
364 # 3. CommandSub # single child that's a CommandList
365 # 4. Subshell # single child that's a CommandList
366 | CommandList(List[command] children)
367
368 # YSH command constructs
369
370 # For 'x = myexpr'. There's no type and no comma allowed.
371 | BareDecl(Token lhs, expr rhs)
372 # var, const
373 | VarDecl(Token? keyword, List[NameType] lhs, expr rhs)
374 # setvar, maybe 'auto' later
375 | PlaceMutation(Token keyword, List[place_expr] lhs, Token op, expr rhs)
376 # = keyword
377 | Expr(Token keyword, expr e)
378 | Proc(Token keyword, Token name, proc_sig sig, command body)
379 | Func(Token keyword, Token name,
380 List[Param] pos_params, RestParam? rest_of_pos,
381 List[Param] named_params, RestParam? rest_of_named,
382 command body)
383
384 # Tea
385
386 | TeaFunc(Token name,
387 List[Param] pos_params, RestParam? pos_splat,
388 List[Param] named_params, RestParam? named_splat,
389 List[TypeExpr] return_types, command body)
390 | Data(Token name, List[Param] params)
391 | Enum(Token name, List[Variant] variants)
392 | Class(Token name, Token? extends, List[class_item] items)
393 | Import(SingleQuoted path, Token? alias, List[ImportName] names)
394 | For(List[NameType] targets, expr iterable, command body)
395 | While(expr test, command body)
396 | Break | Continue
397 | Return(expr? value)
398
399 # Binary(x expr, y expr) or Nullary %Token
400 # In the first case we have a tag, and an anonymous type.
401 variant_type =
402 Anon(List[Param] params)
403 | Ref(Token type_name)
404
405 # tags and types are separate in our model of algebraic data types
406 Variant = (Token tag_name, variant_type? typ)
407
408 class_item =
409 Data(Token keyword, List[NameType] fields)
410 | Method()
411
412 ImportName = (Token name, Token? alias)
413
414 Param = (Token blame_tok, str name, TypeExpr? type, expr? default_val)
415 RestParam = (Token blame_tok, str name)
416
417 # 'open' is for proc p { }; closed is for proc p () { }
418 proc_sig =
419 Open
420 | Closed(List[Param] word_params, RestParam? rest_of_words,
421 List[Param] pos_params, RestParam? rest_of_pos,
422 List[Param] named_params, RestParam? rest_of_named,
423 RestParam? block_param)
424
425 #
426 # Glob representation, for converting ${x//} to extended regexes.
427 #
428
429 # Example: *.[ch] is:
430 # GlobOp(<Glob_Star '*'>),
431 # GlobLit(Glob_OtherLiteral, '.'),
432 # CharClass(False, ['ch']) # from Glob_CleanLiterals token
433
434 glob_part =
435 Literal(id id, str s)
436 | Operator(id op_id) # * or ?
437 | CharClass(bool negated, List[str] strs)
438
439 # Char classes are opaque for now. If we ever need them:
440 # - Collating symbols are [. .]
441 # - Equivalence classes are [=
442
443 printf_part =
444 Literal(Token token)
445 # flags are 0 hyphen space + #
446 # type is 's' for %s, etc.
447 | Percent(List[Token] flags, Token? width, Token? precision, Token type)
448
449 #
450 # YSH Language
451 #
452 # Copied and modified from Python-3.7/Parser/Python.asdl !
453
454 expr_context = Load | Store | Del | AugLoad | AugStore | Param
455
456 # Type expressions: Int List[Int] Dict[Str, Any]
457 # Do we have Func[Int, Int => Int] ? I guess we can parse that into this
458 # system.
459 TypeExpr = (Token tok, str name, List[TypeExpr] params)
460
461 # LHS binding in loops, list comprehensions, and var/const
462 NameType = (Token name, TypeExpr? typ)
463
464 # TODO: Inline this into GenExp and ListComp? Just use a flag there?
465 Comprehension = (List[NameType] lhs, expr iter, expr? cond)
466
467 # Named arguments supplied to call. Token is null for f(; ...named).
468 NamedArg = (Token? name, expr value)
469
470 # Subscripts are lists of expressions
471 # a[:i, n] (we don't have matrices, but we have data frames)
472 Subscript = (expr obj, expr index)
473
474 # Attributes are obj.attr, d->key, name::scope,
475 Attribute = (expr obj, Token op, Token attr, expr_context ctx)
476
477 # Places that can be mutated.
478 place_expr =
479 Var(Token name) # TODO: add str var_name
480 | Subscript %Subscript
481 | Attribute %Attribute
482
483 expr =
484 # a variable name to evaluate
485 Var(Token name) # TODO: add str var_name
486 # For null, Bool, Int, Float
487 # Python uses Num(object n), which doesn't respect our "LST" invariant.
488 | Const(Token c)
489 # @(one 'two' "$three")
490 | ShArrayLiteral %ShArrayLiteral
491 # @[a b c] @[1 2 3] @[(1+1) (2+2)]
492 | RegexLiteral(Token left, re regex, List[Token] flags, Token? trans_pref)
493
494 | SimpleVarSub %SimpleVarSub
495 | BracedVarSub %BracedVarSub
496 | CommandSub %CommandSub
497 | SingleQuoted %SingleQuoted
498 | DoubleQuoted %DoubleQuoted
499
500 | BlockArg %BlockArg
501
502 | Lambda(List[NameType] params, expr body)
503
504 | Unary(Token op, expr child)
505 | Binary(Token op, expr left, expr right)
506 # x < 4 < 3 and (x < 4) < 3
507 | Compare(expr left, List[Token] ops, List[expr] comparators)
508 | FuncCall(expr func, ArgList args)
509
510 # TODO: Need a representation for method call. We don't just want
511 # Attribute() and then Call()
512
513 | IfExp(expr test, expr body, expr orelse)
514 | Tuple(List[expr] elts, expr_context ctx)
515
516 | List(List[expr] elts, expr_context ctx)
517 | Dict(List[expr] keys, List[expr] values)
518 # For the values in {n1, n2}
519 | Implicit
520
521 | ListComp(expr elt, List[Comprehension] generators)
522 # not implemented
523 | DictComp(expr key, expr value, List[Comprehension] generators)
524 | GeneratorExp(expr elt, List[Comprehension] generators)
525
526 # Ranges are written 1:2, with first class expression syntax. There is no
527 # step as in Python. Use range(0, 10, step=2) for that.
528 | Range(expr lower, expr upper)
529
530 # Slices occur within [] only. Unlike ranges, the start/end can be #
531 # implicit. Like ranges, denote a step with slice(0, 10, step=2).
532 # a[3:] a[:i]
533 | Slice(expr? lower, expr? upper)
534
535 | Subscript %Subscript
536 | Attribute %Attribute
537
538 # Ellipsis is like 'Starred' within Python, which are valid on the LHS in
539 # Python for unpacking, and # within list literals for splicing.
540 # (Starred is NOT used for {k:v, **a}. That used a blank "keys"
541 # attribute.)
542
543 # I think we can use { **pairs } like Python
544 | Spread(expr child, expr_context ctx)
545
546 #
547 # Regex Language (Eggex)
548 #
549
550 # e.g. alnum digit
551 PosixClass = (Token? negated, str name)
552 # e.g. d w s
553 PerlClass = (Token? negated, str name)
554
555 # Note: .NET has && in character classes, making it a recursive language
556
557 class_literal_term =
558 PosixClass %PosixClass
559 | PerlClass %PerlClass
560 # [a-z] ~[a-z] TODO: Doesn't respect LST invariant
561
562 | Range(Token start, Token end)
563 | CharLiteral(Token tok)
564
565 | SingleQuoted %SingleQuoted
566 # @chars
567 | Splice(Token name, str var_name)
568
569 # Char Sets and Ranges both use Char Codes
570 # with u_braced == true : \u{ff}
571 # with u_braced == false: \xff \\ 'a' a '0' 0
572 # ERE doesn't make a distinction, but compiling to Python/PCRE can use it
573 CharCode = (int i, bool u_braced, Token blame_tok)
574
575 # evaluated version of class_literal_term (could be in runtime.asdl)
576 char_class_term =
577 PosixClass %PosixClass
578 | PerlClass %PerlClass
579
580 | Range(CharCode start, CharCode end)
581
582 # For [ \x00 \\ ]
583 | CharCode %CharCode
584
585 # NOTE: modifier is unused now, can represent L or P
586 re_repeat =
587 Op(Token op)
588 | Num(Token times)
589 # dot{1,2}
590 | Range(Token? lower, Token? upper)
591 # Haven't implemented the modifier, e.g. x{+ P}
592 # | Num(Token times, id modifier)
593 # | Range(Token? lower, Token? upper, id modifier)
594
595 re =
596 # e.g. . ^ $ %begin \u123
597 Token %Token
598 | PosixClass %PosixClass
599 | PerlClass %PerlClass
600 # syntax [ $x \n ]
601 | CharClassLiteral(bool negated, List[class_literal_term] terms)
602 # evaluated [ 'abc' \n ]
603 | CharClass(bool negated, List[char_class_term] terms)
604
605 # @D
606 | Splice(Token name, str var_name)
607
608 | SingleQuoted %SingleQuoted
609
610 # Compound:
611 | Repeat(re child, re_repeat op)
612 | Seq(List[re] children)
613 | Alt(List[re] children)
614
615 | Group(re child)
616 # TODO: <d+ : month Int> needs Token? type field
617 | Capture(re child, Token? var_name)
618 | Backtracking(bool negated, Token name, re child)
619
620 # These nodes are never parsed; they're part of execution.
621 # Right now we do them in _EvalRegex, though many transformations could be
622 # done as constant evaluation.
623
624 | Primitive(id id) # . dot etc.
625 # String substitutions are evaluated into literals
626 | LiteralChars(str s, Token blame_tok)
627 }