1 # Grammar for Oil.
2 # Adapted from the Python 3.7 expression grammar, with several changes!
3 #
4 # TODO:
5 # - funcs in expression context, maybe disable lambdas like |x| x+1 for now?
6 # - Are tuples and heterogeneous lists in Oil or Tea?
7 #
8 # Tea also needs:
9 # - cast expressions
10 # - although cast(Int, foo) works I suppose. It feels like it has a runtime
11 # cost
12 #
13 # - What about list comprehensions?
14 # - I think this could be replaced with implicit vectorization, like
15 # @len(x) or len.(x) ? It's shorter, but it maps and doesn't filter.
16 # - Generator expressions?
17
18 # Note: trailing commas are allowed:
19 # {k: mydict,}
20 # [mylist,]
21 # mytuple,
22 # f(args,)
23 # func f(params,)
24 #
25 # Kinds used:
26 # VSub, Left, Right, Expr, Op, Arith, Char, Eof, Unknown
27
28 # Oil patch: removed @= **= //=
29 # We're missing div= and xor=, which now look weird. ^= is
30 # exponentiation. |= has a use case.
31 augassign: (
32 '+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | '<<=' | '>>='
33 )
34 # For normal and annotated assignments, additional restrictions enforced by the interpreter
35
36 test: or_test ['if' or_test 'else' test] | lambdef
37
38 # We follow the same rules as Python lambdas:
39 #
40 # |x| 1, 2 == (|x| 1), 2
41 # |x| x if True else 42 == |x| (x if True else 42)
42 #
43 # Python also had a test_nocond production like this: We don't need it because
44 # we can't have multiple ifs.
45 # [x for x in range(3) if lambda x: x if 1]
46 #
47 # The zero arg syntax like || 1 annoys me -- but this also works:
48 # func() { return 1 }
49 #
50 # We used name_type_list rather than param_group because a default value like
51 # x|y (bitwise or) conflicts with the | delimiter!
52
53 lambdef: '|' [name_type_list] '|' test
54
55 or_test: and_test ('or' and_test)*
56 and_test: not_test ('and' not_test)*
57 not_test: 'not' not_test | comparison
58 comparison: range_expr (comp_op range_expr)*
59
60 # Here the beginning and end are required
61 range_expr: expr [':' expr]
62
63 # Oil patch: removed legacy <>, use == and !==
64 comp_op: (
65 '<'|'>'|'==='|'>='|'<='|'!=='|'in'|'not' 'in'|'is'|'is' 'not'|
66 '~' | '!~' | '~~' | '!~~' | '~=='
67 )
68
69 # For lists and dicts. Note: In Python this was star_expr *foo
70 splat_expr: '...' expr
71
72 expr: xor_expr ('|' xor_expr)*
73 xor_expr: and_expr ('^' and_expr)*
74 and_expr: shift_expr ('&' shift_expr)*
75 shift_expr: arith_expr (('<<'|'>>') arith_expr)*
76 # Oil: add concatenation with same precedence as +
77 arith_expr: term (('+'|'-'|'++') term)*
78 # Oil patch: removed '@'
79 term: factor (('*'|'/'|'//'|'%') factor)*
80 factor: ('+'|'-'|'~') factor | power
81 # Oil patch: removed Python 3 'await'
82 power: atom trailer* ['**' factor]
83
84 testlist_comp: (test|splat_expr) ( comp_for | (',' (test|splat_expr))* [','] )
85
86 atom: (
87 '(' [testlist_comp] ')'
88 | '[' [testlist_comp] ']'
89 # Note: newlines are significant inside {}, unlike inside () and []
90 | '{' [Op_Newline] [dict] '}'
91 | '/' regex [re_flags] '/'
92 # NOTE: These atoms are are allowed in typed array literals
93 | Expr_Name | Expr_Null | Expr_True | Expr_False
94 # TODO: Allow suffixes on floats and decimals?
95 # You could frame it as multiplication, so 100 M is 100 * M, where
96 # M = 1_000_000
97 | Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt
98
99 | Char_OneChar # char literal \n \\ etc.
100 | Char_UBraced # char literal \u{3bc}
101 | Char_Pound # char literal #'A' etc.
102
103 | dq_string | sq_string
104 # Expr_Symbol could be %mykey
105
106 # $foo is disallowed, but $? is allowed. Should be "$foo" to indicate a
107 # string, or ${foo:-}
108 | simple_var_sub
109 | sh_command_sub | braced_var_sub
110 | sh_array_literal
111 | old_sh_array_literal
112 # Anonymous function. Is this only in Tea mode?
113 | 'func' tea_func
114 )
115
116 # Tea can run a limited form of procs. The first word must be a name, and NO
117 # BARE WORDS.
118 #
119 # Example:
120 # log "hello $name" # valid in OSH, Oil, Tea
121 # myproc $(hostname) # ditto
122 #
123 # my-proc '/' $foo # OSH and Oil
124 # run 'my-proc' '/' $foo # Tea. 'run' is similar to 'command' and 'builtin'
125 #
126
127 tea_word: (
128 dq_string | sq_string
129 | sh_command_sub | braced_var_sub | simple_var_sub
130 )
131
132 # var f = f(x)
133 trailer: (
134 '(' [arglist] ')'
135 | '[' subscriptlist ']'
136
137 # TODO:
138 # - {} %() :() trailers for passing a single arg that's a collection.
139 # - and maybe %"" too
140 # Lazy evaluation:
141 # - f %(a b c) could be f(%(a b c))
142 # - f {a=1, b=2} could be f({a=1, b=2})
143 # - although we might want that for Point {x: 1, y: 2}
144 # - f :(x, y) could be equivalent to f(:[x], :[y])
145
146 | '.' Expr_Name
147 | '->' Expr_Name
148 | '::' Expr_Name
149 )
150
151 # Oil patch: this is 'expr' instead of 'test'
152 # - 1:(3<4) doesn't make any sense.
153 # - And then this allows us to support a[3:] and a[:i] as special cases.
154 # - First class slices have to be written 0:n.
155
156 subscriptlist: subscript (',' subscript)* [',']
157
158 # TODO: Add => as low precedence operator, for Func[Str, Int => Str]
159 subscript: expr | [expr] ':' [expr]
160
161 # TODO: => should be even lower precedence here too
162 testlist: test (',' test)* [',']
163
164 # Dict syntax resembles JavaScript
165 # https://stackoverflow.com/questions/38948306/what-is-javascript-shorthand-property
166 #
167 # Examples:
168 # {age: 20} is like {'age': 20}
169 #
170 # x = 'age'
171 # d = %{[x]: 20} # Evaluate x as a variable
172 # d = %{["foo$x"]: 20} # Another expression
173 # d = %{[x, y]: 20} # Tuple key
174 # d = %{key1, key1: 123}
175 # Notes:
176 # - Value is optional when the key is a name, because it can be taken from the
177 # environment.
178 # - We don't have:
179 # - dict comprehensions. Maybe wait until LR parsing?
180 # - Splatting with **
181
182 dict_pair: (
183 Expr_Name [':' test]
184 | '[' testlist ']' ':' test
185 | sq_string ':' test
186 | dq_string ':' test
187 )
188
189 dict: dict_pair (comma_newline dict_pair)* [comma_newline]
190
191 # This how Python implemented dict comprehensions. We can probably do the
192 # same.
193 #
194 # dictorsetmaker: ( ((test ':' test | '**' expr)
195 # (comp_for | (',' (test ':' test | '**' expr))* [','])) |
196 # ((test | splat_expr)
197 # (comp_for | (',' (test | splat_expr))* [','])) )
198
199 # The reason that keywords are test nodes instead of NAME is that using NAME
200 # results in an ambiguity. ast.c makes sure it's a NAME.
201 # "test '=' test" is really "keyword '=' test", but we have no such token.
202 # These need to be in a single rule to avoid grammar that is ambiguous
203 # to our LL(1) parser. Even though 'test' includes '*expr' in splat_expr,
204 # we explicitly match '*' here, too, to give it proper precedence.
205 # Illegal combinations and orderings are blocked in ast.c:
206 # multiple (test comp_for) arguments are blocked; keyword unpackings
207 # that precede iterable unpackings are blocked; etc.
208
209 argument: (
210 test [comp_for]
211 # named arg
212 | test '=' test
213 # splat. Note we're using prefix syntax to be consistent with Python, JS,
214 # and the prefix @ operator.
215 | '...' test
216 )
217
218 # The grammar at call sites is less restrictive than at declaration sites.
219 # ... can appear anywhere. Keyword args can appear anywhere too.
220 arglist: argument (',' argument)* [','] [';' argument (',' argument)* [',']]
221
222
223 # Oil patch: test_nocond -> or_test. I believe this was trying to prevent the
224 # "double if" ambiguity here:
225 # #
226 # [x for x in range(3) if lambda x: x if 1]
227 #
228 # but Oil doesn't supported "nested loops", so we don't have this problem.
229 comp_for: 'for' name_type_list 'in' or_test ['if' or_test]
230
231
232 #
233 # Expressions that are New in Oil
234 #
235
236 # Notes:
237 # - Most of these occur in 'atom' above
238 # - You can write $mystr but not mystr. It has to be (mystr)
239 array_item: (
240 Expr_Null | Expr_True | Expr_False
241 | Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt
242 | dq_string | sq_string
243 | sh_command_sub | braced_var_sub | simple_var_sub
244 | '(' test ')'
245 )
246 sh_array_literal: ':|' Expr_CastedDummy Op_Pipe
247
248 # TODO: remove this
249 old_sh_array_literal: '%(' Expr_CastedDummy Right_ShArrayLiteral
250 sh_command_sub: ( '$(' | '@(' | '^(' ) Expr_CastedDummy Eof_RParen
251
252 # Note: could add c"" too
253 dq_string: (Left_DoubleQuote | Left_TDoubleQuote) Expr_CastedDummy Right_DoubleQuote
254 sq_string: (
255 Left_SingleQuote | Left_RSingleQuote | Left_DollarSingleQuote |
256 Left_TSingleQuote | Left_RTSingleQuote | Left_DollarTSingleQuote
257 ) Expr_CastedDummy Right_SingleQuote
258
259 braced_var_sub: '${' Expr_CastedDummy Right_DollarBrace
260
261 simple_var_sub: (
262 # This is everything in Kind.VSub except VSub_Name, which is braced: ${foo}
263 #
264 # Note: we could allow $foo and $0, but disallow the rest in favor of ${@}
265 # and ${-}? Meh it's too inconsistent.
266 VSub_DollarName | VSub_Number
267 | VSub_Bang | VSub_At | VSub_Pound | VSub_Dollar | VSub_Star | VSub_Hyphen
268 | VSub_QMark
269 # NOTE: $? should be STATUS because it's an integer.
270 )
271
272 #
273 # Assignment / Type Variables
274 #
275 # Several differences vs. Python:
276 #
277 # - no yield expression on RHS
278 # - no star expressions on either side (Python 3) *x, y = 2, *b
279 # - no multiple assignments like: var x = y = 3
280 # - type annotation syntax is more restrictive # a: (1+2) = 3 is OK in python
281 # - We're validating the lvalue here, instead of doing it in the "transformer".
282 # We have the 'var' prefix which helps.
283
284 # name_type use cases:
285 # for x Int, y Int
286 # [x for x Int, y Int in ...]
287 # var x Int, y Int = 3, 5
288 # func(x Int, y Int)
289 name_type: Expr_Name [type_expr]
290 name_type_list: name_type (',' name_type)*
291
292 type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
293
294 # NOTE: Eof_RParen and Eof_Backtick aren't allowed because we don't want 'var'
295 # in command subs.
296 end_stmt: '}' | ';' | Op_Newline | Eof_Real
297
298 oil_var_decl: name_type_list '=' testlist end_stmt
299
300 # Note: this production is more precise as the following, but it's ambiguous :(
301 #oil_setvar: place augassign testlist end_stmt
302 # | place_list '=' testlist end_stmt
303
304 # Note: for Oil (not Tea), we could accept [':'] expr for setvar :out = 'foo'
305 place_list: expr (',' expr)*
306 oil_place_mutation: place_list (augassign | '=') testlist end_stmt
307
308 # For $stringfunc(x, y=1) and @arrayfunc(a, b='s')
309 oil_arglist: '(' [arglist] ')'
310
311 #
312 # Other Entry Points
313 #
314
315 # for if (x > 0) etc.
316 oil_expr: '(' testlist ')'
317
318 # Example: for (a Int, b Int in expr) { ... }
319 oil_for: '(' name_type_list 'in' testlist ')'
320
321 # e.g. return 1 + 2 * 3
322 command_expr: testlist end_stmt
323
324 # $[d->key] etc.
325 oil_expr_sub: testlist ']'
326
327 # Signatures for proc and func.
328
329 # Note: 'proc name-with-hyphens' is allowed, so we can't parse the name in
330 # expression mode.
331 ysh_proc: (
332 [ '('
333 [ param_group ] # word params, with defaults
334 [ ';' [ param_group ] ] # positional typed params, with defaults
335 [ ';' [ param_group ] ] # named params, with defaults
336 [ ';' [ param_group ] ] # optional block param, with no type or default
337
338 # This causes a pgen2 error? It doesn't know which branch to take
339 # So we have the extra {block} syntax
340 #[ ';' Expr_Name ] # optional block param, with no type or default
341 ')'
342 ]
343 '{' # opening { for pgen2
344 )
345
346 # YSH entry point
347 ysh_func: Expr_Name '(' [param_group] [';' param_group] ')'
348
349 param: Expr_Name [type_expr] ['=' expr]
350
351 # This is an awkward way of writing that '...' has to come last.
352 param_group: (
353 (param ',')*
354 [ (param | '...' Expr_Name) [','] ]
355 )
356
357 type_expr_list: type_expr (',' type_expr)*
358
359 # Note: It may make sense to have ; here, for named params only!
360 data_params: (param ',')* [ param [','] ]
361
362 # zero params allowed for consistency with func and class?
363 tea_data: Expr_Name '(' [data_params] ')'
364
365 # e.g. Nullary %Token or Nullary(x Int)
366 variant_type: Expr_Symbol | '(' data_params ')'
367 variant: Expr_Name [ variant_type ]
368
369 # for dict, tea_enum
370 comma_newline: ',' [Op_Newline] | Op_Newline
371 # for tea_func, tea_class
372 semi_newline: ';' [Op_Newline] | Op_Newline
373
374 #
375 # Experimental "Tea" stuff
376 #
377
378 tea_enum: (
379 Expr_Name '{' [Op_Newline]
380 # note: braces can be empty
381 [ variant (comma_newline variant)* [comma_newline] ]
382 '}'
383 )
384
385 suite: '{' [Op_Newline] [func_items] '}'
386
387 func_item: (
388 ('var' | 'const') name_type_list '=' testlist # oil_var_decl
389
390 # TODO: if/switch, with, try/except/throw, etc.
391 | 'while' test suite
392 | 'for' name_type_list 'in' test suite
393
394 # In Python, imports, assert, etc. also at this 'small_stmt' level
395 | 'break' | 'continue' | 'return' [testlist]
396
397 # TODO: accept setvar for consistency with Oil?
398 | 'set' place_list (augassign | '=') testlist # oil_place_mutation
399 # x f(x) etc.
400 #
401 # And x = 1. Python uses the same "hack" to fit within pgen2. It also
402 # supports a = b = 1, which we don't want.
403 #
404 # And echo 'hi' 'there'
405 #
406 # TODO: expr_to_ast needs to validate this
407 | testlist (['=' testlist] | tea_word*)
408 )
409
410 # we want to avoid requiring newline or ; before }
411 func_items: func_item (semi_newline func_item)* [semi_newline]
412
413 # This is anonymous
414 tea_func: (
415 '(' [param_group] [';' param_group] ')' [type_expr_list]
416 suite
417 )
418 named_func: Expr_Name tea_func
419
420 # TODO: Methods differ from functions:
421 # super() can be the first arg
422 # shortcut initializer: Parser(this.lexer) { }
423 # abstract, override, virtual
424 # should we allow annotations, like 'public' or 'export'?
425 #
426 # No field initializers for now. Later C++ versions allow it.
427 #
428 # Annotations:
429 #
430 # func Parse() Int
431 # [override const abstract] {
432 # } ?
433
434 class_item: (
435 ('virtual' | 'override' | 'func' | 'abstract' ) Expr_Name tea_func
436 # Member declaration
437 | 'var' name_type_list
438 )
439
440 # Note: we could restrict separators to newlines.
441 # But then you couldn't do class Foo { var a; var b }
442 class_items: class_item (semi_newline class_item)* [semi_newline]
443
444 tea_class: Expr_Name [':' Expr_Name ] '{' [Op_Newline] [class_items] '}'
445
446 # 'import' can't use 'semi_newline' because ending with an unknown number of
447 # tokens doesn't compose with our CommandParser.
448 end_import: ';' | Op_Newline
449
450 import_name: Expr_Name ['as' Expr_Name]
451 import_names: import_name (comma_newline import_name)* [import_name]
452
453 # TODO: Should we have a simpler Oil string literal?
454 tea_import: sq_string [ 'as' Expr_Name ] ['(' [Op_Newline] [import_names] ')'] end_import
455
456 # Top level:
457 # declarations of constants -- with const only?
458 # maybe only const?
459 # use, data, enum, class, func. That's it? OK.
460
461 end_outer: ';' [Op_Newline] | Op_Newline | Eof_Real
462
463 module_item: (
464 # oil_var_decl, but no mutation
465 ('var' | 'const') name_type_list '=' testlist end_outer
466 | 'import' tea_import # TODO: needs Eof_Real
467 # Also 'export'
468 | 'class' tea_class end_outer
469 | 'data' tea_data end_outer
470 | 'enum' tea_enum end_outer
471 | 'func' Expr_Name tea_func end_outer
472
473 # Might need: typedef? Or typealias?
474 )
475
476 # Eof_Real either after newline or before newline are both valid
477 tea_module: [Op_Newline] module_item* [Eof_Real]
478
479
480 #
481 # Regex Sublanguage
482 #
483
484 char_literal: Char_OneChar | Char_Hex | Char_UBraced
485
486 # we allow a-z A-Z 0-9 as ranges, but otherwise they have to be quoted
487 # The parser enforces that they are single strings
488 range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
489
490 # digit or a-z
491 # We have to do further validation of ranges later.
492 class_literal_term: (
493 # NOTE: range_char has sq_string
494 range_char ['-' range_char ]
495 # splice a literal set of characters
496 | '@' Expr_Name
497 | '!' Expr_Name
498 # Reserved for [[.collating sequences.]] (Unicode)
499 | '.' Expr_Name
500 # Reserved for [[=character equivalents=]] (Unicode)
501 | '=' Expr_Name
502 # TODO: Do these char classes actually work in bash/awk/egrep/sed/etc.?
503
504 )
505 class_literal: '[' class_literal_term+ ']'
506
507 # NOTE: Here is an example of where you can put ^ in the middle of a pattern in
508 # Python, and it matters!
509 # >>> r = re.compile('.f[a-z]*', re.DOTALL|re.MULTILINE)
510 # >>> r.findall('z\nfoo\nbeef\nfood\n')
511 # ['\nfoo', 'ef', '\nfood']
512 # >>> r = re.compile('.^f[a-z]*', re.DOTALL|re.MULTILINE)
513 # r.findall('z\nfoo\nbeef\nfood\n')
514 # ['\nfoo', '\nfood']
515
516 re_atom: (
517 char_literal
518 # builtin regex like 'digit' or a regex reference like 'D'
519 | Expr_Name
520 # %begin or %end
521 | Expr_Symbol
522 | class_literal
523 # !digit or ![a-f]. Note ! %boundary could be \B in Python, but ERE
524 # doesn't have anything like that
525 | '!' (Expr_Name | class_literal)
526
527 # syntactic space for Perl-style backtracking
528 # !!REF 1 !!REF name
529 # !!AHEAD(d+) !!BEHIND(d+) !!NOT_AHEAD(d+) !!NOT_BEHIND(d+)
530 #
531 # Note: !! conflicts with history
532 | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
533
534 # Splice another expression
535 | '@' Expr_Name
536 # any %start %end are preferred
537 | '.' | '^' | '$'
538 # In a language-independent spec, backslashes are disallowed within 'sq'.
539 # Write it with char literals outside strings: 'foo' \\ 'bar' \n
540 #
541 # No double-quoted strings because you can write "x = $x" with 'x = ' @x
542 | sq_string
543
544 # grouping (non-capturing in Perl; capturing in ERE although < > is preferred)
545 | '(' regex ')'
546 # capturing group, with optional name
547 | '<' regex [':' name_type] '>'
548
549 # Might want this obscure conditional construct. Can't use C-style ternary
550 # because '?' is a regex operator.
551 #| '{' regex 'if' regex 'else' regex '}'
552
553 # Others:
554 # PCRE has (?R ) for recursion? That could be !RECURSE()
555 # Note: .NET has && in character classes, making it a recursive language
556 )
557
558 # e.g. a{3} a{3,4} a{3,} a{,4} but not a{,}
559 repeat_range: (
560 Expr_DecInt [',']
561 | ',' Expr_DecInt
562 | Expr_DecInt ',' Expr_DecInt
563 )
564
565 repeat_op: (
566 '+' | '*' | '?'
567 # In PCRE, ?? *? +? {}? is lazy/nongreedy and ?+ *+ ++ {}+ is "possessive"
568 # We use N and P modifiers within {}.
569 # a{L +} a{P ?} a{P 3,4} a{P ,4}
570 | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}'
571 )
572
573 re_alt: (re_atom [repeat_op])+
574
575 regex: [re_alt] (('|'|'or') re_alt)*
576
577 # Patterns are the start of a case arm. Ie,
578 #
579 # case (foo) {
580 # (40 + 2) | (0) { echo number }
581 # ^^^^^^^^^^^^^^-- This is pattern
582 # }
583 #
584 # Due to limitations created from pgen2/cmd_parser interactions, we also parse
585 # the leading '{' token of the case arm body in pgen2. We do this to help pgen2
586 # figure out when to transfer control back to the cmd_parser. For more details
587 # see #oil-dev > Dev Friction / Smells.
588 #
589 # case (foo) {
590 # (40 + 2) | (0) { echo number }
591 # ^-- End of pattern/beginning of case arm body
592 # }
593
594 ysh_case_pat: (
595 '(' (pat_else | pat_exprs)
596 | pat_eggex
597 ) [Op_Newline] '{'
598
599 pat_else: 'else' ')'
600 pat_exprs: expr ')' [Op_Newline] ('|' [Op_Newline] '(' expr ')' [Op_Newline])*
601 pat_eggex: '/' regex [re_flags] '/'
602
603 # e.g. /digit+ ; multiline !ignorecase/
604 #
605 # This can express translation preferences:
606 #
607 # / d+ ; %ERE / is '[[:digit:]]+'
608 # / d+ ; %python / is '\d+'
609 # / d+ ; ignorecase %python / is '(?i)\d+'
610
611 re_flag: ['!'] Expr_Name | Expr_Symbol
612 re_flags: ';' re_flag+
613
614 # Syntax reserved for PCRE/Python, but that's not in ERE:
615 #
616 # nop-greedy a{N *}
617 # non-capturing :( digit+ )
618 # backtracking !REF 1 !AHEAD(d+)
619 #
620 # Legacy syntax:
621 #
622 # ^ and $ instead of %start and %end
623 # < and > instead of %start_word and %end_word
624 # . instead of dot
625 # | instead of 'or'