# Grammar for Oil. # Adapted from the Python 3.7 expression grammar, with several changes! # Note: trailing commas are allowed: # {k: mydict,} # [mylist,] # mytuple, # f(args,) # func f(params,) # Oil patch: removed @= **= //= # We're missing div= and xor=, which now look weird. ^= is # exponentiation. |= has a use case. augassign: ( '+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | '<<=' | '>>=' ) # For normal and annotated assignments, additional restrictions enforced by the interpreter test: or_test ['if' or_test 'else' test] | lambdef # We follow the same rules as Python lambdas: # # |x| 1, 2 == (|x| 1), 2 # |x| x if True else 42 == |x| (x if True else 42) # # Python also had a test_nocond production like this: We don't need it because # we can't have multiple ifs. # [x for x in range(3) if lambda x: x if 1] # # The zero arg syntax like || 1 annoys me -- but this also works: # func() { return 1 } # # We used name_type_list rather than func_params because a default value like # x|y (bitwise or) conflicts with the | delimiter! lambdef: '|' [name_type_list] '|' test or_test: and_test ('or' and_test)* and_test: not_test ('and' not_test)* not_test: 'not' not_test | comparison comparison: range_expr (comp_op range_expr)* # Here the beginning and end are required range_expr: expr [':' expr] # Oil patch: removed legacy <> comp_op: ( '<'|'>'|'=='|'>='|'<='|'!='|'in'|'not' 'in'|'is'|'is' 'not'| Arith_Tilde | Expr_NotTilde ) # For lists and dicts. Note: In Python this was star_expr *foo splat_expr: '...' expr expr: xor_expr ('|' xor_expr)* xor_expr: and_expr ('xor' and_expr)* and_expr: shift_expr ('&' shift_expr)* shift_expr: arith_expr (('<<'|'>>') arith_expr)* arith_expr: term (('+'|'-') term)* # Oil patch: removed '@' and // -> div, % -> mod term: factor (('*'|'/'|'mod'|'div') factor)* factor: ('+'|'-'|'~') factor | power # Oil patch: ** -> ^ # Also removed Python 3 'await' power: atom trailer* ['^' factor] testlist_comp: (test|splat_expr) ( comp_for | (',' (test|splat_expr))* [','] ) atom: ( '(' [testlist_comp] ')' | '[' [testlist_comp] ']' | '{' [dict] '}' # TODO: translation preference | '/' regex [re_flags] '/' # NOTE: These atoms are are allowed in typed array literals | Expr_Name | Expr_Null | Expr_True | Expr_False # TODO: Allow suffixes on floats and decimals? What about in arrays? | Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt | dq_string | sq_string | sh_command_sub | braced_var_sub | simple_var_sub | sh_array_literal | array_literal ) # var f = f(x) trailer: ( '(' [arglist] ')' | '[' subscriptlist ']' # TODO: {} @() @[] trailers for passing a single arg that's a collection. # %{} %() %"" too. # And [[ ]] for lazy evaluation. | '.' Expr_Name | '->' Expr_Name | '::' Expr_Name ) # Oil patch: this is 'expr' instead of 'test' # - 1:(3<4) doesn't make any sense. # - And then this allows us to support a[3:] and a[:i] as special cases. # - First class slices have to be written 0:n. subscriptlist: subscript (',' subscript)* [','] # TODO: Add => as low precedence operator, for Func[Str, Int => Str] subscript: expr | [expr] ':' [expr] # TODO: => should be even lower precedence here too testlist: test (',' test)* [','] # Dict syntax resembles JavaScript # https://stackoverflow.com/questions/38948306/what-is-javascript-shorthand-property # # Examples: # {age: 20} is like {'age': 20} # # x = 'age' # d = {[x]: 20} # Evaluate x as a variable # d = {["foo$x"]: 20} # Another expression # d = {[x, y]: 20} # Tuple key # d = {key1, key1: 123} # Notes: # - Value is optional when the key is a name, because it can be taken from the # environment. # - We don't have: # - dict comprehensions. Maybe wait until LR parsing? # - Splatting with ** # - I don't think we want set literals? It might be @{} or %{} or #{} dict_pair: ( Expr_Name [':' test] | '[' testlist ']' ':' test | sq_string ':' test | dq_string ':' test ) dict: dict_pair (',' dict_pair)* [','] # This how Python implemented dict comprehensions. We can probably do the # same. # # dictorsetmaker: ( ((test ':' test | '**' expr) # (comp_for | (',' (test ':' test | '**' expr))* [','])) | # ((test | splat_expr) # (comp_for | (',' (test | splat_expr))* [','])) ) # The reason that keywords are test nodes instead of NAME is that using NAME # results in an ambiguity. ast.c makes sure it's a NAME. # "test '=' test" is really "keyword '=' test", but we have no such token. # These need to be in a single rule to avoid grammar that is ambiguous # to our LL(1) parser. Even though 'test' includes '*expr' in splat_expr, # we explicitly match '*' here, too, to give it proper precedence. # Illegal combinations and orderings are blocked in ast.c: # multiple (test comp_for) arguments are blocked; keyword unpackings # that precede iterable unpackings are blocked; etc. argument: ( test [comp_for] # named arg | test '=' test # splat. Note we're using prefix syntax to be consistent with Python, JS, # and the prefix @ operator. | '...' test ) # The grammar at call sites is less restrictive than at declaration sites. # ... can appear anywhere. Keyword args can appear anywhere too. arglist: argument (',' argument)* [','] [';' argument (',' argument)* [',']] # Oil patch: test_nocond -> or_test. I believe this was trying to prevent the # "double if" ambiguity here: # # # [x for x in range(3) if lambda x: x if 1] # # but Oil doesn't supported "nested loops", so we don't have this problem. comp_for: 'for' name_type_list 'in' or_test ['if' or_test] # # Expressions that are New in Oil # # Notes: # - Most of these occur in 'atom' above # - You can write $mystr but not mystr. It has to be (mystr) array_item: ( Expr_Null | Expr_True | Expr_False | Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt | dq_string | sq_string | sh_command_sub | braced_var_sub | simple_var_sub | '(' test ')' ) array_literal: '@[' array_item* Op_RBracket sh_array_literal: '@(' Expr_CastedDummy Right_ShArrayLiteral sh_command_sub: '$(' Expr_CastedDummy Eof_RParen # Note: could add c"" too dq_string: '"' Expr_CastedDummy Right_DoubleQuote sq_string: (Left_SingleQuoteRaw | Left_SingleQuoteC) Expr_CastedDummy Right_SingleQuote braced_var_sub: '${' Expr_CastedDummy Right_DollarBrace simple_var_sub: ( # This is everything in Kind.VSub except VSub_Name, which is braced: ${foo} # # Note: we could allow $foo and $0, but disallow the rest in favor of ${@} # and ${-}? Meh it's too inconsistent. VSub_DollarName | VSub_Number | VSub_Bang | VSub_At | VSub_Pound | VSub_Dollar | VSub_Star | VSub_Hyphen | VSub_QMark # NOTE: $? should be STATUS because it's an integer. ) # # Assignment / Type Variables # # Several differences vs. Python: # # - no yield expression on RHS # - no star expressions on either side (Python 3) *x, y = 2, *b # - no multiple assignments like: var x = y = 3 # - type annotation syntax is more restrictive # a: (1+2) = 3 is OK in python # - We're validating the lvalue here, instead of doing it in the "transformer". # We have the 'var' prefix which helps. # name_type use cases: # for x Int, y Int # [x for x Int, y Int in ...] # var x Int, y Int = 3, 5 # func(x Int, y Int) name_type: Expr_Name [type_expr] name_type_list: name_type (',' name_type)* type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ] # NOTE: Eof_RParen and Eof_Backtick aren't allowed because we don't want 'var' # in command subs. end_stmt: '}' | ';' | Op_Newline | Eof_Real oil_var_decl: name_type_list '=' testlist end_stmt # Note: this production is more precise as the following, but it's ambiguous :( #oil_setvar: place augassign testlist end_stmt # | place_list '=' testlist end_stmt place_list: expr (',' expr)* oil_place_mutation: place_list (augassign | '=') testlist end_stmt # For $stringfunc(x, y=1) and @arrayfunc(a, b='s') oil_arglist: '(' [arglist] ')' # # Other Entry Points # # for if (x > 0) etc. oil_expr: '(' testlist ')' # Example: for (a Int, b Int in expr) { ... } oil_for: '(' name_type_list 'in' testlist ')' # e.g. return 1 + 2 * 3 command_expr: testlist end_stmt # $[d->key] etc. oil_expr_sub: testlist ']' # Signatures for proc and func. # # Examples: # func print(msg Str, v = 0, ...args; span_id Int = 0, ...named) Int { # echo hi # } # proc rule [src, dest, @argv, &block] { echo hi } # Comparison: # () # func: required because func f { } doesn't make sense # proc: optional # ...args # func: allowed for args of any type # proc: doesn't make sense because it takes homogeneous args # ; # func: allowed for named args # proc: doesn't make sense # return types # func: optional list # proc: disallowed -- it's always int # proc params can't have types. proc_param: Expr_Name ['=' expr] # This is an awkward way of writing that we have params, then '@', then '&' -- # all optional. If they appear, @ and & have to be last, in that order. proc_params: ( (proc_param ',')* [ (proc_param | '&' Expr_Name | '@' Expr_Name [',' '&' Expr_Name]) [','] ] ) # Note: we might have proc name-with-hyphens, so we can't parse the name in # expression mode. oil_proc: ['(' [proc_params] ')'] '{' # opening { for pgen2 func_param: Expr_Name [type_expr] ['=' expr] # This is an awkward way of writing that '...' has to come last. func_params: ( (func_param ',')* [ (func_param | '...' Expr_Name) [','] ] ) type_expr_list: type_expr (',' type_expr)* oil_func: Expr_Name '(' [func_params] [';' func_params] ')' [type_expr_list] '{' # # Regex Sublanguage # char_literal: Char_OneChar | Char_Hex | Char_Unicode4 | Char_Unicode8 # we allow a-z A-Z 0-9 as ranges, but otherwise they have to be quoted # The parser enforces that they are single strings range_char: Expr_Name | Expr_DecInt | sq_string | char_literal # digit or a-z # We have to do further validation of ranges later. class_literal_term: ( range_char ['-' range_char ] | '~' Expr_Name # $mychars or ${mymodule.mychars} | simple_var_sub | braced_var_sub # e.g. 'abc' or "abc$mychars" # NOTE: range_char has sq_string | dq_string # Reserved for [[.collating sequences.]] (Unicode) | '.' Expr_Name # Reserved for [[=character equivalents=]] (Unicode) | '=' Expr_Name # TODO: Do these char classes actually work in bash/awk/egrep/sed/etc.? ) class_literal: '[' class_literal_term+ ']' # NOTE: Here is an example of where you can put ^ in the middle of a pattern in # Python, and it matters! # >>> r = re.compile('.f[a-z]*', re.DOTALL|re.MULTILINE) # >>> r.findall('z\nfoo\nbeef\nfood\n') # ['\nfoo', 'ef', '\nfood'] # >>> r = re.compile('.^f[a-z]*', re.DOTALL|re.MULTILINE) # r.findall('z\nfoo\nbeef\nfood\n') # ['\nfoo', '\nfood'] re_atom: ( char_literal # builtin regex like 'digit' or a regex reference like 'D' | Expr_Name # %begin or %end | Expr_Symbol | class_literal # ~digit or ~ %boundary or ~[a-f] | '~' [Expr_Name | Expr_Symbol | class_literal] # Splice another expression | '@' Expr_Name # any %start %end are preferred | '.' | '^' | '$' # egrep has zero-width assertions \< and \> # We could make them %< and %> or %startword %endword | '<' | '>' # literal STRINGS like $foo or ${module.foo} | simple_var_sub | braced_var_sub # In a language-independent spec, backslashes are disallowed within 'sq'. # Write it with char literals outside strings: 'foo' \\ 'bar' \n | sq_string | dq_string # capturing group | '(' regex ['as' name_type] ')' # : is syntactic space for non-capturing group. (! would seem like negation.) | ':' '(' regex ')' # syntactic space for Perl-style backtracking # !REF 1 !REF name # !AHEAD(d+) !BEHIND(d+) !NOT_AHEAD(d+) !NOT_BEHIND(d+) | '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')') # Might want this obscure conditional construct. Can't use C-style ternary # because '?' is a regex operator. #| '{' regex 'if' regex 'else' regex '}' # Others: # PCRE has (?R ) for recursion? That could be !RECURSE() # Note: .NET has && in character classes, making it a recursive language ) # e.g. a{3} a{3,4} a{3,} a{,4} but not a{,} repeat_range: ( Expr_DecInt [','] | ',' Expr_DecInt | Expr_DecInt ',' Expr_DecInt ) repeat_op: ( '+' | '*' | '?' # In PCRE, ?? *? +? {}? is lazy/nongreedy and ?+ *+ ++ {}+ is "possessive" # We use N and P modifiers within {}. # a{L +} a{P ?} a{P 3,4} a{P ,4} | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}' ) re_alt: (re_atom [repeat_op])+ regex: [re_alt] (('|'|'or') re_alt)* # /digit+ ; multiline,ignorecase/ re_flag: ['~'] Expr_Name re_flags: ';' re_flag (',' re_flag)* # Syntax reserved for PCRE/Python, but that's not in ERE: # # nop-greedy a{N *} # non-capturing :( digit+ ) # backtracking !REF 1 !AHEAD(d+) # # Legacy syntax: # # ^ and $ instead of %start and %end # < and > instead of %start_word and %end_word # . instead of dot # | instead of 'or'