# Grammar for Oil. # Adapted from the Python 3.7 expression grammar, with several changes! # # TODO: # - funcs in expression context, maybe disable lambdas like |x| x+1 for now? # - Are tuples and heterogeneous lists in Oil or Tea? # # Tea also needs: # - cast expressions # - although cast(Int, foo) works I suppose. It feels like it has a runtime # cost # # - What about list comprehensions? # - I think this could be replaced with implicit vectorization, like # @len(x) or len.(x) ? It's shorter, but it maps and doesn't filter. # - Generator expressions? # Note: trailing commas are allowed: # {k: mydict,} # [mylist,] # mytuple, # f(args,) # func f(params,) # # Kinds used: # VSub, Left, Right, Expr, Op, Arith, Char, Eof, Unknown # Oil patch: removed @= **= //= # We're missing div= and xor=, which now look weird. ^= is # exponentiation. |= has a use case. augassign: ( '+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | '<<=' | '>>=' ) # For normal and annotated assignments, additional restrictions enforced by the interpreter test: or_test ['if' or_test 'else' test] | lambdef # We follow the same rules as Python lambdas: # # |x| 1, 2 == (|x| 1), 2 # |x| x if True else 42 == |x| (x if True else 42) # # Python also had a test_nocond production like this: We don't need it because # we can't have multiple ifs. # [x for x in range(3) if lambda x: x if 1] # # The zero arg syntax like || 1 annoys me -- but this also works: # func() { return 1 } # # We used name_type_list rather than func_params because a default value like # x|y (bitwise or) conflicts with the | delimiter! lambdef: '|' [name_type_list] '|' test or_test: and_test ('or' and_test)* and_test: not_test ('and' not_test)* not_test: 'not' not_test | comparison comparison: range_expr (comp_op range_expr)* # Here the beginning and end are required range_expr: expr [':' expr] # Oil patch: removed legacy <>, use == and !== comp_op: ( '<'|'>'|'==='|'>='|'<='|'!=='|'in'|'not' 'in'|'is'|'is' 'not'| '~' | '!~' | '~~' | '!~~' | '~==' ) # For lists and dicts. Note: In Python this was star_expr *foo splat_expr: '...' expr expr: xor_expr ('|' xor_expr)* xor_expr: and_expr ('^' and_expr)* and_expr: shift_expr ('&' shift_expr)* shift_expr: arith_expr (('<<'|'>>') arith_expr)* # Oil: add concatenation with same precedence as + arith_expr: term (('+'|'-'|'++') term)* # Oil patch: removed '@' term: factor (('*'|'/'|'//'|'%') factor)* factor: ('+'|'-'|'~') factor | power # Oil patch: removed Python 3 'await' power: atom trailer* ['**' factor] testlist_comp: (test|splat_expr) ( comp_for | (',' (test|splat_expr))* [','] ) atom: ( '(' [testlist_comp] ')' | '[' [testlist_comp] ']' # Note: newlines are significant inside {}, unlike inside () and [] | '{' [Op_Newline] [dict] '}' | '/' regex [re_flags] '/' # NOTE: These atoms are are allowed in typed array literals | Expr_Name | Expr_Null | Expr_True | Expr_False # TODO: Allow suffixes on floats and decimals? # You could frame it as multiplication, so 100 M is 100 * M, where # M = 1_000_000 | Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt | Char_OneChar # char literal \n \\ etc. | Char_UBraced # char literal \u{3bc} | Char_Pound # char literal #'A' etc. | dq_string | sq_string # Expr_Symbol could be %mykey | sh_command_sub | braced_var_sub | simple_var_sub | sh_array_literal # Anonymous function. Is this only in Tea mode? | 'func' tea_func ) # Tea can run a limited form of procs. The first word must be a name, and NO # BARE WORDS. # # Example: # log "hello $name" # valid in OSH, Oil, Tea # myproc $(hostname) # ditto # # my-proc '/' $foo # OSH and Oil # run 'my-proc' '/' $foo # Tea. 'run' is similar to 'command' and 'builtin' # tea_word: ( dq_string | sq_string | sh_command_sub | braced_var_sub | simple_var_sub ) # var f = f(x) trailer: ( '(' [arglist] ')' | '[' subscriptlist ']' # TODO: # - {} %() :() trailers for passing a single arg that's a collection. # - and maybe %"" too # Lazy evaluation: # - f %(a b c) could be f(%(a b c)) # - f {a=1, b=2} could be f({a=1, b=2}) # - although we might want that for Point {x: 1, y: 2} # - f :(x, y) could be equivalent to f(:[x], :[y]) | '.' Expr_Name | '->' Expr_Name | '::' Expr_Name ) # Oil patch: this is 'expr' instead of 'test' # - 1:(3<4) doesn't make any sense. # - And then this allows us to support a[3:] and a[:i] as special cases. # - First class slices have to be written 0:n. subscriptlist: subscript (',' subscript)* [','] # TODO: Add => as low precedence operator, for Func[Str, Int => Str] subscript: expr | [expr] ':' [expr] # TODO: => should be even lower precedence here too testlist: test (',' test)* [','] # Dict syntax resembles JavaScript # https://stackoverflow.com/questions/38948306/what-is-javascript-shorthand-property # # Examples: # {age: 20} is like {'age': 20} # # x = 'age' # d = %{[x]: 20} # Evaluate x as a variable # d = %{["foo$x"]: 20} # Another expression # d = %{[x, y]: 20} # Tuple key # d = %{key1, key1: 123} # Notes: # - Value is optional when the key is a name, because it can be taken from the # environment. # - We don't have: # - dict comprehensions. Maybe wait until LR parsing? # - Splatting with ** dict_pair: ( Expr_Name [':' test] | '[' testlist ']' ':' test | sq_string ':' test | dq_string ':' test ) dict: dict_pair (comma_newline dict_pair)* [comma_newline] # This how Python implemented dict comprehensions. We can probably do the # same. # # dictorsetmaker: ( ((test ':' test | '**' expr) # (comp_for | (',' (test ':' test | '**' expr))* [','])) | # ((test | splat_expr) # (comp_for | (',' (test | splat_expr))* [','])) ) # The reason that keywords are test nodes instead of NAME is that using NAME # results in an ambiguity. ast.c makes sure it's a NAME. # "test '=' test" is really "keyword '=' test", but we have no such token. # These need to be in a single rule to avoid grammar that is ambiguous # to our LL(1) parser. Even though 'test' includes '*expr' in splat_expr, # we explicitly match '*' here, too, to give it proper precedence. # Illegal combinations and orderings are blocked in ast.c: # multiple (test comp_for) arguments are blocked; keyword unpackings # that precede iterable unpackings are blocked; etc. argument: ( test [comp_for] # named arg | test '=' test # splat. Note we're using prefix syntax to be consistent with Python, JS, # and the prefix @ operator. | '...' test ) # The grammar at call sites is less restrictive than at declaration sites. # ... can appear anywhere. Keyword args can appear anywhere too. arglist: argument (',' argument)* [','] [';' argument (',' argument)* [',']] # Oil patch: test_nocond -> or_test. I believe this was trying to prevent the # "double if" ambiguity here: # # # [x for x in range(3) if lambda x: x if 1] # # but Oil doesn't supported "nested loops", so we don't have this problem. comp_for: 'for' name_type_list 'in' or_test ['if' or_test] # # Expressions that are New in Oil # # Notes: # - Most of these occur in 'atom' above # - You can write $mystr but not mystr. It has to be (mystr) array_item: ( Expr_Null | Expr_True | Expr_False | Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt | dq_string | sq_string | sh_command_sub | braced_var_sub | simple_var_sub | '(' test ')' ) sh_array_literal: '%(' Expr_CastedDummy Right_ShArrayLiteral sh_command_sub: ( '$(' | '@(' | '^(' ) Expr_CastedDummy Eof_RParen # Note: could add c"" too dq_string: (Left_DoubleQuote | Left_TDoubleQuote) Expr_CastedDummy Right_DoubleQuote sq_string: ( Left_SingleQuote | Left_RSingleQuote | Left_DollarSingleQuote | Left_TSingleQuote | Left_RTSingleQuote | Left_DollarTSingleQuote ) Expr_CastedDummy Right_SingleQuote braced_var_sub: '${' Expr_CastedDummy Right_DollarBrace simple_var_sub: ( # This is everything in Kind.VSub except VSub_Name, which is braced: ${foo} # # Note: we could allow $foo and $0, but disallow the rest in favor of ${@} # and ${-}? Meh it's too inconsistent. VSub_DollarName | VSub_Number | VSub_Bang | VSub_At | VSub_Pound | VSub_Dollar | VSub_Star | VSub_Hyphen | VSub_QMark # NOTE: $? should be STATUS because it's an integer. ) # # Assignment / Type Variables # # Several differences vs. Python: # # - no yield expression on RHS # - no star expressions on either side (Python 3) *x, y = 2, *b # - no multiple assignments like: var x = y = 3 # - type annotation syntax is more restrictive # a: (1+2) = 3 is OK in python # - We're validating the lvalue here, instead of doing it in the "transformer". # We have the 'var' prefix which helps. # name_type use cases: # for x Int, y Int # [x for x Int, y Int in ...] # var x Int, y Int = 3, 5 # func(x Int, y Int) name_type: Expr_Name [type_expr] name_type_list: name_type (',' name_type)* type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ] # NOTE: Eof_RParen and Eof_Backtick aren't allowed because we don't want 'var' # in command subs. end_stmt: '}' | ';' | Op_Newline | Eof_Real oil_var_decl: name_type_list '=' testlist end_stmt # Note: this production is more precise as the following, but it's ambiguous :( #oil_setvar: place augassign testlist end_stmt # | place_list '=' testlist end_stmt # Note: for Oil (not Tea), we could accept [':'] expr for setvar :out = 'foo' place_list: expr (',' expr)* oil_place_mutation: place_list (augassign | '=') testlist end_stmt # For $stringfunc(x, y=1) and @arrayfunc(a, b='s') oil_arglist: '(' [arglist] ')' # # Other Entry Points # # for if (x > 0) etc. oil_expr: '(' testlist ')' # Example: for (a Int, b Int in expr) { ... } oil_for: '(' name_type_list 'in' testlist ')' # e.g. return 1 + 2 * 3 command_expr: testlist end_stmt # $[d->key] etc. oil_expr_sub: testlist ']' # Signatures for proc and func. # # Examples: # func print(msg Str, v = 0, ...args; span_id Int = 0, ...named) Int { # echo hi # } # proc rule [src, dest, @argv, &block] { echo hi } # Comparison: # () # func: required because func f { } doesn't make sense # proc: optional # ...args # func: allowed for args of any type # proc: doesn't make sense because it takes homogeneous args # ; # func: allowed for named args # proc: doesn't make sense # return types # func: optional list # proc: disallowed -- it's always int # proc_params is an overapproximation of what's allowed in the signature. It # really should look like # # untyped_params* typed_param* # # where untyped_params are mystr :myref @rest # and typed_params are myblock Block myexpr Expr # # Example: proc p(x, :out, @rest, cond Expr, block Block) { # # These two styles are hard to express within the LL(1) constraint, so we # have a single proc_param production and then sort it out in expr_to_ast.py. # # Note: might want typed splat ... later proc_param: ( ':' Expr_Name | '@' Expr_Name | Expr_Name [Expr_Name] ['=' expr] # type is either Expr or Block ) proc_params: (proc_param ',')* [ proc_param [','] ] # Note: we might have proc name-with-hyphens, so we can't parse the name in # expression mode. oil_proc: ['(' [proc_params] ')'] '{' # opening { for pgen2 func_param: Expr_Name [type_expr] ['=' expr] # This is an awkward way of writing that '...' has to come last. func_params: ( (func_param ',')* [ (func_param | '...' Expr_Name) [','] ] ) type_expr_list: type_expr (',' type_expr)* # Note: It may make sense to have ; here, for named params only! data_params: (func_param ',')* [ func_param [','] ] # zero params allowed for consistency with func and class? tea_data: Expr_Name '(' [data_params] ')' # e.g. Nullary %Token or Nullary(x Int) variant_type: Expr_Symbol | '(' data_params ')' variant: Expr_Name [ variant_type ] # for dict, tea_enum comma_newline: ',' [Op_Newline] | Op_Newline # for tea_func, tea_class semi_newline: ';' [Op_Newline] | Op_Newline tea_enum: ( Expr_Name '{' [Op_Newline] # note: braces can be empty [ variant (comma_newline variant)* [comma_newline] ] '}' ) suite: '{' [Op_Newline] [func_items] '}' func_item: ( ('var' | 'const') name_type_list '=' testlist # oil_var_decl # TODO: if/switch, with, try/except/throw, etc. | 'while' test suite | 'for' name_type_list 'in' test suite # In Python, imports, assert, etc. also at this 'small_stmt' level | 'break' | 'continue' | 'return' [testlist] # TODO: accept setvar for consistency with Oil? | 'set' place_list (augassign | '=') testlist # oil_place_mutation # x f(x) etc. # # And x = 1. Python uses the same "hack" to fit within pgen2. It also # supports a = b = 1, which we don't want. # # And echo 'hi' 'there' # # TODO: expr_to_ast needs to validate this | testlist (['=' testlist] | tea_word*) ) # we want to avoid requiring newline or ; before } func_items: func_item (semi_newline func_item)* [semi_newline] # This is anonymous tea_func: ( '(' [func_params] [';' func_params] ')' [type_expr_list] suite ) # Oil entry point named_func: Expr_Name tea_func # TODO: Methods differ from functions: # super() can be the first arg # shortcut initializer: Parser(this.lexer) { } # abstract, override, virtual # should we allow annotations, like 'public' or 'export'? # # No field initializers for now. Later C++ versions allow it. # # Annotations: # # func Parse() Int # [override const abstract] { # } ? class_item: ( ('virtual' | 'override' | 'func' | 'abstract' ) Expr_Name tea_func # Member declaration | 'var' name_type_list ) # Note: we could restrict separators to newlines. # But then you couldn't do class Foo { var a; var b } class_items: class_item (semi_newline class_item)* [semi_newline] tea_class: Expr_Name [':' Expr_Name ] '{' [Op_Newline] [class_items] '}' # 'import' can't use 'semi_newline' because ending with an unknown number of # tokens doesn't compose with our CommandParser. end_import: ';' | Op_Newline import_name: Expr_Name ['as' Expr_Name] import_names: import_name (comma_newline import_name)* [import_name] # TODO: Should we have a simpler Oil string literal? tea_import: sq_string [ 'as' Expr_Name ] ['(' [Op_Newline] [import_names] ')'] end_import # Top level: # delarations of constants -- with const only? # maybe only const? # use, data, enum, class, func. That's it? OK. end_outer: ';' [Op_Newline] | Op_Newline | Eof_Real module_item: ( # oil_var_decl, but no mutation ('var' | 'const') name_type_list '=' testlist end_outer | 'import' tea_import # TODO: needs Eof_Real # Also 'export' | 'class' tea_class end_outer | 'data' tea_data end_outer | 'enum' tea_enum end_outer | 'func' Expr_Name tea_func end_outer # Might need: typedef? Or typealias? ) # Eof_Real either after newline or before newline are both valid tea_module: [Op_Newline] module_item* [Eof_Real] # # Regex Sublanguage # char_literal: Char_OneChar | Char_Hex | Char_UBraced # we allow a-z A-Z 0-9 as ranges, but otherwise they have to be quoted # The parser enforces that they are single strings range_char: Expr_Name | Expr_DecInt | sq_string | char_literal # digit or a-z # We have to do further validation of ranges later. class_literal_term: ( range_char ['-' range_char ] | '!' Expr_Name # $mychars or ${mymodule.mychars} | simple_var_sub | braced_var_sub # e.g. 'abc' or "abc$mychars" # NOTE: range_char has sq_string | dq_string # Reserved for [[.collating sequences.]] (Unicode) | '.' Expr_Name # Reserved for [[=character equivalents=]] (Unicode) | '=' Expr_Name # TODO: Do these char classes actually work in bash/awk/egrep/sed/etc.? ) class_literal: '[' class_literal_term+ ']' # NOTE: Here is an example of where you can put ^ in the middle of a pattern in # Python, and it matters! # >>> r = re.compile('.f[a-z]*', re.DOTALL|re.MULTILINE) # >>> r.findall('z\nfoo\nbeef\nfood\n') # ['\nfoo', 'ef', '\nfood'] # >>> r = re.compile('.^f[a-z]*', re.DOTALL|re.MULTILINE) # r.findall('z\nfoo\nbeef\nfood\n') # ['\nfoo', '\nfood'] re_atom: ( char_literal # builtin regex like 'digit' or a regex reference like 'D' | Expr_Name # %begin or %end | Expr_Symbol | class_literal # !digit or ![a-f]. Note ! %boundary could be \B in Python, but ERE # doesn't have anything like that | '!' (Expr_Name | class_literal) # syntactic space for Perl-style backtracking # !!REF 1 !!REF name # !!AHEAD(d+) !!BEHIND(d+) !!NOT_AHEAD(d+) !!NOT_BEHIND(d+) # # Note: !! conflicts with history | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')') # Splice another expression | '@' Expr_Name # any %start %end are preferred | '.' | '^' | '$' # literal STRINGS like $foo or ${module.foo} | simple_var_sub | braced_var_sub # In a language-independent spec, backslashes are disallowed within 'sq'. # Write it with char literals outside strings: 'foo' \\ 'bar' \n | sq_string | dq_string # grouping (non-capturing in Perl; capturing in ERE although < > is preferred) | '(' regex ')' # capturing group, with optional name | '<' regex [':' name_type] '>' # Might want this obscure conditional construct. Can't use C-style ternary # because '?' is a regex operator. #| '{' regex 'if' regex 'else' regex '}' # Others: # PCRE has (?R ) for recursion? That could be !RECURSE() # Note: .NET has && in character classes, making it a recursive language ) # e.g. a{3} a{3,4} a{3,} a{,4} but not a{,} repeat_range: ( Expr_DecInt [','] | ',' Expr_DecInt | Expr_DecInt ',' Expr_DecInt ) repeat_op: ( '+' | '*' | '?' # In PCRE, ?? *? +? {}? is lazy/nongreedy and ?+ *+ ++ {}+ is "possessive" # We use N and P modifiers within {}. # a{L +} a{P ?} a{P 3,4} a{P ,4} | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}' ) re_alt: (re_atom [repeat_op])+ regex: [re_alt] (('|'|'or') re_alt)* # e.g. /digit+ ; multiline !ignorecase/ # # This can express translation preferences: # # / d+ ; %ERE / is '[[:digit:]]+' # / d+ ; %python / is '\d+' # / d+ ; ignorecase %python / is '(?i)\d+' re_flag: ['!'] Expr_Name | Expr_Symbol re_flags: ';' re_flag+ # Syntax reserved for PCRE/Python, but that's not in ERE: # # nop-greedy a{N *} # non-capturing :( digit+ ) # backtracking !REF 1 !AHEAD(d+) # # Legacy syntax: # # ^ and $ instead of %start and %end # < and > instead of %start_word and %end_word # . instead of dot # | instead of 'or'