# Data types for the Oil AST, aka "Lossless Syntax Tree". # # Invariant: the source text can be reconstructed byte-for-byte from this # tree. The test/arena.sh file partially verifies this. # # Exceptions: # - <<- here docs with leading tabs, since we don't want those for # conversion. We don't want files with mixed tabs and spaces. # - Distinguishing between function styles wasn't necessary: # - foo() { } vs function foo { } # ksh style # We usually try to preserve the physical order of the source in the ASDL # fields. One exception is the order of redirects: # # echo >out.txt hi # # versus # echo hi >out.txt # Unrepresented: # - let arithmetic (rarely used) # - coprocesses # one with arg and one without # - select block # - case fallthrough ;& and ;;& # Possible refactorings: # # # More %Token as first class variants: # type_expr = Atom %Token | ... # printf_part = Literal %Token | ... # # # %CompoundWord as first class variant: # bool_expr = WordTest %CompoundWord | ... # # # Can DoubleQuoted have a subset of parts compared with CompoundWord? # string_part = ... # subset of word_part # # - Distinguish word_t with BracedTree vs. those without? seq_word_t? # - Remove command.NoOp ? module syntax { # More efficient than the List[bool] pattern we've been using BoolParamBox = (bool b) IntParamBox = (int i) # core/main_loop.py parse_result = EmptyLine | Eof | Node(command cmd) # 'source' represents the location of a line / token. source = Interactive | Headless | Unused(str comment) # completion and history never show parse errors? | CFlag | Stdin(str comment) # TODO: if it's not the main script, it's sourced, and you could provide # a chain of locations back to the sourced script! # MainFile(str path) or SourcedFile(str path, loc location) | MainFile(str path) | SourcedFile(str path, loc location) # code parsed from a word # used for 'eval', 'trap', 'printf', 'complete -W', etc. | ArgvWord(str what, loc location) # code parsed from the value of a variable # used for $PS1 $PROMPT_COMMAND | Variable(str var_name, loc location) # Point to the original variable reference | VarRef(Token orig_tok) # alias expansion (location of first word) | Alias(str argv0, loc argv0_loc) # 2 kinds of reparsing: backticks, and x+1 in a[x+1]=y # TODO: use this for eval_unsafe_arith instead of Variable | Reparsed(str what, Token left_token, Token right_token) # For #location-str | Synthetic(str s) SourceLine = (int line_num, str content, source src) # Token TODO: # - get rid of redundant span_id # - function to compute string val on demand # - maybe re-compute length on demand too Token = (id id, int col, int length, int span_id, SourceLine? line, str tval) # Source location for errors # It's possible for word_part and word to have a beginning and end location loc = Missing # equivalent of runtime.NO_SPID # TODO: remove in favor of using Tokens | Span(int span_id) | Token %Token | WordPart(word_part p) # Note: is it possible to only have CompoundWord? | Word(word w) | Arith(arith_expr a) # e.g. for errexit blaming | Command(command c) # # Shell language # bracket_op = WholeArray(id op_id) # * or @ | ArrayIndex(arith_expr expr) suffix_op = Nullary %Token # ${x@Q} or ${!prefix@} (which also has prefix_op) | Unary(Token op, rhs_word arg_word) # e.g. ${v:-default} # TODO: Implement Oil's ${x|html} and ${x %.3f} | Static(Token tok, str arg) | PatSub(CompoundWord pat, rhs_word replace, id replace_mode, Token slash_tok) # begin is optional with ${array::1} | Slice(arith_expr? begin, arith_expr? length) BracedVarSub = ( Token left, # in dynamic ParseVarRef, same as name_tok Token token, # location for the name str var_name, # the name Token? prefix_op, # prefix # or ! operators bracket_op? bracket_op, suffix_op? suffix_op, Token right # in dynamic ParseVarRef, same as name_tok ) # Variants: # - Look at left token ID for $'' c'' vs r'' '' e.g. Id.Left_DollarSingleQuote # - And """ and ''' e.g. Id.Left_TDoubleQuote DoubleQuoted = (Token left, List[word_part] parts, Token right) SingleQuoted = (Token left, List[Token] tokens, Token right) SimpleVarSub = (Token left, str var_name) CommandSub = (Token left_token, command child, Token right) # - can contain word.BracedTree # - no 'Token right' for now, doesn't appear to be used ShArrayLiteral = (Token left, List[word] words) # Used in both expr and a word_part ArgList = ( Token left, List[expr] positional, List[NamedArg] named, Token right ) # TODO: every word_part has Token left, Token right, and remove int* spids # these are used in the main program for alias expansion AssocPair = (CompoundWord key, CompoundWord value) word_part = ShArrayLiteral %ShArrayLiteral | AssocArrayLiteral(Token left, List[AssocPair] pairs) | Literal %Token # escaped case is separate so the evaluator doesn't have to check token ID | EscapedLiteral(Token token, str ch) | SingleQuoted %SingleQuoted | DoubleQuoted %DoubleQuoted | SimpleVarSub %SimpleVarSub | BracedVarSub %BracedVarSub # For command sub and process sub: $(...) <(...) >(...) | CommandSub %CommandSub # ~ or ~bob | TildeSub(Token token, str? user_name) | ArithSub(arith_expr anode) # {a,b,c} | BracedTuple(List[CompoundWord] words) # {1..10} or {-5..10..2} or {01..10} (leading zeros matter) # {a..f} or {a..f..2} or {a..f..-2} | BracedRange(id kind, str start, str end, int step) # note: optional int may need special handling in ASDL # extended globs are parsed statically, unlike globs | ExtGlob(Token op, List[CompoundWord] arms) # Oil word_part extensions # @myarray | Splice(Token name, str var_name) # $strfunc(x) and @arrayfunc(x) | FuncCall(Token name, ArgList args) # $[d->key], etc. | ExprSub(Token left, expr child) attributes (List[int] spids) # TODO: I think Token left should be copied into every CompoundWord # either that, or CommandParser needs to reliably have GetToken(word_t) # Invariant: CompoundWord always has at least one part (as opposed to # rhs_word.Empty) CompoundWord = (List[word_part] parts) # Use cases for Empty: RHS of 'x=', the argument in "${x:-}". # The latter is semantically necessary. (See osh/word_parse.py). # At runtime: RHS of 'declare x='. rhs_word = Empty | Compound %CompoundWord word = # Returns from WordParser, but not generally stored in LST Token %Token # A Compound word can contain any word_part except the Braced*Part. # We could model this with another variant type but it incurs runtime # overhead and seems like overkill. Note that DoubleQuoted can't # contain a SingleQuoted, etc. either. | Compound %CompoundWord # For word sequences command.Simple, ShArrayLiteral, for_iter.Words # Could be its own type | BracedTree(List[word_part] parts) # For dynamic parsing of test/[ # the string is already evaluated. # TODO: try using Token with source info | String(id id, str s, int span_id) # Note: the name 'foo' is derived from token value 'foo=' or 'foo+=' sh_lhs_expr = Name(Token left, str name) | IndexedName(Token left, str name, arith_expr index) | UnparsedIndex(Token left, str name, str index) # for translation arith_expr = VarSub %SimpleVarSub # e.g. $(( x )) | Word %CompoundWord # e.g. $(( 123'456'$y )) | UnaryAssign(id op_id, arith_expr child) | BinaryAssign(id op_id, arith_expr left, arith_expr right) | Unary(id op_id, arith_expr child) # TODO: op should be token, e.g. for divide by zero | Binary(id op_id, arith_expr left, arith_expr right) | TernaryOp(arith_expr cond, arith_expr true_expr, arith_expr false_expr) bool_expr = WordTest(word w) # e.g. [[ myword ]] | Binary(id op_id, word left, word right) | Unary(id op_id, word child) | LogicalNot(bool_expr child) | LogicalAnd(bool_expr left, bool_expr right) | LogicalOr(bool_expr left, bool_expr right) redir_loc = Fd(int fd) | VarName(str name) redir_param = Word %CompoundWord | HereDoc(word here_begin, # e.g. EOF or 'EOF' int here_end_span_id, # span is whole line (for defunct osh2oil) List[word_part] stdin_parts # one for each line ) Redir = (Token op, redir_loc loc, redir_param arg) assign_op = Equal | PlusEqual AssignPair = (sh_lhs_expr lhs, assign_op op, rhs_word rhs, List[int] spids) EnvPair = (str name, rhs_word val, List[int] spids) condition = Shell(List[command] commands) # if false; true; then echo hi; fi | Oil(expr e) # if (x > 0) { echo hi } # Each arm tests one word against multiple words CaseArm = (List[word] pat_list, List[command] action, List[int] spids) # Each if arm starts with either an "if" or "elif" keyword # In YSH, the then keyword is not used (replaced by braces {}) IfArm = (Token keyword, condition cond, Token? then_kw, List[command] action, List[int] spids) for_iter = Args # for x; do echo $x; done # implicit "$@" | Words(List[word] words) # for x in 'foo' *.py { echo $x } # like ShArrayLiteral, but no location for %( | Oil(expr e, Token blame) # for x in (mylist) { echo $x } # TODO: Make field names consistent: child vs expr, etc. BraceGroup = ( Token left, Token? doc_token, List[command] children, List[Redir] redirects, Token right ) # TODO: every command needs Token left # also add Token right_lok (a lok is a token used for the pretty printer ONLY) # then Case, ForEach, and ShFunction also have Token in_lok, esac_lok, etc. # REMOVE int* spids # Retain references to lines BlockArg = (BraceGroup brace_group, List[SourceLine] lines) command = NoOp # Note: do_fork is semantic, not syntactic | Simple(List[word] words, List[Redir] redirects, List[EnvPair] more_env, ArgList? typed_args, BlockArg? block, bool do_fork) # This doesn't technically belong in the LST, but it's convenient for # execution | ExpandedAlias(command child, List[Redir] redirects, List[EnvPair] more_env) | Sentence(command child, Token terminator) # Note: Only represents "bare assignment" | ShAssignment(List[AssignPair] pairs, List[Redir] redirects) | ControlFlow(Token keyword, word? arg_word) # Note: There are spids for every pipeline operator, parallel to # stderr_indices | Pipeline(List[command] children, bool negated, List[int] stderr_indices) | AndOr(List[id] ops, List[command] children) # Part of for, while, until (but not if, case, ShFunction). No redirects. | DoGroup(Token left, List[command] children, Token right) # A brace group is a compound command, with redirects. | BraceGroup %BraceGroup # Contains a single child, like CommandSub | Subshell(command child, List[Redir] redirects) | DParen(arith_expr child, List[Redir] redirects) | DBracket(bool_expr expr, List[Redir] redirects) # up to 3 iterations variables | ForEach(Token keyword, List[str] iter_names, for_iter iterable, command body, List[Redir] redirects) # C-style for loop. Any of the 3 expressions can be omitted. # Note: body is required, but only optional here because of initialization # order. | ForExpr(Token keyword, arith_expr? init, arith_expr? cond, arith_expr? update, command? body, List[Redir] redirects) | WhileUntil(Token keyword, condition cond, command body, List[Redir] redirects) | If(Token if_kw, List[IfArm] arms, Token? else_kw, List[command] else_action, Token? fi_kw, List[Redir] redirects) | Case(word to_match, List[CaseArm] arms, List[Redir] redirects) | ShFunction(str name, command body) | TimeBlock(command pipeline) # Some nodes optimize it out as List[command], but we use CommandList for # 1. the top level # 2. ls ; ls & ls (same line) # 3. CommandSub # single child that's a CommandList # 4. Subshell # single child that's a CommandList # Similar to DoGroup, except that has do and done spids. | CommandList(List[command] children) # Oil stuff # For 'x = myexpr'. There's no type and no comma allowed. | BareDecl(Token lhs, expr rhs) # var, const | VarDecl(Token? keyword, List[NameType] lhs, expr rhs) # setvar/set, auto. | PlaceMutation(Token? keyword, List[place_expr] lhs, Token op, expr rhs) # = keyword | Expr(Token keyword, expr e) | Proc(Token keyword, Token name, proc_sig sig, command body) # Tea | Func(Token name, List[Param] pos_params, Token? pos_splat, List[Param] named_params, Token? named_splat, List[type_expr] return_types, command body) | Data(Token name, List[Param] params) | Enum(Token name, List[Variant] variants) | Class(Token name, Token? extends, List[class_item] items) | Import(SingleQuoted path, Token? alias, List[ImportName] names) | For(List[NameType] targets, expr iterable, command body) | While(expr test, command body) | Break | Continue | Return(expr? value) attributes (List[int] spids) # Binary(x expr, y expr) or Nullary %Token # In the first case we have a tag, and an anonymous type. variant_type = Anon(List[Param] params) | Ref(Token type_name) # tags and types are separate in our model of algebraic data types Variant = (Token tag_name, variant_type? typ) class_item = Data(Token keyword, List[NameType] fields) | Method() ImportName = (Token name, Token? alias) # ref is always ':' or empty UntypedParam = (Token? ref, Token name, expr? default_val) # type is Expr or Block TypedParam = (Token name, Token type, expr? default_val) # 'open' is for proc p { }; closed is for proc p () { } proc_sig = Open | Closed(List[UntypedParam] untyped, Token? rest, List[TypedParam] typed) # prefix is : for out param, @ for proc splat, ... for func splat # procs only have types Expr, Block (and Str is implicit) Param = (Token? prefix, Token name, type_expr? type, expr? default_val) # # Glob representation, for converting ${x//} to extended regexes. # # Example: *.[ch] is: # GlobOp(), # GlobLit(Glob_OtherLiteral, '.'), # CharClass(False, ['ch']) # from Glob_CleanLiterals token glob_part = Literal(id id, str s) | Operator(id op_id) # * or ? | CharClass(bool negated, List[str] strs) # Char classes are opaque for now. If we ever need them: # - Collating symbols are [. .] # - Equivalence classes are [= printf_part = Literal(Token token) # flags are 0 hyphen space + # # type is 's' for %s, etc. | Percent(List[Token] flags, Token? width, Token? precision, Token type) # # OIL LANGUAGE # # Copied and modified from Python-3.7/Parser/Python.asdl ! expr_context = Load | Store | Del | AugLoad | AugStore | Param # type expressions: Int Array[Int] Dict[Str, Any] type_expr = Simple(Token name) | Compound(Token name, List[type_expr] params) # LHS binding in loops, list comprehensions, and var/const NameType = (Token name, type_expr? typ) # TODO: Inline this into GenExp and ListComp? Just use a flag there? Comprehension = (List[NameType] lhs, expr iter, expr? cond) # named arguments supplied to call. token is null for f(; ...named). NamedArg = (Token? name, expr value) # Subscripts are lists of expressions # a[:i, n] (we don't have matrices, but we have data frames) Subscript = (expr obj, List[expr] indices) # Attributes are obj.attr, d->key, name::scope, Attribute = (expr obj, Token op, Token attr, expr_context ctx) # Places that can be mutated. place_expr = Var(Token name) # TODO: could be Var %Token | Subscript %Subscript | Attribute %Attribute expr = # a variable name to evaluate Var(Token name) # TODO: could be Var %Token # For null, Bool, Int, Float # Python uses Num(object n), which doesn't respect our "LST" invariant. | Const(Token c) # @(one 'two' "$three") | ShArrayLiteral %ShArrayLiteral # @[a b c] @[1 2 3] @[(1+1) (2+2)] | RegexLiteral(Token left, re regex, List[Token] flags, Token? trans_pref) | SimpleVarSub %SimpleVarSub | BracedVarSub %BracedVarSub | CommandSub %CommandSub | SingleQuoted %SingleQuoted | DoubleQuoted %DoubleQuoted | BlockArg %BlockArg | Lambda(List[NameType] params, expr body) | Unary(Token op, expr child) | Binary(Token op, expr left, expr right) # x < 4 < 3 and (x < 4) < 3 | Compare(expr left, List[Token] ops, List[expr] comparators) | FuncCall(expr func, ArgList args) # TODO: Need a representation for method call. We don't just want # Attribute() and then Call() | IfExp(expr test, expr body, expr orelse) | Tuple(List[expr] elts, expr_context ctx) | List(List[expr] elts, expr_context ctx) | Dict(List[expr] keys, List[expr] values) # For the values in {n1, n2} | Implicit | ListComp(expr elt, List[Comprehension] generators) # not implemented | DictComp(expr key, expr value, List[Comprehension] generators) | GeneratorExp(expr elt, List[Comprehension] generators) # Ranges are written 1:2, with first class expression syntax. There is no # step as in Python. Use range(0, 10, step=2) for that. | Range(expr lower, expr upper) # Slices occur within [] only. Unlike ranges, the start/end can be # # implicit. Like ranges, denote a step with slice(0, 10, step=2). # a[3:] a[:i] | Slice(expr? lower, expr? upper) | Subscript %Subscript | Attribute %Attribute # Ellipsis is like 'Starred' within Python, which are valid on the LHS in # Python for unpacking, and # within list literals for splicing. # (Starred is NOT used for {k:v, **a}. That used a blank "keys" # attribute.) # In Oil, "spreading" will be @[1 ...array2] [b, ...list2] and # {k: v, ...dict2}. We don't need two different symbols. | Spread(expr child, expr_context ctx) # # Regex Language (Eggex) # # e.g. alnum digit PosixClass = (Token? negated, str name) # e.g. d w s PerlClass = (Token? negated, str name) # Note: .NET has && in character classes, making it a recursive language class_literal_term = PosixClass %PosixClass | PerlClass %PerlClass # [a-z] ~[a-z] TODO: Doesn't respect LST invariant | Range(Token start, Token end) | CharLiteral(Token tok) | SimpleVarSub %SimpleVarSub | BracedVarSub %BracedVarSub | SingleQuoted %SingleQuoted | DoubleQuoted %DoubleQuoted # Char Sets and Ranges both use Char Codes # with u_braced == true : \u{ff} # with u_braced == false: \xff \\ 'a' a '0' 0 # ERE doesn't make a distinction, but compiling to Python/PCRE can use it CharCode = (int i, bool u_braced, int spid) # evaluated version of class_literal_term (could be in runtime.asdl) char_class_term = PosixClass %PosixClass | PerlClass %PerlClass | Range(CharCode start, CharCode end) # For [ \x00 \\ ] | CharCode %CharCode # NOTE: modifier is unused now, can represent L or P re_repeat = Op(Token op) | Num(Token times) # dot{1,2} | Range(Token? lower, Token? upper) # Haven't implemented the modifier, e.g. x{+ P} # | Num(Token times, id modifier) # | Range(Token? lower, Token? upper, id modifier) re = # e.g. . ^ $ %begin \u123 Token %Token | PosixClass %PosixClass | PerlClass %PerlClass # syntax [ $x \n ] | CharClassLiteral(bool negated, List[class_literal_term] terms) # evaluated [ 'abc' \n ] | CharClass(bool negated, List[char_class_term] terms) # @D | Splice(Token name, str var_name) # $literal ${literal} 'no-backslashes' "other$foo" | SimpleVarSub %SimpleVarSub | BracedVarSub %BracedVarSub | SingleQuoted %SingleQuoted | DoubleQuoted %DoubleQuoted # Compound: | Repeat(re child, re_repeat op) | Seq(List[re] children) | Alt(List[re] children) | Group(re child) # TODO: needs Token? type field | Capture(re child, Token? var_name) | Backtracking(bool negated, Token name, re child) # Regex Evaluation Shares the Same Structure, but uses slightly different # nodes. # - Token (syntactic concepts) -> Primitive (logical) # - Splice -> re_t # - All Strings -> Literal | Primitive(id id) # . dot etc. # String substitutions are evaluated into literals | LiteralChars(str s, int spid) }