-- Data types for the Oil AST, aka "Lossless Syntax Tree". -- -- Invariant: the source text can be reconstructed byte-for-byte from this -- tree. -- -- Exceptions: -- * <<- here docs with leading tabs, since we don't want those for -- conversion. We don't want files with mixed tabs and spaces. -- * Distinguishing between function styles wasn't necessary: -- * foo() { } vs function foo { } -- ksh -- -- The AST is composed of the builtin ASDL types (string, int, bool) and our -- application type 'id', which is core.id_kind.Id. -- Unrepresented: -- * let arithmetic (rarely used) -- * coprocesses -- one with arg and one without -- * select block -- * case fallthrough ;& and ;;& -- We usually try to preserve the physical order of the source in the ASDL -- fields. Exception: -- * order of redirects: 'echo >out.txt hi' vs echo hi >out.txt -- Refactorings: -- -- # word.{Empty,BracedTree} not allowed here -- cmd_token = Word %compound_word | Token %Token -- -- # Makes sense for DoubleQuoted and TripleQuoted -- string_part = ... # subset of word_part -- -- # Simplify many %Token references: -- type_expr = Atom %Token | ... -- printf_part ... -- -- # Simplify compound_word reference -- bool_expr = WordTest %compound_word | ... module syntax { -- core/main_loop.py parse_result = EmptyLine | Eof | Node(command cmd) -- 'source' represents the location of a line / token. source = Interactive | Headless | Unused(string comment) -- completion and history never show parse errors? | CFlag | Stdin(string comment) -- TODO: if it's not the main script, it's sourced, and you could provide -- a chain of locations back to the sourced script! -- MainFile(string path) or SourcedFile(string path, int spid) | MainFile(string path) | SourcedFile(string path, int spid) -- code parsed from a word -- used for 'eval', 'trap', 'printf', 'complete -W', etc. | ArgvWord(string what, int span_id) -- code parsed from the value of a variable -- used for $PS1 $PROMPT_COMMAND eval_unsafe_arith | Variable(string var_name, int span_id) -- alias expansion (location of first word) | Alias(string argv0, int argv0_spid) -- 2 kinds of reparsing: backticks, and x+1 in a[x+1]=y | Reparsed(string what, int left_spid, int right_spid) -- For --location-str | Synthetic(string s) -- Logically, here's our record for a physical line. For compactness, we -- TRANSPOSE it into 3 parallel arrays, so this record is UNUSED. -- line_record = (int line_num, string val, source src) -- A line_span represents the source location of a token, which never crosses -- lines. Encoded into LST with 'int spid'. line_span = (int line_id, int col, int length) -- NOTE: 'val' and 'span_id' fields redundant but useful. 'val' is for -- execution, and 'span_id' is for error messages and translation. -- TODO: new representation that points to a line? Token = (id id, int span_id, string val) -- After parsing, we often don't need the token 'string val', so use a -- 'speck'. Notes: -- * It's 12 bytes with the GC header -- * It's a "slab" to the GC (no pointers to follow) speck = (id id, int span_id) -- Note: like token -> speck, token -> strand is for when we don't need the id. -- We're not using it since it's probably 4 + 4 + 8 = 16 bytes, which doesn't -- save any space over the token! -- strand = (int span_id, string val) -- -- Shell language -- bracket_op = WholeArray(id op_id) -- * or @ | ArrayIndex(arith_expr expr) suffix_op = Nullary %Token -- ${x@Q} or ${!prefix@} (which also has prefix_op) | Unary(Token tok, word arg_word) -- e.g. ${v:-default} -- Oil's ${x|html} and ${x %.3f} have static arguments | Static(Token tok, string arg) | PatSub(word pat, word? replace, id replace_mode) -- begin is optional with ${array::1} | Slice(arith_expr? begin, arith_expr? length) attributes (int* spids) -- Variants: -- * Look at left token for $'' c'' vs r'' '' -- * """ and ''' strings have a boolean set double_quoted = (Token left, word_part* parts, bool multiline) attributes (int* spids) single_quoted = (Token left, Token* tokens, bool multiline) attributes (int* spids) -- Note: simple_var_sub could be split up into speck for $?, (speck, int) for -- $0 $1, token for $foo. simple_var_sub = (Token token) -- TODO: Add ${x %.3f} ${x|html} braced_var_sub = ( Token token, -- the name speck? prefix_op, -- prefix # or ! operators bracket_op? bracket_op, suffix_op? suffix_op ) attributes (int* spids) command_sub = (Token left_token, command child) attributes (int* spids) -- Note: can be word.BracedTree sh_array_literal = (Token left, word* words) attributes (int* spids) -- Used in both expr and a word_part ArgList = (expr* positional, named_arg* named) attributes (int* spids) word_part = ShArrayLiteral %sh_array_literal -- alternating key and value (saving some space) -- Note that word.Empty isn't needed here | AssocArrayLiteral(Token left, compound_word* pairs) | Literal %Token -- escaped case is separate so the evaluator doesn't have to check token ID | EscapedLiteral(Token token) | SingleQuoted %single_quoted | DoubleQuoted %double_quoted | SimpleVarSub %simple_var_sub | BracedVarSub %braced_var_sub -- For command sub and process sub: $(...) <(...) >(...) | CommandSub %command_sub -- This should be token tilde, token rest | TildeSub(Token token) | ArithSub(arith_expr anode) -- {a,b,c} | BracedTuple(compound_word* words) -- {1..10} or {-5..10..2} or {01..10} (leading zeros matter) -- {a..f} or {a..f..2} or {a..f..-2} | BracedRange(id kind, string start, string end, int step) -- note: optional int may need special handling in ASDL -- extended globs are parsed statically, unlike globs | ExtGlob(Token op, compound_word* arms) -- Oil word_part extensions -- @myarray | Splice(Token name) -- $strfunc(x) and @arrayfunc(x) | FuncCall(Token name, ArgList args) -- $[d->key], etc. | ExprSub(Token left, expr child) attributes (int* spids) compound_word = (word_part* parts) -- dedent is calculated from the first part -- TODO: could be string_part string_line = (int dedent, word_part* part) -- min_dedent is calculated from each line using Julia's rules triple_quoted = (int min_dedent, string_line* lines) word = -- Use cases for Empty: RHS of 'x=', the argument in "${x:-}". -- The latter is semantically necessary. (See osh/word_parse.py). -- At runtime: RHS of 'declare x='. -- We don't always use it, e.g. the RHS of 'A=([k]=)' is a compound_word -- with zero parts. Empty -- Returns from WordParser, but not generally stored in LST (TODO: cmd_token) | Token %Token -- A Compound word can contain any word_part except the Braced*Part. -- We could model this with another variant type but it incurs runtime -- overhead and seems like overkill. Note that DoubleQuoted can't -- contain a SingleQuoted, etc. either. | Compound %compound_word -- Similar to CompoundWord, but leading space is stripped | BracedTree(word_part* parts) -- For dynamic parsing of test/[ -- the string is already evaluated. -- note: this could also be Token | String(id id, string s, int span_id) sh_lhs_expr = Name(string name) | IndexedName(string name, arith_expr index) | UnparsedIndex(string name, string index) -- for translation attributes (int* spids) arith_expr = VarRef %Token -- variable without $ | Word %compound_word -- a string that looks like an integer | UnaryAssign(id op_id, arith_expr child) | BinaryAssign(id op_id, arith_expr left, arith_expr right) | Unary(id op_id, arith_expr child) -- TODO: op should be token, e.g. for divide by zero | Binary(id op_id, arith_expr left, arith_expr right) | TernaryOp(arith_expr cond, arith_expr true_expr, arith_expr false_expr) bool_expr = WordTest(word w) -- e.g. [[ myword ]] | Binary(id op_id, word left, word right) | Unary(id op_id, word child) | LogicalNot(bool_expr child) | LogicalAnd(bool_expr left, bool_expr right) | LogicalOr(bool_expr left, bool_expr right) redir_loc = Fd(int fd) | VarName(string name) redir_param = Word %compound_word | HereDoc(word here_begin, -- e.g. EOF or 'EOF' int here_end_span_id, -- span is whole line (for defunct osh2oil) word_part* stdin_parts -- one for each line ) redir = (Token op, redir_loc loc, redir_param arg) assign_op = Equal | PlusEqual assign_pair = (sh_lhs_expr lhs, assign_op op, word? rhs, int* spids) env_pair = (string name, word val, int* spids) condition = Shell(command* commands) -- if false; true; then echo hi; fi | Oil(expr e) -- if (x > 0) { echo hi } -- Each arm tests one word against multiple words case_arm = (word* pat_list, command* action, int* spids) if_arm = (condition cond, command* action, int* spids) for_iter = Args -- for x; do echo $x; done -- implicit "$@" | Words(word* words) -- for x in 'foo' *.py { echo $x } -- like ShArrayLiteral, but no location for %( | Oil(expr e, Token blame) -- for x in (mylist) { echo $x } -- TODO: Make field names consistent: child vs expr, etc. BraceGroup = (Token? doc_token, command* children, redir* redirects) attributes (int* spids) command = NoOp -- Note: do_fork is semantic, not syntactic | Simple(word* words, redir* redirects, env_pair* more_env, ArgList? typed_args, BraceGroup? block, bool do_fork) -- This doesn't technically belong in the LST, but it's convenient for -- execution | ExpandedAlias(command child, redir* redirects, env_pair* more_env) | Sentence(command child, Token terminator) -- Note: Only represents "bare assignment" | ShAssignment(assign_pair* pairs, redir* redirects) | ControlFlow(Token token, word? arg_word) -- Note: There are spids for every pipeline operator, parallel to -- stderr_indices | Pipeline(command* children, bool negated, int* stderr_indices) | AndOr(id* ops, command* children) -- Part of for, while, until (but not if, case, ShFunction). No redirects. | DoGroup(command* children) -- A brace group is a compound command, with redirects. | BraceGroup %BraceGroup -- Contains a single child, like CommandSub | Subshell(command child, redir* redirects) | DParen(arith_expr child, redir* redirects) | DBracket(bool_expr expr, redir* redirects) -- up to 3 iterations variables | ForEach(string* iter_names, for_iter iterable, command body, redir* redirects) -- C-style for loop. Any of the 3 expressions can be omitted. -- Note: body is required, but only optional here because of initialization -- order. | ForExpr(arith_expr? init, arith_expr? cond, arith_expr? update, command? body, redir* redirects) | WhileUntil(Token keyword, condition cond, command body, redir* redirects) | If(if_arm* arms, command* else_action, redir* redirects) | Case(word to_match, case_arm* arms, redir* redirects) | ShFunction(string name, command body) | TimeBlock(command pipeline) -- Some nodes optimize it out as command*, but we use CommandList for -- 1. the top level -- 2. ls ; ls & ls (same line) -- 3. command_sub -- single child that's a CommandList -- 4. Subshell -- single child that's a CommandList -- Similar to DoGroup, except that has do and done spids. | CommandList(command* children) -- Oil stuff -- For 'x = myexpr'. There's no type and no comma allowed. | BareDecl(Token lhs, expr rhs) -- var, const | VarDecl(Token? keyword, name_type* lhs, expr rhs) -- setvar/set, auto. Note: op can be a speck | PlaceMutation(Token? keyword, place_expr* lhs, Token op, expr rhs) -- = keyword | Expr(speck keyword, expr e) | Proc(Token name, proc_sig sig, command body) -- Tea | Func(Token name, param* pos_params, Token? pos_splat, param* named_params, Token? named_splat, type_expr* return_types, command body) | Data(Token name, param* params) | Enum(Token name, variant* variants) | Class(Token name, Token? extends, class_item* items) | Import(single_quoted path, Token? alias, import_name* names) | For(name_type* targets, expr iterable, command body) | While(expr test, command body) | Break | Continue | Return(expr? value) attributes (int* spids) -- Binary(x expr, y expr) or Nullary %Token -- In the first case we have a tag, and an anonymous type. variant_type = Anon(param* params) | Ref(Token type_name) -- tags and types are separate in our model of algebraic data types variant = (Token tag_name, variant_type? typ) class_item = Data(Token keyword, name_type* fields) | Method() import_name = (Token name, Token? alias) -- ref is always ':' or empty UntypedParam = (Token? ref, Token name, expr? default_val) -- type is Expr or Block TypedParam = (Token name, Token type, expr? default_val) -- 'open' is for proc p { }; closed is for proc p () { } proc_sig = Open | Closed(UntypedParam* untyped, Token? rest, TypedParam* typed) -- prefix is : for out param, @ for proc splat, ... for func splat -- procs only have types Expr, Block (and Str is implicit) param = (Token? prefix, Token name, type_expr? type, expr? default_val) -- -- Glob representation, for converting ${x//} to extended regexes. -- -- Example: *.[ch] is: -- GlobOp(), -- GlobLit(Glob_OtherLiteral, '.'), -- CharClass(False, ['ch']) # from Glob_CleanLiterals token glob_part = Literal(id id, string s) | Operator(id op_id) -- * or ? | CharClass(bool negated, string* strs) -- Char classes are opaque for now. If we ever need them: -- * Collating symbols are [. .] -- * Equivalence classes are [= printf_part = Literal(Token token) -- flags are 0 hyphen space + # -- type is 's' for %s, etc. | Percent(Token* flags, Token? width, Token? precision, Token type) -- -- OIL LANGUAGE -- -- Copied and modified from Python-3.7/Parser/Python.asdl ! expr_context = Load | Store | Del | AugLoad | AugStore | Param -- type expressions: Int Array[Int] Dict[Str, Any] type_expr = Simple(Token name) | Compound(Token name, type_expr* params) -- LHS binding in loops, list comprehensions, and var/const name_type = (Token name, type_expr? typ) -- TODO: Inline this into GenExp and ListComp? Just use a flag there? comprehension = (name_type* lhs, expr iter, expr? cond) -- named arguments supplied to call. token is null for f(; ...named). named_arg = (Token? name, expr value) -- Subscripts are lists of expressions -- a[:i, n] (we don't have matrices, but we have data frames) subscript = (expr obj, expr* indices) -- Attributes are obj.attr, d->key, name::scope, -- TODO: op should be 'speck' attribute = (expr obj, Token op, Token attr, expr_context ctx) -- Places that can be mutated. place_expr = Var(Token name) -- TODO: could be Var %Token | Subscript %subscript | Attribute %attribute expr = -- a variable name to evaluate Var(Token name) -- TODO: could be Var %Token -- For null, Bool, Int, Float -- Python uses Num(object n), which doesn't respect our "LST" invariant. | Const(Token c) -- @(one 'two' "$three") | ShArrayLiteral %sh_array_literal -- @[a b c] @[1 2 3] @[(1+1) (2+2)] | RegexLiteral(Token left, re regex, Token* flags, Token? trans_pref) | SimpleVarSub %simple_var_sub | BracedVarSub %braced_var_sub | CommandSub %command_sub | SingleQuoted %single_quoted | DoubleQuoted %double_quoted | BlockArg(BraceGroup block) | Lambda(name_type* params, expr body) -- TODO: speck op | Unary(Token op, expr child) | Binary(Token op, expr left, expr right) -- x < 4 < 3 and (x < 4) < 3 | Compare(expr left, speck* ops, expr* comparators) | FuncCall(expr func, ArgList args) -- TODO: Need a representation for method call. We don't just want -- Attribute() and then Call() | IfExp(expr test, expr body, expr orelse) | Tuple(expr* elts, expr_context ctx) | List(expr* elts, expr_context ctx) | Dict(expr* keys, expr* values) -- For the values in {n1, n2} | Implicit | ListComp(expr elt, comprehension* generators) -- not implemented | DictComp(expr key, expr value, comprehension* generators) | GeneratorExp(expr elt, comprehension* generators) -- Ranges are written 1:2, with first class expression syntax. There is no -- step as in Python. Use range(0, 10, step=2) for that. | Range(expr lower, expr upper) -- Slices occur within [] only. Unlike ranges, the start/end can be -- -- implicit. Like ranges, denote a step with slice(0, 10, step=2). -- a[3:] a[:i] | Slice(expr? lower, expr? upper) | Subscript %subscript | Attribute %attribute -- Ellipsis is like 'Starred' within Python, which are valid on the LHS in -- Python for unpacking, and -- within list literals for splicing. -- (Starred is NOT used for {k:v, **a}. That used a blank "keys" -- attribute.) -- In Oil, "spreading" will be @[1 ...array2] [b, ...list2] and -- {k: v, ...dict2}. We don't need two different symbols. | Spread(expr child, expr_context ctx) attributes (int* spids) -- -- Regex Language (Eggex) -- -- e.g. alnum digit posix_class = (speck? negated, string name) -- e.g. d w s perl_class = (speck? negated, string name) -- Note: .NET has && in character classes, making it a recursive language class_literal_term = PosixClass %posix_class | PerlClass %perl_class -- [a-z] ~[a-z] TODO: Doesn't respect LST invariant | Range(string start, string end) | CharLiteral(Token tok) | SimpleVarSub %simple_var_sub | BracedVarSub %braced_var_sub | SingleQuoted %single_quoted | DoubleQuoted %double_quoted -- Each of ['abc' \\ \" \xff ] is evaluated to this -- If there is more than one byte and any of them is > 128, there is -- ambiguity about whether it's a single encoded character or multiple -- bytes. | ByteSet(string bytes, int spid) -- [ \u0100 \u00ff ] is evaluated to this | CodePoint(int i, int spid) -- NOTE: modifier is unused now, can represent L or P re_repeat = -- TODO: could use speck. And add back Op(Token op, id modifier) Op(Token op) | Num(Token times) | Range(Token? lower, Token? upper) -- Haven't implemented the modifier, e.g. x{+ P} -- | Num(Token times, id modifier) -- | Range(Token? lower, Token? upper, id modifier) re = -- . ^ $ Speck %speck -- %begin or \u123 | Token %Token | PosixClass %posix_class | PerlClass %perl_class | ClassLiteral(bool negated, class_literal_term* terms) -- @D | Splice(Token name) -- $literal ${literal} 'no-backslashes' "other$foo" | SimpleVarSub %simple_var_sub | BracedVarSub %braced_var_sub | SingleQuoted %single_quoted | DoubleQuoted %double_quoted -- Compound: | Repeat(re child, re_repeat op) | Seq(re* children) | Alt(re* children) | Group(re child) -- TODO: needs Token? type field | Capture(re child, Token? var_name) | Backtracking(bool negated, Token name, re child) -- Regex Evaluation Shares the Same Structure, but uses slightly different -- nodes. -- * Speck/Token (syntactic concepts) -> Primitive (logical) -- * Splice -> re_t -- * All Strings -> Literal | Primitive(id id) -- . dot etc. -- String substitutions are evaluated into literals | LiteralChars(string s, int spid) } -- Ideas for even more precise error messages -- A temporary value that's NOT stored in the LST. It's the beginning or end -- of a line_span / extent. -- position = (int line_id, int col) -- An extent represents the source locations of a word or word_part, which -- may span multiple lines. Encoded into LST with 'int exid'. -- extent = (int s_line_id, int s_col, int e_line_id, int e_col)