-- Data types for the osh AST, aka "Lossless Syntax Tree". -- -- Invariant: the source text can be reconstructed byte-for-byte from this -- tree. -- -- The AST is composed of the builtin ASDL types (string, int, bool) and our -- application type 'id', which is core.id_kind.Id. -- Unrepresented: -- * let arithmetic (rarely used) -- * coprocesses -- one with arg and one without -- * select block -- * &> redirect both stdout and stderr -- * 1>&2- to close redirect -- * case fallthrough ;& and ;;& -- Represented but Not Parsed: -- * LeftIndex -- a[foo]=bar (the arithmetic version is parsed) -- * ArrayPair -- ([foo]=bar) -- Parsed but Not Implemented -- * extended glob -- * Process sub -- <() and >() -- * <>, >| redirects -- TODO: Preserve these source differences: -- * order of redirects: 'echo >out.txt hi' vs echo hi >out.txt -- * In the printer, I want to preserve line breaks! foo \bar? -- * parens -- * 1 + 2*3 vs. 1 + (2*3) or even 1 + ((2*3)) -- * [[ (1 == 1) ]] vs [[ 1 == 1 ]] -- * $(( 1 + 2 )) vs $[1 + 2] (bash-specific, used by Aboriginal) -- -- Found to be not strictly necessary for oil conversion -- * foo() { } vs function foo { } -- ksh -- * $'n' vs 'n' -- one of them just has EscapedLiteralPart module osh { -- A portion of a line, used for error messages. -- TODO: Need arena_id, for different files line_span = (int line_id, int col, int length) -- A primitive token. NOTE: val is redundant with 'loc' for now. If we -- rewrite the parser in C++, we might care for memory footprint. But for -- now this is convenient. -- NOTE: identical strings can shared, if we care. token = (id id, string val, int? span_id) -- Optional step for {100..50..-15} braced_step = (int val, int negated) bracket_op = WholeArray(id op_id) -- * or @ | ArrayIndex(arith_expr expr) suffix_op = StringUnary(id op_id, word arg_word) -- e.g. ${v:-default} | PatSub(word pat, word? replace, bool do_all, bool do_prefix, bool do_suffix) -- begin is optional with ${array::1} | Slice(arith_expr? begin, arith_expr? length) -- TODO: Constructors should be scoped? array_item::Pair? array_item = ArrayWord(word w) | ArrayPair(word key, word value) word_part = -- TODO: should be array_item* items. They CAN be mixed, like a=([x]=y z) ArrayLiteralPart(word* words) | LiteralPart(token token) | EscapedLiteralPart(token token) | SingleQuotedPart(token* tokens) | DoubleQuotedPart(word_part* parts) | SimpleVarSub(token token) | BracedVarSub(token token, id? prefix_op, -- prefix # or ! operators bracket_op? bracket_op suffix_op? suffix_op) | TildeSubPart(string prefix) -- For command sub and process sub: $(...) <(...) >(...) | CommandSubPart(command command_list, token left_token) | ArithSubPart(arith_expr anode) -- {a,b,c} | BracedAltPart(word* words) -- {1..10} or {1..10..2} | BracedIntRangePart(int start, int end, braced_step? step) -- {a..f} or {a..f..2} or {a..f..-2} | BracedCharRangePart(string start, string end, braced_step? step) -- extended globs are parsed statically, unlike globs | ExtGlobPart(token op, word* arms) word = TokenWord(token token) -- A CompoundWord can contain any word_part except the Braced*Part. -- We could model this with another variant type but it incurs runtime -- overhead and seems like overkill. Note that DoubleQuotedPart can't -- contain a SingleQuotedPart, etc. either. | CompoundWord(word_part* parts) -- A BracedWordTree is a word because it can appear in a command. It can -- contains any type of word_part. | BracedWordTree(word_part* parts) -- For dynamic parsing of test/[ -- the string is already evaluated. | StringWord(id id, string s) -- TODO: Might want to preserve token here. lhs_expr = LhsName(string name) | LhsIndexedName(string name, arith_expr index) -- Need to preserve physical parens? Maybe need Parens node. arith_expr = -- TODO: need token ArithVarRef(string name) -- variable without $ | ArithWord(word w) -- a string that looks like an integer | UnaryAssign(id op_id, lhs_expr child) | BinaryAssign(id op_id, lhs_expr left, arith_expr right) | ArithUnary(id op_id, arith_expr child) -- TODO: add token for divide by zero | ArithBinary(id op_id, arith_expr left, arith_expr right) | TernaryOp(arith_expr cond, arith_expr true_expr, arith_expr false_expr) | FuncCall(arith_expr func, arith_expr* args) bool_expr = WordTest(word w) -- e.g. [[ myword ]] | BoolBinary(id op_id, word left, word right) | BoolUnary(id op_id, word child) | LogicalNot(bool_expr child) | LogicalAnd(bool_expr left, bool_expr right) | LogicalOr(bool_expr left, bool_expr right) -- Notes about here docs: -- * 'body' is required, but must be initialized after construction. -- * here_end could be a word at parse time for the LST. Then the second -- pass could StaticEval it to a string and set do_expansion. -- * To reprint the here doc, we need the here_end delimiter, but it doesn't -- matter at runtime. do_expansion is calculated from it. -- * was_filled is only used during the parse and should be eliminated from -- serialization format. -- TODO : id -> token for translation? redir = Redir(id op_id, int fd, word arg_word) | HereDoc(id op_id, int fd, word? body, int do_expansion, string here_end, bool was_filled) assign_op = Equal | PlusEqual assign_pair = (lhs_expr lhs, assign_op op, word? rhs) env_pair = (string name, word val) -- Each arm tests one word against multiple words case_arm = (word* pat_list, command* action) if_arm = (command* cond, command* action) iterable = IterArgv | IterArray(word* words) -- TODO: Make field names consistent: child vs expr, etc. command = NoOp -- TODO: respect order | SimpleCommand(word* words, redir* redirects, env_pair* more_env) | Sentence(command child, token terminator) | Assignment(id keyword, string* flags, assign_pair* pairs) | ControlFlow(token token, word? arg_word) | Pipeline(command* children, bool negated, int* stderr_indices) -- TODO: Should be left and right | AndOr(command* children, id op_id) -- Part of for/while/until. Can have one or more children. | DoGroup(command* children, redir* redirects) -- A brace group is a compound command, with redirects. Can have one or -- more children. | BraceGroup(command* children, redir* redirects) -- Can have one or more children. | Subshell(command child, redir* redirects) | DParen(arith_expr child, redir* redirects) | DBracket(bool_expr expr, redir* redirects) -- do_arg_iter: whether to implicitly loop over "$@" -- TODO: Make iter_words a sum type. iterable for_words | ForEach(string iter_name, word* iter_words, bool do_arg_iter, command body, redir* redirects) -- C-style for loop. Any of the 3 expressions can be omitted. -- TODO: body is required, but only optional here because of initialization -- order. | ForExpr(arith_expr? init, arith_expr? cond, arith_expr? update, command? body, redir* redirects) | While(command* cond, command body, redir* redirects) | Until(command* cond, command body, redir* redirects) | If(if_arm* arms, command* else_action, redir* redirects) | Case(word to_match, case_arm* arms, redir* redirects) | FuncDef(string name, command body, redir* redirects) | TimeBlock(command pipeline) -- Most nodes optimize it out as command*, but there are a few places where -- this is useful for type safety. | CommandList(command* children) and_or = DAmp | DPipe -- For now, using stderr_indices representation because it's more compact. -- |& in osh; |- in oil. -- pipe_op = Pipe | PipeAndStderr -- NOTE: Do we even need these types? Arena already has methods. They can -- We just need to go from text -> text. For execution, we'll be compiling -- to a different format. We also won't bootstrap with osh code -- only oil -- code. shell can call oil builtins if necessary. -- A node with full debug info -- All other nodes should have span_id? int _loc or int _begin, int _end. -- It can be further compressed perhaps, like a varint. arena = (string* lines, line_span* spans, command root) -- On-disk format for an entire file. Enough info so we can reconstruct the -- text byte-for-byte. whole_file = (string path, arena a) -- In-memory format for all the functions snipped out of a file. partial_file = (string path, arena* funcs) -- -- These types are not used in the LST. But they could be statically -- from values in the LST. -- -- NOTE: This is invalid: ASDL is case-sensitive! -- OperandType = Undefined | Path | Int | Str | Other -- RedirType = Path | Desc | Here -- Also invalid because of duplicate 'Path' -- to fix -- bool_operand_type = Undefined | Path | Int | Str | Other -- redir_type = Path | Desc | Here -- -- APIs -- -- Fifteen lexer modes for osh. -- Possible additional modes: -- nested backticks: echo `echo \`echo foo\` bar` lex_mode = NONE | COMMENT | OUTER | DBRACKET | SQ | DQ | DOLLAR_SQ | ARITH | EXTGLOB | VS_1 | VS_2 | VS_ARG_UNQ | VS_ARG_DQ | BASH_REGEX | BASH_REGEX_CHARS }