# Data types for the Oil AST, aka "Lossless Syntax Tree".
#
# Invariant: the source text can be reconstructed byte-for-byte from this
# tree.  The test/arena.sh file partially verifies this.
#
# Exceptions:
# - <<- here docs with leading tabs, since we don't want those for
#   conversion.  We don't want files with mixed tabs and spaces.
# - Distinguishing between function styles wasn't necessary:
#   - foo() { } vs function foo { }  # ksh style

# We usually try to preserve the physical order of the source in the ASDL
# fields.  One exception is the order of redirects:
#
#     echo >out.txt hi   
#     # versus
#     echo hi >out.txt

# Unrepresented:
# - let arithmetic (rarely used)
# - coprocesses # one with arg and one without
# - select block
# - case fallthrough ;& and ;;&

# Possible refactorings:
#
#   # More %Token as first class variants:
#   type_expr = Atom %Token | ...
#   printf_part = Literal %Token | ...
#
#   # %CompoundWord as first class variant:
#   bool_expr = WordTest %CompoundWord | ...
#
#   # Can DoubleQuoted have a subset of parts compared with CompoundWord?
#   string_part = ...  # subset of word_part
#
#   - Distinguish word_t with BracedTree vs. those without?  seq_word_t?
#   - Remove command.NoOp ?

module syntax
{
  # More efficient than the List[bool] pattern we've been using
  BoolParamBox = (bool b)
  IntParamBox = (int i)

  # core/main_loop.py
  parse_result = EmptyLine | Eof | Node(command cmd)
 
  # 'source' represents the location of a line / token.
  source = 
    Interactive
  | Headless
  | Unused(str comment)     # completion and history never show parse errors?
  | CFlag
  | Stdin(str comment)

    # TODO: if it's not the main script, it's sourced, and you could provide
    # a chain of locations back to the sourced script!
    # MainFile(str path) or SourcedFile(str path, loc location)
  | MainFile(str path)
  | SourcedFile(str path, loc location)

    # code parsed from a word
    # used for 'eval', 'trap', 'printf', 'complete -W', etc.
  | ArgvWord(str what, loc location)

    # code parsed from the value of a variable
    # used for $PS1 $PROMPT_COMMAND
  | Variable(str var_name, loc location)

    # Point to the original variable reference
  | VarRef(Token orig_tok)

    # alias expansion (location of first word)
  | Alias(str argv0, loc argv0_loc)

    # 2 kinds of reparsing: backticks, and x+1 in a[x+1]=y
    # TODO: use this for eval_unsafe_arith instead of Variable
  | Reparsed(str what, Token left_token, Token right_token)

    # For #location-str
  | Synthetic(str s)

  SourceLine = (int line_num, str content, source src)

  # Token TODO:
  # - get rid of redundant span_id
  # - function to compute string val on demand
  # - maybe re-compute length on demand too
  Token = (id id, int col, int length, int span_id, SourceLine? line, str tval)

  # Source location for errors
  # It's possible for word_part and word to have a beginning and end location
  loc = 
    Missing  # equivalent of runtime.NO_SPID
    # TODO: remove in favor of using Tokens
  | Span(int span_id)
  | Token %Token
  | WordPart(word_part p)
    # Note: is it possible to only have CompoundWord?
  | Word(word w)
  | Arith(arith_expr a)
    # e.g. for errexit blaming
  | Command(command c)

  #
  # Shell language
  #

  bracket_op = 
    WholeArray(id op_id)  # * or @
  | ArrayIndex(arith_expr expr)

  suffix_op = 
    Nullary %Token  # ${x@Q} or ${!prefix@} (which also has prefix_op)
  | Unary(Token op, rhs_word arg_word)  # e.g. ${v:-default}
    # TODO: Implement Oil's ${x|html} and ${x %.3f}
  | Static(Token tok, str arg)
  | PatSub(CompoundWord pat, rhs_word replace, id replace_mode, Token slash_tok)
    # begin is optional with ${array::1}
  | Slice(arith_expr? begin, arith_expr? length)

  BracedVarSub = (
      Token left,        # in dynamic ParseVarRef, same as name_tok
      Token token,       # location for the name
      str var_name,   # the name
      Token? prefix_op,  # prefix # or ! operators
      bracket_op? bracket_op,
      suffix_op? suffix_op,
      Token right        # in dynamic ParseVarRef, same as name_tok
  )

  # Variants:
  # - Look at left token ID for $'' c'' vs r'' '' e.g. Id.Left_DollarSingleQuote
  # - And """ and ''' e.g. Id.Left_TDoubleQuote
  DoubleQuoted = (Token left, List[word_part] parts, Token right)
  SingleQuoted = (Token left, List[Token] tokens, Token right)

  SimpleVarSub = (Token left, str var_name)

  CommandSub = (Token left_token, command child, Token right)

  # - can contain word.BracedTree
  # - no 'Token right' for now, doesn't appear to be used
  ShArrayLiteral = (Token left, List[word] words)

  # Used in both expr and a word_part
  ArgList = (
     Token left, List[expr] positional, List[NamedArg] named, Token right
  )

  # TODO: every word_part has Token left, Token right, and remove int* spids
  # these are used in the main program for alias expansion

  AssocPair = (CompoundWord key, CompoundWord value)

  word_part = 
    ShArrayLiteral %ShArrayLiteral
  | AssocArrayLiteral(Token left, List[AssocPair] pairs)
  | Literal %Token
    # escaped case is separate so the evaluator doesn't have to check token ID
  | EscapedLiteral(Token token, str ch)
  | SingleQuoted %SingleQuoted
  | DoubleQuoted %DoubleQuoted
  | SimpleVarSub %SimpleVarSub
  | BracedVarSub %BracedVarSub
    # For command sub and process sub: $(...)  <(...)  >(...)
  | CommandSub %CommandSub
    # ~ or ~bob
  | TildeSub(Token token, str? user_name)
  | ArithSub(arith_expr anode)
    # {a,b,c}
  | BracedTuple(List[CompoundWord] words)
    # {1..10} or {-5..10..2} or {01..10} (leading zeros matter)
    # {a..f} or {a..f..2} or {a..f..-2}
  | BracedRange(id kind, str start, str end, int step)
    # note: optional int may need special handling in ASDL
  # extended globs are parsed statically, unlike globs
  | ExtGlob(Token op, List[CompoundWord] arms)

    # Oil word_part extensions

    # @myarray
  | Splice(Token name, str var_name)
    # $strfunc(x) and @arrayfunc(x)
  | FuncCall(Token name, ArgList args)
    # $[d->key], etc.
  | ExprSub(Token left, expr child)
  attributes (List[int] spids)

  # TODO: I think Token left should be copied into every CompoundWord
  # either that, or CommandParser needs to reliably have GetToken(word_t)
  # Invariant: CompoundWord always has at least one part (as opposed to
  # rhs_word.Empty)
  CompoundWord = (List[word_part] parts)

  # Use cases for Empty: RHS of 'x=', the argument in "${x:-}".
  # The latter is semantically necessary.  (See osh/word_parse.py). 
  # At runtime: RHS of 'declare x='.
  rhs_word = Empty | Compound %CompoundWord

  word = 
    # Returns from WordParser, but not generally stored in LST
    Token %Token
    # A Compound word can contain any word_part except the Braced*Part.
    # We could model this with another variant type but it incurs runtime
    # overhead and seems like overkill.  Note that DoubleQuoted can't
    # contain a SingleQuoted, etc. either.
  | Compound %CompoundWord
    # For word sequences command.Simple, ShArrayLiteral, for_iter.Words
    # Could be its own type
  | BracedTree(List[word_part] parts)
    # For dynamic parsing of test/[ # the string is already evaluated.
    # TODO: try using Token with source info
  | String(id id, str s, int span_id)

  # Note: the name 'foo' is derived from token value 'foo=' or 'foo+='
  sh_lhs_expr =
    Name(Token left, str name)
  | IndexedName(Token left, str name, arith_expr index)
  | UnparsedIndex(Token left, str name, str index)  # for translation

  arith_expr =
    VarSub %SimpleVarSub  # e.g. $(( x ))
  | Word %CompoundWord     # e.g. $(( 123'456'$y ))

  | UnaryAssign(id op_id, arith_expr child)
  | BinaryAssign(id op_id, arith_expr left, arith_expr right)

  | Unary(id op_id, arith_expr child)
    # TODO: op should be token, e.g. for divide by zero
  | Binary(id op_id, arith_expr left, arith_expr right)
  | TernaryOp(arith_expr cond, arith_expr true_expr, arith_expr false_expr)

  bool_expr =
    WordTest(word w)  # e.g. [[ myword ]]
  | Binary(id op_id, word left, word right)
  | Unary(id op_id, word child)
  | LogicalNot(bool_expr child)
  | LogicalAnd(bool_expr left, bool_expr right)
  | LogicalOr(bool_expr left, bool_expr right)

  redir_loc =
    Fd(int fd) | VarName(str name)

  redir_param =
    Word %CompoundWord
  | HereDoc(word here_begin,  # e.g. EOF or 'EOF'
            int here_end_span_id,  # span is whole line (for defunct osh2oil)
            List[word_part] stdin_parts  # one for each line
           )

  Redir = (Token op, redir_loc loc, redir_param arg)

  assign_op = Equal | PlusEqual
  AssignPair = (sh_lhs_expr lhs, assign_op op, rhs_word rhs, List[int] spids)
  EnvPair = (str name, rhs_word val, List[int] spids)

  condition = 
    Shell(List[command] commands)  # if false; true; then echo hi; fi
  | Oil(expr e)               # if (x > 0) { echo hi }

  # Each arm tests one word against multiple words
  CaseArm = (List[word] pat_list, List[command] action, List[int] spids)

  # Each if arm starts with either an "if" or "elif" keyword
  # In YSH, the then keyword is not used (replaced by braces {})
  IfArm = (Token keyword, condition cond, Token? then_kw, List[command] action,
  	   List[int] spids)

  for_iter =
    Args                      # for x; do echo $x; done # implicit "$@"
  | Words(List[word] words)   # for x in 'foo' *.py { echo $x }
                              # like ShArrayLiteral, but no location for %(
  | Oil(expr e, Token blame)  # for x in (mylist) { echo $x }

  # TODO: Make field names consistent: child vs expr, etc.
  BraceGroup = (
      Token left, Token? doc_token, List[command] children,
      List[Redir] redirects, Token right
  )

  # TODO: every command needs Token left
  # also add Token right_lok (a lok is a token used for the pretty printer ONLY)
  # then Case, ForEach, and ShFunction also have Token in_lok, esac_lok, etc.
  # REMOVE int* spids

  # Retain references to lines
  BlockArg = (BraceGroup brace_group, List[SourceLine] lines)

  command =
    NoOp
    # Note: do_fork is semantic, not syntactic
  | Simple(List[word] words, List[Redir] redirects, List[EnvPair] more_env,
           ArgList? typed_args, BlockArg? block, bool do_fork)
    # This doesn't technically belong in the LST, but it's convenient for
    # execution
  | ExpandedAlias(command child, List[Redir] redirects, List[EnvPair] more_env)
  | Sentence(command child, Token terminator)
    # Note: Only represents "bare assignment"
  | ShAssignment(List[AssignPair] pairs, List[Redir] redirects)
  | ControlFlow(Token keyword, word? arg_word)
    # Note: There are spids for every pipeline operator, parallel to
    # stderr_indices
  | Pipeline(List[command] children, bool negated, List[int] stderr_indices)
  | AndOr(List[id] ops, List[command] children)
    # Part of for, while, until (but not if, case, ShFunction).  No redirects.
  | DoGroup(Token left, List[command] children, Token right)
    # A brace group is a compound command, with redirects.
  | BraceGroup %BraceGroup
    # Contains a single child, like CommandSub
  | Subshell(command child, List[Redir] redirects)
  | DParen(arith_expr child, List[Redir] redirects)
  | DBracket(bool_expr expr, List[Redir] redirects)
    # up to 3 iterations variables
  | ForEach(Token keyword, List[str] iter_names, for_iter iterable,
            command body, List[Redir] redirects)
    # C-style for loop.  Any of the 3 expressions can be omitted.
    # Note: body is required, but only optional here because of initialization
    # order.
  | ForExpr(Token keyword, arith_expr? init, arith_expr? cond,
            arith_expr? update, command? body, List[Redir] redirects)
  | WhileUntil(Token keyword, condition cond, command body, List[Redir] redirects)
  | If(Token if_kw, List[IfArm] arms, Token? else_kw, List[command] else_action,
       Token? fi_kw, List[Redir] redirects)
  | Case(word to_match, List[CaseArm] arms, List[Redir] redirects)
  | ShFunction(str name, command body)
  | TimeBlock(command pipeline)
    # Some nodes optimize it out as List[command], but we use CommandList for
    # 1. the top level
    # 2. ls ; ls & ls  (same line)
    # 3. CommandSub # single child that's a CommandList
    # 4. Subshell # single child that's a CommandList
    # Similar to DoGroup, except that has do and done spids.
  | CommandList(List[command] children)

    # Oil stuff

    # For 'x = myexpr'.  There's no type and no comma allowed.
  | BareDecl(Token lhs, expr rhs)
    # var, const
  | VarDecl(Token? keyword, List[NameType] lhs, expr rhs)
    # setvar/set, auto.
  | PlaceMutation(Token? keyword, List[place_expr] lhs, Token op, expr rhs)
    # = keyword
  | Expr(Token keyword, expr e)
  | Proc(Token keyword, Token name, proc_sig sig, command body)

  # Tea

  | Func(Token name, 
         List[Param] pos_params, Token? pos_splat,
         List[Param] named_params, Token? named_splat,
         List[type_expr] return_types, command body)
  | Data(Token name, List[Param] params)
  | Enum(Token name, List[Variant] variants)
  | Class(Token name, Token? extends, List[class_item] items)
  | Import(SingleQuoted path, Token? alias, List[ImportName] names)
  | For(List[NameType] targets, expr iterable, command body)
  | While(expr test, command body)
  | Break | Continue
  | Return(expr? value)
  attributes (List[int] spids)

  # Binary(x expr, y expr) or Nullary %Token
  # In the first case we have a tag, and an anonymous type.
  variant_type =
    Anon(List[Param] params)
  | Ref(Token type_name)

  # tags and types are separate in our model of algebraic data types
  Variant = (Token tag_name, variant_type? typ)

  class_item =
    Data(Token keyword, List[NameType] fields)
  | Method()

  ImportName = (Token name, Token? alias)

  # ref is always ':' or empty
  UntypedParam = (Token? ref, Token name, expr? default_val)
  # type is Expr or Block
  TypedParam = (Token name, Token type, expr? default_val)

  # 'open' is for proc p { }; closed is for proc p () { }
  proc_sig =
    Open
  | Closed(List[UntypedParam] untyped, Token? rest, List[TypedParam] typed)

  # prefix is : for out param, @ for proc splat, ... for func splat
  # procs only have types Expr, Block (and Str is implicit)
  Param = (Token? prefix, Token name, type_expr? type, expr? default_val)

  #
  # Glob representation, for converting ${x//} to extended regexes.
  #

  # Example: *.[ch] is:
  #   GlobOp(<Glob_Star '*'>),
  #   GlobLit(Glob_OtherLiteral, '.'),
  #   CharClass(False, ['ch'])  # from Glob_CleanLiterals token

  glob_part =
    Literal(id id, str s)
  | Operator(id op_id)  # * or ?
  | CharClass(bool negated, List[str] strs)

  # Char classes are opaque for now.  If we ever need them:
  # - Collating symbols are [. .]
  # - Equivalence classes are [=

  printf_part =
    Literal(Token token)
    # flags are 0 hyphen space + #
    # type is 's' for %s, etc.
  | Percent(List[Token] flags, Token? width, Token? precision, Token type)

  #
  # OIL LANGUAGE
  #
  # Copied and modified from Python-3.7/Parser/Python.asdl !

  expr_context = Load | Store | Del | AugLoad | AugStore | Param

  # type expressions:   Int   Array[Int]   Dict[Str, Any]
  type_expr =
    Simple(Token name)
  | Compound(Token name, List[type_expr] params)

  # LHS binding in loops, list comprehensions, and var/const
  NameType = (Token name, type_expr? typ)

  # TODO: Inline this into GenExp and ListComp?  Just use a flag there?
  Comprehension = (List[NameType] lhs, expr iter, expr? cond)

  # named arguments supplied to call.  token is null for f(; ...named).
  NamedArg = (Token? name, expr value)

  # Subscripts are lists of expressions
  #   a[:i, n]      (we don't have matrices, but we have data frames)
  Subscript = (expr obj, List[expr] indices)

  # Attributes are obj.attr, d->key, name::scope,
  Attribute = (expr obj, Token op, Token attr, expr_context ctx)

  # Places that can be mutated.
  place_expr = 
    Var(Token name)  # TODO: could be Var %Token
  | Subscript %Subscript
  | Attribute %Attribute

  expr =
    # a variable name to evaluate
    Var(Token name)  # TODO: could be Var %Token
    # For null, Bool, Int, Float
    # Python uses Num(object n), which doesn't respect our "LST" invariant.
  | Const(Token c)
    # @(one 'two' "$three")
  | ShArrayLiteral %ShArrayLiteral
    # @[a b c] @[1 2 3] @[(1+1) (2+2)]
  | RegexLiteral(Token left, re regex, List[Token] flags, Token? trans_pref)

  | SimpleVarSub %SimpleVarSub
  | BracedVarSub %BracedVarSub
  | CommandSub %CommandSub
  | SingleQuoted %SingleQuoted
  | DoubleQuoted %DoubleQuoted

  | BlockArg %BlockArg

  | Lambda(List[NameType] params, expr body)

  | Unary(Token op, expr child)
  | Binary(Token op, expr left, expr right)
    # x < 4 < 3 and (x < 4) < 3
  | Compare(expr left, List[Token] ops, List[expr] comparators)
  | FuncCall(expr func, ArgList args)

    # TODO: Need a representation for method call.  We don't just want
    # Attribute() and then Call()

  | IfExp(expr test, expr body, expr orelse)
  | Tuple(List[expr] elts, expr_context ctx)

  | List(List[expr] elts, expr_context ctx)
  | Dict(List[expr] keys, List[expr] values)
    # For the values in {n1, n2}
  | Implicit

  | ListComp(expr elt, List[Comprehension] generators)
    # not implemented
  | DictComp(expr key, expr value, List[Comprehension] generators)
  | GeneratorExp(expr elt, List[Comprehension] generators)

    # Ranges are written 1:2, with first class expression syntax. There is no
    # step as in Python. Use range(0, 10, step=2) for that.
  | Range(expr lower, expr upper)

    # Slices occur within [] only.  Unlike ranges, the start/end can be #
    # implicit.  Like ranges, denote a step with slice(0, 10, step=2).
    #   a[3:]   a[:i]
  | Slice(expr? lower, expr? upper)

  | Subscript %Subscript
  | Attribute %Attribute

    # Ellipsis is like 'Starred' within Python, which are valid on the LHS in
    # Python for unpacking, and # within list literals for splicing.
    # (Starred is NOT used for {k:v, **a}.  That used a blank "keys"
    # attribute.)

    # In Oil, "spreading" will be @[1 ...array2] [b, ...list2] and
    # {k: v, ...dict2}.  We don't need two different symbols.
  | Spread(expr child, expr_context ctx)

  #
  # Regex Language (Eggex)
  #

  # e.g. alnum digit
  PosixClass = (Token? negated, str name)
  # e.g. d w s
  PerlClass = (Token? negated, str name)

  # Note: .NET has && in character classes, making it a recursive language

  class_literal_term = 
    PosixClass %PosixClass
  | PerlClass %PerlClass
    # [a-z]   ~[a-z]   TODO: Doesn't respect LST invariant

  | Range(Token start, Token end)
  | CharLiteral(Token tok)

  | SimpleVarSub %SimpleVarSub
  | BracedVarSub %BracedVarSub
  | SingleQuoted %SingleQuoted
  | DoubleQuoted %DoubleQuoted

  # Char Sets and Ranges both use Char Codes
  # with u_braced == true : \u{ff}
  # with u_braced == false: \xff \\ 'a' a '0' 0
  # ERE doesn't make a distinction, but compiling to Python/PCRE can use it
  CharCode = (int i, bool u_braced, int spid)

  # evaluated version of class_literal_term (could be in runtime.asdl)
  char_class_term =
    PosixClass %PosixClass
  | PerlClass %PerlClass

  | Range(CharCode start, CharCode end)

    # For [ \x00 \\ ]
  | CharCode %CharCode

  # NOTE: modifier is unused now, can represent L or P
  re_repeat =
    Op(Token op)
  | Num(Token times)
    # dot{1,2}
  | Range(Token? lower, Token? upper)
  # Haven't implemented the modifier, e.g. x{+ P}
  # | Num(Token times, id modifier)
  # | Range(Token? lower, Token? upper, id modifier)

  re = 
    # e.g.  .  ^  $  %begin  \u123
    Token %Token
  | PosixClass %PosixClass
  | PerlClass %PerlClass
    # syntax [ $x \n ]
  | CharClassLiteral(bool negated, List[class_literal_term] terms)
    # evaluated [ 'abc' \n ]
  | CharClass(bool negated, List[char_class_term] terms)

    # @D
  | Splice(Token name, str var_name)

    # $literal ${literal} 'no-backslashes' "other$foo"
  | SimpleVarSub %SimpleVarSub
  | BracedVarSub %BracedVarSub
  | SingleQuoted %SingleQuoted
  | DoubleQuoted %DoubleQuoted

    # Compound:
  | Repeat(re child, re_repeat op)
  | Seq(List[re] children)
  | Alt(List[re] children)

  | Group(re child)
    # TODO: <d+ : month Int> needs Token? type field
  | Capture(re child, Token? var_name)  
  | Backtracking(bool negated, Token name, re child)

  # Regex Evaluation Shares the Same Structure, but uses slightly different
  # nodes.
  # - Token (syntactic concepts) -> Primitive (logical)
  # - Splice -> re_t
  # - All Strings -> Literal

  | Primitive(id id)  # . dot etc.
    # String substitutions are evaluated into literals
  | LiteralChars(str s, int spid)
}