1 #!/usr/bin/env python2
2 """regex_translate.py."""
3 from __future__ import print_function
4
5 from _devbuild.gen.syntax_asdl import (
6 PosixClass,
7 PerlClass,
8 CharCode,
9 char_class_term,
10 char_class_term_e,
11 char_class_term_t,
12 re,
13 re_e,
14 re_repeat,
15 re_repeat_e,
16 )
17 from _devbuild.gen.runtime_asdl import value
18 from _devbuild.gen.id_kind_asdl import Id
19
20 from core.error import e_die
21 from mycpp.mylib import log, tagswitch
22 from osh import glob_ # for ExtendedRegexEscape
23
24 from typing import List, TYPE_CHECKING, cast
25
26 if TYPE_CHECKING:
27 from _devbuild.gen.syntax_asdl import re_t
28
29 _ = log
30
31 PERL_CLASS = {
32 'd': '[:digit:]',
33 # Python's docs say it's [a-zA-Z0-9_] when NO LOCALE is set.
34 'w': '[:alpha:][:digit:]_',
35 # Python's doc says \s is [ \t\n\r\f\v] when NO LCOALE
36 's': '[:space:]',
37 }
38
39 # ERE's in POSIX:
40 # https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
41 #
42 # NOTE: They don't support \x00 or \u1234 as in Perl/Python!
43 #
44 # It's hard to grep for tabs with BRE or ERE syntax. You have to use:
45 #
46 # (1) a literal tab with Ctrl-V, or
47 # (2) bash syntax: grep $'\t' foo.txt
48 # (3) POSIX shell syntax: grep "$(echo -e '\t')" foo.txt.
49 #
50 # I ran into this in test/lint.sh !!!
51 #
52 # https://stackoverflow.com/questions/1825552/grep-a-tab-in-unix
53
54 # Algorithm:
55 # - Unicode escapes in BREs disallowed
56 # - ASCII codes 1-255 allowed LITERALLY, but NUL \0 disallowed
57 #
58 # What about utf-8 encoded characters? Those work within literals, but are
59 # problematic within character sets. There's no way to disallow those in
60 # general though.
61
62 CH_RBRACKET = 0x5d
63 CH_BACKSLASH = 0x5c
64 CH_CARET = 0x5e
65 CH_HYPHEN = 0x2d
66
67 FLAG_RBRACKET = 0b0001
68 FLAG_BACKSLASH = 0b0010
69 FLAG_CARET = 0b0100
70 FLAG_HYPHEN = 0b1000
71
72
73 def _CharCodeToEre(term, parts, special_char_flags):
74 # type: (CharCode, List[str], List[int]) -> None
75 """special_char_flags: list of single int that is mutated."""
76
77 char_int = term.i
78 if char_int >= 128 and term.u_braced:
79 # \u{ff} can't be represented in ERE because we don't know the encoding
80 # \xff can be represented
81 e_die("ERE can't express char code %d" % char_int, term.blame_tok)
82
83 # note: mycpp doesn't handle
84 # special_char_flags[0] |= FLAG_HYPHEN
85 mask = special_char_flags[0]
86
87 if char_int == CH_HYPHEN:
88 mask |= FLAG_HYPHEN
89 elif char_int == CH_CARET:
90 mask |= FLAG_CARET
91 elif char_int == CH_RBRACKET:
92 mask |= FLAG_RBRACKET
93 elif char_int == CH_BACKSLASH:
94 mask |= FLAG_BACKSLASH
95 else:
96 parts.append(chr(char_int))
97
98 special_char_flags[0] = mask
99
100
101 def _CharClassTermToEre(term, parts, special_char_flags):
102 # type: (char_class_term_t, List[str], List[int]) -> None
103 """special_char_flags: list of single int that is mutated."""
104
105 UP_term = term
106 with tagswitch(term) as case:
107 if case(char_class_term_e.Range):
108 term = cast(char_class_term.Range, UP_term)
109
110 # Create our own flags
111 range_no_special = [0]
112
113 _CharCodeToEre(term.start, parts, range_no_special)
114 if range_no_special[0] != 0:
115 e_die(
116 "Can't use char %d as start of range in ERE syntax" %
117 term.start.i, term.start.blame_tok)
118
119 parts.append('-') # a-b
120
121 _CharCodeToEre(term.end, parts, range_no_special)
122 if range_no_special[0] != 0:
123 e_die(
124 "Can't use char %d as end of range in ERE syntax" %
125 term.end.i, term.end.blame_tok)
126
127 elif case(char_class_term_e.CharCode):
128 term = cast(CharCode, UP_term)
129
130 _CharCodeToEre(term, parts, special_char_flags)
131
132 elif case(char_class_term_e.PerlClass):
133 term = cast(PerlClass, UP_term)
134 n = term.name
135 chars = PERL_CLASS[term.name] # looks like '[:digit:]'
136 if term.negated:
137 e_die("Perl classes can't be negated in ERE", term.negated)
138 else:
139 pat = '%s' % chars
140 parts.append(pat)
141
142 elif case(char_class_term_e.PosixClass):
143 term = cast(PosixClass, UP_term)
144 n = term.name # looks like 'digit'
145 if term.negated:
146 e_die("POSIX classes can't be negated in ERE", term.negated)
147 else:
148 pat = '[:%s:]' % n
149 parts.append(pat)
150
151 else:
152 raise AssertionError(term)
153
154
155 def _AsPosixEre(node, parts):
156 # type: (re_t, List[str]) -> None
157 """Translate an Oil regex to a POSIX ERE.
158
159 Appends to a list of parts that you have to join.
160 """
161 UP_node = node
162 tag = node.tag()
163
164 if tag == re_e.Primitive:
165 node = cast(re.Primitive, UP_node)
166 if node.id == Id.Re_Dot:
167 parts.append('.')
168 elif node.id == Id.Re_Start:
169 parts.append('^')
170 elif node.id == Id.Re_End:
171 parts.append('$')
172 else:
173 raise AssertionError(node.id)
174 return
175
176 if tag == re_e.LiteralChars:
177 node = cast(re.LiteralChars, UP_node)
178 # The bash [[ x =~ "." ]] construct also has to do this
179
180 # TODO: What about \0 and unicode escapes?
181 # Those won't be as LiteralChars I don't think?
182 # Unless you put them there through \0
183 # Maybe DISALLOW those.
184 # "Unprintable chars should be written as \0 or \x00 or \u0000"
185
186 parts.append(glob_.ExtendedRegexEscape(node.s))
187 return
188
189 if tag == re_e.Seq:
190 node = cast(re.Seq, UP_node)
191 for c in node.children:
192 _AsPosixEre(c, parts)
193 return
194
195 if tag == re_e.Alt:
196 node = cast(re.Alt, UP_node)
197 for i, c in enumerate(node.children):
198 if i != 0:
199 parts.append('|')
200 _AsPosixEre(c, parts)
201 return
202
203 if tag == re_e.Repeat:
204 node = cast(re.Repeat, UP_node)
205 # 'foo' or "foo" or $x or ${x} evaluated to too many chars
206 if node.child.tag() == re_e.LiteralChars:
207 child = cast(re.LiteralChars, node.child)
208 if len(child.s) > 1:
209 # Note: Other regex dialects have non-capturing groups since we don't
210 # need this.
211 e_die(
212 "POSIX EREs don't have groups without capture, so this node "
213 "needs () around it.", child.blame_tok)
214
215 _AsPosixEre(node.child, parts)
216 op = node.op
217 op_tag = op.tag()
218 UP_op = op
219
220 if op_tag == re_repeat_e.Op:
221 op = cast(re_repeat.Op, UP_op)
222 op_id = op.op.id
223 if op_id == Id.Arith_Plus:
224 parts.append('+')
225 elif op_id == Id.Arith_Star:
226 parts.append('*')
227 elif op_id == Id.Arith_QMark:
228 parts.append('?')
229 else:
230 raise AssertionError(op_id)
231 return
232
233 if op_tag == re_repeat_e.Num:
234 op = cast(re_repeat.Num, UP_op)
235 parts.append('{%s}' % op.times.tval)
236 return
237
238 if op_tag == re_repeat_e.Range:
239 op = cast(re_repeat.Range, UP_op)
240 lower = op.lower.tval if op.lower else ''
241 upper = op.upper.tval if op.upper else ''
242 parts.append('{%s,%s}' % (lower, upper))
243 return
244
245 raise NotImplementedError(op_tag)
246
247 # Special case for familiarity: () is acceptable as a group in ERE
248 if tag in (re_e.Group, re_e.Capture):
249 node = cast(re.Group, UP_node)
250 parts.append('(')
251 _AsPosixEre(node.child, parts)
252 parts.append(')')
253 return
254
255 if tag == re_e.PerlClass:
256 node = cast(PerlClass, UP_node)
257 n = node.name
258 chars = PERL_CLASS[node.name] # looks like [:digit:]
259 if node.negated:
260 pat = '[^%s]' % chars
261 else:
262 pat = '[%s]' % chars
263 parts.append(pat)
264 return
265
266 if tag == re_e.PosixClass:
267 node = cast(PosixClass, UP_node)
268 n = node.name # looks like 'digit'
269 if node.negated:
270 pat = '[^[:%s:]]' % n
271 else:
272 pat = '[[:%s:]]' % n
273 parts.append(pat)
274 return
275
276 if tag == re_e.CharClass:
277 node = cast(re.CharClass, UP_node)
278
279 # HYPHEN CARET RBRACKET BACKSLASH
280 special_char_flags = [0]
281 non_special_parts = [] # type: List[str]
282
283 for term in node.terms:
284 _CharClassTermToEre(term, non_special_parts, special_char_flags)
285
286 parts.append('[')
287 if node.negated:
288 parts.append('^')
289
290 # Help the user with some of terrible corner cases
291
292 # - move literal - to end [ab-] not [a-b]
293 # - move literal ^ to end [x^-] not [^x-]
294 # - move literal ] to beginning: []x] not [x]]
295 # - double up \\ because of Gawk extension [\\]
296
297 if special_char_flags[0] & FLAG_RBRACKET:
298 parts.append(']')
299
300 parts.extend(non_special_parts)
301
302 if special_char_flags[0] & FLAG_BACKSLASH:
303 parts.append('\\\\') # TWO backslashes
304
305 if special_char_flags[0] & FLAG_CARET:
306 parts.append('^')
307
308 if special_char_flags[0] & FLAG_HYPHEN:
309 parts.append('-')
310
311 parts.append(']')
312 return
313
314 raise NotImplementedError(tag)
315
316
317 def AsPosixEre(eggex):
318 # type: (value.Eggex) -> str
319 if eggex.as_ere is not None:
320 return eggex.as_ere
321
322 parts = [] # type: List[str]
323 _AsPosixEre(eggex.expr, parts)
324 eggex.as_ere = ''.join(parts)
325 return eggex.as_ere