cpp

Coverage Report

Created: 2024-03-13 14:13

/home/andy/git/oilshell/oil/mycpp/gc_str.cc
Line
Count
Source (jump to first uncovered line)
1
#include "mycpp/gc_str.h"
2
3
#include <ctype.h>  // isalpha(), isdigit()
4
#include <stdarg.h>
5
6
#include <regex>
7
8
#include "mycpp/common.h"
9
#include "mycpp/gc_alloc.h"     // NewStr()
10
#include "mycpp/gc_builtins.h"  // StringToInt()
11
#include "mycpp/gc_list.h"      // join(), split() use it
12
13
GLOBAL_STR(kEmptyString, "");
14
15
static const std::regex gStrFmtRegex("([^%]*)(?:%(-?[0-9]*)(.))?");
16
static const int kMaxFmtWidth = 256;  // arbitrary...
17
18
9
int BigStr::find(BigStr* needle, int pos) {
19
9
  int length = len(this);
20
9
  DCHECK(len(needle) == 1);  // Oil's usage
21
0
  char c = needle->data_[0];
22
29
  for (int i = pos; i < length; ++i) {
23
26
    if (data_[i] == c) {
24
6
      return i;
25
6
    }
26
26
  }
27
3
  return -1;
28
9
}
29
30
6
int BigStr::rfind(BigStr* needle) {
31
6
  int length = len(this);
32
6
  DCHECK(len(needle) == 1);  // Oil's usage
33
0
  char c = needle->data_[0];
34
24
  for (int i = length - 1; i >= 0; --i) {
35
22
    if (data_[i] == c) {
36
4
      return i;
37
4
    }
38
22
  }
39
2
  return -1;
40
6
}
41
42
51
bool BigStr::isdigit() {
43
51
  int n = len(this);
44
51
  if (n == 0) {
45
2
    return false;  // special case
46
2
  }
47
65
  for (int i = 0; i < n; ++i) {
48
49
    if (!::isdigit(data_[i])) {
49
33
      return false;
50
33
    }
51
49
  }
52
16
  return true;
53
49
}
54
55
35
bool BigStr::isalpha() {
56
35
  int n = len(this);
57
35
  if (n == 0) {
58
0
    return false;  // special case
59
0
  }
60
53
  for (int i = 0; i < n; ++i) {
61
39
    if (!::isalpha(data_[i])) {
62
21
      return false;
63
21
    }
64
39
  }
65
14
  return true;
66
35
}
67
68
// e.g. for osh/braces.py
69
8
bool BigStr::isupper() {
70
8
  int n = len(this);
71
8
  if (n == 0) {
72
2
    return false;  // special case
73
2
  }
74
12
  for (int i = 0; i < n; ++i) {
75
8
    if (!::isupper(data_[i])) {
76
2
      return false;
77
2
    }
78
8
  }
79
4
  return true;
80
6
}
81
82
21
bool BigStr::startswith(BigStr* s) {
83
21
  int n = len(s);
84
21
  if (n > len(this)) {
85
0
    return false;
86
0
  }
87
21
  return memcmp(data_, s->data_, n) == 0;
88
21
}
89
90
12
bool BigStr::endswith(BigStr* s) {
91
12
  int len_s = len(s);
92
12
  int len_this = len(this);
93
12
  if (len_s > len_this) {
94
1
    return false;
95
1
  }
96
11
  const char* start = data_ + len_this - len_s;
97
11
  return memcmp(start, s->data_, len_s) == 0;
98
12
}
99
100
// Get a string with one character
101
95
BigStr* BigStr::at(int i) {
102
95
  int length = len(this);
103
95
  if (i < 0) {
104
2
    i = length + i;
105
2
  }
106
95
  DCHECK(0 <= i);
107
95
  DCHECK(i < length);  // had a problem here!
108
109
0
  BigStr* result = NewStr(1);
110
95
  result->data_[0] = data_[i];
111
95
  return result;
112
95
}
113
114
// s[begin:]
115
6
BigStr* BigStr::slice(int begin) {
116
6
  return slice(begin, len(this));
117
6
}
118
119
// s[begin:end]
120
636
BigStr* BigStr::slice(int begin, int end) {
121
636
  int length = len(this);
122
636
  SLICE_ADJUST(begin, end, length);
123
124
636
  DCHECK(0 <= begin && begin <= length);
125
636
  DCHECK(0 <= end && end <= length);
126
127
0
  int new_len = end - begin;
128
636
  DCHECK(0 <= new_len && new_len <= length);
129
130
0
  BigStr* result = NewStr(new_len);
131
636
  memcpy(result->data_, data_ + begin, new_len);
132
133
636
  return result;
134
636
}
135
136
// Used by 'help' builtin and --help, neither of which translate yet.
137
138
0
List<BigStr*>* BigStr::splitlines(bool keep) {
139
0
  DCHECK(keep == true);
140
0
  FAIL(kNotImplemented);
141
0
}
142
143
9
BigStr* BigStr::upper() {
144
9
  int length = len(this);
145
9
  BigStr* result = NewStr(length);
146
9
  char* buffer = result->data();
147
56
  for (int char_index = 0; char_index < length; ++char_index) {
148
47
    buffer[char_index] = toupper(data_[char_index]);
149
47
  }
150
9
  return result;
151
9
}
152
153
6
BigStr* BigStr::lower() {
154
6
  int length = len(this);
155
6
  BigStr* result = NewStr(length);
156
6
  char* buffer = result->data();
157
38
  for (int char_index = 0; char_index < length; ++char_index) {
158
32
    buffer[char_index] = tolower(data_[char_index]);
159
32
  }
160
6
  return result;
161
6
}
162
163
30
BigStr* BigStr::ljust(int width, BigStr* fillchar) {
164
30
  DCHECK(len(fillchar) == 1);
165
166
0
  int length = len(this);
167
30
  int num_fill = width - length;
168
30
  if (num_fill < 0) {
169
10
    return this;
170
20
  } else {
171
20
    BigStr* result = NewStr(width);
172
20
    char c = fillchar->data_[0];
173
20
    memcpy(result->data_, data_, length);
174
42
    for (int i = length; i < width; ++i) {
175
22
      result->data_[i] = c;
176
22
    }
177
20
    return result;
178
20
  }
179
30
}
180
181
30
BigStr* BigStr::rjust(int width, BigStr* fillchar) {
182
30
  DCHECK(len(fillchar) == 1);
183
184
0
  int length = len(this);
185
30
  int num_fill = width - length;
186
30
  if (num_fill < 0) {
187
10
    return this;
188
20
  } else {
189
20
    BigStr* result = NewStr(width);
190
20
    char c = fillchar->data_[0];
191
42
    for (int i = 0; i < num_fill; ++i) {
192
22
      result->data_[i] = c;
193
22
    }
194
20
    memcpy(result->data_ + num_fill, data_, length);
195
20
    return result;
196
20
  }
197
30
}
198
199
729
BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
200
  // Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
201
729
  return replace(old, new_str, -1);
202
729
}
203
204
729
BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
205
  // log("replacing %s with %s", old_data, new_str->data_);
206
729
  const char* old_data = old->data_;
207
208
729
  int this_len = len(this);
209
729
  int old_len = len(old);
210
211
729
  const char* last_possible = data_ + this_len - old_len;
212
213
729
  const char* p_this = data_;  // advances through 'this'
214
215
  // First pass: Calculate number of replacements, and hence new length
216
729
  int replace_count = 0;
217
729
  if (old_len == 0) {
218
0
    replace_count = this_len + 1;
219
0
    if (count > 0) {
220
0
      replace_count = min(replace_count, count);
221
0
    }
222
729
  } else {
223
93.3k
    while (p_this <= last_possible) {
224
92.6k
      if (replace_count != count &&  // limit replacements (if count != -1)
225
92.6k
          memcmp(p_this, old_data, old_len) == 0) {  // equal
226
758
        replace_count++;
227
758
        p_this += old_len;
228
91.9k
      } else {
229
91.9k
        p_this++;
230
91.9k
      }
231
92.6k
    }
232
729
  }
233
234
  // log("replacements %d", replace_count);
235
236
729
  if (replace_count == 0) {
237
4
    return this;  // Reuse the string if there were no replacements
238
4
  }
239
240
725
  int new_str_len = len(new_str);
241
725
  int result_len =
242
725
      this_len - (replace_count * old_len) + (replace_count * new_str_len);
243
244
725
  BigStr* result = NewStr(result_len);
245
246
725
  const char* new_data = new_str->data_;
247
725
  const size_t new_len = new_str_len;
248
249
  // Second pass: Copy pieces into 'result'
250
725
  p_this = data_;                  // back to beginning
251
725
  char* p_result = result->data_;  // advances through 'result'
252
725
  replace_count = 0;
253
254
725
  if (old_len == 0) {
255
    // Should place new_str between each char in this
256
0
    while (p_this < last_possible && replace_count != count) {
257
0
      replace_count++;
258
0
      memcpy(p_result, new_data, new_len);  // Copy from new_str
259
0
      p_result += new_len;                  // Move past new_str
260
261
      // Write a char from this
262
0
      *p_result = *p_this;
263
0
      p_this++;
264
0
      p_result++;
265
0
    }
266
267
0
    if (replace_count != count) {
268
      // Write a copy of new_str at the end
269
0
      assert(p_this == last_possible);
270
0
      memcpy(p_result, new_data, new_len);
271
0
    } else if (p_this <= last_possible) {
272
      // Write the last part of string
273
0
      memcpy(p_result, p_this, data_ + this_len - p_this);
274
0
    }
275
725
  } else {
276
93.3k
    while (p_this <= last_possible) {
277
      // Note: would be more efficient if we remembered the match positions
278
92.6k
      if (replace_count != count &&  // limit replacements (if count != -1)
279
92.6k
          memcmp(p_this, old_data, old_len) == 0) {  // equal
280
758
        memcpy(p_result, new_data, new_len);         // Copy from new_str
281
758
        replace_count++;
282
758
        p_result += new_len;
283
758
        p_this += old_len;
284
91.8k
      } else {  // copy 1 byte
285
91.8k
        *p_result = *p_this;
286
91.8k
        p_result++;
287
91.8k
        p_this++;
288
91.8k
      }
289
92.6k
    }
290
725
    memcpy(p_result, p_this, data_ + this_len - p_this);  // last part of string
291
725
  }
292
293
0
  return result;
294
729
}
295
296
enum class StripWhere {
297
  Left,
298
  Right,
299
  Both,
300
};
301
302
const int kWhitespace = -1;
303
304
162
bool OmitChar(uint8_t ch, int what) {
305
162
  if (what == kWhitespace) {
306
122
    return isspace(ch);
307
122
  } else {
308
40
    return what == ch;
309
40
  }
310
162
}
311
312
// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
313
// implement 6 functions:
314
//
315
//   strip / lstrip / rstrip
316
//   strip(char) / lstrip(char) / rstrip(char)
317
//
318
// Args:
319
//   where: which ends to strip from
320
//   what: kWhitespace, or an ASCII code 0-255
321
322
62
BigStr* StripAny(BigStr* s, StripWhere where, int what) {
323
62
  int length = len(s);
324
62
  const char* char_data = s->data();
325
326
62
  int i = 0;
327
62
  if (where != StripWhere::Right) {
328
90
    while (i < length && OmitChar(char_data[i], what)) {
329
52
      i++;
330
52
    }
331
38
  }
332
333
62
  int j = length;
334
62
  if (where != StripWhere::Left) {
335
98
    do {
336
98
      j--;
337
98
    } while (j >= i && OmitChar(char_data[j], what));
338
46
    j++;
339
46
  }
340
341
62
  if (i == j) {  // Optimization to reuse existing object
342
18
    return kEmptyString;
343
18
  }
344
345
44
  if (i == 0 && j == length) {  // nothing stripped
346
8
    return s;
347
8
  }
348
349
  // Note: makes a copy in leaky version, and will in GC version too
350
36
  int new_len = j - i;
351
36
  BigStr* result = NewStr(new_len);
352
36
  memcpy(result->data(), s->data() + i, new_len);
353
36
  return result;
354
44
}
355
356
22
BigStr* BigStr::strip() {
357
22
  return StripAny(this, StripWhere::Both, kWhitespace);
358
22
}
359
360
// Used for CommandSub in osh/cmd_exec.py
361
8
BigStr* BigStr::rstrip(BigStr* chars) {
362
8
  DCHECK(len(chars) == 1);
363
0
  int c = chars->data_[0];
364
8
  return StripAny(this, StripWhere::Right, c);
365
8
}
366
367
16
BigStr* BigStr::rstrip() {
368
16
  return StripAny(this, StripWhere::Right, kWhitespace);
369
16
}
370
371
8
BigStr* BigStr::lstrip(BigStr* chars) {
372
8
  DCHECK(len(chars) == 1);
373
0
  int c = chars->data_[0];
374
8
  return StripAny(this, StripWhere::Left, c);
375
8
}
376
377
8
BigStr* BigStr::lstrip() {
378
8
  return StripAny(this, StripWhere::Left, kWhitespace);
379
8
}
380
381
24
BigStr* BigStr::join(List<BigStr*>* items) {
382
24
  int length = 0;
383
384
24
  int num_parts = len(items);
385
386
  // " ".join([]) == ""
387
24
  if (num_parts == 0) {
388
9
    return kEmptyString;
389
9
  }
390
391
  // Common case
392
  // 'anything'.join(["foo"]) == "foo"
393
15
  if (num_parts == 1) {
394
4
    return items->at(0);
395
4
  }
396
397
317
  for (int i = 0; i < num_parts; ++i) {
398
306
    length += len(items->at(i));
399
306
  }
400
  // add length of all the separators
401
11
  int this_len = len(this);
402
11
  length += this_len * (num_parts - 1);
403
404
11
  BigStr* result = NewStr(length);
405
11
  char* p_result = result->data_;  // advances through
406
407
317
  for (int i = 0; i < num_parts; ++i) {
408
    // log("i %d", i);
409
306
    if (i != 0 && this_len) {             // optimize common case of ''.join()
410
16
      memcpy(p_result, data_, this_len);  // copy the separator
411
16
      p_result += this_len;
412
      // log("this_len %d", this_len);
413
16
    }
414
415
306
    int n = len(items->at(i));
416
    // log("n: %d", n);
417
306
    memcpy(p_result, items->at(i)->data_, n);  // copy the list item
418
306
    p_result += n;
419
306
  }
420
421
11
  return result;
422
15
}
423
424
98
static void AppendPart(List<BigStr*>* result, BigStr* s, int left, int right) {
425
98
  int new_len = right - left;
426
98
  BigStr* part;
427
98
  if (new_len == 0) {
428
42
    part = kEmptyString;
429
56
  } else {
430
56
    part = NewStr(new_len);
431
56
    memcpy(part->data_, s->data_ + left, new_len);
432
56
  }
433
98
  result->append(part);
434
98
}
435
436
// Split BigStr into List<BigStr*> of parts separated by 'sep'.
437
// The code structure is taken from CPython's Objects/stringlib/split.h.
438
38
List<BigStr*>* BigStr::split(BigStr* sep, int max_split) {
439
38
  DCHECK(sep != nullptr);
440
38
  DCHECK(len(sep) == 1);  // we can only split one char
441
0
  char sep_char = sep->data_[0];
442
443
38
  int str_len = len(this);
444
38
  if (str_len == 0) {
445
    // weird case consistent with Python: ''.split(':') == ['']
446
4
    return NewList<BigStr*>({kEmptyString});
447
4
  }
448
449
34
  List<BigStr*>* result = NewList<BigStr*>({});
450
34
  int left = 0;
451
34
  int right = 0;
452
34
  int num_parts = 0;  // 3 splits results in 4 parts
453
454
114
  while (right < str_len && num_parts < max_split) {
455
    // search for separator
456
186
    for (; right < str_len; right++) {
457
174
      if (data_[right] == sep_char) {
458
68
        AppendPart(result, this, left, right);
459
68
        right++;
460
68
        left = right;
461
68
        num_parts++;
462
68
        break;
463
68
      }
464
174
    }
465
80
  }
466
34
  if (num_parts == 0) {  // Optimization when there is no split
467
4
    result->append(this);
468
30
  } else if (left <= str_len) {  // Last part
469
30
    AppendPart(result, this, left, str_len);
470
30
  }
471
472
34
  return result;
473
38
}
474
475
32
List<BigStr*>* BigStr::split(BigStr* sep) {
476
32
  return this->split(sep, len(this));
477
32
}
478
479
4.29k
unsigned BigStr::hash(HashFunc h) {
480
4.29k
  if (!is_hashed_) {
481
280
    hash_ = h(data_, len(this)) >> 1;
482
280
    is_hashed_ = 1;
483
280
  }
484
4.29k
  return hash_;
485
4.29k
}
486
487
501
static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
488
501
  auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
489
501
  auto end = std::cregex_iterator();
490
491
501
  char int_buf[kMaxFmtWidth];
492
501
  std::string buf;
493
1.11k
  for (std::cregex_iterator it = beg; it != end; ++it) {
494
1.11k
    const std::cmatch& match = *it;
495
496
1.11k
    const std::csub_match& lit_m = match[1];
497
1.11k
    DCHECK(lit_m.matched);
498
0
    const std::string& lit_s = lit_m.str();
499
1.11k
    buf.append(lit_s);
500
501
1.11k
    int width = 0;
502
1.11k
    bool zero_pad = false;
503
1.11k
    bool pad_back = false;
504
1.11k
    const std::csub_match& width_m = match[2];
505
1.11k
    const std::string& width_s = width_m.str();
506
1.11k
    bool ok = false;
507
1.11k
    if (width_m.matched && !width_s.empty()) {
508
23
      if (width_s[0] == '0') {
509
5
        zero_pad = true;
510
5
        DCHECK(width_s.size() > 1);
511
0
        ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
512
5
        DCHECK(ok);
513
0
        (void)ok;  // silence unused var warning in opt
514
18
      } else {
515
18
        ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
516
18
        DCHECK(ok);
517
18
      }
518
23
      if (width < 0) {
519
2
        pad_back = true;
520
2
        width *= -1;
521
2
      }
522
23
      DCHECK(0 <= width && width < kMaxFmtWidth);
523
23
    }
524
525
0
    char const* str_to_add = nullptr;
526
1.11k
    int add_len = 0;
527
1.11k
    const std::csub_match& code_m = match[3];
528
1.11k
    const std::string& code_s = code_m.str();
529
1.11k
    if (!code_m.matched) {
530
501
      DCHECK(!width_m.matched);  // python errors on invalid format operators
531
0
      break;
532
501
    }
533
615
    DCHECK(code_s.size() == 1);
534
0
    switch (code_s[0]) {
535
14
    case '%': {
536
14
      str_to_add = code_s.c_str();
537
14
      add_len = 1;
538
14
      break;
539
0
    }
540
310
    case 's': {
541
310
      BigStr* s = va_arg(args, BigStr*);
542
      // Check type unconditionally because mycpp doesn't always check it
543
310
      CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
544
545
0
      str_to_add = s->data();
546
310
      add_len = len(s);
547
310
      zero_pad = false;  // python ignores the 0 directive for strings
548
310
      break;
549
0
    }
550
32
    case 'r': {
551
32
      BigStr* s = va_arg(args, BigStr*);
552
      // Check type unconditionally because mycpp doesn't always check it
553
32
      CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
554
555
0
      s = repr(s);
556
32
      str_to_add = s->data();
557
32
      add_len = len(s);
558
32
      zero_pad = false;  // python ignores the 0 directive for strings
559
32
      break;
560
0
    }
561
250
    case 'd':  // fallthrough
562
259
    case 'o': {
563
259
      int d = va_arg(args, int);
564
259
      add_len = snprintf(int_buf, kMaxFmtWidth,
565
259
                         match.str().c_str() + lit_s.size(), d);
566
259
      DCHECK(add_len > 0);
567
0
      str_to_add = int_buf;
568
259
      break;
569
250
    }
570
0
    default:
571
0
      DCHECK(0);
572
0
      break;
573
615
    }
574
615
    DCHECK(str_to_add != nullptr);
575
576
615
    if (pad_back) {
577
2
      buf.append(str_to_add, add_len);
578
2
    }
579
615
    if (add_len < width) {
580
42
      for (int i = 0; i < width - add_len; ++i) {
581
36
        buf.push_back(zero_pad ? '0' : ' ');
582
36
      }
583
6
    }
584
615
    if (!pad_back) {
585
613
      buf.append(str_to_add, add_len);
586
613
    }
587
615
  }
588
589
501
  return StrFromC(buf.c_str(), buf.size());
590
501
}
591
592
43
BigStr* StrIter::Value() {  // similar to at()
593
43
  BigStr* result = NewStr(1);
594
43
  result->data_[0] = s_->data_[i_];
595
43
  DCHECK(result->data_[1] == '\0');
596
0
  return result;
597
43
}
598
599
494
BigStr* StrFormat(const char* fmt, ...) {
600
494
  va_list args;
601
494
  va_start(args, fmt);
602
494
  BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
603
494
  va_end(args);
604
494
  return ret;
605
494
}
606
607
7
BigStr* StrFormat(BigStr* fmt, ...) {
608
7
  va_list args;
609
7
  va_start(args, fmt);
610
7
  BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
611
7
  va_end(args);
612
7
  return ret;
613
7
}