cpp

Coverage Report

Created: 2023-09-13 01:07

/home/andy/git/oilshell/oil/mycpp/gc_str.cc
Line
Count
Source (jump to first uncovered line)
1
#include "mycpp/gc_str.h"
2
3
#include <ctype.h>  // isalpha(), isdigit()
4
#include <stdarg.h>
5
6
#include <regex>
7
8
#include "mycpp/common.h"
9
#include "mycpp/gc_alloc.h"     // NewStr()
10
#include "mycpp/gc_builtins.h"  // StringToInteger()
11
#include "mycpp/gc_list.h"      // join(), split() use it
12
13
GLOBAL_STR(kEmptyString, "");
14
15
static const std::regex gStrFmtRegex("([^%]*)(?:%(-?[0-9]*)(.))?");
16
static const int kMaxFmtWidth = 256;  // arbitrary...
17
18
6
int Str::find(Str* needle, int pos) {
19
6
  int len_ = len(this);
20
6
  assert(len(needle) == 1);  // Oil's usage
21
0
  char c = needle->data_[0];
22
24
  for (int i = pos; i < len_; ++i) {
23
22
    if (data_[i] == c) {
24
4
      return i;
25
4
    }
26
22
  }
27
2
  return -1;
28
6
}
29
30
6
int Str::rfind(Str* needle) {
31
6
  int len_ = len(this);
32
6
  assert(len(needle) == 1);  // Oil's usage
33
0
  char c = needle->data_[0];
34
24
  for (int i = len_ - 1; i >= 0; --i) {
35
22
    if (data_[i] == c) {
36
4
      return i;
37
4
    }
38
22
  }
39
2
  return -1;
40
6
}
41
42
51
bool Str::isdigit() {
43
51
  int n = len(this);
44
51
  if (n == 0) {
45
2
    return false;  // special case
46
2
  }
47
65
  for (int i = 0; i < n; ++i) {
48
49
    if (!::isdigit(data_[i])) {
49
33
      return false;
50
33
    }
51
49
  }
52
16
  return true;
53
49
}
54
55
35
bool Str::isalpha() {
56
35
  int n = len(this);
57
35
  if (n == 0) {
58
0
    return false;  // special case
59
0
  }
60
53
  for (int i = 0; i < n; ++i) {
61
39
    if (!::isalpha(data_[i])) {
62
21
      return false;
63
21
    }
64
39
  }
65
14
  return true;
66
35
}
67
68
// e.g. for osh/braces.py
69
8
bool Str::isupper() {
70
8
  int n = len(this);
71
8
  if (n == 0) {
72
2
    return false;  // special case
73
2
  }
74
12
  for (int i = 0; i < n; ++i) {
75
8
    if (!::isupper(data_[i])) {
76
2
      return false;
77
2
    }
78
8
  }
79
4
  return true;
80
6
}
81
82
18
bool Str::startswith(Str* s) {
83
18
  int n = len(s);
84
18
  if (n > len(this)) {
85
0
    return false;
86
0
  }
87
18
  return memcmp(data_, s->data_, n) == 0;
88
18
}
89
90
8
bool Str::endswith(Str* s) {
91
8
  int len_s = len(s);
92
8
  int len_this = len(this);
93
8
  if (len_s > len_this) {
94
0
    return false;
95
0
  }
96
8
  const char* start = data_ + len_this - len_s;
97
8
  return memcmp(start, s->data_, len_s) == 0;
98
8
}
99
100
// Get a string with one character
101
54
Str* Str::index_(int i) {
102
54
  int len_ = len(this);
103
54
  if (i < 0) {
104
2
    i = len_ + i;
105
2
  }
106
54
  assert(i >= 0);
107
0
  assert(i < len_);  // had a problem here!
108
109
0
  Str* result = NewStr(1);
110
54
  result->data_[0] = data_[i];
111
54
  return result;
112
54
}
113
114
// s[begin:end:step]
115
2
Str* Str::slice(int begin, int end, int step) {
116
2
  int len_ = len(this);
117
2
  begin = std::min(begin, len_);
118
2
  end = std::min(end, len_);
119
120
2
  assert(begin <= len_);
121
0
  assert(end <= len_);
122
123
2
  if (begin < 0) {
124
0
    begin = len_ + begin;
125
0
  }
126
127
2
  if (end < 0) {
128
0
    end = len_ + end;
129
0
  }
130
131
2
  begin = std::min(begin, len_);
132
2
  end = std::min(end, len_);
133
134
2
  begin = std::max(begin, 0);
135
2
  end = std::max(end, 0);
136
137
2
  assert(begin >= 0);
138
0
  assert(begin <= len_);
139
140
0
  assert(end >= 0);
141
0
  assert(end <= len_);
142
143
0
  int astep = std::abs(step);
144
2
  int new_len = (end - begin);
145
2
  new_len = new_len / astep + (new_len % astep);
146
147
  // Tried to use std::clamp() here but we're not compiling against cxx-17
148
2
  new_len = std::max(new_len, 0);
149
2
  new_len = std::min(new_len, len_);
150
151
  // printf("len(%d) [%d, %d] newlen(%d)\n",  len_, begin, end, new_len);
152
153
2
  assert(new_len >= 0);
154
0
  assert(new_len <= len_);
155
156
0
  Str* result = NewStr(new_len + 1);
157
  // step might be negative
158
2
  int j = 0;
159
12
  for (int i = begin; begin <= i && i < end; i += step, j++) {
160
10
    result->data_[j] = data_[i];
161
10
  }
162
2
  result->data_[new_len] = '\0';
163
164
2
  return result;
165
2
}
166
167
// s[begin:end]
168
631
Str* Str::slice(int begin, int end) {
169
631
  int len_ = len(this);
170
631
  begin = std::min(begin, len_);
171
631
  end = std::min(end, len_);
172
173
631
  assert(begin <= len_);
174
0
  assert(end <= len_);
175
176
631
  if (begin < 0) {
177
284
    begin = len_ + begin;
178
284
  }
179
180
631
  if (end < 0) {
181
287
    end = len_ + end;
182
287
  }
183
184
631
  begin = std::min(begin, len_);
185
631
  end = std::min(end, len_);
186
187
631
  begin = std::max(begin, 0);
188
631
  end = std::max(end, 0);
189
190
631
  assert(begin >= 0);
191
0
  assert(begin <= len_);
192
193
0
  assert(end >= 0);
194
0
  assert(end <= len_);
195
196
0
  int new_len = end - begin;
197
198
  // Tried to use std::clamp() here but we're not compiling against cxx-17
199
631
  new_len = std::max(new_len, 0);
200
631
  new_len = std::min(new_len, len_);
201
202
  // printf("len(%d) [%d, %d] newlen(%d)\n",  len_, begin, end, new_len);
203
204
631
  assert(new_len >= 0);
205
0
  assert(new_len <= len_);
206
207
0
  Str* result = NewStr(new_len);
208
631
  memcpy(result->data_, data_ + begin, new_len);
209
210
631
  return result;
211
631
}
212
213
// s[begin:]
214
6
Str* Str::slice(int begin) {
215
6
  int len_ = len(this);
216
6
  if (begin == 0) {
217
0
    return this;  // s[i:] where i == 0 is common in here docs
218
0
  }
219
6
  if (begin < 0) {
220
1
    begin = len_ + begin;
221
1
  }
222
6
  return slice(begin, len_);
223
6
}
224
225
// Used by 'help' builtin and --help, neither of which translate yet.
226
227
0
List<Str*>* Str::splitlines(bool keep) {
228
0
  assert(keep == true);
229
0
  FAIL(kNotImplemented);
230
0
}
231
232
6
Str* Str::upper() {
233
6
  int len_ = len(this);
234
6
  Str* result = NewStr(len_);
235
6
  char* buffer = result->data();
236
38
  for (int char_index = 0; char_index < len_; ++char_index) {
237
32
    buffer[char_index] = toupper(data_[char_index]);
238
32
  }
239
6
  return result;
240
6
}
241
242
6
Str* Str::lower() {
243
6
  int len_ = len(this);
244
6
  Str* result = NewStr(len_);
245
6
  char* buffer = result->data();
246
38
  for (int char_index = 0; char_index < len_; ++char_index) {
247
32
    buffer[char_index] = tolower(data_[char_index]);
248
32
  }
249
6
  return result;
250
6
}
251
252
30
Str* Str::ljust(int width, Str* fillchar) {
253
30
  assert(len(fillchar) == 1);
254
255
0
  int len_ = len(this);
256
30
  int num_fill = width - len_;
257
30
  if (num_fill < 0) {
258
10
    return this;
259
20
  } else {
260
20
    Str* result = NewStr(width);
261
20
    char c = fillchar->data_[0];
262
20
    memcpy(result->data_, data_, len_);
263
42
    for (int i = len_; i < width; ++i) {
264
22
      result->data_[i] = c;
265
22
    }
266
20
    return result;
267
20
  }
268
30
}
269
270
30
Str* Str::rjust(int width, Str* fillchar) {
271
30
  assert(len(fillchar) == 1);
272
273
0
  int len_ = len(this);
274
30
  int num_fill = width - len_;
275
30
  if (num_fill < 0) {
276
10
    return this;
277
20
  } else {
278
20
    Str* result = NewStr(width);
279
20
    char c = fillchar->data_[0];
280
42
    for (int i = 0; i < num_fill; ++i) {
281
22
      result->data_[i] = c;
282
22
    }
283
20
    memcpy(result->data_ + num_fill, data_, len_);
284
20
    return result;
285
20
  }
286
30
}
287
288
729
Str* Str::replace(Str* old, Str* new_str) {
289
  // log("replacing %s with %s", old_data, new_str->data_);
290
729
  const char* old_data = old->data_;
291
292
729
  int this_len = len(this);
293
729
  int old_len = len(old);
294
295
729
  const char* last_possible = data_ + this_len - old_len;
296
297
729
  const char* p_this = data_;  // advances through 'this'
298
299
  // First pass: Calculate number of replacements, and hence new length
300
729
  int replace_count = 0;
301
93.3k
  while (p_this <= last_possible) {
302
92.6k
    if (memcmp(p_this, old_data, old_len) == 0) {  // equal
303
758
      replace_count++;
304
758
      p_this += old_len;
305
91.9k
    } else {
306
91.9k
      p_this++;
307
91.9k
    }
308
92.6k
  }
309
310
  // log("replacements %d", replace_count);
311
312
729
  if (replace_count == 0) {
313
4
    return this;  // Reuse the string if there were no replacements
314
4
  }
315
316
725
  int new_str_len = len(new_str);
317
725
  int result_len =
318
725
      this_len - (replace_count * old_len) + (replace_count * new_str_len);
319
320
725
  Str* result = NewStr(result_len);
321
322
725
  const char* new_data = new_str->data_;
323
725
  const size_t new_len = new_str_len;
324
325
  // Second pass: Copy pieces into 'result'
326
725
  p_this = data_;                  // back to beginning
327
725
  char* p_result = result->data_;  // advances through 'result'
328
329
93.3k
  while (p_this <= last_possible) {
330
    // Note: would be more efficient if we remembered the match positions
331
92.6k
    if (memcmp(p_this, old_data, old_len) == 0) {  // equal
332
758
      memcpy(p_result, new_data, new_len);         // Copy from new_str
333
758
      p_result += new_len;
334
758
      p_this += old_len;
335
91.8k
    } else {  // copy 1 byte
336
91.8k
      *p_result = *p_this;
337
91.8k
      p_result++;
338
91.8k
      p_this++;
339
91.8k
    }
340
92.6k
  }
341
725
  memcpy(p_result, p_this, data_ + this_len - p_this);  // last part of string
342
725
  return result;
343
729
}
344
345
enum class StripWhere {
346
  Left,
347
  Right,
348
  Both,
349
};
350
351
const int kWhitespace = -1;
352
353
162
bool OmitChar(uint8_t ch, int what) {
354
162
  if (what == kWhitespace) {
355
122
    return isspace(ch);
356
122
  } else {
357
40
    return what == ch;
358
40
  }
359
162
}
360
361
// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
362
// implement 6 functions:
363
//
364
//   strip / lstrip / rstrip
365
//   strip(char) / lstrip(char) / rstrip(char)
366
//
367
// Args:
368
//   where: which ends to strip from
369
//   what: kWhitespace, or an ASCII code 0-255
370
371
62
Str* StripAny(Str* s, StripWhere where, int what) {
372
62
  int length = len(s);
373
62
  const char* char_data = s->data();
374
375
62
  int i = 0;
376
62
  if (where != StripWhere::Right) {
377
90
    while (i < length && OmitChar(char_data[i], what)) {
378
52
      i++;
379
52
    }
380
38
  }
381
382
62
  int j = length;
383
62
  if (where != StripWhere::Left) {
384
98
    do {
385
98
      j--;
386
98
    } while (j >= i && OmitChar(char_data[j], what));
387
46
    j++;
388
46
  }
389
390
62
  if (i == j) {  // Optimization to reuse existing object
391
18
    return kEmptyString;
392
18
  }
393
394
44
  if (i == 0 && j == length) {  // nothing stripped
395
8
    return s;
396
8
  }
397
398
  // Note: makes a copy in leaky version, and will in GC version too
399
36
  int new_len = j - i;
400
36
  Str* result = NewStr(new_len);
401
36
  memcpy(result->data(), s->data() + i, new_len);
402
36
  return result;
403
44
}
404
405
22
Str* Str::strip() {
406
22
  return StripAny(this, StripWhere::Both, kWhitespace);
407
22
}
408
409
// Used for CommandSub in osh/cmd_exec.py
410
8
Str* Str::rstrip(Str* chars) {
411
8
  assert(len(chars) == 1);
412
0
  int c = chars->data_[0];
413
8
  return StripAny(this, StripWhere::Right, c);
414
8
}
415
416
16
Str* Str::rstrip() {
417
16
  return StripAny(this, StripWhere::Right, kWhitespace);
418
16
}
419
420
8
Str* Str::lstrip(Str* chars) {
421
8
  assert(len(chars) == 1);
422
0
  int c = chars->data_[0];
423
8
  return StripAny(this, StripWhere::Left, c);
424
8
}
425
426
8
Str* Str::lstrip() {
427
8
  return StripAny(this, StripWhere::Left, kWhitespace);
428
8
}
429
430
58
Str* Str::join(List<Str*>* items) {
431
58
  int length = 0;
432
433
58
  int num_parts = len(items);
434
435
  // " ".join([]) == ""
436
58
  if (num_parts == 0) {
437
9
    return kEmptyString;
438
9
  }
439
440
  // Common case
441
  // 'anything'.join(["foo"]) == "foo"
442
49
  if (num_parts == 1) {
443
4
    return items->index_(0);
444
4
  }
445
446
200
  for (int i = 0; i < num_parts; ++i) {
447
155
    length += len(items->index_(i));
448
155
  }
449
  // add length of all the separators
450
45
  int this_len = len(this);
451
45
  length += this_len * (num_parts - 1);
452
453
45
  Str* result = NewStr(length);
454
45
  char* p_result = result->data_;  // advances through
455
456
200
  for (int i = 0; i < num_parts; ++i) {
457
    // log("i %d", i);
458
155
    if (i != 0 && this_len) {             // optimize common case of ''.join()
459
16
      memcpy(p_result, data_, this_len);  // copy the separator
460
16
      p_result += this_len;
461
      // log("this_len %d", this_len);
462
16
    }
463
464
155
    int n = len(items->index_(i));
465
    // log("n: %d", n);
466
155
    memcpy(p_result, items->index_(i)->data_, n);  // copy the list item
467
155
    p_result += n;
468
155
  }
469
470
45
  return result;
471
49
}
472
473
98
static void AppendPart(List<Str*>* result, Str* s, int left, int right) {
474
98
  int new_len = right - left;
475
98
  Str* part;
476
98
  if (new_len == 0) {
477
42
    part = kEmptyString;
478
56
  } else {
479
56
    part = NewStr(new_len);
480
56
    memcpy(part->data_, s->data_ + left, new_len);
481
56
  }
482
98
  result->append(part);
483
98
}
484
485
// Split Str into List<Str*> of parts separated by 'sep'.
486
// The code structure is taken from CPython's Objects/stringlib/split.h.
487
38
List<Str*>* Str::split(Str* sep, int max_split) {
488
38
  DCHECK(sep != nullptr);
489
38
  DCHECK(len(sep) == 1);  // we can only split one char
490
0
  char sep_char = sep->data_[0];
491
492
38
  int str_len = len(this);
493
38
  if (str_len == 0) {
494
    // weird case consistent with Python: ''.split(':') == ['']
495
4
    return NewList<Str*>({kEmptyString});
496
4
  }
497
498
34
  List<Str*>* result = NewList<Str*>({});
499
34
  int left = 0;
500
34
  int right = 0;
501
34
  int num_parts = 0;  // 3 splits results in 4 parts
502
503
114
  while (right < str_len && num_parts < max_split) {
504
    // search for separator
505
186
    for (; right < str_len; right++) {
506
174
      if (data_[right] == sep_char) {
507
68
        AppendPart(result, this, left, right);
508
68
        right++;
509
68
        left = right;
510
68
        num_parts++;
511
68
        break;
512
68
      }
513
174
    }
514
80
  }
515
34
  if (num_parts == 0) {  // Optimization when there is no split
516
4
    result->append(this);
517
30
  } else if (left <= str_len) {  // Last part
518
30
    AppendPart(result, this, left, str_len);
519
30
  }
520
521
34
  return result;
522
38
}
523
524
32
List<Str*>* Str::split(Str* sep) {
525
32
  return this->split(sep, len(this));
526
32
}
527
528
477
static inline Str* _StrFormat(const char* fmt, int fmt_len, va_list args) {
529
477
  auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
530
477
  auto end = std::cregex_iterator();
531
532
477
  char int_buf[kMaxFmtWidth];
533
477
  std::string buf;
534
1.06k
  for (std::cregex_iterator it = beg; it != end; ++it) {
535
1.06k
    const std::cmatch& match = *it;
536
537
1.06k
    const std::csub_match& lit_m = match[1];
538
1.06k
    assert(lit_m.matched);
539
0
    const std::string& lit_s = lit_m.str();
540
1.06k
    buf.append(lit_s);
541
542
1.06k
    int width = 0;
543
1.06k
    bool zero_pad = false;
544
1.06k
    bool pad_back = false;
545
1.06k
    const std::csub_match& width_m = match[2];
546
1.06k
    const std::string& width_s = width_m.str();
547
1.06k
    if (width_m.matched && !width_s.empty()) {
548
23
      if (width_s[0] == '0') {
549
5
        zero_pad = true;
550
5
        assert(width_s.size() > 1);
551
0
        assert(StringToInteger(width_s.c_str() + 1, width_s.size() - 1, 10,
552
5
                               &width));
553
18
      } else {
554
18
        assert(StringToInteger(width_s.c_str(), width_s.size(), 10, &width));
555
18
      }
556
23
      if (width < 0) {
557
2
        pad_back = true;
558
2
        width *= -1;
559
2
      }
560
23
      assert(width >= 0 && width < kMaxFmtWidth);
561
23
    }
562
563
0
    char const* str_to_add = nullptr;
564
1.06k
    int add_len = 0;
565
1.06k
    const std::csub_match& code_m = match[3];
566
1.06k
    const std::string& code_s = code_m.str();
567
1.06k
    if (!code_m.matched) {
568
477
      assert(!width_m.matched);  // python errors on invalid format operators
569
0
      break;
570
477
    }
571
589
    assert(code_s.size() == 1);
572
0
    switch (code_s[0]) {
573
14
    case '%': {
574
14
      str_to_add = code_s.c_str();
575
14
      add_len = 1;
576
14
      break;
577
0
    }
578
295
    case 's': {
579
295
      Str* s = va_arg(args, Str*);
580
      // Check type unconditionally because mycpp doesn't always check it
581
295
      CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::Str);
582
583
0
      str_to_add = s->data();
584
295
      add_len = len(s);
585
295
      zero_pad = false;  // python ignores the 0 directive for strings
586
295
      break;
587
0
    }
588
30
    case 'r': {
589
30
      Str* s = va_arg(args, Str*);
590
      // Check type unconditionally because mycpp doesn't always check it
591
30
      CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::Str);
592
593
0
      s = repr(s);
594
30
      str_to_add = s->data();
595
30
      add_len = len(s);
596
30
      zero_pad = false;  // python ignores the 0 directive for strings
597
30
      break;
598
0
    }
599
241
    case 'd':  // fallthrough
600
250
    case 'o': {
601
250
      int d = va_arg(args, int);
602
250
      add_len = snprintf(int_buf, kMaxFmtWidth,
603
250
                         match.str().c_str() + lit_s.size(), d);
604
250
      assert(add_len > 0);
605
0
      str_to_add = int_buf;
606
250
      break;
607
241
    }
608
0
    default:
609
0
      assert(0);
610
0
      break;
611
589
    }
612
589
    assert(str_to_add != nullptr);
613
614
589
    if (pad_back) {
615
2
      buf.append(str_to_add, add_len);
616
2
    }
617
589
    if (add_len < width) {
618
42
      for (int i = 0; i < width - add_len; ++i) {
619
36
        buf.push_back(zero_pad ? '0' : ' ');
620
36
      }
621
6
    }
622
589
    if (!pad_back) {
623
587
      buf.append(str_to_add, add_len);
624
587
    }
625
589
  }
626
627
477
  return StrFromC(buf.c_str(), buf.size());
628
477
}
629
630
159
Str* StrIter::Value() {  // similar to index_()
631
159
  Str* result = NewStr(1);
632
159
  result->data_[0] = s_->data_[i_];
633
159
  DCHECK(result->data_[1] == '\0');
634
0
  return result;
635
159
}
636
637
470
Str* StrFormat(const char* fmt, ...) {
638
470
  va_list args;
639
470
  va_start(args, fmt);
640
470
  Str* ret = _StrFormat(fmt, strlen(fmt), args);
641
470
  va_end(args);
642
470
  return ret;
643
470
}
644
645
7
Str* StrFormat(Str* fmt, ...) {
646
7
  va_list args;
647
7
  va_start(args, fmt);
648
7
  Str* ret = _StrFormat(fmt->data(), len(fmt), args);
649
7
  va_end(args);
650
7
  return ret;
651
7
}