/home/andy/git/oilshell/oil/mycpp/gc_str.cc

Source (jump to first uncovered line)
#include "mycpp/gc_str.h"

#include <ctype.h>  // isalpha(), isdigit()
#include <stdarg.h>

#include <regex>

#include "mycpp/common.h"
#include "mycpp/gc_alloc.h"     // NewStr()
#include "mycpp/gc_builtins.h"  // StringToInt()
#include "mycpp/gc_list.h"      // join(), split() use it

GLOBAL_STR(kEmptyString, "");

static const std::regex gStrFmtRegex("([^%]*)(?:%(-?[0-9]*)(.))?");
static const int kMaxFmtWidth = 256;  // arbitrary...

int BigStr::find(BigStr* needle, int pos) {
  int length = len(this);
  DCHECK(len(needle) == 1);  // Oil's usage
  char c = needle->data_[0];
  for (int i = pos; i < length; ++i) {
    if (data_[i] == c) {
      return i;
    }
  }
  return -1;
}

int BigStr::rfind(BigStr* needle) {
  int length = len(this);
  DCHECK(len(needle) == 1);  // Oil's usage
  char c = needle->data_[0];
  for (int i = length - 1; i >= 0; --i) {
    if (data_[i] == c) {
      return i;
    }
  }
  return -1;
}

bool BigStr::isdigit() {
  int n = len(this);
  if (n == 0) {
    return false;  // special case
  }
  for (int i = 0; i < n; ++i) {
    if (!::isdigit(data_[i])) {
      return false;
    }
  }
  return true;
}

bool BigStr::isalpha() {
  int n = len(this);
  if (n == 0) {
    return false;  // special case
  }
  for (int i = 0; i < n; ++i) {
    if (!::isalpha(data_[i])) {
      return false;
    }
  }
  return true;
}

// e.g. for osh/braces.py
bool BigStr::isupper() {
  int n = len(this);
  if (n == 0) {
    return false;  // special case
  }
  for (int i = 0; i < n; ++i) {
    if (!::isupper(data_[i])) {
      return false;
    }
  }
  return true;
}

bool BigStr::startswith(BigStr* s) {
  int n = len(s);
  if (n > len(this)) {
    return false;
  }
  return memcmp(data_, s->data_, n) == 0;
}

bool BigStr::endswith(BigStr* s) {
  int len_s = len(s);
  int len_this = len(this);
  if (len_s > len_this) {
    return false;
  }
  const char* start = data_ + len_this - len_s;
  return memcmp(start, s->data_, len_s) == 0;
}

// Get a string with one character
BigStr* BigStr::at(int i) {
  int length = len(this);
  if (i < 0) {
    i = length + i;
  }
  DCHECK(0 <= i);
  DCHECK(i < length);  // had a problem here!

  BigStr* result = NewStr(1);
  result->data_[0] = data_[i];
  return result;
}

// s[begin:]
BigStr* BigStr::slice(int begin) {
  return slice(begin, len(this));
}

// s[begin:end]
BigStr* BigStr::slice(int begin, int end) {
  int length = len(this);
  SLICE_ADJUST(begin, end, length);

  DCHECK(0 <= begin && begin <= length);
  DCHECK(0 <= end && end <= length);

  int new_len = end - begin;
  DCHECK(0 <= new_len && new_len <= length);

  BigStr* result = NewStr(new_len);
  memcpy(result->data_, data_ + begin, new_len);

  return result;
}

// Used by 'help' builtin and --help, neither of which translate yet.

List<BigStr*>* BigStr::splitlines(bool keep) {
  DCHECK(keep == true);
  FAIL(kNotImplemented);
}

BigStr* BigStr::upper() {
  int length = len(this);
  BigStr* result = NewStr(length);
  char* buffer = result->data();
  for (int char_index = 0; char_index < length; ++char_index) {
    buffer[char_index] = toupper(data_[char_index]);
  }
  return result;
}

BigStr* BigStr::lower() {
  int length = len(this);
  BigStr* result = NewStr(length);
  char* buffer = result->data();
  for (int char_index = 0; char_index < length; ++char_index) {
    buffer[char_index] = tolower(data_[char_index]);
  }
  return result;
}

BigStr* BigStr::ljust(int width, BigStr* fillchar) {
  DCHECK(len(fillchar) == 1);

  int length = len(this);
  int num_fill = width - length;
  if (num_fill < 0) {
    return this;
  } else {
    BigStr* result = NewStr(width);
    char c = fillchar->data_[0];
    memcpy(result->data_, data_, length);
    for (int i = length; i < width; ++i) {
      result->data_[i] = c;
    }
    return result;
  }
}

BigStr* BigStr::rjust(int width, BigStr* fillchar) {
  DCHECK(len(fillchar) == 1);

  int length = len(this);
  int num_fill = width - length;
  if (num_fill < 0) {
    return this;
  } else {
    BigStr* result = NewStr(width);
    char c = fillchar->data_[0];
    for (int i = 0; i < num_fill; ++i) {
      result->data_[i] = c;
    }
    memcpy(result->data_ + num_fill, data_, length);
    return result;
  }
}

BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
  // Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
  return replace(old, new_str, -1);
}

BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
  // log("replacing %s with %s", old_data, new_str->data_);
  const char* old_data = old->data_;

  int this_len = len(this);
  int old_len = len(old);

  const char* last_possible = data_ + this_len - old_len;

  const char* p_this = data_;  // advances through 'this'

  // First pass: Calculate number of replacements, and hence new length
  int replace_count = 0;
  if (old_len == 0) {
    replace_count = this_len + 1;
    if (count > 0) {
      replace_count = min(replace_count, count);
    }
  } else {
    while (p_this <= last_possible) {
      if (replace_count != count &&  // limit replacements (if count != -1)
          memcmp(p_this, old_data, old_len) == 0) {  // equal
        replace_count++;
        p_this += old_len;
      } else {
        p_this++;
      }
    }
  }

  // log("replacements %d", replace_count);

  if (replace_count == 0) {
    return this;  // Reuse the string if there were no replacements
  }

  int new_str_len = len(new_str);
  int result_len =
      this_len - (replace_count * old_len) + (replace_count * new_str_len);

  BigStr* result = NewStr(result_len);

  const char* new_data = new_str->data_;
  const size_t new_len = new_str_len;

  // Second pass: Copy pieces into 'result'
  p_this = data_;                  // back to beginning
  char* p_result = result->data_;  // advances through 'result'
  replace_count = 0;

  if (old_len == 0) {
    // Should place new_str between each char in this
    while (p_this < last_possible && replace_count != count) {
      replace_count++;
      memcpy(p_result, new_data, new_len);  // Copy from new_str
      p_result += new_len;                  // Move past new_str

      // Write a char from this
      *p_result = *p_this;
      p_this++;
      p_result++;
    }

    if (replace_count != count) {
      // Write a copy of new_str at the end
      assert(p_this == last_possible);
      memcpy(p_result, new_data, new_len);
    } else if (p_this <= last_possible) {
      // Write the last part of string
      memcpy(p_result, p_this, data_ + this_len - p_this);
    }
  } else {
    while (p_this <= last_possible) {
      // Note: would be more efficient if we remembered the match positions
      if (replace_count != count &&  // limit replacements (if count != -1)
          memcmp(p_this, old_data, old_len) == 0) {  // equal
        memcpy(p_result, new_data, new_len);         // Copy from new_str
        replace_count++;
        p_result += new_len;
        p_this += old_len;
      } else {  // copy 1 byte
        *p_result = *p_this;
        p_result++;
        p_this++;
      }
    }
    memcpy(p_result, p_this, data_ + this_len - p_this);  // last part of string
  }

  return result;
}

enum class StripWhere {
  Left,
  Right,
  Both,
};

const int kWhitespace = -1;

bool OmitChar(uint8_t ch, int what) {
  if (what == kWhitespace) {
    return isspace(ch);
  } else {
    return what == ch;
  }
}

// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
// implement 6 functions:
//
//   strip / lstrip / rstrip
//   strip(char) / lstrip(char) / rstrip(char)
//
// Args:
//   where: which ends to strip from
//   what: kWhitespace, or an ASCII code 0-255

BigStr* StripAny(BigStr* s, StripWhere where, int what) {
  int length = len(s);
  const char* char_data = s->data();

  int i = 0;
  if (where != StripWhere::Right) {
    while (i < length && OmitChar(char_data[i], what)) {
      i++;
    }
  }

  int j = length;
  if (where != StripWhere::Left) {
    do {
      j--;
    } while (j >= i && OmitChar(char_data[j], what));
    j++;
  }

  if (i == j) {  // Optimization to reuse existing object
    return kEmptyString;
  }

  if (i == 0 && j == length) {  // nothing stripped
    return s;
  }

  // Note: makes a copy in leaky version, and will in GC version too
  int new_len = j - i;
  BigStr* result = NewStr(new_len);
  memcpy(result->data(), s->data() + i, new_len);
  return result;
}

BigStr* BigStr::strip() {
  return StripAny(this, StripWhere::Both, kWhitespace);
}

// Used for CommandSub in osh/cmd_exec.py
BigStr* BigStr::rstrip(BigStr* chars) {
  DCHECK(len(chars) == 1);
  int c = chars->data_[0];
  return StripAny(this, StripWhere::Right, c);
}

BigStr* BigStr::rstrip() {
  return StripAny(this, StripWhere::Right, kWhitespace);
}

BigStr* BigStr::lstrip(BigStr* chars) {
  DCHECK(len(chars) == 1);
  int c = chars->data_[0];
  return StripAny(this, StripWhere::Left, c);
}

BigStr* BigStr::lstrip() {
  return StripAny(this, StripWhere::Left, kWhitespace);
}

BigStr* BigStr::join(List<BigStr*>* items) {
  int length = 0;

  int num_parts = len(items);

  // " ".join([]) == ""
  if (num_parts == 0) {
    return kEmptyString;
  }

  // Common case
  // 'anything'.join(["foo"]) == "foo"
  if (num_parts == 1) {
    return items->at(0);
  }

  for (int i = 0; i < num_parts; ++i) {
    length += len(items->at(i));
  }
  // add length of all the separators
  int this_len = len(this);
  length += this_len * (num_parts - 1);

  BigStr* result = NewStr(length);
  char* p_result = result->data_;  // advances through

  for (int i = 0; i < num_parts; ++i) {
    // log("i %d", i);
    if (i != 0 && this_len) {             // optimize common case of ''.join()
      memcpy(p_result, data_, this_len);  // copy the separator
      p_result += this_len;
      // log("this_len %d", this_len);
    }

    int n = len(items->at(i));
    // log("n: %d", n);
    memcpy(p_result, items->at(i)->data_, n);  // copy the list item
    p_result += n;
  }

  return result;
}

static void AppendPart(List<BigStr*>* result, BigStr* s, int left, int right) {
  int new_len = right - left;
  BigStr* part;
  if (new_len == 0) {
    part = kEmptyString;
  } else {
    part = NewStr(new_len);
    memcpy(part->data_, s->data_ + left, new_len);
  }
  result->append(part);
}

// Split BigStr into List<BigStr*> of parts separated by 'sep'.
// The code structure is taken from CPython's Objects/stringlib/split.h.
List<BigStr*>* BigStr::split(BigStr* sep, int max_split) {
  DCHECK(sep != nullptr);
  DCHECK(len(sep) == 1);  // we can only split one char
  char sep_char = sep->data_[0];

  int str_len = len(this);
  if (str_len == 0) {
    // weird case consistent with Python: ''.split(':') == ['']
    return NewList<BigStr*>({kEmptyString});
  }

  List<BigStr*>* result = NewList<BigStr*>({});
  int left = 0;
  int right = 0;
  int num_parts = 0;  // 3 splits results in 4 parts

  while (right < str_len && num_parts < max_split) {
    // search for separator
    for (; right < str_len; right++) {
      if (data_[right] == sep_char) {
        AppendPart(result, this, left, right);
        right++;
        left = right;
        num_parts++;
        break;
      }
    }
  }
  if (num_parts == 0) {  // Optimization when there is no split
    result->append(this);
  } else if (left <= str_len) {  // Last part
    AppendPart(result, this, left, str_len);
  }

  return result;
}

List<BigStr*>* BigStr::split(BigStr* sep) {
  return this->split(sep, len(this));
}

unsigned BigStr::hash(HashFunc h) {
  if (!is_hashed_) {
    hash_ = h(data_, len(this)) >> 1;
    is_hashed_ = 1;
  }
  return hash_;
}

static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
  auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
  auto end = std::cregex_iterator();

  char int_buf[kMaxFmtWidth];
  std::string buf;
  for (std::cregex_iterator it = beg; it != end; ++it) {
    const std::cmatch& match = *it;

    const std::csub_match& lit_m = match[1];
    DCHECK(lit_m.matched);
    const std::string& lit_s = lit_m.str();
    buf.append(lit_s);

    int width = 0;
    bool zero_pad = false;
    bool pad_back = false;
    const std::csub_match& width_m = match[2];
    const std::string& width_s = width_m.str();
    bool ok = false;
    if (width_m.matched && !width_s.empty()) {
      if (width_s[0] == '0') {
        zero_pad = true;
        DCHECK(width_s.size() > 1);
        ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
        DCHECK(ok);
        (void)ok;  // silence unused var warning in opt
      } else {
        ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
        DCHECK(ok);
      }
      if (width < 0) {
        pad_back = true;
        width *= -1;
      }
      DCHECK(0 <= width && width < kMaxFmtWidth);
    }

    char const* str_to_add = nullptr;
    int add_len = 0;
    const std::csub_match& code_m = match[3];
    const std::string& code_s = code_m.str();
    if (!code_m.matched) {
      DCHECK(!width_m.matched);  // python errors on invalid format operators
      break;
    }
    DCHECK(code_s.size() == 1);
    switch (code_s[0]) {
    case '%': {
      str_to_add = code_s.c_str();
      add_len = 1;
      break;
    }
    case 's': {
      BigStr* s = va_arg(args, BigStr*);
      // Check type unconditionally because mycpp doesn't always check it
      CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);

      str_to_add = s->data();
      add_len = len(s);
      zero_pad = false;  // python ignores the 0 directive for strings
      break;
    }
    case 'r': {
      BigStr* s = va_arg(args, BigStr*);
      // Check type unconditionally because mycpp doesn't always check it
      CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);

      s = repr(s);
      str_to_add = s->data();
      add_len = len(s);
      zero_pad = false;  // python ignores the 0 directive for strings
      break;
    }
    case 'd':  // fallthrough
    case 'o': {
      int d = va_arg(args, int);
      add_len = snprintf(int_buf, kMaxFmtWidth,
                         match.str().c_str() + lit_s.size(), d);
      DCHECK(add_len > 0);
      str_to_add = int_buf;
      break;
    }
    default:
      DCHECK(0);
      break;
    }
    DCHECK(str_to_add != nullptr);

    if (pad_back) {
      buf.append(str_to_add, add_len);
    }
    if (add_len < width) {
      for (int i = 0; i < width - add_len; ++i) {
        buf.push_back(zero_pad ? '0' : ' ');
      }
    }
    if (!pad_back) {
      buf.append(str_to_add, add_len);
    }
  }

  return StrFromC(buf.c_str(), buf.size());
}

BigStr* StrIter::Value() {  // similar to at()
  BigStr* result = NewStr(1);
  result->data_[0] = s_->data_[i_];
  DCHECK(result->data_[1] == '\0');
  return result;
}

BigStr* StrFormat(const char* fmt, ...) {
  va_list args;
  va_start(args, fmt);
  BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
  va_end(args);
  return ret;
}

BigStr* StrFormat(BigStr* fmt, ...) {
  va_list args;
  va_start(args, fmt);
  BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
  va_end(args);
  return ret;
}

cpp

Coverage Report

Created: 2024-03-13 14:13

Line	Count	Source (jump to first uncovered line)
1		#include "mycpp/gc_str.h"
2
3		#include <ctype.h> // isalpha(), isdigit()
4		#include <stdarg.h>
5
6		#include <regex>
7
8		#include "mycpp/common.h"
9		#include "mycpp/gc_alloc.h" // NewStr()
10		#include "mycpp/gc_builtins.h" // StringToInt()
11		#include "mycpp/gc_list.h" // join(), split() use it
12
13		GLOBAL_STR(kEmptyString, "");
14
15		static const std::regex gStrFmtRegex("([^%])(?:%(-?[0-9])(.))?");
16		static const int kMaxFmtWidth = 256; // arbitrary...
17
18	9	int BigStr::find(BigStr* needle, int pos) {
19	9	int length = len(this);
20	9	DCHECK(len(needle) == 1); // Oil's usage
21	0	char c = needle->data_[0];
22	29	for (int i = pos; i < length; ++i) {
23	26	if (data_[i] == c) {
24	6	return i;
25	6	}
26	26	}
27	3	return -1;
28	9	}
29
30	6	int BigStr::rfind(BigStr* needle) {
31	6	int length = len(this);
32	6	DCHECK(len(needle) == 1); // Oil's usage
33	0	char c = needle->data_[0];
34	24	for (int i = length - 1; i >= 0; --i) {
35	22	if (data_[i] == c) {
36	4	return i;
37	4	}
38	22	}
39	2	return -1;
40	6	}
41
42	51	bool BigStr::isdigit() {
43	51	int n = len(this);
44	51	if (n == 0) {
45	2	return false; // special case
46	2	}
47	65	for (int i = 0; i < n; ++i) {
48	49	if (!::isdigit(data_[i])) {
49	33	return false;
50	33	}
51	49	}
52	16	return true;
53	49	}
54
55	35	bool BigStr::isalpha() {
56	35	int n = len(this);
57	35	if (n == 0) {
58	0	return false; // special case
59	0	}
60	53	for (int i = 0; i < n; ++i) {
61	39	if (!::isalpha(data_[i])) {
62	21	return false;
63	21	}
64	39	}
65	14	return true;
66	35	}
67
68		// e.g. for osh/braces.py
69	8	bool BigStr::isupper() {
70	8	int n = len(this);
71	8	if (n == 0) {
72	2	return false; // special case
73	2	}
74	12	for (int i = 0; i < n; ++i) {
75	8	if (!::isupper(data_[i])) {
76	2	return false;
77	2	}
78	8	}
79	4	return true;
80	6	}
81
82	21	bool BigStr::startswith(BigStr* s) {
83	21	int n = len(s);
84	21	if (n > len(this)) {
85	0	return false;
86	0	}
87	21	return memcmp(data_, s->data_, n) == 0;
88	21	}
89
90	12	bool BigStr::endswith(BigStr* s) {
91	12	int len_s = len(s);
92	12	int len_this = len(this);
93	12	if (len_s > len_this) {
94	1	return false;
95	1	}
96	11	const char* start = data_ + len_this - len_s;
97	11	return memcmp(start, s->data_, len_s) == 0;
98	12	}
99
100		// Get a string with one character
101	95	BigStr* BigStr::at(int i) {
102	95	int length = len(this);
103	95	if (i < 0) {
104	2	i = length + i;
105	2	}
106	95	DCHECK(0 <= i);
107	95	DCHECK(i < length); // had a problem here!
108
109	0	BigStr* result = NewStr(1);
110	95	result->data_[0] = data_[i];
111	95	return result;
112	95	}
113
114		// s[begin:]
115	6	BigStr* BigStr::slice(int begin) {
116	6	return slice(begin, len(this));
117	6	}
118
119		// s[begin:end]
120	636	BigStr* BigStr::slice(int begin, int end) {
121	636	int length = len(this);
122	636	SLICE_ADJUST(begin, end, length);
123
124	636	DCHECK(0 <= begin && begin <= length);
125	636	DCHECK(0 <= end && end <= length);
126
127	0	int new_len = end - begin;
128	636	DCHECK(0 <= new_len && new_len <= length);
129
130	0	BigStr* result = NewStr(new_len);
131	636	memcpy(result->data_, data_ + begin, new_len);
132
133	636	return result;
134	636	}
135
136		// Used by 'help' builtin and --help, neither of which translate yet.
137
138	0	List<BigStr> BigStr::splitlines(bool keep) {
139	0	DCHECK(keep == true);
140	0	FAIL(kNotImplemented);
141	0	}
142
143	9	BigStr* BigStr::upper() {
144	9	int length = len(this);
145	9	BigStr* result = NewStr(length);
146	9	char* buffer = result->data();
147	56	for (int char_index = 0; char_index < length; ++char_index) {
148	47	buffer[char_index] = toupper(data_[char_index]);
149	47	}
150	9	return result;
151	9	}
152
153	6	BigStr* BigStr::lower() {
154	6	int length = len(this);
155	6	BigStr* result = NewStr(length);
156	6	char* buffer = result->data();
157	38	for (int char_index = 0; char_index < length; ++char_index) {
158	32	buffer[char_index] = tolower(data_[char_index]);
159	32	}
160	6	return result;
161	6	}
162
163	30	BigStr* BigStr::ljust(int width, BigStr* fillchar) {
164	30	DCHECK(len(fillchar) == 1);
165
166	0	int length = len(this);
167	30	int num_fill = width - length;
168	30	if (num_fill < 0) {
169	10	return this;
170	20	} else {
171	20	BigStr* result = NewStr(width);
172	20	char c = fillchar->data_[0];
173	20	memcpy(result->data_, data_, length);
174	42	for (int i = length; i < width; ++i) {
175	22	result->data_[i] = c;
176	22	}
177	20	return result;
178	20	}
179	30	}
180
181	30	BigStr* BigStr::rjust(int width, BigStr* fillchar) {
182	30	DCHECK(len(fillchar) == 1);
183
184	0	int length = len(this);
185	30	int num_fill = width - length;
186	30	if (num_fill < 0) {
187	10	return this;
188	20	} else {
189	20	BigStr* result = NewStr(width);
190	20	char c = fillchar->data_[0];
191	42	for (int i = 0; i < num_fill; ++i) {
192	22	result->data_[i] = c;
193	22	}
194	20	memcpy(result->data_ + num_fill, data_, length);
195	20	return result;
196	20	}
197	30	}
198
199	729	BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
200		// Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
201	729	return replace(old, new_str, -1);
202	729	}
203
204	729	BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
205		// log("replacing %s with %s", old_data, new_str->data_);
206	729	const char* old_data = old->data_;
207
208	729	int this_len = len(this);
209	729	int old_len = len(old);
210
211	729	const char* last_possible = data_ + this_len - old_len;
212
213	729	const char* p_this = data_; // advances through 'this'
214
215		// First pass: Calculate number of replacements, and hence new length
216	729	int replace_count = 0;
217	729	if (old_len == 0) {
218	0	replace_count = this_len + 1;
219	0	if (count > 0) {
220	0	replace_count = min(replace_count, count);
221	0	}
222	729	} else {
223	93.3k	while (p_this <= last_possible) {
224	92.6k	if (replace_count != count && // limit replacements (if count != -1)
225	92.6k	memcmp(p_this, old_data, old_len) == 0) { // equal
226	758	replace_count++;
227	758	p_this += old_len;
228	91.9k	} else {
229	91.9k	p_this++;
230	91.9k	}
231	92.6k	}
232	729	}
233
234		// log("replacements %d", replace_count);
235
236	729	if (replace_count == 0) {
237	4	return this; // Reuse the string if there were no replacements
238	4	}
239
240	725	int new_str_len = len(new_str);
241	725	int result_len =
242	725	this_len - (replace_count * old_len) + (replace_count * new_str_len);
243
244	725	BigStr* result = NewStr(result_len);
245
246	725	const char* new_data = new_str->data_;
247	725	const size_t new_len = new_str_len;
248
249		// Second pass: Copy pieces into 'result'
250	725	p_this = data_; // back to beginning
251	725	char* p_result = result->data_; // advances through 'result'
252	725	replace_count = 0;
253
254	725	if (old_len == 0) {
255		// Should place new_str between each char in this
256	0	while (p_this < last_possible && replace_count != count) {
257	0	replace_count++;
258	0	memcpy(p_result, new_data, new_len); // Copy from new_str
259	0	p_result += new_len; // Move past new_str
260
261		// Write a char from this
262	0	p_result = p_this;
263	0	p_this++;
264	0	p_result++;
265	0	}
266
267	0	if (replace_count != count) {
268		// Write a copy of new_str at the end
269	0	assert(p_this == last_possible);
270	0	memcpy(p_result, new_data, new_len);
271	0	} else if (p_this <= last_possible) {
272		// Write the last part of string
273	0	memcpy(p_result, p_this, data_ + this_len - p_this);
274	0	}
275	725	} else {
276	93.3k	while (p_this <= last_possible) {
277		// Note: would be more efficient if we remembered the match positions
278	92.6k	if (replace_count != count && // limit replacements (if count != -1)
279	92.6k	memcmp(p_this, old_data, old_len) == 0) { // equal
280	758	memcpy(p_result, new_data, new_len); // Copy from new_str
281	758	replace_count++;
282	758	p_result += new_len;
283	758	p_this += old_len;
284	91.8k	} else { // copy 1 byte
285	91.8k	p_result = p_this;
286	91.8k	p_result++;
287	91.8k	p_this++;
288	91.8k	}
289	92.6k	}
290	725	memcpy(p_result, p_this, data_ + this_len - p_this); // last part of string
291	725	}
292
293	0	return result;
294	729	}
295
296		enum class StripWhere {
297		Left,
298		Right,
299		Both,
300		};
301
302		const int kWhitespace = -1;
303
304	162	bool OmitChar(uint8_t ch, int what) {
305	162	if (what == kWhitespace) {
306	122	return isspace(ch);
307	122	} else {
308	40	return what == ch;
309	40	}
310	162	}
311
312		// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
313		// implement 6 functions:
314		//
315		// strip / lstrip / rstrip
316		// strip(char) / lstrip(char) / rstrip(char)
317		//
318		// Args:
319		// where: which ends to strip from
320		// what: kWhitespace, or an ASCII code 0-255
321
322	62	BigStr* StripAny(BigStr* s, StripWhere where, int what) {
323	62	int length = len(s);
324	62	const char* char_data = s->data();
325
326	62	int i = 0;
327	62	if (where != StripWhere::Right) {
328	90	while (i < length && OmitChar(char_data[i], what)) {
329	52	i++;
330	52	}
331	38	}
332
333	62	int j = length;
334	62	if (where != StripWhere::Left) {
335	98	do {
336	98	j--;
337	98	} while (j >= i && OmitChar(char_data[j], what));
338	46	j++;
339	46	}
340
341	62	if (i == j) { // Optimization to reuse existing object
342	18	return kEmptyString;
343	18	}
344
345	44	if (i == 0 && j == length) { // nothing stripped
346	8	return s;
347	8	}
348
349		// Note: makes a copy in leaky version, and will in GC version too
350	36	int new_len = j - i;
351	36	BigStr* result = NewStr(new_len);
352	36	memcpy(result->data(), s->data() + i, new_len);
353	36	return result;
354	44	}
355
356	22	BigStr* BigStr::strip() {
357	22	return StripAny(this, StripWhere::Both, kWhitespace);
358	22	}
359
360		// Used for CommandSub in osh/cmd_exec.py
361	8	BigStr* BigStr::rstrip(BigStr* chars) {
362	8	DCHECK(len(chars) == 1);
363	0	int c = chars->data_[0];
364	8	return StripAny(this, StripWhere::Right, c);
365	8	}
366
367	16	BigStr* BigStr::rstrip() {
368	16	return StripAny(this, StripWhere::Right, kWhitespace);
369	16	}
370
371	8	BigStr* BigStr::lstrip(BigStr* chars) {
372	8	DCHECK(len(chars) == 1);
373	0	int c = chars->data_[0];
374	8	return StripAny(this, StripWhere::Left, c);
375	8	}
376
377	8	BigStr* BigStr::lstrip() {
378	8	return StripAny(this, StripWhere::Left, kWhitespace);
379	8	}
380
381	24	BigStr* BigStr::join(List<BigStr> items) {
382	24	int length = 0;
383
384	24	int num_parts = len(items);
385
386		// " ".join([]) == ""
387	24	if (num_parts == 0) {
388	9	return kEmptyString;
389	9	}
390
391		// Common case
392		// 'anything'.join(["foo"]) == "foo"
393	15	if (num_parts == 1) {
394	4	return items->at(0);
395	4	}
396
397	317	for (int i = 0; i < num_parts; ++i) {
398	306	length += len(items->at(i));
399	306	}
400		// add length of all the separators
401	11	int this_len = len(this);
402	11	length += this_len * (num_parts - 1);
403
404	11	BigStr* result = NewStr(length);
405	11	char* p_result = result->data_; // advances through
406
407	317	for (int i = 0; i < num_parts; ++i) {
408		// log("i %d", i);
409	306	if (i != 0 && this_len) { // optimize common case of ''.join()
410	16	memcpy(p_result, data_, this_len); // copy the separator
411	16	p_result += this_len;
412		// log("this_len %d", this_len);
413	16	}
414
415	306	int n = len(items->at(i));
416		// log("n: %d", n);
417	306	memcpy(p_result, items->at(i)->data_, n); // copy the list item
418	306	p_result += n;
419	306	}
420
421	11	return result;
422	15	}
423
424	98	static void AppendPart(List<BigStr> result, BigStr* s, int left, int right) {
425	98	int new_len = right - left;
426	98	BigStr* part;
427	98	if (new_len == 0) {
428	42	part = kEmptyString;
429	56	} else {
430	56	part = NewStr(new_len);
431	56	memcpy(part->data_, s->data_ + left, new_len);
432	56	}
433	98	result->append(part);
434	98	}
435
436		// Split BigStr into List<BigStr*> of parts separated by 'sep'.
437		// The code structure is taken from CPython's Objects/stringlib/split.h.
438	38	List<BigStr> BigStr::split(BigStr* sep, int max_split) {
439	38	DCHECK(sep != nullptr);
440	38	DCHECK(len(sep) == 1); // we can only split one char
441	0	char sep_char = sep->data_[0];
442
443	38	int str_len = len(this);
444	38	if (str_len == 0) {
445		// weird case consistent with Python: ''.split(':') == ['']
446	4	return NewList<BigStr*>({kEmptyString});
447	4	}
448
449	34	List<BigStr> result = NewList<BigStr*>({});
450	34	int left = 0;
451	34	int right = 0;
452	34	int num_parts = 0; // 3 splits results in 4 parts
453
454	114	while (right < str_len && num_parts < max_split) {
455		// search for separator
456	186	for (; right < str_len; right++) {
457	174	if (data_[right] == sep_char) {
458	68	AppendPart(result, this, left, right);
459	68	right++;
460	68	left = right;
461	68	num_parts++;
462	68	break;
463	68	}
464	174	}
465	80	}
466	34	if (num_parts == 0) { // Optimization when there is no split
467	4	result->append(this);
468	30	} else if (left <= str_len) { // Last part
469	30	AppendPart(result, this, left, str_len);
470	30	}
471
472	34	return result;
473	38	}
474
475	32	List<BigStr> BigStr::split(BigStr* sep) {
476	32	return this->split(sep, len(this));
477	32	}
478
479	4.29k	unsigned BigStr::hash(HashFunc h) {
480	4.29k	if (!is_hashed_) {
481	280	hash_ = h(data_, len(this)) >> 1;
482	280	is_hashed_ = 1;
483	280	}
484	4.29k	return hash_;
485	4.29k	}
486
487	501	static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
488	501	auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
489	501	auto end = std::cregex_iterator();
490
491	501	char int_buf[kMaxFmtWidth];
492	501	std::string buf;
493	1.11k	for (std::cregex_iterator it = beg; it != end; ++it) {
494	1.11k	const std::cmatch& match = *it;
495
496	1.11k	const std::csub_match& lit_m = match[1];
497	1.11k	DCHECK(lit_m.matched);
498	0	const std::string& lit_s = lit_m.str();
499	1.11k	buf.append(lit_s);
500
501	1.11k	int width = 0;
502	1.11k	bool zero_pad = false;
503	1.11k	bool pad_back = false;
504	1.11k	const std::csub_match& width_m = match[2];
505	1.11k	const std::string& width_s = width_m.str();
506	1.11k	bool ok = false;
507	1.11k	if (width_m.matched && !width_s.empty()) {
508	23	if (width_s[0] == '0') {
509	5	zero_pad = true;
510	5	DCHECK(width_s.size() > 1);
511	0	ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
512	5	DCHECK(ok);
513	0	(void)ok; // silence unused var warning in opt
514	18	} else {
515	18	ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
516	18	DCHECK(ok);
517	18	}
518	23	if (width < 0) {
519	2	pad_back = true;
520	2	width *= -1;
521	2	}
522	23	DCHECK(0 <= width && width < kMaxFmtWidth);
523	23	}
524
525	0	char const* str_to_add = nullptr;
526	1.11k	int add_len = 0;
527	1.11k	const std::csub_match& code_m = match[3];
528	1.11k	const std::string& code_s = code_m.str();
529	1.11k	if (!code_m.matched) {
530	501	DCHECK(!width_m.matched); // python errors on invalid format operators
531	0	break;
532	501	}
533	615	DCHECK(code_s.size() == 1);
534	0	switch (code_s[0]) {
535	14	case '%': {
536	14	str_to_add = code_s.c_str();
537	14	add_len = 1;
538	14	break;
539	0	}
540	310	case 's': {
541	310	BigStr* s = va_arg(args, BigStr*);
542		// Check type unconditionally because mycpp doesn't always check it
543	310	CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
544
545	0	str_to_add = s->data();
546	310	add_len = len(s);
547	310	zero_pad = false; // python ignores the 0 directive for strings
548	310	break;
549	0	}
550	32	case 'r': {
551	32	BigStr* s = va_arg(args, BigStr*);
552		// Check type unconditionally because mycpp doesn't always check it
553	32	CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
554
555	0	s = repr(s);
556	32	str_to_add = s->data();
557	32	add_len = len(s);
558	32	zero_pad = false; // python ignores the 0 directive for strings
559	32	break;
560	0	}
561	250	case 'd': // fallthrough
562	259	case 'o': {
563	259	int d = va_arg(args, int);
564	259	add_len = snprintf(int_buf, kMaxFmtWidth,
565	259	match.str().c_str() + lit_s.size(), d);
566	259	DCHECK(add_len > 0);
567	0	str_to_add = int_buf;
568	259	break;
569	250	}
570	0	default:
571	0	DCHECK(0);
572	0	break;
573	615	}
574	615	DCHECK(str_to_add != nullptr);
575
576	615	if (pad_back) {
577	2	buf.append(str_to_add, add_len);
578	2	}
579	615	if (add_len < width) {
580	42	for (int i = 0; i < width - add_len; ++i) {
581	36	buf.push_back(zero_pad ? '0' : ' ');
582	36	}
583	6	}
584	615	if (!pad_back) {
585	613	buf.append(str_to_add, add_len);
586	613	}
587	615	}
588
589	501	return StrFromC(buf.c_str(), buf.size());
590	501	}
591
592	43	BigStr* StrIter::Value() { // similar to at()
593	43	BigStr* result = NewStr(1);
594	43	result->data_[0] = s_->data_[i_];
595	43	DCHECK(result->data_[1] == '\0');
596	0	return result;
597	43	}
598
599	494	BigStr* StrFormat(const char* fmt, ...) {
600	494	va_list args;
601	494	va_start(args, fmt);
602	494	BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
603	494	va_end(args);
604	494	return ret;
605	494	}
606
607	7	BigStr* StrFormat(BigStr* fmt, ...) {
608	7	va_list args;
609	7	va_start(args, fmt);
610	7	BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
611	7	va_end(args);
612	7	return ret;
613	7	}