/home/andy/git/oilshell/oil/mycpp/gc_str.cc

Source (jump to first uncovered line)
#include "mycpp/gc_str.h"

#include <ctype.h>  // isalpha(), isdigit()
#include <stdarg.h>

#include <regex>

#include "mycpp/common.h"
#include "mycpp/gc_alloc.h"     // NewStr()
#include "mycpp/gc_builtins.h"  // StringToInteger()
#include "mycpp/gc_list.h"      // join(), split() use it

GLOBAL_STR(kEmptyString, "");

static const std::regex gStrFmtRegex("([^%]*)(?:%(-?[0-9]*)(.))?");
static const int kMaxFmtWidth = 256;  // arbitrary...

int Str::find(Str* needle, int pos) {
  int len_ = len(this);
  assert(len(needle) == 1);  // Oil's usage
  char c = needle->data_[0];
  for (int i = pos; i < len_; ++i) {
    if (data_[i] == c) {
      return i;
    }
  }
  return -1;
}

int Str::rfind(Str* needle) {
  int len_ = len(this);
  assert(len(needle) == 1);  // Oil's usage
  char c = needle->data_[0];
  for (int i = len_ - 1; i >= 0; --i) {
    if (data_[i] == c) {
      return i;
    }
  }
  return -1;
}

bool Str::isdigit() {
  int n = len(this);
  if (n == 0) {
    return false;  // special case
  }
  for (int i = 0; i < n; ++i) {
    if (!::isdigit(data_[i])) {
      return false;
    }
  }
  return true;
}

bool Str::isalpha() {
  int n = len(this);
  if (n == 0) {
    return false;  // special case
  }
  for (int i = 0; i < n; ++i) {
    if (!::isalpha(data_[i])) {
      return false;
    }
  }
  return true;
}

// e.g. for osh/braces.py
bool Str::isupper() {
  int n = len(this);
  if (n == 0) {
    return false;  // special case
  }
  for (int i = 0; i < n; ++i) {
    if (!::isupper(data_[i])) {
      return false;
    }
  }
  return true;
}

bool Str::startswith(Str* s) {
  int n = len(s);
  if (n > len(this)) {
    return false;
  }
  return memcmp(data_, s->data_, n) == 0;
}

bool Str::endswith(Str* s) {
  int len_s = len(s);
  int len_this = len(this);
  if (len_s > len_this) {
    return false;
  }
  const char* start = data_ + len_this - len_s;
  return memcmp(start, s->data_, len_s) == 0;
}

// Get a string with one character
Str* Str::index_(int i) {
  int len_ = len(this);
  if (i < 0) {
    i = len_ + i;
  }
  assert(i >= 0);
  assert(i < len_);  // had a problem here!

  Str* result = NewStr(1);
  result->data_[0] = data_[i];
  return result;
}

// s[begin:end]
Str* Str::slice(int begin, int end) {
  int len_ = len(this);
  begin = std::min(begin, len_);
  end = std::min(end, len_);

  assert(begin <= len_);
  assert(end <= len_);

  if (begin < 0) {
    begin = len_ + begin;
  }

  if (end < 0) {
    end = len_ + end;
  }

  begin = std::min(begin, len_);
  end = std::min(end, len_);

  begin = std::max(begin, 0);
  end = std::max(end, 0);

  assert(begin >= 0);
  assert(begin <= len_);

  assert(end >= 0);
  assert(end <= len_);

  int new_len = end - begin;

  // Tried to use std::clamp() here but we're not compiling against cxx-17
  new_len = std::max(new_len, 0);
  new_len = std::min(new_len, len_);

  // printf("len(%d) [%d, %d] newlen(%d)\n",  len_, begin, end, new_len);

  assert(new_len >= 0);
  assert(new_len <= len_);

  Str* result = NewStr(new_len);
  memcpy(result->data_, data_ + begin, new_len);

  return result;
}

// s[begin:]
Str* Str::slice(int begin) {
  int len_ = len(this);
  if (begin == 0) {
    return this;  // s[i:] where i == 0 is common in here docs
  }
  if (begin < 0) {
    begin = len_ + begin;
  }
  return slice(begin, len_);
}

// Used by 'help' builtin and --help, neither of which translate yet.

List<Str*>* Str::splitlines(bool keep) {
  assert(keep == true);
  FAIL(kNotImplemented);
}

Str* Str::upper() {
  int len_ = len(this);
  Str* result = NewStr(len_);
  char* buffer = result->data();
  for (int char_index = 0; char_index < len_; ++char_index) {
    buffer[char_index] = toupper(data_[char_index]);
  }
  return result;
}

Str* Str::lower() {
  int len_ = len(this);
  Str* result = NewStr(len_);
  char* buffer = result->data();
  for (int char_index = 0; char_index < len_; ++char_index) {
    buffer[char_index] = tolower(data_[char_index]);
  }
  return result;
}

Str* Str::ljust(int width, Str* fillchar) {
  assert(len(fillchar) == 1);

  int len_ = len(this);
  int num_fill = width - len_;
  if (num_fill < 0) {
    return this;
  } else {
    Str* result = NewStr(width);
    char c = fillchar->data_[0];
    memcpy(result->data_, data_, len_);
    for (int i = len_; i < width; ++i) {
      result->data_[i] = c;
    }
    return result;
  }
}

Str* Str::rjust(int width, Str* fillchar) {
  assert(len(fillchar) == 1);

  int len_ = len(this);
  int num_fill = width - len_;
  if (num_fill < 0) {
    return this;
  } else {
    Str* result = NewStr(width);
    char c = fillchar->data_[0];
    for (int i = 0; i < num_fill; ++i) {
      result->data_[i] = c;
    }
    memcpy(result->data_ + num_fill, data_, len_);
    return result;
  }
}

Str* Str::replace(Str* old, Str* new_str) {
  // log("replacing %s with %s", old_data, new_str->data_);
  const char* old_data = old->data_;

  int this_len = len(this);
  int old_len = len(old);

  const char* last_possible = data_ + this_len - old_len;

  const char* p_this = data_;  // advances through 'this'

  // First pass: Calculate number of replacements, and hence new length
  int replace_count = 0;
  while (p_this <= last_possible) {
    if (memcmp(p_this, old_data, old_len) == 0) {  // equal
      replace_count++;
      p_this += old_len;
    } else {
      p_this++;
    }
  }

  // log("replacements %d", replace_count);

  if (replace_count == 0) {
    return this;  // Reuse the string if there were no replacements
  }

  int new_str_len = len(new_str);
  int result_len =
      this_len - (replace_count * old_len) + (replace_count * new_str_len);

  Str* result = NewStr(result_len);

  const char* new_data = new_str->data_;
  const size_t new_len = new_str_len;

  // Second pass: Copy pieces into 'result'
  p_this = data_;                  // back to beginning
  char* p_result = result->data_;  // advances through 'result'

  while (p_this <= last_possible) {
    // Note: would be more efficient if we remembered the match positions
    if (memcmp(p_this, old_data, old_len) == 0) {  // equal
      memcpy(p_result, new_data, new_len);         // Copy from new_str
      p_result += new_len;
      p_this += old_len;
    } else {  // copy 1 byte
      *p_result = *p_this;
      p_result++;
      p_this++;
    }
  }
  memcpy(p_result, p_this, data_ + this_len - p_this);  // last part of string
  return result;
}

enum class StripWhere {
  Left,
  Right,
  Both,
};

const int kWhitespace = -1;

bool OmitChar(uint8_t ch, int what) {
  if (what == kWhitespace) {
    return isspace(ch);
  } else {
    return what == ch;
  }
}

// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
// implement 6 functions:
//
//   strip / lstrip / rstrip
//   strip(char) / lstrip(char) / rstrip(char)
//
// Args:
//   where: which ends to strip from
//   what: kWhitespace, or an ASCII code 0-255

Str* StripAny(Str* s, StripWhere where, int what) {
  int length = len(s);
  const char* char_data = s->data();

  int i = 0;
  if (where != StripWhere::Right) {
    while (i < length && OmitChar(char_data[i], what)) {
      i++;
    }
  }

  int j = length;
  if (where != StripWhere::Left) {
    do {
      j--;
    } while (j >= i && OmitChar(char_data[j], what));
    j++;
  }

  if (i == j) {  // Optimization to reuse existing object
    return kEmptyString;
  }

  if (i == 0 && j == length) {  // nothing stripped
    return s;
  }

  // Note: makes a copy in leaky version, and will in GC version too
  int new_len = j - i;
  Str* result = NewStr(new_len);
  memcpy(result->data(), s->data() + i, new_len);
  return result;
}

Str* Str::strip() {
  return StripAny(this, StripWhere::Both, kWhitespace);
}

// Used for CommandSub in osh/cmd_exec.py
Str* Str::rstrip(Str* chars) {
  assert(len(chars) == 1);
  int c = chars->data_[0];
  return StripAny(this, StripWhere::Right, c);
}

Str* Str::rstrip() {
  return StripAny(this, StripWhere::Right, kWhitespace);
}

Str* Str::lstrip(Str* chars) {
  assert(len(chars) == 1);
  int c = chars->data_[0];
  return StripAny(this, StripWhere::Left, c);
}

Str* Str::lstrip() {
  return StripAny(this, StripWhere::Left, kWhitespace);
}

Str* Str::join(List<Str*>* items) {
  int length = 0;

  int num_parts = len(items);

  // " ".join([]) == ""
  if (num_parts == 0) {
    return kEmptyString;
  }

  // Common case
  // 'anything'.join(["foo"]) == "foo"
  if (num_parts == 1) {
    return items->index_(0);
  }

  for (int i = 0; i < num_parts; ++i) {
    length += len(items->index_(i));
  }
  // add length of all the separators
  int this_len = len(this);
  length += this_len * (num_parts - 1);

  Str* result = NewStr(length);
  char* p_result = result->data_;  // advances through

  for (int i = 0; i < num_parts; ++i) {
    // log("i %d", i);
    if (i != 0 && this_len) {             // optimize common case of ''.join()
      memcpy(p_result, data_, this_len);  // copy the separator
      p_result += this_len;
      // log("this_len %d", this_len);
    }

    int n = len(items->index_(i));
    // log("n: %d", n);
    memcpy(p_result, items->index_(i)->data_, n);  // copy the list item
    p_result += n;
  }

  return result;
}

static void AppendPart(List<Str*>* result, Str* s, int left, int right) {
  int new_len = right - left;
  Str* part;
  if (new_len == 0) {
    part = kEmptyString;
  } else {
    part = NewStr(new_len);
    memcpy(part->data_, s->data_ + left, new_len);
  }
  result->append(part);
}

// Split Str into List<Str*> of parts separated by 'sep'.
// The code structure is taken from CPython's Objects/stringlib/split.h.
List<Str*>* Str::split(Str* sep, int max_split) {
  DCHECK(sep != nullptr);
  DCHECK(len(sep) == 1);  // we can only split one char
  char sep_char = sep->data_[0];

  int str_len = len(this);
  if (str_len == 0) {
    // weird case consistent with Python: ''.split(':') == ['']
    return NewList<Str*>({kEmptyString});
  }

  List<Str*>* result = NewList<Str*>({});
  int left = 0;
  int right = 0;
  int num_parts = 0;  // 3 splits results in 4 parts

  while (right < str_len && num_parts < max_split) {
    // search for separator
    for (; right < str_len; right++) {
      if (data_[right] == sep_char) {
        AppendPart(result, this, left, right);
        right++;
        left = right;
        num_parts++;
        break;
      }
    }
  }
  if (num_parts == 0) {  // Optimization when there is no split
    result->append(this);
  } else if (left <= str_len) {  // Last part
    AppendPart(result, this, left, str_len);
  }

  return result;
}

List<Str*>* Str::split(Str* sep) {
  return this->split(sep, len(this));
}

static inline Str* _StrFormat(const char* fmt, int fmt_len, va_list args) {
  auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
  auto end = std::cregex_iterator();

  char int_buf[kMaxFmtWidth];
  std::string buf;
  for (std::cregex_iterator it = beg; it != end; ++it) {
    const std::cmatch& match = *it;

    const std::csub_match& lit_m = match[1];
    assert(lit_m.matched);
    const std::string& lit_s = lit_m.str();
    buf.append(lit_s);

    int width = 0;
    bool zero_pad = false;
    bool pad_back = false;
    const std::csub_match& width_m = match[2];
    const std::string& width_s = width_m.str();
    if (width_m.matched && !width_s.empty()) {
      if (width_s[0] == '0') {
        zero_pad = true;
        assert(width_s.size() > 1);
        assert(StringToInteger(width_s.c_str() + 1, width_s.size() - 1, 10,
                               &width));
      } else {
        assert(StringToInteger(width_s.c_str(), width_s.size(), 10, &width));
      }
      if (width < 0) {
        pad_back = true;
        width *= -1;
      }
      assert(width >= 0 && width < kMaxFmtWidth);
    }

    char const* str_to_add = nullptr;
    int add_len = 0;
    const std::csub_match& code_m = match[3];
    const std::string& code_s = code_m.str();
    if (!code_m.matched) {
      assert(!width_m.matched);  // python errors on invalid format operators
      break;
    }
    assert(code_s.size() == 1);
    switch (code_s[0]) {
    case '%': {
      str_to_add = code_s.c_str();
      add_len = 1;
      break;
    }
    case 's': {
      Str* s = va_arg(args, Str*);
      // Check type unconditionally because mycpp doesn't always check it
      CHECK(s->header_.type_tag == TypeTag::Str);

      str_to_add = s->data();
      add_len = len(s);
      zero_pad = false;  // python ignores the 0 directive for strings
      break;
    }
    case 'r': {
      Str* s = va_arg(args, Str*);
      // Check type unconditionally because mycpp doesn't always check it
      CHECK(s->header_.type_tag == TypeTag::Str);

      s = repr(s);
      str_to_add = s->data();
      add_len = len(s);
      zero_pad = false;  // python ignores the 0 directive for strings
      break;
    }
    case 'd':  // fallthrough
    case 'o': {
      int d = va_arg(args, int);
      add_len = snprintf(int_buf, kMaxFmtWidth,
                         match.str().c_str() + lit_s.size(), d);
      assert(add_len > 0);
      str_to_add = int_buf;
      break;
    }
    default:
      assert(0);
      break;
    }
    assert(str_to_add != nullptr);

    if (pad_back) {
      buf.append(str_to_add, add_len);
    }
    if (add_len < width) {
      for (int i = 0; i < width - add_len; ++i) {
        buf.push_back(zero_pad ? '0' : ' ');
      }
    }
    if (!pad_back) {
      buf.append(str_to_add, add_len);
    }
  }

  return StrFromC(buf.c_str(), buf.size());
}

Str* StrIter::Value() {  // similar to index_()
  Str* result = NewStr(1);
  result->data_[0] = s_->data_[i_];
  DCHECK(result->data_[1] == '\0');
  return result;
}

Str* StrFormat(const char* fmt, ...) {
  va_list args;
  va_start(args, fmt);
  Str* ret = _StrFormat(fmt, strlen(fmt), args);
  va_end(args);
  return ret;
}

Str* StrFormat(Str* fmt, ...) {
  va_list args;
  va_start(args, fmt);
  Str* ret = _StrFormat(fmt->data(), len(fmt), args);
  va_end(args);
  return ret;
}

cpp

Coverage Report

Created: 2023-03-07 20:24

Line	Count	Source (jump to first uncovered line)
1		#include "mycpp/gc_str.h"
2
3		#include <ctype.h> // isalpha(), isdigit()
4		#include <stdarg.h>
5
6		#include <regex>
7
8		#include "mycpp/common.h"
9		#include "mycpp/gc_alloc.h" // NewStr()
10		#include "mycpp/gc_builtins.h" // StringToInteger()
11		#include "mycpp/gc_list.h" // join(), split() use it
12
13		GLOBAL_STR(kEmptyString, "");
14
15		static const std::regex gStrFmtRegex("([^%])(?:%(-?[0-9])(.))?");
16		static const int kMaxFmtWidth = 256; // arbitrary...
17
18	6	int Str::find(Str* needle, int pos) {
19	6	int len_ = len(this);
20	6	assert(len(needle) == 1); // Oil's usage
21	0	char c = needle->data_[0];
22	24	for (int i = pos; i < len_; ++i) {
23	22	if (data_[i] == c) {
24	4	return i;
25	4	}
26	22	}
27	2	return -1;
28	6	}
29
30	6	int Str::rfind(Str* needle) {
31	6	int len_ = len(this);
32	6	assert(len(needle) == 1); // Oil's usage
33	0	char c = needle->data_[0];
34	24	for (int i = len_ - 1; i >= 0; --i) {
35	22	if (data_[i] == c) {
36	4	return i;
37	4	}
38	22	}
39	2	return -1;
40	6	}
41
42	51	bool Str::isdigit() {
43	51	int n = len(this);
44	51	if (n == 0) {
45	2	return false; // special case
46	2	}
47	65	for (int i = 0; i < n; ++i) {
48	49	if (!::isdigit(data_[i])) {
49	33	return false;
50	33	}
51	49	}
52	16	return true;
53	49	}
54
55	35	bool Str::isalpha() {
56	35	int n = len(this);
57	35	if (n == 0) {
58	0	return false; // special case
59	0	}
60	53	for (int i = 0; i < n; ++i) {
61	39	if (!::isalpha(data_[i])) {
62	21	return false;
63	21	}
64	39	}
65	14	return true;
66	35	}
67
68		// e.g. for osh/braces.py
69	8	bool Str::isupper() {
70	8	int n = len(this);
71	8	if (n == 0) {
72	2	return false; // special case
73	2	}
74	12	for (int i = 0; i < n; ++i) {
75	8	if (!::isupper(data_[i])) {
76	2	return false;
77	2	}
78	8	}
79	4	return true;
80	6	}
81
82	18	bool Str::startswith(Str* s) {
83	18	int n = len(s);
84	18	if (n > len(this)) {
85	0	return false;
86	0	}
87	18	return memcmp(data_, s->data_, n) == 0;
88	18	}
89
90	8	bool Str::endswith(Str* s) {
91	8	int len_s = len(s);
92	8	int len_this = len(this);
93	8	if (len_s > len_this) {
94	0	return false;
95	0	}
96	8	const char* start = data_ + len_this - len_s;
97	8	return memcmp(start, s->data_, len_s) == 0;
98	8	}
99
100		// Get a string with one character
101	54	Str* Str::index_(int i) {
102	54	int len_ = len(this);
103	54	if (i < 0) {
104	2	i = len_ + i;
105	2	}
106	54	assert(i >= 0);
107	0	assert(i < len_); // had a problem here!
108
109	0	Str* result = NewStr(1);
110	54	result->data_[0] = data_[i];
111	54	return result;
112	54	}
113
114		// s[begin:end]
115	631	Str* Str::slice(int begin, int end) {
116	631	int len_ = len(this);
117	631	begin = std::min(begin, len_);
118	631	end = std::min(end, len_);
119
120	631	assert(begin <= len_);
121	0	assert(end <= len_);
122
123	631	if (begin < 0) {
124	284	begin = len_ + begin;
125	284	}
126
127	631	if (end < 0) {
128	287	end = len_ + end;
129	287	}
130
131	631	begin = std::min(begin, len_);
132	631	end = std::min(end, len_);
133
134	631	begin = std::max(begin, 0);
135	631	end = std::max(end, 0);
136
137	631	assert(begin >= 0);
138	0	assert(begin <= len_);
139
140	0	assert(end >= 0);
141	0	assert(end <= len_);
142
143	0	int new_len = end - begin;
144
145		// Tried to use std::clamp() here but we're not compiling against cxx-17
146	631	new_len = std::max(new_len, 0);
147	631	new_len = std::min(new_len, len_);
148
149		// printf("len(%d) [%d, %d] newlen(%d)\n", len_, begin, end, new_len);
150
151	631	assert(new_len >= 0);
152	0	assert(new_len <= len_);
153
154	0	Str* result = NewStr(new_len);
155	631	memcpy(result->data_, data_ + begin, new_len);
156
157	631	return result;
158	631	}
159
160		// s[begin:]
161	6	Str* Str::slice(int begin) {
162	6	int len_ = len(this);
163	6	if (begin == 0) {
164	0	return this; // s[i:] where i == 0 is common in here docs
165	0	}
166	6	if (begin < 0) {
167	1	begin = len_ + begin;
168	1	}
169	6	return slice(begin, len_);
170	6	}
171
172		// Used by 'help' builtin and --help, neither of which translate yet.
173
174	0	List<Str> Str::splitlines(bool keep) {
175	0	assert(keep == true);
176	0	FAIL(kNotImplemented);
177	0	}
178
179	6	Str* Str::upper() {
180	6	int len_ = len(this);
181	6	Str* result = NewStr(len_);
182	6	char* buffer = result->data();
183	38	for (int char_index = 0; char_index < len_; ++char_index) {
184	32	buffer[char_index] = toupper(data_[char_index]);
185	32	}
186	6	return result;
187	6	}
188
189	6	Str* Str::lower() {
190	6	int len_ = len(this);
191	6	Str* result = NewStr(len_);
192	6	char* buffer = result->data();
193	38	for (int char_index = 0; char_index < len_; ++char_index) {
194	32	buffer[char_index] = tolower(data_[char_index]);
195	32	}
196	6	return result;
197	6	}
198
199	30	Str* Str::ljust(int width, Str* fillchar) {
200	30	assert(len(fillchar) == 1);
201
202	0	int len_ = len(this);
203	30	int num_fill = width - len_;
204	30	if (num_fill < 0) {
205	10	return this;
206	20	} else {
207	20	Str* result = NewStr(width);
208	20	char c = fillchar->data_[0];
209	20	memcpy(result->data_, data_, len_);
210	42	for (int i = len_; i < width; ++i) {
211	22	result->data_[i] = c;
212	22	}
213	20	return result;
214	20	}
215	30	}
216
217	30	Str* Str::rjust(int width, Str* fillchar) {
218	30	assert(len(fillchar) == 1);
219
220	0	int len_ = len(this);
221	30	int num_fill = width - len_;
222	30	if (num_fill < 0) {
223	10	return this;
224	20	} else {
225	20	Str* result = NewStr(width);
226	20	char c = fillchar->data_[0];
227	42	for (int i = 0; i < num_fill; ++i) {
228	22	result->data_[i] = c;
229	22	}
230	20	memcpy(result->data_ + num_fill, data_, len_);
231	20	return result;
232	20	}
233	30	}
234
235	729	Str* Str::replace(Str* old, Str* new_str) {
236		// log("replacing %s with %s", old_data, new_str->data_);
237	729	const char* old_data = old->data_;
238
239	729	int this_len = len(this);
240	729	int old_len = len(old);
241
242	729	const char* last_possible = data_ + this_len - old_len;
243
244	729	const char* p_this = data_; // advances through 'this'
245
246		// First pass: Calculate number of replacements, and hence new length
247	729	int replace_count = 0;
248	93.3k	while (p_this <= last_possible) {
249	92.6k	if (memcmp(p_this, old_data, old_len) == 0) { // equal
250	758	replace_count++;
251	758	p_this += old_len;
252	91.9k	} else {
253	91.9k	p_this++;
254	91.9k	}
255	92.6k	}
256
257		// log("replacements %d", replace_count);
258
259	729	if (replace_count == 0) {
260	4	return this; // Reuse the string if there were no replacements
261	4	}
262
263	725	int new_str_len = len(new_str);
264	725	int result_len =
265	725	this_len - (replace_count * old_len) + (replace_count * new_str_len);
266
267	725	Str* result = NewStr(result_len);
268
269	725	const char* new_data = new_str->data_;
270	725	const size_t new_len = new_str_len;
271
272		// Second pass: Copy pieces into 'result'
273	725	p_this = data_; // back to beginning
274	725	char* p_result = result->data_; // advances through 'result'
275
276	93.3k	while (p_this <= last_possible) {
277		// Note: would be more efficient if we remembered the match positions
278	92.6k	if (memcmp(p_this, old_data, old_len) == 0) { // equal
279	758	memcpy(p_result, new_data, new_len); // Copy from new_str
280	758	p_result += new_len;
281	758	p_this += old_len;
282	91.8k	} else { // copy 1 byte
283	91.8k	p_result = p_this;
284	91.8k	p_result++;
285	91.8k	p_this++;
286	91.8k	}
287	92.6k	}
288	725	memcpy(p_result, p_this, data_ + this_len - p_this); // last part of string
289	725	return result;
290	729	}
291
292		enum class StripWhere {
293		Left,
294		Right,
295		Both,
296		};
297
298		const int kWhitespace = -1;
299
300	162	bool OmitChar(uint8_t ch, int what) {
301	162	if (what == kWhitespace) {
302	122	return isspace(ch);
303	122	} else {
304	40	return what == ch;
305	40	}
306	162	}
307
308		// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
309		// implement 6 functions:
310		//
311		// strip / lstrip / rstrip
312		// strip(char) / lstrip(char) / rstrip(char)
313		//
314		// Args:
315		// where: which ends to strip from
316		// what: kWhitespace, or an ASCII code 0-255
317
318	62	Str* StripAny(Str* s, StripWhere where, int what) {
319	62	int length = len(s);
320	62	const char* char_data = s->data();
321
322	62	int i = 0;
323	62	if (where != StripWhere::Right) {
324	90	while (i < length && OmitChar(char_data[i], what)) {
325	52	i++;
326	52	}
327	38	}
328
329	62	int j = length;
330	62	if (where != StripWhere::Left) {
331	98	do {
332	98	j--;
333	98	} while (j >= i && OmitChar(char_data[j], what));
334	46	j++;
335	46	}
336
337	62	if (i == j) { // Optimization to reuse existing object
338	18	return kEmptyString;
339	18	}
340
341	44	if (i == 0 && j == length) { // nothing stripped
342	8	return s;
343	8	}
344
345		// Note: makes a copy in leaky version, and will in GC version too
346	36	int new_len = j - i;
347	36	Str* result = NewStr(new_len);
348	36	memcpy(result->data(), s->data() + i, new_len);
349	36	return result;
350	44	}
351
352	22	Str* Str::strip() {
353	22	return StripAny(this, StripWhere::Both, kWhitespace);
354	22	}
355
356		// Used for CommandSub in osh/cmd_exec.py
357	8	Str* Str::rstrip(Str* chars) {
358	8	assert(len(chars) == 1);
359	0	int c = chars->data_[0];
360	8	return StripAny(this, StripWhere::Right, c);
361	8	}
362
363	16	Str* Str::rstrip() {
364	16	return StripAny(this, StripWhere::Right, kWhitespace);
365	16	}
366
367	8	Str* Str::lstrip(Str* chars) {
368	8	assert(len(chars) == 1);
369	0	int c = chars->data_[0];
370	8	return StripAny(this, StripWhere::Left, c);
371	8	}
372
373	8	Str* Str::lstrip() {
374	8	return StripAny(this, StripWhere::Left, kWhitespace);
375	8	}
376
377	58	Str* Str::join(List<Str> items) {
378	58	int length = 0;
379
380	58	int num_parts = len(items);
381
382		// " ".join([]) == ""
383	58	if (num_parts == 0) {
384	9	return kEmptyString;
385	9	}
386
387		// Common case
388		// 'anything'.join(["foo"]) == "foo"
389	49	if (num_parts == 1) {
390	4	return items->index_(0);
391	4	}
392
393	199	for (int i = 0; i < num_parts; ++i) {
394	154	length += len(items->index_(i));
395	154	}
396		// add length of all the separators
397	45	int this_len = len(this);
398	45	length += this_len * (num_parts - 1);
399
400	45	Str* result = NewStr(length);
401	45	char* p_result = result->data_; // advances through
402
403	199	for (int i = 0; i < num_parts; ++i) {
404		// log("i %d", i);
405	154	if (i != 0 && this_len) { // optimize common case of ''.join()
406	16	memcpy(p_result, data_, this_len); // copy the separator
407	16	p_result += this_len;
408		// log("this_len %d", this_len);
409	16	}
410
411	154	int n = len(items->index_(i));
412		// log("n: %d", n);
413	154	memcpy(p_result, items->index_(i)->data_, n); // copy the list item
414	154	p_result += n;
415	154	}
416
417	45	return result;
418	49	}
419
420	98	static void AppendPart(List<Str> result, Str* s, int left, int right) {
421	98	int new_len = right - left;
422	98	Str* part;
423	98	if (new_len == 0) {
424	42	part = kEmptyString;
425	56	} else {
426	56	part = NewStr(new_len);
427	56	memcpy(part->data_, s->data_ + left, new_len);
428	56	}
429	98	result->append(part);
430	98	}
431
432		// Split Str into List<Str*> of parts separated by 'sep'.
433		// The code structure is taken from CPython's Objects/stringlib/split.h.
434	38	List<Str> Str::split(Str* sep, int max_split) {
435	38	DCHECK(sep != nullptr);
436	38	DCHECK(len(sep) == 1); // we can only split one char
437	0	char sep_char = sep->data_[0];
438
439	38	int str_len = len(this);
440	38	if (str_len == 0) {
441		// weird case consistent with Python: ''.split(':') == ['']
442	4	return NewList<Str*>({kEmptyString});
443	4	}
444
445	34	List<Str> result = NewList<Str*>({});
446	34	int left = 0;
447	34	int right = 0;
448	34	int num_parts = 0; // 3 splits results in 4 parts
449
450	114	while (right < str_len && num_parts < max_split) {
451		// search for separator
452	186	for (; right < str_len; right++) {
453	174	if (data_[right] == sep_char) {
454	68	AppendPart(result, this, left, right);
455	68	right++;
456	68	left = right;
457	68	num_parts++;
458	68	break;
459	68	}
460	174	}
461	80	}
462	34	if (num_parts == 0) { // Optimization when there is no split
463	4	result->append(this);
464	30	} else if (left <= str_len) { // Last part
465	30	AppendPart(result, this, left, str_len);
466	30	}
467
468	34	return result;
469	38	}
470
471	32	List<Str> Str::split(Str* sep) {
472	32	return this->split(sep, len(this));
473	32	}
474
475	473	static inline Str* _StrFormat(const char* fmt, int fmt_len, va_list args) {
476	473	auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
477	473	auto end = std::cregex_iterator();
478
479	473	char int_buf[kMaxFmtWidth];
480	473	std::string buf;
481	1.05k	for (std::cregex_iterator it = beg; it != end; ++it) {
482	1.05k	const std::cmatch& match = *it;
483
484	1.05k	const std::csub_match& lit_m = match[1];
485	1.05k	assert(lit_m.matched);
486	0	const std::string& lit_s = lit_m.str();
487	1.05k	buf.append(lit_s);
488
489	1.05k	int width = 0;
490	1.05k	bool zero_pad = false;
491	1.05k	bool pad_back = false;
492	1.05k	const std::csub_match& width_m = match[2];
493	1.05k	const std::string& width_s = width_m.str();
494	1.05k	if (width_m.matched && !width_s.empty()) {
495	23	if (width_s[0] == '0') {
496	5	zero_pad = true;
497	5	assert(width_s.size() > 1);
498	0	assert(StringToInteger(width_s.c_str() + 1, width_s.size() - 1, 10,
499	5	&width));
500	18	} else {
501	18	assert(StringToInteger(width_s.c_str(), width_s.size(), 10, &width));
502	18	}
503	23	if (width < 0) {
504	2	pad_back = true;
505	2	width *= -1;
506	2	}
507	23	assert(width >= 0 && width < kMaxFmtWidth);
508	23	}
509
510	0	char const* str_to_add = nullptr;
511	1.05k	int add_len = 0;
512	1.05k	const std::csub_match& code_m = match[3];
513	1.05k	const std::string& code_s = code_m.str();
514	1.05k	if (!code_m.matched) {
515	473	assert(!width_m.matched); // python errors on invalid format operators
516	0	break;
517	473	}
518	586	assert(code_s.size() == 1);
519	0	switch (code_s[0]) {
520	14	case '%': {
521	14	str_to_add = code_s.c_str();
522	14	add_len = 1;
523	14	break;
524	0	}
525	294	case 's': {
526	294	Str* s = va_arg(args, Str*);
527		// Check type unconditionally because mycpp doesn't always check it
528	294	CHECK(s->header_.type_tag == TypeTag::Str);
529
530	0	str_to_add = s->data();
531	294	add_len = len(s);
532	294	zero_pad = false; // python ignores the 0 directive for strings
533	294	break;
534	0	}
535	30	case 'r': {
536	30	Str* s = va_arg(args, Str*);
537		// Check type unconditionally because mycpp doesn't always check it
538	30	CHECK(s->header_.type_tag == TypeTag::Str);
539
540	0	s = repr(s);
541	30	str_to_add = s->data();
542	30	add_len = len(s);
543	30	zero_pad = false; // python ignores the 0 directive for strings
544	30	break;
545	0	}
546	239	case 'd': // fallthrough
547	248	case 'o': {
548	248	int d = va_arg(args, int);
549	248	add_len = snprintf(int_buf, kMaxFmtWidth,
550	248	match.str().c_str() + lit_s.size(), d);
551	248	assert(add_len > 0);
552	0	str_to_add = int_buf;
553	248	break;
554	239	}
555	0	default:
556	0	assert(0);
557	0	break;
558	586	}
559	586	assert(str_to_add != nullptr);
560
561	586	if (pad_back) {
562	2	buf.append(str_to_add, add_len);
563	2	}
564	586	if (add_len < width) {
565	42	for (int i = 0; i < width - add_len; ++i) {
566	36	buf.push_back(zero_pad ? '0' : ' ');
567	36	}
568	6	}
569	586	if (!pad_back) {
570	584	buf.append(str_to_add, add_len);
571	584	}
572	586	}
573
574	473	return StrFromC(buf.c_str(), buf.size());
575	473	}
576
577	159	Str* StrIter::Value() { // similar to index_()
578	159	Str* result = NewStr(1);
579	159	result->data_[0] = s_->data_[i_];
580	159	DCHECK(result->data_[1] == '\0');
581	0	return result;
582	159	}
583
584	466	Str* StrFormat(const char* fmt, ...) {
585	466	va_list args;
586	466	va_start(args, fmt);
587	466	Str* ret = _StrFormat(fmt, strlen(fmt), args);
588	466	va_end(args);
589	466	return ret;
590	466	}
591
592	7	Str* StrFormat(Str* fmt, ...) {
593	7	va_list args;
594	7	va_start(args, fmt);
595	7	Str* ret = _StrFormat(fmt->data(), len(fmt), args);
596	7	va_end(args);
597	7	return ret;
598	7	}