cpp

Coverage Report

Created: 2024-03-13 14:13

/home/andy/git/oilshell/oil/mycpp/gc_str.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef MYCPP_GC_STR_H
2
#define MYCPP_GC_STR_H
3
4
#include "mycpp/common.h"  // DISALLOW_COPY_AND_ASSIGN
5
#include "mycpp/gc_obj.h"  // GC_OBJ
6
#include "mycpp/hash.h"    // HashFunc
7
8
template <typename T>
9
class List;
10
11
class BigStr {
12
 public:
13
  // Don't call this directly.  Call NewStr() instead, which calls this.
14
6.01k
  BigStr() {
15
6.01k
  }
16
17
553
  char* data() {
18
553
    return data_;
19
553
  }
20
21
  // Call this after writing into buffer created by OverAllocatedStr()
22
  void MaybeShrink(int str_len);
23
24
  BigStr* at(int i);
25
26
  // TODO: take (start, end) for equality without allocation (not pos)
27
  int find(BigStr* needle, int pos = 0);
28
  int rfind(BigStr* needle);
29
30
  BigStr* slice(int begin);
31
  BigStr* slice(int begin, int end);
32
33
  BigStr* strip();
34
  // Used for CommandSub in osh/cmd_exec.py
35
  BigStr* rstrip(BigStr* chars);
36
  BigStr* rstrip();
37
38
  BigStr* lstrip(BigStr* chars);
39
  BigStr* lstrip();
40
41
  BigStr* ljust(int width, BigStr* fillchar);
42
  BigStr* rjust(int width, BigStr* fillchar);
43
44
  // Can take (start, end) so Tokens can be compared without allocation
45
  bool startswith(BigStr* s);
46
  bool endswith(BigStr* s);
47
48
  BigStr* replace(BigStr* old, BigStr* new_str);
49
  BigStr* replace(BigStr* old, BigStr* new_str, int count);
50
  BigStr* join(List<BigStr*>* items);
51
52
  List<BigStr*>* split(BigStr* sep);
53
  List<BigStr*>* split(BigStr* sep, int max_split);
54
  List<BigStr*>* splitlines(bool keep);
55
56
  // TODO: Move unicode functions out of mycpp runtime?  Because we won't match
57
  // Python exactly
58
  bool isdigit();
59
  bool isalpha();
60
  bool isupper();
61
62
  BigStr* upper();
63
  BigStr* lower();
64
65
  // Other options for fast comparison / hashing / string interning:
66
  // - unique_id_: an index into intern table.  I don't think this works unless
67
  //   you want to deal with rehashing all strings when the set grows.
68
  //   - although note that the JVM has -XX:StringTableSize=FIXED, which means
69
  //   - it can degrade into linked list performance
70
  // - Hashed strings become GLOBAL_STR().  Never deallocated.
71
  // - Hashed strings become part of the "large object space", which might be
72
  //   managed by mark and sweep.  This requires linked list overhead.
73
  //   (doubly-linked?)
74
  // - Intern strings at GARBAGE COLLECTION TIME, with
75
  //   LayoutForwarded::new_location_?  Is this possible?  Does it introduce
76
  //   too much coupling between strings, hash tables, and GC?
77
78
6.01k
  static constexpr ObjHeader obj_header() {
79
6.01k
    return ObjHeader::BigStr();
80
6.01k
  }
81
82
  unsigned hash(HashFunc h);
83
84
  int len_;
85
  unsigned hash_ : 31;
86
  unsigned is_hashed_ : 1;
87
  char data_[1];  // flexible array
88
89
 private:
90
  int _strip_left_pos();
91
  int _strip_right_pos();
92
93
  DISALLOW_COPY_AND_ASSIGN(BigStr)
94
};
95
96
constexpr int kStrHeaderSize = offsetof(BigStr, data_);
97
98
// Note: for SmallStr, we might copy into the VALUE
99
212
inline void BigStr::MaybeShrink(int str_len) {
100
212
  len_ = str_len;
101
212
  data_[len_] = '\0';  // NUL terminate
102
212
}
103
104
13.3k
inline int len(const BigStr* s) {
105
13.3k
  return s->len_;
106
13.3k
}
107
108
BigStr* StrFormat(const char* fmt, ...);
109
BigStr* StrFormat(BigStr* fmt, ...);
110
111
// NOTE: This iterates over bytes.
112
class StrIter {
113
 public:
114
10
  explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
115
    // Cheney only: s_ could be moved during iteration.
116
    // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
117
10
  }
118
10
  ~StrIter() {
119
    // gHeap.PopRoot();
120
10
  }
121
42
  void Next() {
122
42
    i_++;
123
42
  }
124
52
  bool Done() {
125
52
    return i_ >= len_;
126
52
  }
127
  BigStr* Value();  // similar to at()
128
129
 private:
130
  BigStr* s_;
131
  int i_;
132
  int len_;
133
134
  DISALLOW_COPY_AND_ASSIGN(StrIter)
135
};
136
137
extern BigStr* kEmptyString;
138
139
// GlobalStr notes:
140
// - sizeof("foo") == 4, for the NUL terminator.
141
// - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr.  We
142
// don't put it here because it triggers -Winvalid-offsetof
143
144
template <int N>
145
class GlobalStr {
146
  // A template type with the same layout as BigStr with length N-1 (which needs
147
  // a buffer of size N).  For initializing global constant instances.
148
 public:
149
  int len_;
150
  unsigned hash_ : 31;
151
  unsigned is_hashed_ : 1;
152
  const char data_[N];
153
154
  DISALLOW_COPY_AND_ASSIGN(GlobalStr)
155
};
156
157
union Str {
158
 public:
159
  // Instead of this at the start of every function:
160
  //   Str* s = nullptr;
161
  // It will now be:
162
  //   Str s(nullptr);
163
  //
164
  //   StackRoot _root(&s);
165
12
  explicit Str(BigStr* big) : big_(big) {
166
12
  }
167
168
10
  char* data() {
169
10
    return big_->data();
170
10
  }
171
172
10
  Str at(int i) {
173
10
    return Str(big_->at(i));
174
10
  }
175
176
0
  Str upper() {
177
0
    return Str(big_->upper());
178
0
  }
179
180
  uint64_t raw_bytes_;
181
  BigStr* big_;
182
  // TODO: add SmallStr, see mycpp/small_str_test.cc
183
};
184
185
12
inline int len(const Str s) {
186
12
  return len(s.big_);
187
12
}
188
189
// This macro is a workaround for the fact that it's impossible to have a
190
// a constexpr initializer for char[N].  The "String Literals as Non-Type
191
// Template Parameters" feature of C++ 20 would have done it, but it's not
192
// there.
193
//
194
// https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
195
// https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
196
//
197
// TODO: Can we hash values at compile time so they can be in the intern table?
198
199
#define GLOBAL_STR(name, val)                                                \
200
  GcGlobal<GlobalStr<sizeof(val)>> _##name = {                               \
201
      ObjHeader::Global(TypeTag::BigStr),                                    \
202
      {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
203
  BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
204
205
// New style for SmallStr compatibility
206
#define GLOBAL_STR2(name, val)                                               \
207
  GcGlobal<GlobalStr<sizeof(val)>> _##name = {                               \
208
      ObjHeader::Global(TypeTag::BigStr),                                    \
209
      {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
210
  Str name(reinterpret_cast<BigStr*>(&_##name.obj));
211
212
#endif  // MYCPP_GC_STR_H