/home/andy/git/oilshell/oil/mycpp/gc_str.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef MYCPP_GC_STR_H |
2 | | #define MYCPP_GC_STR_H |
3 | | |
4 | | #include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN |
5 | | #include "mycpp/gc_obj.h" // GC_OBJ |
6 | | #include "mycpp/hash.h" // HashFunc |
7 | | |
8 | | template <typename T> |
9 | | class List; |
10 | | |
11 | | class BigStr { |
12 | | public: |
13 | | // Don't call this directly. Call NewStr() instead, which calls this. |
14 | 6.01k | BigStr() { |
15 | 6.01k | } |
16 | | |
17 | 553 | char* data() { |
18 | 553 | return data_; |
19 | 553 | } |
20 | | |
21 | | // Call this after writing into buffer created by OverAllocatedStr() |
22 | | void MaybeShrink(int str_len); |
23 | | |
24 | | BigStr* at(int i); |
25 | | |
26 | | // TODO: take (start, end) for equality without allocation (not pos) |
27 | | int find(BigStr* needle, int pos = 0); |
28 | | int rfind(BigStr* needle); |
29 | | |
30 | | BigStr* slice(int begin); |
31 | | BigStr* slice(int begin, int end); |
32 | | |
33 | | BigStr* strip(); |
34 | | // Used for CommandSub in osh/cmd_exec.py |
35 | | BigStr* rstrip(BigStr* chars); |
36 | | BigStr* rstrip(); |
37 | | |
38 | | BigStr* lstrip(BigStr* chars); |
39 | | BigStr* lstrip(); |
40 | | |
41 | | BigStr* ljust(int width, BigStr* fillchar); |
42 | | BigStr* rjust(int width, BigStr* fillchar); |
43 | | |
44 | | // Can take (start, end) so Tokens can be compared without allocation |
45 | | bool startswith(BigStr* s); |
46 | | bool endswith(BigStr* s); |
47 | | |
48 | | BigStr* replace(BigStr* old, BigStr* new_str); |
49 | | BigStr* replace(BigStr* old, BigStr* new_str, int count); |
50 | | BigStr* join(List<BigStr*>* items); |
51 | | |
52 | | List<BigStr*>* split(BigStr* sep); |
53 | | List<BigStr*>* split(BigStr* sep, int max_split); |
54 | | List<BigStr*>* splitlines(bool keep); |
55 | | |
56 | | // TODO: Move unicode functions out of mycpp runtime? Because we won't match |
57 | | // Python exactly |
58 | | bool isdigit(); |
59 | | bool isalpha(); |
60 | | bool isupper(); |
61 | | |
62 | | BigStr* upper(); |
63 | | BigStr* lower(); |
64 | | |
65 | | // Other options for fast comparison / hashing / string interning: |
66 | | // - unique_id_: an index into intern table. I don't think this works unless |
67 | | // you want to deal with rehashing all strings when the set grows. |
68 | | // - although note that the JVM has -XX:StringTableSize=FIXED, which means |
69 | | // - it can degrade into linked list performance |
70 | | // - Hashed strings become GLOBAL_STR(). Never deallocated. |
71 | | // - Hashed strings become part of the "large object space", which might be |
72 | | // managed by mark and sweep. This requires linked list overhead. |
73 | | // (doubly-linked?) |
74 | | // - Intern strings at GARBAGE COLLECTION TIME, with |
75 | | // LayoutForwarded::new_location_? Is this possible? Does it introduce |
76 | | // too much coupling between strings, hash tables, and GC? |
77 | | |
78 | 6.01k | static constexpr ObjHeader obj_header() { |
79 | 6.01k | return ObjHeader::BigStr(); |
80 | 6.01k | } |
81 | | |
82 | | unsigned hash(HashFunc h); |
83 | | |
84 | | int len_; |
85 | | unsigned hash_ : 31; |
86 | | unsigned is_hashed_ : 1; |
87 | | char data_[1]; // flexible array |
88 | | |
89 | | private: |
90 | | int _strip_left_pos(); |
91 | | int _strip_right_pos(); |
92 | | |
93 | | DISALLOW_COPY_AND_ASSIGN(BigStr) |
94 | | }; |
95 | | |
96 | | constexpr int kStrHeaderSize = offsetof(BigStr, data_); |
97 | | |
98 | | // Note: for SmallStr, we might copy into the VALUE |
99 | 212 | inline void BigStr::MaybeShrink(int str_len) { |
100 | 212 | len_ = str_len; |
101 | 212 | data_[len_] = '\0'; // NUL terminate |
102 | 212 | } |
103 | | |
104 | 13.3k | inline int len(const BigStr* s) { |
105 | 13.3k | return s->len_; |
106 | 13.3k | } |
107 | | |
108 | | BigStr* StrFormat(const char* fmt, ...); |
109 | | BigStr* StrFormat(BigStr* fmt, ...); |
110 | | |
111 | | // NOTE: This iterates over bytes. |
112 | | class StrIter { |
113 | | public: |
114 | 10 | explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) { |
115 | | // Cheney only: s_ could be moved during iteration. |
116 | | // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_)); |
117 | 10 | } |
118 | 10 | ~StrIter() { |
119 | | // gHeap.PopRoot(); |
120 | 10 | } |
121 | 42 | void Next() { |
122 | 42 | i_++; |
123 | 42 | } |
124 | 52 | bool Done() { |
125 | 52 | return i_ >= len_; |
126 | 52 | } |
127 | | BigStr* Value(); // similar to at() |
128 | | |
129 | | private: |
130 | | BigStr* s_; |
131 | | int i_; |
132 | | int len_; |
133 | | |
134 | | DISALLOW_COPY_AND_ASSIGN(StrIter) |
135 | | }; |
136 | | |
137 | | extern BigStr* kEmptyString; |
138 | | |
139 | | // GlobalStr notes: |
140 | | // - sizeof("foo") == 4, for the NUL terminator. |
141 | | // - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We |
142 | | // don't put it here because it triggers -Winvalid-offsetof |
143 | | |
144 | | template <int N> |
145 | | class GlobalStr { |
146 | | // A template type with the same layout as BigStr with length N-1 (which needs |
147 | | // a buffer of size N). For initializing global constant instances. |
148 | | public: |
149 | | int len_; |
150 | | unsigned hash_ : 31; |
151 | | unsigned is_hashed_ : 1; |
152 | | const char data_[N]; |
153 | | |
154 | | DISALLOW_COPY_AND_ASSIGN(GlobalStr) |
155 | | }; |
156 | | |
157 | | union Str { |
158 | | public: |
159 | | // Instead of this at the start of every function: |
160 | | // Str* s = nullptr; |
161 | | // It will now be: |
162 | | // Str s(nullptr); |
163 | | // |
164 | | // StackRoot _root(&s); |
165 | 12 | explicit Str(BigStr* big) : big_(big) { |
166 | 12 | } |
167 | | |
168 | 10 | char* data() { |
169 | 10 | return big_->data(); |
170 | 10 | } |
171 | | |
172 | 10 | Str at(int i) { |
173 | 10 | return Str(big_->at(i)); |
174 | 10 | } |
175 | | |
176 | 0 | Str upper() { |
177 | 0 | return Str(big_->upper()); |
178 | 0 | } |
179 | | |
180 | | uint64_t raw_bytes_; |
181 | | BigStr* big_; |
182 | | // TODO: add SmallStr, see mycpp/small_str_test.cc |
183 | | }; |
184 | | |
185 | 12 | inline int len(const Str s) { |
186 | 12 | return len(s.big_); |
187 | 12 | } |
188 | | |
189 | | // This macro is a workaround for the fact that it's impossible to have a |
190 | | // a constexpr initializer for char[N]. The "String Literals as Non-Type |
191 | | // Template Parameters" feature of C++ 20 would have done it, but it's not |
192 | | // there. |
193 | | // |
194 | | // https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/ |
195 | | // https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor |
196 | | // |
197 | | // TODO: Can we hash values at compile time so they can be in the intern table? |
198 | | |
199 | | #define GLOBAL_STR(name, val) \ |
200 | | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \ |
201 | | ObjHeader::Global(TypeTag::BigStr), \ |
202 | | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \ |
203 | | BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj); |
204 | | |
205 | | // New style for SmallStr compatibility |
206 | | #define GLOBAL_STR2(name, val) \ |
207 | | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \ |
208 | | ObjHeader::Global(TypeTag::BigStr), \ |
209 | | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \ |
210 | | Str name(reinterpret_cast<BigStr*>(&_##name.obj)); |
211 | | |
212 | | #endif // MYCPP_GC_STR_H |