/home/andy/git/oilshell/oil/mycpp/gc_str.h
Line | Count | Source |
1 | | #ifndef MYCPP_GC_STR_H |
2 | | #define MYCPP_GC_STR_H |
3 | | |
4 | | #include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN |
5 | | #include "mycpp/gc_obj.h" // GC_OBJ |
6 | | |
7 | | template <typename T> |
8 | | class List; |
9 | | |
10 | | class Str { |
11 | | public: |
12 | | // Don't call this directly. Call NewStr() instead, which calls this. |
13 | 5.28k | Str() { |
14 | 5.28k | } |
15 | | |
16 | 553 | char* data() { |
17 | 553 | return data_; |
18 | 553 | } |
19 | | |
20 | | // Call this after writing into buffer created by OverAllocatedStr() |
21 | | void MaybeShrink(int str_len); |
22 | | |
23 | | Str* index_(int i); |
24 | | |
25 | | int find(Str* needle, int pos = 0); |
26 | | int rfind(Str* needle); |
27 | | |
28 | | Str* slice(int begin); |
29 | | Str* slice(int begin, int end); |
30 | | Str* slice(int begin, int end, int step); |
31 | | |
32 | | Str* strip(); |
33 | | // Used for CommandSub in osh/cmd_exec.py |
34 | | Str* rstrip(Str* chars); |
35 | | Str* rstrip(); |
36 | | |
37 | | Str* lstrip(Str* chars); |
38 | | Str* lstrip(); |
39 | | |
40 | | Str* ljust(int width, Str* fillchar); |
41 | | Str* rjust(int width, Str* fillchar); |
42 | | |
43 | | bool startswith(Str* s); |
44 | | bool endswith(Str* s); |
45 | | |
46 | | Str* replace(Str* old, Str* new_str); |
47 | | Str* join(List<Str*>* items); |
48 | | |
49 | | List<Str*>* split(Str* sep); |
50 | | List<Str*>* split(Str* sep, int max_split); |
51 | | List<Str*>* splitlines(bool keep); |
52 | | |
53 | | bool isdigit(); |
54 | | bool isalpha(); |
55 | | bool isupper(); |
56 | | |
57 | | Str* upper(); |
58 | | Str* lower(); |
59 | | |
60 | | // Other options for fast comparison / hashing / string interning: |
61 | | // - unique_id_: an index into intern table. I don't think this works unless |
62 | | // you want to deal with rehashing all strings when the set grows. |
63 | | // - although note that the JVM has -XX:StringTableSize=FIXED, which means |
64 | | // - it can degrade into linked list performance |
65 | | // - Hashed strings become GLOBAL_STR(). Never deallocated. |
66 | | // - Hashed strings become part of the "large object space", which might be |
67 | | // managed by mark and sweep. This requires linked list overhead. |
68 | | // (doubly-linked?) |
69 | | // - Intern strings at GARBAGE COLLECTION TIME, with |
70 | | // LayoutForwarded::new_location_? Is this possible? Does it introduce |
71 | | // too much coupling between strings, hash tables, and GC? |
72 | | |
73 | 5.28k | static constexpr ObjHeader obj_header() { |
74 | 5.28k | return ObjHeader::Str(); |
75 | 5.28k | } |
76 | | |
77 | | int len_; |
78 | | int hash_value_; |
79 | | char data_[1]; // flexible array |
80 | | |
81 | | private: |
82 | | int _strip_left_pos(); |
83 | | int _strip_right_pos(); |
84 | | |
85 | | DISALLOW_COPY_AND_ASSIGN(Str) |
86 | | }; |
87 | | |
88 | | constexpr int kStrHeaderSize = offsetof(Str, data_); |
89 | | |
90 | | // Note: for SmallStr, we might copy into the VALUE |
91 | 156 | inline void Str::MaybeShrink(int str_len) { |
92 | 156 | len_ = str_len; |
93 | 156 | data_[len_] = '\0'; // NUL terminate |
94 | 156 | } |
95 | | |
96 | 13.4k | inline int len(const Str* s) { |
97 | 13.4k | return s->len_; |
98 | 13.4k | } |
99 | | |
100 | | Str* StrFormat(const char* fmt, ...); |
101 | | Str* StrFormat(Str* fmt, ...); |
102 | | |
103 | | // NOTE: This iterates over bytes. |
104 | | class StrIter { |
105 | | public: |
106 | 126 | explicit StrIter(Str* s) : s_(s), i_(0), len_(len(s)) { |
107 | | // Cheney only: s_ could be moved during iteration. |
108 | | // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_)); |
109 | 126 | } |
110 | 126 | ~StrIter() { |
111 | | // gHeap.PopRoot(); |
112 | 126 | } |
113 | 158 | void Next() { |
114 | 158 | i_++; |
115 | 158 | } |
116 | 284 | bool Done() { |
117 | 284 | return i_ >= len_; |
118 | 284 | } |
119 | | Str* Value(); // similar to index_() |
120 | | |
121 | | private: |
122 | | Str* s_; |
123 | | int i_; |
124 | | int len_; |
125 | | |
126 | | DISALLOW_COPY_AND_ASSIGN(StrIter) |
127 | | }; |
128 | | |
129 | | bool maybe_str_equals(Str* left, Str* right); |
130 | | |
131 | | extern Str* kEmptyString; |
132 | | |
133 | | // GlobalStr notes: |
134 | | // - sizeof("foo") == 4, for the NUL terminator. |
135 | | // - gc_heap_test.cc has a static_assert that GlobalStr matches Str. We don't |
136 | | // put it here because it triggers -Winvalid-offsetof |
137 | | |
138 | | template <int N> |
139 | | class GlobalStr { |
140 | | // A template type with the same layout as Str with length N-1 (which needs a |
141 | | // buffer of size N). For initializing global constant instances. |
142 | | public: |
143 | | int len_; |
144 | | int hash_value_; |
145 | | const char data_[N]; |
146 | | |
147 | | DISALLOW_COPY_AND_ASSIGN(GlobalStr) |
148 | | }; |
149 | | |
150 | | // This macro is a workaround for the fact that it's impossible to have a |
151 | | // a constexpr initializer for char[N]. The "String Literals as Non-Type |
152 | | // Template Parameters" feature of C++ 20 would have done it, but it's not |
153 | | // there. |
154 | | // |
155 | | // https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/ |
156 | | // https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor |
157 | | // |
158 | | // TODO: Can we hash values at compile time so they can be in the intern table? |
159 | | |
160 | | #define GLOBAL_STR(name, val) \ |
161 | | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \ |
162 | | ObjHeader::Global(TypeTag::Str), \ |
163 | | {.len_ = sizeof(val) - 1, .hash_value_ = 0, .data_ = val}}; \ |
164 | | Str* name = reinterpret_cast<Str*>(&_##name.obj); |
165 | | |
166 | | #endif // MYCPP_GC_STR_H |