Line data Source code
1 : /* src/vm/utf8.hpp - utf8 string functions
2 :
3 : Copyright (C) 1996-2014
4 : CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
5 :
6 : This file is part of CACAO.
7 :
8 : This program is free software; you can redistribute it and/or
9 : modify it under the terms of the GNU General Public License as
10 : published by the Free Software Foundation; either version 2, or (at
11 : your option) any later version.
12 :
13 : This program is distributed in the hope that it will be useful, but
14 : WITHOUT ANY WARRANTY; without even the implied warranty of
15 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 : General Public License for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with this program; if not, write to the Free Software
20 : Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 : 02110-1301, USA.
22 :
23 : */
24 :
25 :
26 : #ifndef UTF8_HPP_
27 : #define UTF8_HPP_ 1
28 :
29 : #include "config.h" // used in utf8.inc
30 :
31 : #include <cstddef> // for size_t
32 : #include <cstdio> // for FILE
33 : #include <cstring> // for strlen
34 : #include <stdint.h> // for uint32_t, uint8_t
35 :
36 : #include <iterator>
37 :
38 : namespace cacao { class OStream; }
39 : struct utf;
40 :
41 : /* Utf8String ******************************************************************
42 :
43 : A container for strings in Java's modified UTF-8 encoding.
44 :
45 : A Utf8String always contains either a valid (possibly empty) UTF-8 string
46 : or NULL.
47 : You can check for NULL like you would with any normal pointer.
48 : Invoking any method except operator void*() or c_ptr() on a NULL string leads to
49 : undefined behaviour.
50 :
51 : Use a Utf8String like a pointer, i.e. always pass by value.
52 :
53 : The contents of a Utf8String are zero terminated, and it never contains any
54 : zero bytes except the one at the end, so any C string processing functions
55 : work properly.
56 :
57 : *******************************************************************************/
58 :
59 3571 : class Utf8String {
60 : public:
61 : /*** GLOBAL INITIALIZATION **********************************/
62 :
63 : // initialize the utf8 subsystem
64 : // MUST be called before any Utf8String can be constructed
65 : static void initialize();
66 :
67 : // check if utf8 subsytem is initialized
68 : static bool is_initialized();
69 :
70 : /*** CONSTRUCTORS ******************************************/
71 :
72 : // constructs a null string
73 33066657 : Utf8String() : _data(0) {}
74 :
75 : // construct from a buffer with a given length
76 : // validates that input is really UTF-8
77 : // constructs a null string on error
78 : static Utf8String from_utf8(const char*, size_t);
79 : static Utf8String from_utf8_dot_to_slash(const char*, size_t);
80 : static Utf8String from_utf8_slash_to_dot(const char*, size_t);
81 :
82 126304 : static Utf8String from_utf8(const char *cs) {
83 126304 : return from_utf8(cs, strlen(cs));
84 : }
85 740 : static Utf8String from_utf8_dot_to_slash(const char *cs) {
86 740 : return from_utf8_dot_to_slash(cs, strlen(cs));
87 : }
88 :
89 : // construct from a UTF8String
90 : static Utf8String from_utf8_slash_to_dot(Utf8String);
91 :
92 : // construct from a UTF-16 string with a given length
93 : static Utf8String from_utf16(const uint16_t*, size_t);
94 : static Utf8String from_utf16_dot_to_slash(const uint16_t*, size_t);
95 :
96 : // constructs a Utf8String with a given content
97 : // is only public for interop with legacy C code
98 : // NOTE: does NOT perform any checks
99 9544669 : Utf8String(utf *u) : _data((Data*) u) {}
100 :
101 : /*** ITERATION ******************************************/
102 :
103 : // iterator over the bytes in a string
104 : typedef const char* byte_iterator;
105 :
106 16971132 : byte_iterator begin() const { return _data->text; }
107 4378817 : byte_iterator end() const { return begin() + size(); }
108 :
109 : // iterator over UTF-16 codepoints in a string
110 : struct utf16_iterator {
111 : typedef std::input_iterator_tag iterator_category;
112 : typedef std::ptrdiff_t difference_type;
113 : typedef uint16_t value_type;
114 : typedef const value_type* pointer;
115 : typedef const value_type& reference;
116 :
117 : uint16_t operator*();
118 :
119 648207 : void operator++() { current = next; }
120 :
121 : bool operator!=(const utf16_iterator& it) {
122 : return current != it.current;
123 : }
124 : private:
125 51474 : utf16_iterator(byte_iterator it) : current(it), next(it) {}
126 :
127 : byte_iterator current, next;
128 :
129 : friend class Utf8String;
130 : };
131 :
132 51474 : utf16_iterator utf16_begin() const { return utf16_iterator(begin()); }
133 : utf16_iterator utf16_end() const { return utf16_iterator(end()); }
134 :
135 : /*** HASHING ******************************************/
136 :
137 20897502 : size_t hash() const { return _data->hash; }
138 :
139 : /*** COMPARISONS ******************************************/
140 :
141 : /// check if utf-8 strings contains the same utf-16
142 : /// codepoints as a utf-16 string
143 : bool equals(const uint16_t *cs, size_t sz);
144 :
145 : /// check if utf-8 strings contains same bytes as C string
146 : bool equals(const char *cs) {
147 : return strcmp(begin(), cs) == 0;
148 : }
149 :
150 : /*** ACCESSORS ******************************************/
151 :
152 : // access first element
153 : char front() const { return begin()[0]; }
154 :
155 : // access last element
156 : char back() const { return begin()[size() - 1]; }
157 :
158 3437091 : char operator[](size_t idx) const { return begin()[idx]; }
159 :
160 : // get the number of bytes in string, excluding zero terminator.
161 9339880 : size_t size() const { return _data->utf8_size; }
162 :
163 : // get the number of utf16 codepoints in string
164 116597 : size_t utf16_size() const { return _data->utf16_size; }
165 :
166 : // for checking against NULL,
167 : // also allows interop with legacy C code
168 118866154 : operator void*() const { return _data; }
169 :
170 3212064 : utf* c_ptr() const { return (utf*) _data; }
171 :
172 : // create substring
173 : Utf8String substring(size_t from ) const;
174 : Utf8String substring(size_t from, size_t to ) const;
175 :
176 : /*** MISC ******************************************/
177 :
178 : bool is_valid_name() const;
179 :
180 : // TODO: remove (only used in loader.cpp)
181 : static const size_t sizeof_utf;
182 : private:
183 : // MUST be a POD type
184 : struct Data {
185 : size_t hash; // cached hash of the string
186 : size_t utf8_size; // text length in bytes (does NOT include zero terminator)
187 : size_t utf16_size; // number of utf16 codepoints in string
188 :
189 : char text[sizeof(void*)]; // string content
190 : // directly embedded in struct utf
191 : // aligned to pointer size
192 : };
193 :
194 : static inline Data *alloc(size_t hash,
195 : size_t utf8_size,
196 : size_t utf16_size);
197 :
198 : static void free(Utf8String u);
199 :
200 : Data *_data;
201 :
202 : template<typename Iterator>
203 : friend struct FromUtf8Builder;
204 :
205 : template<typename Iterator>
206 : friend struct FromUtf16Builder;
207 : };
208 :
209 :
210 : // ***** UTF-8 HELPER FUNCTIONS
211 :
212 : namespace utf8 {
213 : // count UTF-16 codepoints, -1 on error
214 : extern long num_codepoints(const char*, size_t);
215 :
216 : // count how many bytes a utf-8 version would need
217 : extern size_t num_bytes(const uint16_t*, size_t);
218 :
219 : extern size_t compute_hash(const uint16_t *cs, size_t);
220 :
221 : // named constants for common utf8 strings
222 : #define UTF8(NAME, STR) extern Utf8String NAME;
223 : #include "vm/utf8.inc"
224 : }
225 :
226 : // these are only used in old logging code
227 :
228 : void utf_display_printable_ascii(Utf8String u);
229 : void utf_display_printable_ascii_classname(Utf8String u);
230 :
231 : void utf_fprint_printable_ascii(FILE *file, Utf8String u);
232 : void utf_fprint_printable_ascii_classname(FILE *file, Utf8String u);
233 :
234 : // OStream operators
235 : namespace cacao {
236 : class OStream;
237 :
238 : OStream& operator<<(OStream& os, const Utf8String &u);
239 :
240 : }
241 :
242 : ////////////////////////////////////////////////////////////////////////////////
243 : ////////////////////////////////////////////////////////////////////////////////
244 : // LEGACY C API
245 : ////////////////////////////////////////////////////////////////////////////////
246 : ////////////////////////////////////////////////////////////////////////////////
247 :
248 : // these are only used in jvmti and cacaodbg
249 :
250 : #define UTF_END(u) utf8_end(u)
251 : #define UTF_SIZE(u) utf8_size(u)
252 :
253 : extern const char *utf8_end(utf*);
254 : extern size_t utf8_size(utf*);
255 :
256 : void utf_sprint_convert_to_latin1(char *buffer, Utf8String u);
257 : void utf_sprint_convert_to_latin1_classname(char *buffer, Utf8String u);
258 :
259 : void utf_strcat_convert_to_latin1(char *buffer, Utf8String u);
260 : void utf_strcat_convert_to_latin1_classname(char *buffer, Utf8String u);
261 :
262 : #endif // UTF8_HPP_
263 :
264 :
265 : /*
266 : * These are local overrides for various environment variables in Emacs.
267 : * Please do not remove this and leave it at the end of the file, where
268 : * Emacs will automagically detect them.
269 : * ---------------------------------------------------------------------
270 : * Local variables:
271 : * mode: c++
272 : * indent-tabs-mode: t
273 : * c-basic-offset: 4
274 : * tab-width: 4
275 : * End:
276 : * vim:noexpandtab:sw=4:ts=4:
277 : */
|