CACAO
utf8.hpp
Go to the documentation of this file.
1 /* src/vm/utf8.hpp - utf8 string functions
2 
3  Copyright (C) 1996-2014
4  CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
5 
6  This file is part of CACAO.
7 
8  This program is free software; you can redistribute it and/or
9  modify it under the terms of the GNU General Public License as
10  published by the Free Software Foundation; either version 2, or (at
11  your option) any later version.
12 
13  This program is distributed in the hope that it will be useful, but
14  WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program; if not, write to the Free Software
20  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21  02110-1301, USA.
22 
23 */
24 
25 
26 #ifndef UTF8_HPP_
27 #define UTF8_HPP_ 1
28 
29 #include "config.h" // used in utf8.inc
30 
31 #include <cstddef> // for size_t
32 #include <cstdio> // for FILE
33 #include <cstring> // for strlen
34 #include <stdint.h> // for uint32_t, uint8_t
35 
36 #include <iterator>
37 
38 namespace cacao { class OStream; }
39 struct utf;
40 
41 /* Utf8String ******************************************************************
42 
43  A container for strings in Java's modified UTF-8 encoding.
44 
45  A Utf8String always contains either a valid (possibly empty) UTF-8 string
46  or NULL.
47  You can check for NULL like you would with any normal pointer.
48  Invoking any method except operator void*() or c_ptr() on a NULL string leads to
49  undefined behaviour.
50 
51  Use a Utf8String like a pointer, i.e. always pass by value.
52 
53  The contents of a Utf8String are zero terminated, and it never contains any
54  zero bytes except the one at the end, so any C string processing functions
55  work properly.
56 
57 *******************************************************************************/
58 
59 class Utf8String {
60  public:
61  /*** GLOBAL INITIALIZATION **********************************/
62 
63  // initialize the utf8 subsystem
64  // MUST be called before any Utf8String can be constructed
65  static void initialize();
66 
67  // check if utf8 subsytem is initialized
68  static bool is_initialized();
69 
70  /*** CONSTRUCTORS ******************************************/
71 
72  // constructs a null string
73  Utf8String() : _data(0) {}
74 
75  // construct from a buffer with a given length
76  // validates that input is really UTF-8
77  // constructs a null string on error
78  static Utf8String from_utf8(const char*, size_t);
79  static Utf8String from_utf8_dot_to_slash(const char*, size_t);
80  static Utf8String from_utf8_slash_to_dot(const char*, size_t);
81 
82  static Utf8String from_utf8(const char *cs) {
83  return from_utf8(cs, strlen(cs));
84  }
85  static Utf8String from_utf8_dot_to_slash(const char *cs) {
86  return from_utf8_dot_to_slash(cs, strlen(cs));
87  }
88 
89  // construct from a UTF8String
91 
92  // construct from a UTF-16 string with a given length
93  static Utf8String from_utf16(const uint16_t*, size_t);
94  static Utf8String from_utf16_dot_to_slash(const uint16_t*, size_t);
95 
96  // constructs a Utf8String with a given content
97  // is only public for interop with legacy C code
98  // NOTE: does NOT perform any checks
99  Utf8String(utf *u) : _data((Data*) u) {}
100 
101  /*** ITERATION ******************************************/
102 
103  // iterator over the bytes in a string
104  typedef const char* byte_iterator;
105 
106  byte_iterator begin() const { return _data->text; }
107  byte_iterator end() const { return begin() + size(); }
108 
109  // iterator over UTF-16 codepoints in a string
110  struct utf16_iterator {
111  typedef std::input_iterator_tag iterator_category;
112  typedef std::ptrdiff_t difference_type;
113  typedef uint16_t value_type;
114  typedef const value_type* pointer;
115  typedef const value_type& reference;
116 
117  uint16_t operator*();
118 
119  void operator++() { current = next; }
120 
121  bool operator!=(const utf16_iterator& it) {
122  return current != it.current;
123  }
124  private:
126 
128 
129  friend class Utf8String;
130  };
131 
134 
135  /*** HASHING ******************************************/
136 
137  size_t hash() const { return _data->hash; }
138 
139  /*** COMPARISONS ******************************************/
140 
141  /// check if utf-8 strings contains the same utf-16
142  /// codepoints as a utf-16 string
143  bool equals(const uint16_t *cs, size_t sz);
144 
145  /// check if utf-8 strings contains same bytes as C string
146  bool equals(const char *cs) {
147  return strcmp(begin(), cs) == 0;
148  }
149 
150  /*** ACCESSORS ******************************************/
151 
152  // access first element
153  char front() const { return begin()[0]; }
154 
155  // access last element
156  char back() const { return begin()[size() - 1]; }
157 
158  char operator[](size_t idx) const { return begin()[idx]; }
159 
160  // get the number of bytes in string, excluding zero terminator.
161  size_t size() const { return _data->utf8_size; }
162 
163  // get the number of utf16 codepoints in string
164  size_t utf16_size() const { return _data->utf16_size; }
165 
166  // for checking against NULL,
167  // also allows interop with legacy C code
168  operator void*() const { return _data; }
169 
170  utf* c_ptr() const { return (utf*) _data; }
171 
172  // create substring
173  Utf8String substring(size_t from ) const;
174  Utf8String substring(size_t from, size_t to ) const;
175 
176  /*** MISC ******************************************/
177 
178  bool is_valid_name() const;
179 
180  // TODO: remove (only used in loader.cpp)
181  static const size_t sizeof_utf;
182  private:
183  // MUST be a POD type
184  struct Data {
185  size_t hash; // cached hash of the string
186  size_t utf8_size; // text length in bytes (does NOT include zero terminator)
187  size_t utf16_size; // number of utf16 codepoints in string
188 
189  char text[sizeof(void*)]; // string content
190  // directly embedded in struct utf
191  // aligned to pointer size
192  };
193 
194  static inline Data *alloc(size_t hash,
195  size_t utf8_size,
196  size_t utf16_size);
197 
198  static void free(Utf8String u);
199 
201 
202  template<typename Iterator>
203  friend struct FromUtf8Builder;
204 
205  template<typename Iterator>
206  friend struct FromUtf16Builder;
207 };
208 
209 
210 // ***** UTF-8 HELPER FUNCTIONS
211 
212 namespace utf8 {
213  // count UTF-16 codepoints, -1 on error
214  extern long num_codepoints(const char*, size_t);
215 
216  // count how many bytes a utf-8 version would need
217  extern size_t num_bytes(const uint16_t*, size_t);
218 
219  extern size_t compute_hash(const uint16_t *cs, size_t);
220 
221  // named constants for common utf8 strings
222  #define UTF8(NAME, STR) extern Utf8String NAME;
223  #include "vm/utf8.inc"
224 }
225 
226 // these are only used in old logging code
227 
230 
231 void utf_fprint_printable_ascii(FILE *file, Utf8String u);
233 
234 // OStream operators
235 namespace cacao {
236 class OStream;
237 
238 OStream& operator<<(OStream& os, const Utf8String &u);
239 
240 }
241 
242 ////////////////////////////////////////////////////////////////////////////////
243 ////////////////////////////////////////////////////////////////////////////////
244 // LEGACY C API
245 ////////////////////////////////////////////////////////////////////////////////
246 ////////////////////////////////////////////////////////////////////////////////
247 
248 // these are only used in jvmti and cacaodbg
249 
250 #define UTF_END(u) utf8_end(u)
251 #define UTF_SIZE(u) utf8_size(u)
252 
253 extern const char *utf8_end(utf*);
254 extern size_t utf8_size(utf*);
255 
256 void utf_sprint_convert_to_latin1(char *buffer, Utf8String u);
258 
259 void utf_strcat_convert_to_latin1(char *buffer, Utf8String u);
261 
262 #endif // UTF8_HPP_
263 
264 
265 /*
266  * These are local overrides for various environment variables in Emacs.
267  * Please do not remove this and leave it at the end of the file, where
268  * Emacs will automagically detect them.
269  * ---------------------------------------------------------------------
270  * Local variables:
271  * mode: c++
272  * indent-tabs-mode: t
273  * c-basic-offset: 4
274  * tab-width: 4
275  * End:
276  * vim:noexpandtab:sw=4:ts=4:
277  */
void utf_strcat_convert_to_latin1_classname(char *buffer, Utf8String u)
Definition: utf8.cpp:637
size_t utf16_size
Definition: utf8.hpp:187
Utf8String substring(size_t from) const
Definition: utf8.cpp:378
static const size_t sizeof_utf
Definition: utf8.hpp:181
Definition: os.hpp:123
static Utf8String from_utf8_dot_to_slash(const char *, size_t)
Definition: utf8.cpp:339
bool operator!=(const utf16_iterator &it)
Definition: utf8.hpp:121
argument_type from
size_t utf16_size() const
Definition: utf8.hpp:164
static Data * alloc(size_t hash, size_t utf8_size, size_t utf16_size)
allocate a Utf8String with given hash and size You still have to fill in the strings text! ...
Definition: utf8.cpp:144
static Utf8String from_utf8_slash_to_dot(const char *, size_t)
Definition: utf8.cpp:343
std::ptrdiff_t difference_type
Definition: utf8.hpp:112
size_t compute_hash(const uint16_t *cs, size_t)
Definition: utf8.cpp:487
byte_iterator end() const
Definition: utf8.hpp:107
void utf_display_printable_ascii_classname(Utf8String u)
Definition: utf8.cpp:552
size_t size() const
Definition: utf8.hpp:161
static void initialize()
Definition: utf8.cpp:110
byte_iterator current
Definition: utf8.hpp:127
Data * _data
Definition: utf8.hpp:200
char front() const
Definition: utf8.hpp:153
void utf_fprint_printable_ascii_classname(FILE *file, Utf8String u)
Definition: utf8.cpp:665
static Utf8String from_utf16_dot_to_slash(const uint16_t *, size_t)
Definition: utf8.cpp:355
bool equals(const char *cs)
check if utf-8 strings contains same bytes as C string
Definition: utf8.hpp:146
size_t hash() const
Definition: utf8.hpp:137
void utf_strcat_convert_to_latin1(char *buffer, utf *u)
Definition: utf8.cpp:623
void utf_sprint_convert_to_latin1_classname(char *buffer, Utf8String u)
Definition: utf8.cpp:604
bool equals(const uint16_t *cs, size_t sz)
check if utf-8 strings contains the same utf-16 codepoints as a utf-16 string
char operator[](size_t idx) const
Definition: utf8.hpp:158
const char * byte_iterator
Definition: utf8.hpp:104
size_t hash
Definition: utf8.hpp:185
static void free(Utf8String u)
static bool is_initialized()
Definition: utf8.cpp:133
bool is_valid_name() const
Definition: utf8.cpp:393
utf16_iterator utf16_begin() const
Definition: utf8.hpp:132
static Utf8String from_utf8(const char *, size_t)
Definition: utf8.cpp:335
void utf_sprint_convert_to_latin1(char *buffer, Utf8String u)
Definition: utf8.cpp:584
static Utf8String from_utf8(const char *cs)
Definition: utf8.hpp:82
OStream & operator<<(OStream &OS, const std::string &t)
Definition: OStream.hpp:459
byte_iterator next
Definition: utf8.hpp:127
void utf_fprint_printable_ascii(FILE *file, Utf8String u)
Definition: utf8.cpp:650
void utf_display_printable_ascii(Utf8String u)
Definition: utf8.cpp:532
Utf8String(utf *u)
Definition: utf8.hpp:99
static Utf8String from_utf16(const uint16_t *, size_t)
Definition: utf8.cpp:351
utf16_iterator(byte_iterator it)
Definition: utf8.hpp:125
long num_codepoints(const char *, size_t)
Definition: utf8.cpp:440
byte_iterator begin() const
Definition: utf8.hpp:106
std::input_iterator_tag iterator_category
Definition: utf8.hpp:111
char back() const
Definition: utf8.hpp:156
utf16_iterator utf16_end() const
Definition: utf8.hpp:133
const value_type * pointer
Definition: utf8.hpp:114
utf * c_ptr() const
Definition: utf8.hpp:170
static Utf8String from_utf8_dot_to_slash(const char *cs)
Definition: utf8.hpp:85
const value_type & reference
Definition: utf8.hpp:115
size_t utf8_size(utf *u)
Definition: utf8.cpp:508
char text[sizeof(void *)]
Definition: utf8.hpp:189
const char * utf8_end(utf *u)
Definition: utf8.cpp:506
size_t utf8_size
Definition: utf8.hpp:186
Utf8String()
Definition: utf8.hpp:73
size_t num_bytes(const uint16_t *, size_t)
Definition: utf8.cpp:463