Line data Source code
1 : /* src/vm/utf8.cpp - utf8 string functions
2 :
3 : Copyright (C) 1996-2014
4 : CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
5 :
6 : This file is part of CACAO.
7 :
8 : This program is free software; you can redistribute it and/or
9 : modify it under the terms of the GNU General Public License as
10 : published by the Free Software Foundation; either version 2, or (at
11 : your option) any later version.
12 :
13 : This program is distributed in the hope that it will be useful, but
14 : WITHOUT ANY WARRANTY; without even the implied warranty of
15 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 : General Public License for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with this program; if not, write to the Free Software
20 : Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 : 02110-1301, USA.
22 :
23 : */
24 :
25 : #include "vm/utf8.hpp"
26 : #include <algorithm> // for std::equal
27 : #include "mm/memory.hpp" // for mem_alloc, mem_free
28 : #include "toolbox/logging.hpp" // for OStream
29 : #include "toolbox/intern_table.hpp" // for InternTable
30 : #include "toolbox/utf_utils.hpp" // for transform, Tag, etc
31 : #include "vm/options.hpp"
32 : #include "vm/statistics.hpp"
33 : #include "toolbox/assert.hpp"
34 :
35 : using namespace cacao;
36 :
37 : STAT_REGISTER_VAR(int,count_utf_new,0,"utf new","Calls of utf_new")
38 : STAT_DECLARE_VAR(int,count_utf_len,0)
39 :
40 : //****************************************************************************//
41 : //***** GLOBAL UTF8-STRING INTERN TABLE *****//
42 : //****************************************************************************//
43 :
44 : // used to for tag dispatch
45 : struct utf8_tag {};
46 : struct utf16_tag {};
47 :
48 114688 : struct InternedUtf8String {
49 2899968 : InternedUtf8String() : string(0) {}
50 : InternedUtf8String(Utf8String u) : string(u) {}
51 :
52 : /// Interface to HashTable
53 :
54 16549534 : bool is_empty() const { return string == ((utf*) 0); }
55 114688 : bool is_occupied() const { return string != ((utf*) 0); }
56 12553131 : bool is_deleted() const { return false; }
57 :
58 : template<typename T>
59 1995024 : void set_occupied(const T& t) { string = t.get_string(); }
60 :
61 : // template<typename Iterator>
62 : // bool operator==(const FromUtf16Builder<Iterator>& t) const;
63 :
64 : template<typename T>
65 12553131 : bool operator==(const T& t) const {
66 12553131 : return equal(t.hash(), t.size(), t.begin(), t.tag());
67 : }
68 :
69 : template<typename Iterator>
70 12515627 : bool equal(size_t _hash, size_t _size, Iterator it, utf8_tag) const {
71 : return hash() == _hash
72 : && size() == _size
73 12515627 : && std::equal(it, it + _size, begin());
74 : }
75 :
76 : template<typename Iterator>
77 37504 : bool equal(size_t _hash, size_t _size, Iterator it, utf16_tag) const {
78 : return hash() == _hash
79 : && utf16_size() == _size
80 37504 : && std::equal(it, it + _size, utf16_begin());
81 : }
82 :
83 : /// used by operator==
84 :
85 : utf8_tag tag() const { return utf8_tag(); }
86 :
87 12650599 : size_t hash() const { return string.hash(); }
88 4053467 : size_t size() const { return string.size(); }
89 :
90 7981 : size_t utf16_size() const { return string.utf16_size(); }
91 :
92 4053467 : Utf8String::byte_iterator begin() const { return string.begin(); }
93 : Utf8String::byte_iterator end() const { return string.end(); }
94 :
95 7981 : Utf8String::utf16_iterator utf16_begin() const { return string.utf16_begin(); }
96 : Utf8String::utf16_iterator utf16_end() const { return string.utf16_end(); }
97 :
98 : /// used by set_occupied
99 :
100 6056472 : Utf8String get_string() const { return string; }
101 : private:
102 : Utf8String string;
103 : };
104 :
105 165 : static InternTable<InternedUtf8String> intern_table;
106 :
107 : // initial size of intern table
108 : #define HASHTABLE_UTF_SIZE 16384
109 :
110 163 : void Utf8String::initialize(void)
111 : {
112 163 : TRACESUBSYSTEMINITIALIZATION("utf8_init");
113 :
114 163 : assert(!is_initialized());
115 :
116 163 : intern_table.initialize(HASHTABLE_UTF_SIZE);
117 :
118 : STATISTICS(count_utf_len += sizeof(utf*) * HASHTABLE_UTF_SIZE);
119 :
120 : // create utf-symbols for pointer comparison of frequently used strings
121 :
122 : #define UTF8(NAME, STR) utf8::NAME = Utf8String::from_utf8(STR);
123 : #include "vm/utf8.inc"
124 163 : }
125 :
126 :
127 : /* Utf8String::initialize ******************************************************
128 :
129 : Check if utf8 subsytem is initialized
130 :
131 : *******************************************************************************/
132 :
133 163 : bool Utf8String::is_initialized(void)
134 : {
135 163 : return intern_table.is_initialized();
136 : }
137 :
138 : //****************************************************************************//
139 : //***** INTERNAL DATA REPRESENTATION *****//
140 : //****************************************************************************//
141 :
142 : /// allocate a Utf8String with given hash and size
143 : /// You still have to fill in the strings text!
144 1897556 : inline Utf8String::Data* Utf8String::alloc(size_t hash,
145 : size_t utf8_size,
146 : size_t utf16_size) {
147 1897556 : Data* str = (Data*) mem_alloc(offsetof(Data,text) + utf8_size + 1);
148 :
149 : STATISTICS(count_utf_new++);
150 :
151 1897556 : str->hash = hash;
152 1897556 : str->utf8_size = utf8_size;
153 1897556 : str->utf16_size = utf16_size;
154 :
155 1897556 : return str;
156 : }
157 :
158 :
159 : //****************************************************************************//
160 : //***** HASHING *****//
161 : //****************************************************************************//
162 :
163 : /* init/update/finish_hash *****************************************************
164 :
165 : These routines are used to compute the hash for a utf-8 string byte by byte.
166 :
167 : Use like this:
168 : size_t hash = 0;
169 :
170 : for each byte in string:
171 : hash = update_hash( hash, byte );
172 :
173 : hash = finish_hash(hash);
174 :
175 : The algorithm is the "One-at-a-time" algorithm as published
176 : by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
177 :
178 : *******************************************************************************/
179 :
180 137431584 : static inline size_t update_hash(size_t hash, uint8_t byte)
181 : {
182 137431584 : hash += byte;
183 137431584 : hash += (hash << 10);
184 137431584 : hash ^= (hash >> 6);
185 :
186 137431584 : return hash;
187 : }
188 :
189 5973911 : static inline size_t finish_hash(size_t hash)
190 : {
191 5973911 : hash += (hash << 3);
192 5973911 : hash ^= (hash >> 11);
193 5973911 : hash += (hash << 15);
194 :
195 5973911 : return hash;
196 : }
197 :
198 :
199 : //****************************************************************************//
200 : //***** UTF-8 STRING *****//
201 : //****************************************************************************//
202 :
203 : // create & intern string
204 :
205 : // Builds a new utf8 string.
206 : // Only allocates a new string if the string was not already intern_table.
207 : template<typename Iterator>
208 : struct FromUtf8Builder : utf8::VisitorBase<Utf8String, utf8::ABORT_ON_ERROR> {
209 5949026 : FromUtf8Builder(Iterator text, size_t utf8_size)
210 5949026 : : _hash(0), _utf8_size(utf8_size), _utf16_size(0), _text(text) {}
211 :
212 : /// interface to utf8::transform
213 :
214 : typedef Utf8String ReturnType;
215 :
216 136777027 : void utf8 (uint8_t c) {
217 136777027 : _hash = update_hash(_hash, c);
218 136777027 : }
219 :
220 133237744 : void utf16(uint16_t c) {
221 133237744 : _utf16_size++;
222 133237744 : }
223 :
224 5949026 : Utf8String finish() {
225 5949026 : _hash = finish_hash(_hash);
226 :
227 5949026 : return intern_table.intern(*this).get_string();
228 : }
229 :
230 0 : Utf8String abort() {
231 0 : return 0;
232 : }
233 :
234 : /// interface to HashTable
235 :
236 24413679 : size_t hash() const { return _hash; }
237 :
238 : /// interface to InternTableEntry
239 :
240 12515627 : utf8_tag tag() const { return utf8_tag(); }
241 :
242 12515627 : Iterator begin() const { return _text; }
243 :
244 12515627 : size_t size() const { return _utf8_size; }
245 :
246 1895559 : Utf8String get_string() const {
247 1895559 : Utf8String::Data *u = Utf8String::alloc(_hash, _utf8_size, _utf16_size);
248 1895559 : char *cs = u->text;
249 :
250 1895559 : cs = std::copy(_text, _text + _utf8_size, cs);
251 1895559 : *cs = '\0';
252 :
253 1895559 : return (utf*) u;
254 : }
255 : private:
256 : size_t _hash;
257 : size_t _utf8_size;
258 : size_t _utf16_size;
259 : Iterator _text;
260 : };
261 :
262 :
263 : // Builds a new utf8 string from an utf16 string.
264 : // Only allocates a new string if the string was not already intern_table.
265 : template<typename Iterator>
266 : struct FromUtf16Builder : utf8::VisitorBase<Utf8String, utf8::ABORT_ON_ERROR> {
267 9978 : FromUtf16Builder(Iterator text, size_t utf16_size)
268 9978 : : _hash(0), _utf8_size(0), _utf16_size(utf16_size), _text(text) {}
269 :
270 : /// interface to utf8::transform
271 :
272 : typedef Utf8String ReturnType;
273 :
274 530355 : void utf8 (uint8_t c) {
275 530355 : _hash = update_hash(_hash, c);
276 530355 : _utf8_size++;
277 530355 : }
278 :
279 9978 : Utf8String finish() {
280 9978 : _hash = finish_hash(_hash);
281 :
282 9978 : return intern_table.intern(*this).get_string();
283 : }
284 :
285 : Utf8String abort() {
286 : return 0;
287 : }
288 :
289 : /// interface to HashTable
290 :
291 57460 : size_t hash() const { return _hash; }
292 :
293 : /// interface to InternTableEntry
294 :
295 37504 : utf16_tag tag() const { return utf16_tag(); }
296 :
297 37504 : Iterator begin() const { return _text; }
298 :
299 37504 : size_t size() const { return _utf16_size; }
300 :
301 1997 : Utf8String get_string() const {
302 1997 : Utf8String::Data *u = Utf8String::alloc(_hash, _utf8_size, _utf16_size);
303 1997 : char *cs = u->text;
304 :
305 1997 : utf16::encode(_text, _text + _utf16_size, cs);
306 1997 : cs[_utf8_size] = '\0';
307 :
308 1997 : return (utf*) u;
309 : }
310 : private:
311 : size_t _hash;
312 : size_t _utf8_size;
313 : size_t _utf16_size;
314 : Iterator _text;
315 : };
316 :
317 :
318 : template<typename Iterator>
319 5949026 : static inline Utf8String string_from_utf8(const char *cs, size_t size) {
320 5949026 : Iterator begin = cs;
321 5949026 : Iterator end = cs + size;
322 :
323 5949026 : return utf8::transform(begin, end, FromUtf8Builder<Iterator>(begin, size));
324 : }
325 :
326 : template<typename Iterator>
327 9978 : static inline Utf8String string_from_utf16(const uint16_t *cs, size_t size) {
328 9978 : Iterator begin = cs;
329 9978 : Iterator end = cs + size;
330 :
331 9978 : return utf16::transform(begin, end, FromUtf16Builder<Iterator>(begin, size));
332 : }
333 :
334 :
335 5909455 : Utf8String Utf8String::from_utf8(const char *cs, size_t sz) {
336 5909455 : return string_from_utf8<const char*>(cs, sz);
337 : }
338 :
339 740 : Utf8String Utf8String::from_utf8_dot_to_slash(const char *cs, size_t sz) {
340 740 : return string_from_utf8<utf8::DotToSlash>(cs, sz);
341 : }
342 :
343 38831 : Utf8String Utf8String::from_utf8_slash_to_dot(const char *cs, size_t sz) {
344 38831 : return string_from_utf8<utf8::SlashToDot>(cs, sz);
345 : }
346 :
347 0 : Utf8String Utf8String::from_utf8_slash_to_dot(Utf8String u) {
348 0 : return string_from_utf8<utf8::SlashToDot>(u.begin(), u.size());
349 : }
350 :
351 5504 : Utf8String Utf8String::from_utf16(const uint16_t *cs, size_t sz) {
352 5504 : return string_from_utf16<const uint16_t*>(cs, sz);
353 : }
354 :
355 4474 : Utf8String Utf8String::from_utf16_dot_to_slash(const uint16_t *cs, size_t sz) {
356 4474 : return string_from_utf16<utf16::DotToSlash>(cs, sz);
357 : }
358 :
359 : /* Utf8String::utf16_iterator **************************************************
360 :
361 : A forward iterator over the utf16 codepoints in a Utf8String
362 :
363 : *******************************************************************************/
364 :
365 672050 : uint16_t Utf8String::utf16_iterator::operator*()
366 : {
367 672050 : return utf8::decode_char(next);
368 : }
369 :
370 :
371 : /* Utf8String::substring *******************************************************
372 :
373 : Access last element, accessing a null or empty string leads to
374 : undefined behaviour
375 :
376 : *******************************************************************************/
377 :
378 569 : Utf8String Utf8String::substring(size_t from) const
379 : {
380 569 : return substring(from, size());
381 : }
382 :
383 750 : Utf8String Utf8String::substring(size_t from, size_t to) const
384 : {
385 : EXPENSIVE_ASSERT(_data);
386 : EXPENSIVE_ASSERT(from > 0);
387 : EXPENSIVE_ASSERT(from <= to);
388 : EXPENSIVE_ASSERT(to <= size());
389 :
390 750 : return Utf8String::from_utf8(begin() + from, to - from);
391 : }
392 :
393 1918054 : bool Utf8String::is_valid_name() const {
394 1918054 : Utf8String::byte_iterator it = this->begin();
395 1918054 : Utf8String::byte_iterator end = this->end();
396 :
397 30307314 : for (; it != end; it++) {
398 28389260 : unsigned char c = *it;
399 :
400 28389260 : if (c < 0x20)
401 0 : return false; // disallow control characters
402 28389260 : if (c == 0xc0 && ((unsigned char) it[1]) == 0x80)
403 0 : return false; // disallow zero
404 : }
405 :
406 1918054 : return true;
407 : }
408 :
409 : //****************************************************************************//
410 : //***** PUBLIC UTF-8 FUNCTIONS *****//
411 : //****************************************************************************//
412 :
413 : /* Utf8String::initialize ******************************************************
414 :
415 : Initializes the utf8 subsystem.
416 :
417 : *******************************************************************************/
418 :
419 : /* utf8::num_codepoints ********************************************************
420 :
421 : Count number of UTF-16 code points in UTF-8 string.
422 :
423 : Returns -1 on error
424 :
425 : *******************************************************************************/
426 :
427 : struct SafeCodePointCounter : utf8::VisitorBase<long, utf8::ABORT_ON_ERROR> {
428 : typedef long ReturnType;
429 :
430 13699 : SafeCodePointCounter() : count(0) {}
431 :
432 356173 : void utf16(uint16_t) { count++; }
433 :
434 13699 : long finish() { return count; }
435 0 : long abort() { return -1; }
436 : private:
437 : long count;
438 : };
439 :
440 13699 : long utf8::num_codepoints(const char *cs, size_t sz) {
441 13699 : return utf8::transform(cs, cs + sz, SafeCodePointCounter());
442 : }
443 :
444 : /* utf8::num_bytes *************************************************************
445 :
446 : Calculate how many bytes a UTF-8 encoded version of a UTF-16 string
447 : would need.
448 :
449 : *******************************************************************************/
450 :
451 : struct ByteCounter : utf8::VisitorBase<size_t, utf8::IGNORE_ERRORS> {
452 : typedef size_t ReturnType;
453 :
454 0 : ByteCounter() : count(0) {}
455 :
456 0 : void utf8(uint8_t) { count++; }
457 :
458 0 : size_t finish() { return count; }
459 : private:
460 : size_t count;
461 : };
462 :
463 0 : size_t utf8::num_bytes(const uint16_t *cs, size_t sz)
464 : {
465 0 : return utf16::transform(cs, cs + sz, ByteCounter());
466 : }
467 :
468 :
469 : /***
470 : * Compute the hash of a UTF-16 string.
471 : * The hash will be the same as for the UTF-8 encoded version of this string
472 : */
473 : struct Utf16Hasher : utf16::VisitorBase<size_t> {
474 : typedef size_t ReturnType;
475 :
476 14907 : Utf16Hasher() : hash(0) {}
477 :
478 124202 : void utf8(uint8_t c) {
479 124202 : hash = update_hash(hash, c);
480 124202 : }
481 :
482 14907 : size_t finish() { return finish_hash(hash); }
483 : private:
484 : size_t hash;
485 : };
486 :
487 14907 : size_t utf8::compute_hash(const uint16_t *cs, size_t sz) {
488 14907 : return utf16::transform(cs, cs + sz, Utf16Hasher());
489 : }
490 :
491 :
492 : //****************************************************************************//
493 : //***** GLOBAL UTF8-STRING CONSTANTS *****//
494 : //****************************************************************************//
495 :
496 : #define UTF8( NAME, STR ) Utf8String utf8::NAME;
497 : #include "vm/utf8.inc"
498 :
499 : ////////////////////////////////////////////////////////////////////////////////
500 : ////////////////////////////////////////////////////////////////////////////////
501 : // LEGACY C API
502 : ////////////////////////////////////////////////////////////////////////////////
503 : ////////////////////////////////////////////////////////////////////////////////
504 :
505 0 : extern const char *utf8_text(utf *u) { return Utf8String(u).begin(); }
506 0 : extern const char *utf8_end (utf *u) { return Utf8String(u).end(); }
507 :
508 0 : extern size_t utf8_size(utf *u) { return Utf8String(u).size(); }
509 0 : extern size_t utf8_hash(utf *u) { return Utf8String(u).hash(); }
510 :
511 : /* utf_display_printable_ascii *************************************************
512 :
513 : Write utf symbol to stdout (for debugging purposes).
514 : Non-printable and non-ASCII characters are printed as '?'.
515 :
516 : *******************************************************************************/
517 :
518 : struct DisplayPrintableAscii : utf8::VisitorBase<void, utf8::IGNORE_ERRORS> {
519 : typedef void ReturnType;
520 :
521 0 : DisplayPrintableAscii(FILE *dst) : _dst(dst) {}
522 :
523 0 : void utf8(uint8_t c) {
524 0 : fputc((c >= 32 && c <= 127) ? c : '?', _dst);
525 0 : }
526 :
527 0 : void finish() {fflush(_dst);}
528 : private:
529 : FILE *_dst;
530 : };
531 :
532 0 : void utf_display_printable_ascii(Utf8String u)
533 : {
534 0 : if (u == NULL) {
535 0 : printf("NULL");
536 0 : fflush(stdout);
537 0 : return;
538 : }
539 :
540 0 : utf8::transform(u, DisplayPrintableAscii(stdout));
541 : }
542 :
543 :
544 : /* utf_display_printable_ascii_classname ***************************************
545 :
546 : Write utf symbol to stdout with `/' converted to `.' (for debugging
547 : purposes).
548 : Non-printable and non-ASCII characters are printed as '?'.
549 :
550 : *******************************************************************************/
551 :
552 0 : void utf_display_printable_ascii_classname(Utf8String u)
553 : {
554 0 : if (u == NULL) {
555 0 : printf("NULL");
556 0 : fflush(stdout);
557 0 : return;
558 : }
559 :
560 0 : utf8::transform(utf8::slash_to_dot(u), DisplayPrintableAscii(stdout));
561 : }
562 :
563 :
564 : /* utf_sprint_convert_to_latin1 ************************************************
565 :
566 : Write utf symbol into c-string (for debugging purposes).
567 : Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
568 : invalid results.
569 :
570 : *******************************************************************************/
571 :
572 : struct SprintConvertToLatin1 : utf8::VisitorBase<void, utf8::IGNORE_ERRORS> {
573 : typedef void ReturnType;
574 :
575 0 : SprintConvertToLatin1(char* dst) : _dst(dst) {}
576 :
577 0 : void utf16(uint16_t c) { *_dst++ = c; }
578 :
579 0 : void finish() { *_dst = '\0'; }
580 : private:
581 : char *_dst;
582 : };
583 :
584 0 : void utf_sprint_convert_to_latin1(char *buffer, Utf8String u)
585 : {
586 0 : if (!u) {
587 0 : strcpy(buffer, "NULL");
588 0 : return;
589 : }
590 :
591 0 : utf8::transform(u, SprintConvertToLatin1(buffer));
592 : }
593 :
594 :
595 : /* utf_sprint_convert_to_latin1_classname **************************************
596 :
597 : Write utf symbol into c-string with `/' converted to `.' (for debugging
598 : purposes).
599 : Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
600 : invalid results.
601 :
602 : *******************************************************************************/
603 :
604 0 : void utf_sprint_convert_to_latin1_classname(char *buffer, Utf8String u)
605 : {
606 0 : if (!u) {
607 0 : strcpy(buffer, "NULL");
608 0 : return;
609 : }
610 :
611 0 : utf8::transform(utf8::slash_to_dot(u), SprintConvertToLatin1(buffer));
612 : }
613 :
614 :
615 : /* utf_strcat_convert_to_latin1 ************************************************
616 :
617 : Like libc strcat, but uses an utf8 string.
618 : Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
619 : invalid results.
620 :
621 : *******************************************************************************/
622 :
623 0 : void utf_strcat_convert_to_latin1(char *buffer, utf *u)
624 : {
625 0 : utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
626 0 : }
627 :
628 :
629 : /* utf_strcat_convert_to_latin1_classname **************************************
630 :
631 : Like libc strcat, but uses an utf8 string.
632 : Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
633 : invalid results.
634 :
635 : *******************************************************************************/
636 :
637 0 : void utf_strcat_convert_to_latin1_classname(char *buffer, Utf8String u)
638 : {
639 0 : utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
640 0 : }
641 :
642 :
643 : /* utf_fprint_printable_ascii **************************************************
644 :
645 : Write utf symbol into file.
646 : Non-printable and non-ASCII characters are printed as '?'.
647 :
648 : *******************************************************************************/
649 :
650 0 : void utf_fprint_printable_ascii(FILE *file, Utf8String u)
651 : {
652 0 : if (!u) return;
653 :
654 0 : utf8::transform(u, DisplayPrintableAscii(file));
655 : }
656 :
657 :
658 : /* utf_fprint_printable_ascii_classname ****************************************
659 :
660 : Write utf symbol into file with `/' converted to `.'.
661 : Non-printable and non-ASCII characters are printed as '?'.
662 :
663 : *******************************************************************************/
664 :
665 0 : void utf_fprint_printable_ascii_classname(FILE *file, Utf8String u)
666 : {
667 0 : if (!u) return;
668 :
669 0 : utf8::transform(utf8::slash_to_dot(u), DisplayPrintableAscii(file));
670 : }
671 :
672 : const size_t Utf8String::sizeof_utf = sizeof(Utf8String::Data);
673 :
674 : namespace cacao {
675 :
676 : // OStream operators
677 0 : OStream& operator<<(OStream& os, const Utf8String &u) {
678 0 : return os << (u ? u.begin() : "(nil)");
679 : }
680 :
681 495 : } // end namespace cacao
682 :
683 : /*
684 : * These are local overrides for various environment variables in Emacs.
685 : * Please do not remove this and leave it at the end of the file, where
686 : * Emacs will automagically detect them.
687 : * ---------------------------------------------------------------------
688 : * Local variables:
689 : * mode: c++
690 : * indent-tabs-mode: t
691 : * c-basic-offset: 4
692 : * tab-width: 4
693 : * End:
694 : * vim:noexpandtab:sw=4:ts=4:
695 : */
|