Line data Source code
1 : /* src/toolbox/utf8_transform.inc - implementation of utf8 decoder
2 :
3 : Copyright (C) 1996-2013
4 : CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
5 :
6 : This file is part of CACAO.
7 :
8 : This program is free software; you can redistribute it and/or
9 : modify it under the terms of the GNU General Public License as
10 : published by the Free Software Foundation; either version 2, or (at
11 : your option) any later version.
12 :
13 : This program is distributed in the hope that it will be useful, but
14 : WITHOUT ANY WARRANTY; without even the implied warranty of
15 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 : General Public License for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with this program; if not, write to the Free Software
20 : Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 : 02110-1301, USA.
22 :
23 : */
24 :
25 : #ifndef UTF8_TRANSFORM_INC
26 : #define UTF8_TRANSFORM_INC 1
27 :
28 : namespace utf8 {
29 : namespace impl {
30 : struct CopyUtf8ToUtf16 : utf8::VisitorBase<bool, ABORT_ON_ERROR> {
31 : typedef bool ReturnType;
32 :
33 84654 : CopyUtf8ToUtf16(uint16_t *dst) : dst(dst) {}
34 :
35 3259302 : inline void utf16(uint16_t c) { *dst++ = c; }
36 :
37 84654 : inline bool finish() { return true; }
38 0 : inline bool abort() { return false; }
39 : private:
40 : uint16_t *dst;
41 : };
42 : } // end namespace impl
43 : } // end namespace utf8
44 :
45 : template<typename Iterator, typename Fn>
46 6046564 : inline typename Fn::ReturnType utf8::transform(Iterator it, Iterator end, Fn fn) {
47 : using namespace ::utf8::impl;
48 :
49 : #define UTF8_HANDLE_ERROR { \
50 : if (fn.error_action() == ABORT_ON_ERROR) { \
51 : return fn.abort(); \
52 : } \
53 : }
54 :
55 148895039 : while (it != end) {
56 136801911 : unsigned byte = *it++;
57 :
58 136801911 : if (byte & 0x80) {
59 : // highest bit set, non-ASCII character
60 :
61 3911496 : if ((byte & 0xe0) == 0xc0) {
62 : // 2-byte: should be 110..... 10......
63 :
64 816736 : if (it == end)
65 0 : UTF8_HANDLE_ERROR
66 :
67 816736 : unsigned byte2 = *it++;
68 :
69 816736 : if ((byte2 & 0xc0) != 0x80)
70 0 : UTF8_HANDLE_ERROR
71 :
72 816736 : fn.utf8(byte);
73 816736 : fn.utf8(byte2);
74 816736 : fn.utf16(((byte & 0x1f) << 6) | (byte2 & 0x3f));
75 3094760 : } else if ((byte & 0xf0) == 0xe0) {
76 : // 3-byte: should be 1110.... 10...... 10......
77 :
78 3094760 : if (it + 2 > end)
79 0 : UTF8_HANDLE_ERROR
80 :
81 3094760 : unsigned byte2 = *it++;
82 :
83 3094760 : if ((byte2 & 0xc0) != 0x80)
84 0 : UTF8_HANDLE_ERROR
85 :
86 3094760 : unsigned byte3 = *it++;
87 :
88 3094760 : if ((byte3 & 0xc0) != 0x80)
89 0 : UTF8_HANDLE_ERROR
90 :
91 3094760 : fn.utf8(byte);
92 3094760 : fn.utf8(byte2);
93 3094760 : fn.utf8(byte3);
94 3094760 : fn.utf16(((byte & 0x0f) << 12) | ((byte2 & 0x3f) << 6) | (byte3 & 0x3f));
95 : } else {
96 0 : UTF8_HANDLE_ERROR
97 : }
98 : } else {
99 : // Java forbids zero bytes in UTF8
100 132890415 : if (byte == 0)
101 0 : UTF8_HANDLE_ERROR
102 :
103 : // ASCII character: highest bit not set, at least one other bit set
104 132890415 : fn.utf8(byte);
105 132890415 : fn.utf16(byte);
106 : }
107 : }
108 :
109 6046564 : return fn.finish();
110 :
111 : #undef UTF8_HANDLE_ERROR
112 : }
113 :
114 :
115 654039 : inline uint16_t utf8::decode_char(const char*& src) {
116 : uint16_t ch1, ch2, ch3;
117 :
118 654039 : ch1 = src[0];
119 :
120 654039 : switch (((uint8_t) ch1) >> 4) {
121 : default: // 1 byte (ASCII)
122 653313 : src++;
123 653313 : return ch1;
124 : case 0xC:
125 : case 0xD: // 2 bytes
126 : // mask out non-data bits
127 723 : ch1 = ch1 & 0x1F;
128 723 : ch2 = src[1] & 0x3F;
129 723 : src += 2;
130 :
131 : // stitch together data bits from individual bytes
132 723 : return (ch1 << 6) | ch2;
133 : case 0xE: // 3 bytes
134 : // mask out non-data bits
135 3 : ch1 = ch1 & 0x1F;
136 3 : ch2 = src[1] & 0x3F;
137 3 : ch3 = src[2] & 0x3F;
138 3 : src += 3;
139 :
140 : // stitch together data bits from individual bytes
141 3 : return (ch1 << 12) | (ch2 << 6) | ch3;
142 : }
143 : }
144 :
145 : template<typename Utf8Iterator>
146 84654 : inline bool utf8::decode(Utf8Iterator begin, Utf8Iterator end, uint16_t *dst) {
147 84654 : return ::utf8::transform(begin, end, ::utf8::impl::CopyUtf8ToUtf16(dst));
148 : }
149 :
150 : #endif // UTF8_TRANSFORM_INC
151 :
152 :
153 : /*
154 : * These are local overrides for various environment variables in Emacs.
155 : * Please do not remove this and leave it at the end of the file, where
156 : * Emacs will automagically detect them.
157 : * ---------------------------------------------------------------------
158 : * Local variables:
159 : * mode: c++
160 : * indent-tabs-mode: t
161 : * c-basic-offset: 4
162 : * tab-width: 4
163 : * End:
164 : * vim:noexpandtab:sw=4:ts=4:
165 : */
|