Line data Source code
1 : /* src/toolbox/utf_utils.hpp - functions for handling utf8/utf16
2 :
3 : Copyright (C) 1996-2013
4 : CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
5 :
6 : This file is part of CACAO.
7 :
8 : This program is free software; you can redistribute it and/or
9 : modify it under the terms of the GNU General Public License as
10 : published by the Free Software Foundation; either version 2, or (at
11 : your option) any later version.
12 :
13 : This program is distributed in the hope that it will be useful, but
14 : WITHOUT ANY WARRANTY; without even the implied warranty of
15 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 : General Public License for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with this program; if not, write to the Free Software
20 : Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 : 02110-1301, USA.
22 :
23 : */
24 :
25 : #ifndef UTF_UTILS_HPP_
26 : #define UTF_UTILS_HPP_ 1
27 :
28 : #include <cassert>
29 : #include <stdint.h>
30 : #include <iterator>
31 :
32 : // TODO: Maybe rename transform functions, this is not a transform in the STL sense.
33 : // It's a reduction, like std::accumulate.
34 :
35 : namespace utf_utils {
36 : /***
37 : * A STL style read-only forward iterator.
38 : * Iterates over a char* but replaces '/' with '.'
39 : */
40 : template<typename Char>
41 : struct SlashToDot {
42 : typedef std::forward_iterator_tag iterator_category;
43 : typedef const Char* pointer;
44 : typedef const Char& reference;
45 : typedef Char value_type;
46 : typedef const Char* difference_type;
47 :
48 133104 : SlashToDot(const Char *cs) : cs(cs) {}
49 :
50 5537 : bool operator==(const SlashToDot& it) const { return cs == it.cs; }
51 1124976 : bool operator!=(const SlashToDot& it) const { return cs != it.cs; }
52 0 : bool operator> (const SlashToDot& it) const { return cs > it.cs; }
53 :
54 1041777 : SlashToDot& operator++() {
55 1041777 : cs++;
56 1041777 : return *this;
57 : }
58 565734 : SlashToDot operator++(int) {
59 565734 : SlashToDot it(*this);
60 565734 : ++(*this);
61 565734 : return it;
62 : }
63 :
64 38831 : SlashToDot operator+(size_t sz) const { return SlashToDot(cs + sz); }
65 :
66 1041777 : Char operator*() const {
67 1041777 : char c = *cs;
68 :
69 1041777 : return (c == '/') ? '.' : c;
70 : }
71 : private:
72 : const Char *cs;
73 : };
74 :
75 : /***
76 : * A STL style read-only forward iterator.
77 : * Iterates over a char* but replaces '.' with '/'
78 : */
79 : template<typename Char>
80 : struct DotToSlash {
81 : typedef std::forward_iterator_tag iterator_category;
82 : typedef const Char* pointer;
83 : typedef const Char& reference;
84 : typedef Char value_type;
85 : typedef const Char* difference_type;
86 :
87 15639 : DotToSlash(const Char *cs) : cs(cs) {}
88 :
89 0 : bool operator==(const DotToSlash& it) const { return cs == it.cs; }
90 236030 : bool operator!=(const DotToSlash& it) const { return cs != it.cs; }
91 0 : bool operator> (const DotToSlash& it) const { return cs > it.cs; }
92 :
93 225604 : DotToSlash& operator++() {
94 225604 : cs++;
95 225604 : return *this;
96 : }
97 15230 : DotToSlash operator++(int) {
98 15230 : DotToSlash it(*this);
99 15230 : ++(*this);
100 15230 : return it;
101 : }
102 :
103 5213 : DotToSlash operator+(size_t sz) const { return DotToSlash(cs + sz); }
104 :
105 225604 : Char operator*() const {
106 225604 : char c = *cs;
107 :
108 225604 : return (c == '.') ? '/' : c;
109 : }
110 : private:
111 : const Char *cs;
112 : };
113 :
114 : /***
115 : * Helper that wraps a pair of iterators
116 : */
117 : template<typename Iterator>
118 : struct Range {
119 : template<typename T>
120 0 : Range(T t) : _begin(t.begin()), _end(t.end()) {}
121 : Range(Iterator b, Iterator e) : _begin(b), _end(e) {}
122 :
123 0 : Iterator begin() { return _begin; }
124 0 : Iterator end() { return _end; }
125 : private:
126 : Iterator _begin, _end;
127 : };
128 : }
129 :
130 : namespace utf8 {
131 : // what the decoder should do when it encounters an error
132 : enum ErrorAction {
133 : IGNORE_ERRORS, // Invalid input leads to undefined behaviour.
134 :
135 : ABORT_ON_ERROR // The decoding is aborted an the result of
136 : // Visitor::abort() is returned.
137 : };
138 :
139 : /***
140 : * utf8::transform
141 : *
142 : * Iterates over an UTF-8 string and calls a visitor for every UTF-8 byte and
143 : * UTF-16 codepoint encountered.
144 : * How the visitor handles errors is controlled via the enum ErrorAction.
145 : *
146 : * A visitor must conform to the following interface:
147 : * (The class VisitorBase stubs out all of these methods and can be used
148 : * as a convenient base class)
149 : *
150 : * struct Visitor {
151 : * typedef ... ReturnType;
152 : *
153 : * ErrorAction error_action(); // called when an error is encountered
154 : *
155 : * void utf8(uint8_t); // called for every valid UTF-8 byte
156 : * void utf16(uint16_t); // called for every valid UTF-16 codepoint
157 : *
158 : * ReturnType finish(); // called on success
159 : * ReturnType abort(); // called on error
160 : * // (iff ErrorAction is ABORT_ON_ERROR)
161 : * };
162 : *
163 : * @Cpp11 Use decltype to get return type of Fn::finish without forcing
164 : * Fn to explicitly contain a typedef.
165 : * We could do this now with GCCs typeof, but that's non-standard.
166 : */
167 : template<typename Iterator, typename Fn>
168 : typename Fn::ReturnType transform(Iterator begin, Iterator end, Fn);
169 :
170 : template<typename T, typename Fn>
171 0 : inline typename Fn::ReturnType transform(T t, Fn fn) {
172 0 : return ::utf8::transform(t.begin(), t.end(), fn);
173 : }
174 :
175 :
176 : /***
177 : * Handy base class for implementing visitors
178 : */
179 : template<typename ReturnType, ErrorAction action>
180 6058538 : struct VisitorBase {
181 0 : ErrorAction error_action() const { return action; }
182 :
183 7066348 : void utf8(uint8_t) const {}
184 660364 : void utf16(uint16_t) const {}
185 :
186 : ReturnType finish() const { return ReturnType(); }
187 0 : ReturnType abort() const { return ReturnType(); }
188 : };
189 :
190 : /***
191 : * Decodes one utf-16 codepoints from input, automatically advances input
192 : * pointer to start of next codepoint.
193 : *
194 : * Input MUST be valid UTF-8.
195 : */
196 : uint16_t decode_char(const char*&);
197 :
198 : /***
199 : * check if char is valid ascii
200 : */
201 : inline bool is_ascii(uint8_t c) { return c < 128; }
202 :
203 : /***
204 : * decode utf8 string into utf16 string, destination must have enough space.
205 : * returns false on error
206 : */
207 : template<typename Utf8Iterator>
208 : inline bool decode(Utf8Iterator begin, Utf8Iterator end, uint16_t *dst);
209 :
210 : typedef utf_utils::SlashToDot<char> SlashToDot;
211 : typedef utf_utils::DotToSlash<char> DotToSlash;
212 :
213 : /***
214 : * Wrap iterators of container with SlashToDot
215 : */
216 : template<typename T>
217 0 : utf_utils::Range<SlashToDot> slash_to_dot(T t) { return utf_utils::Range<SlashToDot>(t); }
218 :
219 : template<typename It>
220 : utf_utils::Range<SlashToDot> slash_to_dot(It a, It b) { return utf_utils::Range<SlashToDot>(a, b); }
221 :
222 : /***
223 : * Wrap iterators of container with DotToSlash
224 : */
225 : template<typename T>
226 : utf_utils::Range<DotToSlash> dot_to_slash(T t) { return utf_utils::Range<DotToSlash>(t); }
227 :
228 : template<typename It>
229 : utf_utils::Range<DotToSlash> dot_to_slash(It a, It b) { return utf_utils::Range<DotToSlash>(a, b); }
230 :
231 : } // end namespace utf8
232 :
233 : namespace utf16 {
234 : /***
235 : * utf16::transform
236 : *
237 : * Iterates over an UTF-16 string and calls a visitor for every UTF-8 byte and
238 : * UTF-16 codepoint encountered.
239 : *
240 : * A visitor must conform to the following interface:
241 : * (The class VisitorBase stubs out all these methods and can be used
242 : * as a convenient base class)
243 : *
244 : * struct Visitor {
245 : * typedef ... ReturnType;
246 : *
247 : * void utf8(uint8_t); // called for every UTF-8 byte
248 : * void utf16(uint16_t); // called for every UTF-16 codepoint
249 : *
250 : * ReturnType finish(); // called on success
251 : * };
252 : *
253 : */
254 : template<typename Iterator, typename Fn>
255 : typename Fn::ReturnType transform(Iterator begin, Iterator end, Fn);
256 :
257 : template<typename T, typename Fn>
258 : inline typename Fn::ReturnType transform(T t, Fn fn) {
259 : return ::utf16::transform(t.begin(), t.end(), fn);
260 : }
261 :
262 :
263 : /***
264 : * Handy base class for implementing visitors
265 : */
266 : template<typename ReturnType>
267 14907 : struct VisitorBase {
268 : void utf8(uint8_t) const {}
269 124202 : void utf16(uint16_t) const {}
270 :
271 0 : ReturnType finish() const { return ReturnType(); }
272 : };
273 :
274 : /***
275 : * check if char is valid ascii
276 : */
277 : inline bool is_ascii(uint16_t c) { return c < 128; }
278 :
279 : /***
280 : * encode utf16 string into utf8 string, destination must have enough space.
281 : */
282 : template<typename Utf16Iterator>
283 : void encode(Utf16Iterator begin, Utf16Iterator end, char *dst);
284 :
285 : typedef utf_utils::SlashToDot<uint16_t> SlashToDot;
286 : typedef utf_utils::DotToSlash<uint16_t> DotToSlash;
287 :
288 : /***
289 : * Wrap iterators of container with SlashToDot
290 : */
291 : template<typename T>
292 : utf_utils::Range<SlashToDot> slash_to_dot(T t) { return utf_utils::Range<SlashToDot>(t); }
293 :
294 : template<typename It>
295 : utf_utils::Range<SlashToDot> slash_to_dot(It a, It b) { return utf_utils::Range<SlashToDot>(a, b); }
296 :
297 : /***
298 : * Wrap iterators of container with DotToSlash
299 : */
300 : template<typename T>
301 : utf_utils::Range<DotToSlash> dot_to_slash(T t) { return utf_utils::Range<DotToSlash>(t); }
302 :
303 : template<typename It>
304 : utf_utils::Range<DotToSlash> dot_to_slash(It a, It b) { return utf_utils::Range<DotToSlash>(a, b); }
305 :
306 : } // end namespace utf16
307 :
308 : /*******************************************************************************
309 : IMPLEMENTATION
310 : *******************************************************************************/
311 :
312 : #include "toolbox/utf8_transform.inc"
313 : #include "toolbox/utf16_transform.inc"
314 :
315 : #endif // UTF_UTILS_HPP_
316 :
317 :
318 : /*
319 : * These are local overrides for various environment variables in Emacs.
320 : * Please do not remove this and leave it at the end of the file, where
321 : * Emacs will automagically detect them.
322 : * ---------------------------------------------------------------------
323 : * Local variables:
324 : * mode: c++
325 : * indent-tabs-mode: t
326 : * c-basic-offset: 4
327 : * tab-width: 4
328 : * End:
329 : * vim:noexpandtab:sw=4:ts=4:
330 : */
|