CACAO
utf_utils.hpp
Go to the documentation of this file.
1 /* src/toolbox/utf_utils.hpp - functions for handling utf8/utf16
2 
3  Copyright (C) 1996-2013
4  CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
5 
6  This file is part of CACAO.
7 
8  This program is free software; you can redistribute it and/or
9  modify it under the terms of the GNU General Public License as
10  published by the Free Software Foundation; either version 2, or (at
11  your option) any later version.
12 
13  This program is distributed in the hope that it will be useful, but
14  WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program; if not, write to the Free Software
20  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21  02110-1301, USA.
22 
23 */
24 
25 #ifndef UTF_UTILS_HPP_
26 #define UTF_UTILS_HPP_ 1
27 
28 #include <cassert>
29 #include <stdint.h>
30 #include <iterator>
31 
32 // TODO: Maybe rename transform functions, this is not a transform in the STL sense.
33 // It's a reduction, like std::accumulate.
34 
35 namespace utf_utils {
36  /***
37  * A STL style read-only forward iterator.
38  * Iterates over a char* but replaces '/' with '.'
39  */
40  template<typename Char>
41  struct SlashToDot {
42  typedef std::forward_iterator_tag iterator_category;
43  typedef const Char* pointer;
44  typedef const Char& reference;
45  typedef Char value_type;
46  typedef const Char* difference_type;
47 
48  SlashToDot(const Char *cs) : cs(cs) {}
49 
50  bool operator==(const SlashToDot& it) const { return cs == it.cs; }
51  bool operator!=(const SlashToDot& it) const { return cs != it.cs; }
52  bool operator> (const SlashToDot& it) const { return cs > it.cs; }
53 
55  cs++;
56  return *this;
57  }
59  SlashToDot it(*this);
60  ++(*this);
61  return it;
62  }
63 
64  SlashToDot operator+(size_t sz) const { return SlashToDot(cs + sz); }
65 
66  Char operator*() const {
67  char c = *cs;
68 
69  return (c == '/') ? '.' : c;
70  }
71  private:
72  const Char *cs;
73  };
74 
75  /***
76  * A STL style read-only forward iterator.
77  * Iterates over a char* but replaces '.' with '/'
78  */
79  template<typename Char>
80  struct DotToSlash {
81  typedef std::forward_iterator_tag iterator_category;
82  typedef const Char* pointer;
83  typedef const Char& reference;
84  typedef Char value_type;
85  typedef const Char* difference_type;
86 
87  DotToSlash(const Char *cs) : cs(cs) {}
88 
89  bool operator==(const DotToSlash& it) const { return cs == it.cs; }
90  bool operator!=(const DotToSlash& it) const { return cs != it.cs; }
91  bool operator> (const DotToSlash& it) const { return cs > it.cs; }
92 
94  cs++;
95  return *this;
96  }
98  DotToSlash it(*this);
99  ++(*this);
100  return it;
101  }
102 
103  DotToSlash operator+(size_t sz) const { return DotToSlash(cs + sz); }
104 
105  Char operator*() const {
106  char c = *cs;
107 
108  return (c == '.') ? '/' : c;
109  }
110  private:
111  const Char *cs;
112  };
113 
114  /***
115  * Helper that wraps a pair of iterators
116  */
117  template<typename Iterator>
118  struct Range {
119  template<typename T>
120  Range(T t) : _begin(t.begin()), _end(t.end()) {}
121  Range(Iterator b, Iterator e) : _begin(b), _end(e) {}
122 
123  Iterator begin() { return _begin; }
124  Iterator end() { return _end; }
125  private:
126  Iterator _begin, _end;
127  };
128 }
129 
130 namespace utf8 {
131  // what the decoder should do when it encounters an error
132  enum ErrorAction {
133  IGNORE_ERRORS, // Invalid input leads to undefined behaviour.
134 
135  ABORT_ON_ERROR // The decoding is aborted an the result of
136  // Visitor::abort() is returned.
137  };
138 
139  /***
140  * utf8::transform
141  *
142  * Iterates over an UTF-8 string and calls a visitor for every UTF-8 byte and
143  * UTF-16 codepoint encountered.
144  * How the visitor handles errors is controlled via the enum ErrorAction.
145  *
146  * A visitor must conform to the following interface:
147  * (The class VisitorBase stubs out all of these methods and can be used
148  * as a convenient base class)
149  *
150  * struct Visitor {
151  * typedef ... ReturnType;
152  *
153  * ErrorAction error_action(); // called when an error is encountered
154  *
155  * void utf8(uint8_t); // called for every valid UTF-8 byte
156  * void utf16(uint16_t); // called for every valid UTF-16 codepoint
157  *
158  * ReturnType finish(); // called on success
159  * ReturnType abort(); // called on error
160  * // (iff ErrorAction is ABORT_ON_ERROR)
161  * };
162  *
163  * @Cpp11 Use decltype to get return type of Fn::finish without forcing
164  * Fn to explicitly contain a typedef.
165  * We could do this now with GCCs typeof, but that's non-standard.
166  */
167  template<typename Iterator, typename Fn>
168  typename Fn::ReturnType transform(Iterator begin, Iterator end, Fn);
169 
170  template<typename T, typename Fn>
171  inline typename Fn::ReturnType transform(T t, Fn fn) {
172  return ::utf8::transform(t.begin(), t.end(), fn);
173  }
174 
175 
176  /***
177  * Handy base class for implementing visitors
178  */
179  template<typename ReturnType, ErrorAction action>
180  struct VisitorBase {
181  ErrorAction error_action() const { return action; }
182 
183  void utf8(uint8_t) const {}
184  void utf16(uint16_t) const {}
185 
186  ReturnType finish() const { return ReturnType(); }
187  ReturnType abort() const { return ReturnType(); }
188  };
189 
190  /***
191  * Decodes one utf-16 codepoints from input, automatically advances input
192  * pointer to start of next codepoint.
193  *
194  * Input MUST be valid UTF-8.
195  */
196  uint16_t decode_char(const char*&);
197 
198  /***
199  * check if char is valid ascii
200  */
201  inline bool is_ascii(uint8_t c) { return c < 128; }
202 
203  /***
204  * decode utf8 string into utf16 string, destination must have enough space.
205  * returns false on error
206  */
207  template<typename Utf8Iterator>
208  inline bool decode(Utf8Iterator begin, Utf8Iterator end, uint16_t *dst);
209 
212 
213  /***
214  * Wrap iterators of container with SlashToDot
215  */
216  template<typename T>
218 
219  template<typename It>
221 
222  /***
223  * Wrap iterators of container with DotToSlash
224  */
225  template<typename T>
227 
228  template<typename It>
230 
231 } // end namespace utf8
232 
233 namespace utf16 {
234  /***
235  * utf16::transform
236  *
237  * Iterates over an UTF-16 string and calls a visitor for every UTF-8 byte and
238  * UTF-16 codepoint encountered.
239  *
240  * A visitor must conform to the following interface:
241  * (The class VisitorBase stubs out all these methods and can be used
242  * as a convenient base class)
243  *
244  * struct Visitor {
245  * typedef ... ReturnType;
246  *
247  * void utf8(uint8_t); // called for every UTF-8 byte
248  * void utf16(uint16_t); // called for every UTF-16 codepoint
249  *
250  * ReturnType finish(); // called on success
251  * };
252  *
253  */
254  template<typename Iterator, typename Fn>
255  typename Fn::ReturnType transform(Iterator begin, Iterator end, Fn);
256 
257  template<typename T, typename Fn>
258  inline typename Fn::ReturnType transform(T t, Fn fn) {
259  return ::utf16::transform(t.begin(), t.end(), fn);
260  }
261 
262 
263  /***
264  * Handy base class for implementing visitors
265  */
266  template<typename ReturnType>
267  struct VisitorBase {
268  void utf8(uint8_t) const {}
269  void utf16(uint16_t) const {}
270 
271  ReturnType finish() const { return ReturnType(); }
272  };
273 
274  /***
275  * check if char is valid ascii
276  */
277  inline bool is_ascii(uint16_t c) { return c < 128; }
278 
279  /***
280  * encode utf16 string into utf8 string, destination must have enough space.
281  */
282  template<typename Utf16Iterator>
283  void encode(Utf16Iterator begin, Utf16Iterator end, char *dst);
284 
287 
288  /***
289  * Wrap iterators of container with SlashToDot
290  */
291  template<typename T>
293 
294  template<typename It>
296 
297  /***
298  * Wrap iterators of container with DotToSlash
299  */
300  template<typename T>
302 
303  template<typename It>
305 
306 } // end namespace utf16
307 
308 /*******************************************************************************
309  IMPLEMENTATION
310 *******************************************************************************/
311 
314 
315 #endif // UTF_UTILS_HPP_
316 
317 
318 /*
319  * These are local overrides for various environment variables in Emacs.
320  * Please do not remove this and leave it at the end of the file, where
321  * Emacs will automagically detect them.
322  * ---------------------------------------------------------------------
323  * Local variables:
324  * mode: c++
325  * indent-tabs-mode: t
326  * c-basic-offset: 4
327  * tab-width: 4
328  * End:
329  * vim:noexpandtab:sw=4:ts=4:
330  */
bool operator!=(const SlashToDot &it) const
Definition: utf_utils.hpp:51
const Char * difference_type
Definition: utf_utils.hpp:85
void utf8(uint8_t) const
Definition: utf_utils.hpp:268
SlashToDot(const Char *cs)
Definition: utf_utils.hpp:48
bool operator==(const DotToSlash &it) const
Definition: utf_utils.hpp:89
utf_utils::Range< SlashToDot > slash_to_dot(T t)
Definition: utf_utils.hpp:217
uint16_t decode_char(const char *&)
Range(Iterator b, Iterator e)
Definition: utf_utils.hpp:121
const Char * pointer
Definition: utf_utils.hpp:82
std::forward_iterator_tag iterator_category
Definition: utf_utils.hpp:42
bool operator==(const SlashToDot &it) const
Definition: utf_utils.hpp:50
SlashToDot operator++(int)
Definition: utf_utils.hpp:58
std::forward_iterator_tag iterator_category
Definition: utf_utils.hpp:81
Fn::ReturnType transform(Iterator begin, Iterator end, Fn)
SlashToDot & operator++()
Definition: utf_utils.hpp:54
ReturnType finish() const
Definition: utf_utils.hpp:186
Char operator*() const
Definition: utf_utils.hpp:66
SlashToDot operator+(size_t sz) const
Definition: utf_utils.hpp:64
Iterator begin()
Definition: utf_utils.hpp:123
Char operator*() const
Definition: utf_utils.hpp:105
utf_utils::DotToSlash< uint16_t > DotToSlash
Definition: utf_utils.hpp:286
void encode(Utf16Iterator begin, Utf16Iterator end, char *dst)
const Char & reference
Definition: utf_utils.hpp:83
utf_utils::SlashToDot< uint16_t > SlashToDot
Definition: utf_utils.hpp:285
DotToSlash operator++(int)
Definition: utf_utils.hpp:97
const Char * pointer
Definition: utf_utils.hpp:43
bool is_ascii(uint16_t c)
Definition: utf_utils.hpp:277
utf_utils::Range< DotToSlash > dot_to_slash(T t)
Definition: utf_utils.hpp:301
DotToSlash(const Char *cs)
Definition: utf_utils.hpp:87
ReturnType abort() const
Definition: utf_utils.hpp:187
void utf16(uint16_t) const
Definition: utf_utils.hpp:269
ErrorAction error_action() const
Definition: utf_utils.hpp:181
bool is_ascii(uint8_t c)
Definition: utf_utils.hpp:201
void utf16(uint16_t) const
Definition: utf_utils.hpp:184
bool operator>(const SlashToDot &it) const
Definition: utf_utils.hpp:52
void utf8(uint8_t) const
Definition: utf_utils.hpp:183
utf_utils::DotToSlash< char > DotToSlash
Definition: utf_utils.hpp:211
ReturnType finish() const
Definition: utf_utils.hpp:271
utf_utils::Range< DotToSlash > dot_to_slash(T t)
Definition: utf_utils.hpp:226
MIIterator e
DotToSlash & operator++()
Definition: utf_utils.hpp:93
DotToSlash operator+(size_t sz) const
Definition: utf_utils.hpp:103
Fn::ReturnType transform(Iterator begin, Iterator end, Fn)
Iterator end()
Definition: utf_utils.hpp:124
bool operator!=(const DotToSlash &it) const
Definition: utf_utils.hpp:90
bool decode(Utf8Iterator begin, Utf8Iterator end, uint16_t *dst)
ErrorAction
Definition: utf_utils.hpp:132
const Char * difference_type
Definition: utf_utils.hpp:46
utf_utils::SlashToDot< char > SlashToDot
Definition: utf_utils.hpp:210
bool operator>(const DotToSlash &it) const
Definition: utf_utils.hpp:91
const Char & reference
Definition: utf_utils.hpp:44
utf_utils::Range< SlashToDot > slash_to_dot(T t)
Definition: utf_utils.hpp:292