CACAO
utf8_transform.inc
Go to the documentation of this file.
1 /* src/toolbox/utf8_transform.inc - implementation of utf8 decoder
2 
3  Copyright (C) 1996-2013
4  CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
5 
6  This file is part of CACAO.
7 
8  This program is free software; you can redistribute it and/or
9  modify it under the terms of the GNU General Public License as
10  published by the Free Software Foundation; either version 2, or (at
11  your option) any later version.
12 
13  This program is distributed in the hope that it will be useful, but
14  WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program; if not, write to the Free Software
20  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21  02110-1301, USA.
22 
23 */
24 
25 #ifndef UTF8_TRANSFORM_INC
26 #define UTF8_TRANSFORM_INC 1
27 
28 namespace utf8 {
29 namespace impl {
30  struct CopyUtf8ToUtf16 : utf8::VisitorBase<bool, ABORT_ON_ERROR> {
31  typedef bool ReturnType;
32 
33  CopyUtf8ToUtf16(uint16_t *dst) : dst(dst) {}
34 
35  inline void utf16(uint16_t c) { *dst++ = c; }
36 
37  inline bool finish() { return true; }
38  inline bool abort() { return false; }
39  private:
40  uint16_t *dst;
41  };
42 } // end namespace impl
43 } // end namespace utf8
44 
45 template<typename Iterator, typename Fn>
46 inline typename Fn::ReturnType utf8::transform(Iterator it, Iterator end, Fn fn) {
47  using namespace ::utf8::impl;
48 
49 #define UTF8_HANDLE_ERROR { \
50  if (fn.error_action() == ABORT_ON_ERROR) { \
51  return fn.abort(); \
52  } \
53 }
54 
55  while (it != end) {
56  unsigned byte = *it++;
57 
58  if (byte & 0x80) {
59  // highest bit set, non-ASCII character
60 
61  if ((byte & 0xe0) == 0xc0) {
62  // 2-byte: should be 110..... 10......
63 
64  if (it == end)
65  UTF8_HANDLE_ERROR
66 
67  unsigned byte2 = *it++;
68 
69  if ((byte2 & 0xc0) != 0x80)
70  UTF8_HANDLE_ERROR
71 
72  fn.utf8(byte);
73  fn.utf8(byte2);
74  fn.utf16(((byte & 0x1f) << 6) | (byte2 & 0x3f));
75  } else if ((byte & 0xf0) == 0xe0) {
76  // 3-byte: should be 1110.... 10...... 10......
77 
78  if (it + 2 > end)
79  UTF8_HANDLE_ERROR
80 
81  unsigned byte2 = *it++;
82 
83  if ((byte2 & 0xc0) != 0x80)
84  UTF8_HANDLE_ERROR
85 
86  unsigned byte3 = *it++;
87 
88  if ((byte3 & 0xc0) != 0x80)
89  UTF8_HANDLE_ERROR
90 
91  fn.utf8(byte);
92  fn.utf8(byte2);
93  fn.utf8(byte3);
94  fn.utf16(((byte & 0x0f) << 12) | ((byte2 & 0x3f) << 6) | (byte3 & 0x3f));
95  } else {
96  UTF8_HANDLE_ERROR
97  }
98  } else {
99  // Java forbids zero bytes in UTF8
100  if (byte == 0)
101  UTF8_HANDLE_ERROR
102 
103  // ASCII character: highest bit not set, at least one other bit set
104  fn.utf8(byte);
105  fn.utf16(byte);
106  }
107  }
108 
109  return fn.finish();
110 
111 #undef UTF8_HANDLE_ERROR
112 }
113 
114 
115 inline uint16_t utf8::decode_char(const char*& src) {
116  uint16_t ch1, ch2, ch3;
117 
118  ch1 = src[0];
119 
120  switch (((uint8_t) ch1) >> 4) {
121  default: // 1 byte (ASCII)
122  src++;
123  return ch1;
124  case 0xC:
125  case 0xD: // 2 bytes
126  // mask out non-data bits
127  ch1 = ch1 & 0x1F;
128  ch2 = src[1] & 0x3F;
129  src += 2;
130 
131  // stitch together data bits from individual bytes
132  return (ch1 << 6) | ch2;
133  case 0xE: // 3 bytes
134  // mask out non-data bits
135  ch1 = ch1 & 0x1F;
136  ch2 = src[1] & 0x3F;
137  ch3 = src[2] & 0x3F;
138  src += 3;
139 
140  // stitch together data bits from individual bytes
141  return (ch1 << 12) | (ch2 << 6) | ch3;
142  }
143 }
144 
145 template<typename Utf8Iterator>
146 inline bool utf8::decode(Utf8Iterator begin, Utf8Iterator end, uint16_t *dst) {
147  return ::utf8::transform(begin, end, ::utf8::impl::CopyUtf8ToUtf16(dst));
148 }
149 
150 #endif // UTF8_TRANSFORM_INC
151 
152 
153 /*
154  * These are local overrides for various environment variables in Emacs.
155  * Please do not remove this and leave it at the end of the file, where
156  * Emacs will automagically detect them.
157  * ---------------------------------------------------------------------
158  * Local variables:
159  * mode: c++
160  * indent-tabs-mode: t
161  * c-basic-offset: 4
162  * tab-width: 4
163  * End:
164  * vim:noexpandtab:sw=4:ts=4:
165  */
uint16_t decode_char(const char *&)
Fn::ReturnType transform(Iterator begin, Iterator end, Fn)
#define abort
Definition: md-asm.hpp:112
bool decode(Utf8Iterator begin, Utf8Iterator end, uint16_t *dst)