LCOV - code coverage report
Current view: top level - toolbox - utf8_transform.inc (source / functions) Hit Total Coverage
Test: coverage.info Lines: 44 52 84.6 %
Date: 2017-07-14 10:03:36 Functions: 12 21 57.1 %

          Line data    Source code
       1             : /* src/toolbox/utf8_transform.inc - implementation of utf8 decoder
       2             : 
       3             :    Copyright (C) 1996-2013
       4             :    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
       5             : 
       6             :    This file is part of CACAO.
       7             : 
       8             :    This program is free software; you can redistribute it and/or
       9             :    modify it under the terms of the GNU General Public License as
      10             :    published by the Free Software Foundation; either version 2, or (at
      11             :    your option) any later version.
      12             : 
      13             :    This program is distributed in the hope that it will be useful, but
      14             :    WITHOUT ANY WARRANTY; without even the implied warranty of
      15             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      16             :    General Public License for more details.
      17             : 
      18             :    You should have received a copy of the GNU General Public License
      19             :    along with this program; if not, write to the Free Software
      20             :    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
      21             :    02110-1301, USA.
      22             : 
      23             : */
      24             : 
      25             : #ifndef UTF8_TRANSFORM_INC
      26             : #define UTF8_TRANSFORM_INC 1
      27             : 
      28             : namespace utf8 {
      29             : namespace impl {
      30             :         struct CopyUtf8ToUtf16 : utf8::VisitorBase<bool, ABORT_ON_ERROR> {
      31             :                 typedef bool ReturnType;
      32             : 
      33       84655 :                 CopyUtf8ToUtf16(uint16_t *dst) : dst(dst) {}
      34             : 
      35     3275403 :                 inline void utf16(uint16_t c) { *dst++ = c; }
      36             : 
      37       84655 :                 inline bool finish() { return true;  }
      38           0 :                 inline bool abort()  { return false; }
      39             :         private:
      40             :                 uint16_t *dst;
      41             :         };
      42             : } // end namespace impl
      43             : } // end namespace utf8
      44             : 
      45             : template<typename Iterator, typename Fn>
      46     6047380 : inline typename Fn::ReturnType utf8::transform(Iterator it, Iterator end, Fn fn) {
      47             :         using namespace ::utf8::impl;
      48             : 
      49             : #define UTF8_HANDLE_ERROR {                                \
      50             :         if (fn.error_action() == ABORT_ON_ERROR) {             \
      51             :                 return fn.abort();                                 \
      52             :         }                                                      \
      53             : }
      54             : 
      55   148964080 :         while (it != end) {
      56   136869320 :                 unsigned byte = *it++;
      57             : 
      58   136869320 :                 if (byte & 0x80) {
      59             :                         // highest bit set, non-ASCII character
      60             : 
      61     3911496 :                         if ((byte & 0xe0) == 0xc0) {
      62             :                                 // 2-byte: should be 110..... 10......
      63             : 
      64      816736 :                                 if (it == end)
      65           0 :                                         UTF8_HANDLE_ERROR
      66             : 
      67      816736 :                                 unsigned byte2 = *it++;
      68             : 
      69      816736 :                                 if ((byte2 & 0xc0) != 0x80)
      70           0 :                                         UTF8_HANDLE_ERROR
      71             : 
      72      816736 :                                 fn.utf8(byte);
      73      816736 :                                 fn.utf8(byte2);
      74      816736 :                                 fn.utf16(((byte & 0x1f) << 6) | (byte2 & 0x3f));
      75     3094760 :                         } else if ((byte & 0xf0) == 0xe0) {
      76             :                                 // 3-byte: should be 1110.... 10...... 10......
      77             : 
      78     3094760 :                                 if (it + 2 > end)
      79           0 :                                         UTF8_HANDLE_ERROR
      80             : 
      81     3094760 :                                 unsigned byte2 = *it++;
      82             : 
      83     3094760 :                                 if ((byte2 & 0xc0) != 0x80)
      84           0 :                                         UTF8_HANDLE_ERROR
      85             : 
      86     3094760 :                                 unsigned byte3 = *it++;
      87             : 
      88     3094760 :                                 if ((byte3 & 0xc0) != 0x80)
      89           0 :                                         UTF8_HANDLE_ERROR
      90             : 
      91     3094760 :                                 fn.utf8(byte);
      92     3094760 :                                 fn.utf8(byte2);
      93     3094760 :                                 fn.utf8(byte3);
      94     3094760 :                                 fn.utf16(((byte & 0x0f) << 12) | ((byte2 & 0x3f) <<  6) | (byte3 & 0x3f));
      95             :                         } else {
      96           0 :                                 UTF8_HANDLE_ERROR
      97             :                         }
      98             :                 } else {
      99             :                         // Java forbids zero bytes in UTF8
     100   132957824 :                         if (byte == 0)
     101           0 :                                 UTF8_HANDLE_ERROR
     102             : 
     103             :                         // ASCII character: highest bit not set, at least one other bit set
     104   132957824 :                         fn.utf8(byte);
     105   132957824 :                         fn.utf16(byte);
     106             :                 }
     107             :         }
     108             : 
     109     6047380 :         return fn.finish();
     110             : 
     111             : #undef UTF8_HANDLE_ERROR
     112             : }
     113             : 
     114             : 
     115      672050 : inline uint16_t utf8::decode_char(const char*& src) {
     116             :         uint16_t ch1, ch2, ch3;
     117             : 
     118      672050 :         ch1 = src[0];
     119             : 
     120      672050 :         switch (((uint8_t) ch1) >> 4) {
     121             :         default:  // 1 byte (ASCII)
     122      671324 :                 src++;
     123      671324 :                 return ch1;
     124             :         case 0xC:
     125             :         case 0xD: // 2 bytes
     126             :                 // mask out non-data bits
     127         723 :                 ch1  = ch1    & 0x1F;
     128         723 :                 ch2  = src[1] & 0x3F;
     129         723 :                 src += 2;
     130             : 
     131             :                 // stitch together data bits from individual bytes
     132         723 :                 return (ch1 << 6) | ch2;
     133             :         case 0xE: // 3 bytes
     134             :                 // mask out non-data bits
     135           3 :                 ch1  = ch1    & 0x1F;
     136           3 :                 ch2  = src[1] & 0x3F;
     137           3 :                 ch3  = src[2] & 0x3F;
     138           3 :                 src += 3;
     139             : 
     140             :                 // stitch together data bits from individual bytes
     141           3 :                 return (ch1 << 12) | (ch2 << 6) | ch3;
     142             :         }
     143             : }
     144             : 
     145             : template<typename Utf8Iterator>
     146       84655 : inline bool utf8::decode(Utf8Iterator begin, Utf8Iterator end, uint16_t *dst) {
     147       84655 :         return ::utf8::transform(begin, end, ::utf8::impl::CopyUtf8ToUtf16(dst));
     148             : }
     149             : 
     150             : #endif // UTF8_TRANSFORM_INC
     151             : 
     152             : 
     153             : /*
     154             :  * These are local overrides for various environment variables in Emacs.
     155             :  * Please do not remove this and leave it at the end of the file, where
     156             :  * Emacs will automagically detect them.
     157             :  * ---------------------------------------------------------------------
     158             :  * Local variables:
     159             :  * mode: c++
     160             :  * indent-tabs-mode: t
     161             :  * c-basic-offset: 4
     162             :  * tab-width: 4
     163             :  * End:
     164             :  * vim:noexpandtab:sw=4:ts=4:
     165             :  */

Generated by: LCOV version 1.11