LCOV - code coverage report
Current view: top level - toolbox - utf_utils.hpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 32 44 72.7 %
Date: 2015-06-10 18:10:59 Functions: 28 50 56.0 %

          Line data    Source code
       1             : /* src/toolbox/utf_utils.hpp - functions for handling utf8/utf16
       2             : 
       3             :    Copyright (C) 1996-2013
       4             :    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
       5             : 
       6             :    This file is part of CACAO.
       7             : 
       8             :    This program is free software; you can redistribute it and/or
       9             :    modify it under the terms of the GNU General Public License as
      10             :    published by the Free Software Foundation; either version 2, or (at
      11             :    your option) any later version.
      12             : 
      13             :    This program is distributed in the hope that it will be useful, but
      14             :    WITHOUT ANY WARRANTY; without even the implied warranty of
      15             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      16             :    General Public License for more details.
      17             : 
      18             :    You should have received a copy of the GNU General Public License
      19             :    along with this program; if not, write to the Free Software
      20             :    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
      21             :    02110-1301, USA.
      22             : 
      23             : */
      24             : 
      25             : #ifndef UTF_UTILS_HPP_
      26             : #define UTF_UTILS_HPP_ 1
      27             : 
      28             : #include <cassert>
      29             : #include <stdint.h>
      30             : #include <iterator>
      31             : 
      32             : // TODO: Maybe rename transform functions, this is not a transform in the STL sense.
      33             : //       It's a reduction, like std::accumulate.
      34             : 
      35             : namespace utf_utils {
      36             :         /***
      37             :          * A STL style read-only forward iterator.
      38             :          * Iterates over a char* but replaces '/' with '.'
      39             :          */
      40             :         template<typename Char>
      41             :         struct SlashToDot {
      42             :                 typedef std::forward_iterator_tag iterator_category;
      43             :                 typedef const Char*               pointer;
      44             :                 typedef const Char&               reference;
      45             :                 typedef Char                      value_type;
      46             :                 typedef const Char*               difference_type;
      47             : 
      48      133104 :                 SlashToDot(const Char *cs) : cs(cs) {}
      49             : 
      50        5537 :                 bool operator==(const SlashToDot& it) const { return cs == it.cs; }
      51     1124976 :                 bool operator!=(const SlashToDot& it) const { return cs != it.cs; }
      52           0 :                 bool operator> (const SlashToDot& it) const { return cs >  it.cs; }
      53             : 
      54     1041777 :                 SlashToDot& operator++() {
      55     1041777 :                         cs++;
      56     1041777 :                         return *this;
      57             :                 }
      58      565734 :                 SlashToDot operator++(int) {
      59      565734 :                         SlashToDot it(*this);
      60      565734 :                         ++(*this);
      61      565734 :                         return it;
      62             :                 }
      63             : 
      64       38831 :                 SlashToDot operator+(size_t sz) const { return SlashToDot(cs + sz); }
      65             : 
      66     1041777 :                 Char operator*() const {
      67     1041777 :                         char c = *cs;
      68             : 
      69     1041777 :                         return (c == '/') ? '.' : c;
      70             :                 }
      71             :         private:
      72             :                 const Char *cs;
      73             :         };
      74             : 
      75             :         /***
      76             :          * A STL style read-only forward iterator.
      77             :          * Iterates over a char* but replaces '.' with '/'
      78             :          */
      79             :         template<typename Char>
      80             :         struct DotToSlash {
      81             :                 typedef std::forward_iterator_tag iterator_category;
      82             :                 typedef const Char*               pointer;
      83             :                 typedef const Char&               reference;
      84             :                 typedef Char                      value_type;
      85             :                 typedef const Char*               difference_type;
      86             : 
      87       15639 :                 DotToSlash(const Char *cs) : cs(cs) {}
      88             : 
      89           0 :                 bool operator==(const DotToSlash& it) const { return cs == it.cs; }
      90      236030 :                 bool operator!=(const DotToSlash& it) const { return cs != it.cs; }
      91           0 :                 bool operator> (const DotToSlash& it) const { return cs >  it.cs; }
      92             : 
      93      225604 :                 DotToSlash& operator++() {
      94      225604 :                         cs++;
      95      225604 :                         return *this;
      96             :                 }
      97       15230 :                 DotToSlash operator++(int) {
      98       15230 :                         DotToSlash it(*this);
      99       15230 :                         ++(*this);
     100       15230 :                         return it;
     101             :                 }
     102             : 
     103        5213 :                 DotToSlash operator+(size_t sz) const { return DotToSlash(cs + sz); }
     104             : 
     105      225604 :                 Char operator*() const {
     106      225604 :                         char c = *cs;
     107             : 
     108      225604 :                         return (c == '.') ? '/' : c;
     109             :                 }
     110             :         private:
     111             :                 const Char *cs;
     112             :         };
     113             : 
     114             :         /***
     115             :          * Helper that wraps a pair of iterators
     116             :          */
     117             :         template<typename Iterator>
     118             :         struct Range {
     119             :                 template<typename T>
     120           0 :                 Range(T t)                    : _begin(t.begin()), _end(t.end()) {}
     121             :                 Range(Iterator b, Iterator e) : _begin(b),         _end(e)       {}
     122             : 
     123           0 :                 Iterator begin() { return _begin; }
     124           0 :                 Iterator end()   { return _end;   }
     125             :         private:
     126             :                 Iterator _begin, _end;
     127             :         };
     128             : }
     129             : 
     130             : namespace utf8 {
     131             :         // what the decoder should do when it encounters an error
     132             :         enum ErrorAction {
     133             :                 IGNORE_ERRORS,    // Invalid input leads to undefined behaviour.
     134             : 
     135             :                 ABORT_ON_ERROR    // The decoding is aborted an the result of
     136             :                                   // Visitor::abort() is returned.
     137             :         };
     138             : 
     139             :         /***
     140             :          *      utf8::transform
     141             :          *
     142             :          *      Iterates over an UTF-8 string and calls a visitor for every UTF-8 byte and
     143             :          *      UTF-16 codepoint encountered.
     144             :          *      How the visitor handles errors is controlled via the enum ErrorAction.
     145             :          *
     146             :          *      A visitor must conform to the following interface:
     147             :          *       (The class VisitorBase stubs out all of these methods and can be used
     148             :          *       as a convenient base class)
     149             :          *
     150             :          *              struct Visitor {
     151             :          *                      typedef ... ReturnType;
     152             :          *
     153             :          *                      ErrorAction error_action(); // called when an error is encountered
     154             :          *
     155             :          *                      void utf8(uint8_t);     // called for every valid UTF-8 byte
     156             :          *                      void utf16(uint16_t);   // called for every valid UTF-16 codepoint
     157             :          *
     158             :          *                      ReturnType finish();    // called on success
     159             :          *                      ReturnType abort();     // called on error
     160             :          *                                              // (iff ErrorAction is ABORT_ON_ERROR)
     161             :          *      };
     162             :          *
     163             :          * @Cpp11 Use decltype to get return type of Fn::finish without forcing
     164             :          *        Fn to explicitly contain a typedef.
     165             :          *        We could do this now with GCCs typeof, but that's non-standard.
     166             :          */
     167             :         template<typename Iterator, typename Fn>
     168             :         typename Fn::ReturnType transform(Iterator begin, Iterator end, Fn);
     169             : 
     170             :         template<typename T, typename Fn>
     171           0 :         inline typename Fn::ReturnType transform(T t, Fn fn) {
     172           0 :                 return ::utf8::transform(t.begin(), t.end(), fn);
     173             :         }
     174             : 
     175             : 
     176             :         /***
     177             :          * Handy base class for implementing visitors
     178             :          */
     179             :         template<typename ReturnType, ErrorAction action>
     180     6058538 :         struct VisitorBase {
     181           0 :                 ErrorAction error_action() const { return action; }
     182             : 
     183     7066348 :                 void utf8(uint8_t)   const {}
     184      660364 :                 void utf16(uint16_t) const {}
     185             : 
     186             :                 ReturnType finish() const { return ReturnType(); }
     187           0 :                 ReturnType abort()  const { return ReturnType(); }
     188             :         };
     189             : 
     190             :         /***
     191             :          * Decodes one utf-16 codepoints from input, automatically advances input
     192             :          * pointer to start of next codepoint.
     193             :          *
     194             :          * Input MUST be valid UTF-8.
     195             :          */
     196             :         uint16_t decode_char(const char*&);
     197             : 
     198             :         /***
     199             :          * check if char is valid ascii
     200             :          */
     201             :         inline bool is_ascii(uint8_t c) { return c < 128; }
     202             : 
     203             :         /***
     204             :          * decode utf8 string into utf16 string, destination must have enough space.
     205             :          * returns false on error
     206             :          */
     207             :         template<typename Utf8Iterator>
     208             :         inline bool decode(Utf8Iterator begin, Utf8Iterator end, uint16_t *dst);
     209             : 
     210             :         typedef utf_utils::SlashToDot<char> SlashToDot;
     211             :         typedef utf_utils::DotToSlash<char> DotToSlash;
     212             : 
     213             :         /***
     214             :          * Wrap iterators of container with SlashToDot
     215             :          */
     216             :         template<typename T>
     217           0 :         utf_utils::Range<SlashToDot> slash_to_dot(T t) { return utf_utils::Range<SlashToDot>(t); }
     218             : 
     219             :         template<typename It>
     220             :         utf_utils::Range<SlashToDot> slash_to_dot(It a, It b) { return utf_utils::Range<SlashToDot>(a, b); }
     221             : 
     222             :         /***
     223             :          * Wrap iterators of container with DotToSlash
     224             :          */
     225             :         template<typename T>
     226             :         utf_utils::Range<DotToSlash> dot_to_slash(T t) { return utf_utils::Range<DotToSlash>(t); }
     227             : 
     228             :         template<typename It>
     229             :         utf_utils::Range<DotToSlash> dot_to_slash(It a, It b) { return utf_utils::Range<DotToSlash>(a, b); }
     230             : 
     231             : } // end namespace utf8
     232             : 
     233             : namespace utf16 {
     234             :         /***
     235             :          *      utf16::transform
     236             :          *
     237             :          *      Iterates over an UTF-16 string and calls a visitor for every UTF-8 byte and
     238             :          *      UTF-16 codepoint encountered.
     239             :          *
     240             :          *      A visitor must conform to the following interface:
     241             :          *       (The class VisitorBase stubs out all these methods and can be used
     242             :          *       as a convenient base class)
     243             :          *
     244             :          *              struct Visitor {
     245             :          *                      typedef ... ReturnType;
     246             :          *
     247             :          *                      void utf8(uint8_t);     // called for every UTF-8 byte
     248             :          *                      void utf16(uint16_t);   // called for every UTF-16 codepoint
     249             :          *
     250             :          *                      ReturnType finish();    // called on success
     251             :          *      };
     252             :          *
     253             :          */
     254             :         template<typename Iterator, typename Fn>
     255             :         typename Fn::ReturnType transform(Iterator begin, Iterator end, Fn);
     256             : 
     257             :         template<typename T, typename Fn>
     258             :         inline typename Fn::ReturnType transform(T t, Fn fn) {
     259             :                 return ::utf16::transform(t.begin(), t.end(), fn);
     260             :         }
     261             : 
     262             : 
     263             :         /***
     264             :          * Handy base class for implementing visitors
     265             :          */
     266             :         template<typename ReturnType>
     267       14907 :         struct VisitorBase {
     268             :                 void utf8(uint8_t)   const {}
     269      124202 :                 void utf16(uint16_t) const {}
     270             : 
     271           0 :                 ReturnType finish() const { return ReturnType(); }
     272             :         };
     273             : 
     274             :         /***
     275             :          * check if char is valid ascii
     276             :          */
     277             :         inline bool is_ascii(uint16_t c) { return c < 128; }
     278             : 
     279             :         /***
     280             :          * encode utf16 string into utf8 string, destination must have enough space.
     281             :          */
     282             :         template<typename Utf16Iterator>
     283             :         void encode(Utf16Iterator begin, Utf16Iterator end, char *dst);
     284             : 
     285             :         typedef utf_utils::SlashToDot<uint16_t> SlashToDot;
     286             :         typedef utf_utils::DotToSlash<uint16_t> DotToSlash;
     287             : 
     288             :         /***
     289             :          * Wrap iterators of container with SlashToDot
     290             :          */
     291             :         template<typename T>
     292             :         utf_utils::Range<SlashToDot> slash_to_dot(T t) { return utf_utils::Range<SlashToDot>(t); }
     293             : 
     294             :         template<typename It>
     295             :         utf_utils::Range<SlashToDot> slash_to_dot(It a, It b) { return utf_utils::Range<SlashToDot>(a, b); }
     296             : 
     297             :         /***
     298             :          * Wrap iterators of container with DotToSlash
     299             :          */
     300             :         template<typename T>
     301             :         utf_utils::Range<DotToSlash> dot_to_slash(T t) { return utf_utils::Range<DotToSlash>(t); }
     302             : 
     303             :         template<typename It>
     304             :         utf_utils::Range<DotToSlash> dot_to_slash(It a, It b) { return utf_utils::Range<DotToSlash>(a, b); }
     305             : 
     306             : } // end namespace utf16
     307             : 
     308             : /*******************************************************************************
     309             :         IMPLEMENTATION
     310             : *******************************************************************************/
     311             : 
     312             : #include "toolbox/utf8_transform.inc"
     313             : #include "toolbox/utf16_transform.inc"
     314             : 
     315             : #endif // UTF_UTILS_HPP_
     316             : 
     317             : 
     318             : /*
     319             :  * These are local overrides for various environment variables in Emacs.
     320             :  * Please do not remove this and leave it at the end of the file, where
     321             :  * Emacs will automagically detect them.
     322             :  * ---------------------------------------------------------------------
     323             :  * Local variables:
     324             :  * mode: c++
     325             :  * indent-tabs-mode: t
     326             :  * c-basic-offset: 4
     327             :  * tab-width: 4
     328             :  * End:
     329             :  * vim:noexpandtab:sw=4:ts=4:
     330             :  */

Generated by: LCOV version 1.11