LCOV - code coverage report
Current view: top level - vm - utf8.hpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 18 18 100.0 %
Date: 2015-06-10 18:10:59 Functions: 16 16 100.0 %

          Line data    Source code
       1             : /* src/vm/utf8.hpp - utf8 string functions
       2             : 
       3             :    Copyright (C) 1996-2014
       4             :    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
       5             : 
       6             :    This file is part of CACAO.
       7             : 
       8             :    This program is free software; you can redistribute it and/or
       9             :    modify it under the terms of the GNU General Public License as
      10             :    published by the Free Software Foundation; either version 2, or (at
      11             :    your option) any later version.
      12             : 
      13             :    This program is distributed in the hope that it will be useful, but
      14             :    WITHOUT ANY WARRANTY; without even the implied warranty of
      15             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      16             :    General Public License for more details.
      17             : 
      18             :    You should have received a copy of the GNU General Public License
      19             :    along with this program; if not, write to the Free Software
      20             :    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
      21             :    02110-1301, USA.
      22             : 
      23             : */
      24             : 
      25             : 
      26             : #ifndef UTF8_HPP_
      27             : #define UTF8_HPP_ 1
      28             : 
      29             : #include "config.h"                     // used in utf8.inc
      30             : 
      31             : #include <cstddef>                      // for size_t
      32             : #include <cstdio>                       // for FILE
      33             : #include <cstring>                      // for strlen
      34             : #include <stdint.h>                     // for uint32_t, uint8_t
      35             : 
      36             : #include <iterator>
      37             : 
      38             : namespace cacao { class OStream; }
      39             : struct utf;
      40             : 
      41             : /* Utf8String ******************************************************************
      42             : 
      43             :         A container for strings in Java's modified UTF-8 encoding.
      44             : 
      45             :         A Utf8String always contains either a valid (possibly empty) UTF-8 string
      46             :         or NULL.
      47             :         You can check for NULL like you would with any normal pointer.
      48             :         Invoking any method except operator void*() or c_ptr() on a NULL string leads to
      49             :         undefined behaviour.
      50             : 
      51             :         Use a Utf8String like a pointer, i.e. always pass by value.
      52             : 
      53             :         The contents of a Utf8String are zero terminated, and it never contains any
      54             :         zero bytes except the one at the end, so any C string processing functions
      55             :         work properly.
      56             : 
      57             : *******************************************************************************/
      58             : 
      59        3571 : class Utf8String {
      60             :         public:
      61             :                 /*** GLOBAL INITIALIZATION **********************************/
      62             : 
      63             :                 // initialize the utf8 subsystem
      64             :                 // MUST be called before any Utf8String can be constructed
      65             :                 static void initialize();
      66             : 
      67             :                 // check if utf8 subsytem is initialized
      68             :                 static bool is_initialized();
      69             : 
      70             :                 /*** CONSTRUCTORS  ******************************************/
      71             : 
      72             :                 // constructs a null string
      73    33066657 :                 Utf8String() : _data(0) {}
      74             : 
      75             :                 // construct from a buffer with a given length
      76             :                 // validates that input is really UTF-8
      77             :                 // constructs a null string on error
      78             :                 static Utf8String from_utf8(const char*, size_t);
      79             :                 static Utf8String from_utf8_dot_to_slash(const char*, size_t);
      80             :                 static Utf8String from_utf8_slash_to_dot(const char*, size_t);
      81             : 
      82      126304 :                 static Utf8String from_utf8(const char *cs) {
      83      126304 :                         return from_utf8(cs, strlen(cs));
      84             :                 }
      85         740 :                 static Utf8String from_utf8_dot_to_slash(const char *cs) {
      86         740 :                         return from_utf8_dot_to_slash(cs, strlen(cs));
      87             :                 }
      88             : 
      89             :                 // construct from a UTF8String
      90             :                 static Utf8String from_utf8_slash_to_dot(Utf8String);
      91             : 
      92             :                 // construct from a UTF-16 string with a given length
      93             :                 static Utf8String from_utf16(const uint16_t*, size_t);
      94             :                 static Utf8String from_utf16_dot_to_slash(const uint16_t*, size_t);
      95             : 
      96             :                 // constructs a Utf8String with a given content
      97             :                 // is only public for interop with legacy C code
      98             :                 // NOTE: does NOT perform any checks
      99     9544669 :                 Utf8String(utf *u) : _data((Data*) u) {}
     100             : 
     101             :                 /*** ITERATION     ******************************************/
     102             : 
     103             :                 // iterator over the bytes in a string
     104             :                 typedef const char* byte_iterator;
     105             : 
     106    16971132 :                 byte_iterator begin() const { return _data->text; }
     107     4378817 :                 byte_iterator end()   const { return begin() + size(); }
     108             : 
     109             :                 // iterator over UTF-16 codepoints in a string
     110             :                 struct utf16_iterator {
     111             :                         typedef std::input_iterator_tag iterator_category;
     112             :                         typedef std::ptrdiff_t          difference_type;
     113             :                         typedef uint16_t                value_type;
     114             :                         typedef const value_type*       pointer;
     115             :                         typedef const value_type&       reference;
     116             : 
     117             :                         uint16_t operator*();
     118             : 
     119      648207 :                         void operator++() { current = next; }
     120             : 
     121             :                         bool operator!=(const utf16_iterator& it) {
     122             :                                 return current != it.current;
     123             :                         }
     124             :                 private:
     125       51474 :                         utf16_iterator(byte_iterator it) : current(it), next(it) {}
     126             : 
     127             :                         byte_iterator current, next;
     128             : 
     129             :                         friend class Utf8String;
     130             :                 };
     131             : 
     132       51474 :                 utf16_iterator utf16_begin() const { return utf16_iterator(begin()); }
     133             :                 utf16_iterator utf16_end()   const { return utf16_iterator(end());   }
     134             : 
     135             :                 /*** HASHING       ******************************************/
     136             : 
     137    20897502 :                 size_t hash() const { return _data->hash; }
     138             : 
     139             :                 /*** COMPARISONS   ******************************************/
     140             : 
     141             :                 /// check if utf-8 strings contains the same utf-16
     142             :                 /// codepoints as a utf-16 string
     143             :                 bool equals(const uint16_t *cs, size_t sz);
     144             : 
     145             :                 /// check if utf-8 strings contains same bytes as C string
     146             :                 bool equals(const char *cs) {
     147             :                         return strcmp(begin(), cs) == 0;
     148             :                 }
     149             : 
     150             :                 /*** ACCESSORS     ******************************************/
     151             : 
     152             :                 // access first element
     153             :                 char front() const { return begin()[0]; }
     154             : 
     155             :                 // access last element
     156             :                 char back() const { return begin()[size() - 1]; }
     157             : 
     158     3437091 :                 char operator[](size_t idx) const { return begin()[idx]; }
     159             : 
     160             :                 // get the number of bytes in string, excluding zero terminator.
     161     9339880 :                 size_t size() const { return _data->utf8_size; }
     162             : 
     163             :                 // get the number of utf16 codepoints in string
     164      116597 :                 size_t utf16_size() const { return _data->utf16_size; }
     165             : 
     166             :                 // for checking against NULL,
     167             :                 // also allows interop with legacy C code
     168   118866154 :                 operator void*() const { return _data; }
     169             : 
     170     3212064 :                 utf* c_ptr() const { return (utf*) _data; }
     171             : 
     172             :                 // create substring
     173             :                 Utf8String substring(size_t from ) const;
     174             :                 Utf8String substring(size_t from, size_t to ) const;
     175             : 
     176             :                 /*** MISC ******************************************/
     177             : 
     178             :                 bool is_valid_name() const;
     179             : 
     180             :                 // TODO: remove (only used in loader.cpp)
     181             :                 static const size_t sizeof_utf;
     182             :         private:
     183             :                 // MUST be a POD type
     184             :                 struct Data {
     185             :                         size_t hash;       // cached hash of the string
     186             :                         size_t utf8_size;  // text length in bytes (does NOT include zero terminator)
     187             :                         size_t utf16_size; // number of utf16 codepoints in string
     188             : 
     189             :                         char   text[sizeof(void*)]; // string content
     190             :                                                         // directly embedded in struct utf
     191             :                                                         // aligned to pointer size
     192             :                 };
     193             : 
     194             :                 static inline Data *alloc(size_t hash,
     195             :                                           size_t utf8_size,
     196             :                                           size_t utf16_size);
     197             : 
     198             :                 static void free(Utf8String u);
     199             : 
     200             :                 Data *_data;
     201             : 
     202             :                 template<typename Iterator>
     203             :                 friend struct FromUtf8Builder;
     204             : 
     205             :                 template<typename Iterator>
     206             :                 friend struct FromUtf16Builder;
     207             : };
     208             : 
     209             : 
     210             : // ***** UTF-8 HELPER FUNCTIONS
     211             : 
     212             : namespace utf8 {
     213             :         // count UTF-16 codepoints, -1 on error
     214             :         extern long num_codepoints(const char*, size_t);
     215             : 
     216             :         // count how many bytes a utf-8 version would need
     217             :         extern size_t num_bytes(const uint16_t*, size_t);
     218             : 
     219             :         extern size_t compute_hash(const uint16_t *cs, size_t);
     220             : 
     221             :         // named constants for common utf8 strings
     222             :         #define UTF8(NAME, STR) extern Utf8String NAME;
     223             :         #include "vm/utf8.inc"
     224             : }
     225             : 
     226             : // these are only used in old logging code
     227             : 
     228             : void utf_display_printable_ascii(Utf8String u);
     229             : void utf_display_printable_ascii_classname(Utf8String u);
     230             : 
     231             : void utf_fprint_printable_ascii(FILE *file, Utf8String u);
     232             : void utf_fprint_printable_ascii_classname(FILE *file, Utf8String u);
     233             : 
     234             : // OStream operators
     235             : namespace cacao {
     236             : class OStream;
     237             : 
     238             : OStream& operator<<(OStream& os, const Utf8String &u);
     239             : 
     240             : }
     241             : 
     242             : ////////////////////////////////////////////////////////////////////////////////
     243             : ////////////////////////////////////////////////////////////////////////////////
     244             : // LEGACY C API
     245             : ////////////////////////////////////////////////////////////////////////////////
     246             : ////////////////////////////////////////////////////////////////////////////////
     247             : 
     248             : // these are only used in jvmti and cacaodbg
     249             : 
     250             : #define UTF_END(u)     utf8_end(u)
     251             : #define UTF_SIZE(u)    utf8_size(u)
     252             : 
     253             : extern const char *utf8_end(utf*);
     254             : extern size_t      utf8_size(utf*);
     255             : 
     256             : void utf_sprint_convert_to_latin1(char *buffer, Utf8String u);
     257             : void utf_sprint_convert_to_latin1_classname(char *buffer, Utf8String u);
     258             : 
     259             : void utf_strcat_convert_to_latin1(char *buffer, Utf8String u);
     260             : void utf_strcat_convert_to_latin1_classname(char *buffer, Utf8String u);
     261             : 
     262             : #endif // UTF8_HPP_
     263             : 
     264             : 
     265             : /*
     266             :  * These are local overrides for various environment variables in Emacs.
     267             :  * Please do not remove this and leave it at the end of the file, where
     268             :  * Emacs will automagically detect them.
     269             :  * ---------------------------------------------------------------------
     270             :  * Local variables:
     271             :  * mode: c++
     272             :  * indent-tabs-mode: t
     273             :  * c-basic-offset: 4
     274             :  * tab-width: 4
     275             :  * End:
     276             :  * vim:noexpandtab:sw=4:ts=4:
     277             :  */

Generated by: LCOV version 1.11