summaryrefslogtreecommitdiff
path: root/include/utf8.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/utf8.h')
-rw-r--r--include/utf8.h229
1 files changed, 229 insertions, 0 deletions
diff --git a/include/utf8.h b/include/utf8.h
new file mode 100644
index 0000000..b4f6f98
--- /dev/null
+++ b/include/utf8.h
@@ -0,0 +1,229 @@
+#ifndef UTF8_H_
+#define UTF8_H_
+/*
+ * This program source code file is part of KiCad, a free EDA CAD application.
+ *
+ * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
+ * Copyright (C) 2013 KiCad Developers, see CHANGELOG.TXT for contributors.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you may find one here:
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
+ * or you may search the http://www.gnu.org website for the version 2 license,
+ * or you may write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ */
+
+#include <string>
+#include <wx/string.h>
+
+/**
+ * Class UTF8
+ * is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special
+ * conversion support to and from wxString, and has iteration over unicode characters.
+ *
+ * <p>I've been careful to supply only conversion facilities and not try
+ * and duplicate wxString() with many member functions. In the end it is
+ * to be a std::string. There are multiple ways to create text into a std::string
+ * without the need of too many member functions:
+ *
+ * <ul>
+ * <li>richio.h's StrPrintf()</li>
+ * <li>std::ostringstream.</li>
+ * </ul>
+ *
+ * <p>Because this class used no virtuals, it should be possible to cast any
+ * std::string into a UTF8 using this kind of cast: (UTF8 &) without construction
+ * or copying being the effect of the cast. Be sure the source std::string holds
+ * UTF8 encoded text before you do that.
+ *
+ * @author Dick Hollenbeck
+ */
+class UTF8 : public std::string
+{
+public:
+
+ UTF8( const wxString& o );
+
+ /// This is a constructor for which you could end up with
+ /// non-UTF8 encoding, but that would be your fault.
+ UTF8( const char* txt ) :
+ std::string( txt )
+ {
+ }
+
+ /// For use with _() function on wx 2.8.
+ /// BTW _() on wx >= 2.9 returns wxString, not wchar_t* like on 2.8.
+ UTF8( const wchar_t* txt );
+
+ UTF8( const std::string& o ) :
+ std::string( o )
+ {
+ }
+
+ UTF8() :
+ std::string()
+ {
+ }
+
+ ~UTF8() // Needed mainly to build python wrapper
+ {
+ }
+
+ UTF8& operator=( const wxString& o );
+
+ UTF8& operator=( const std::string& o )
+ {
+ std::string::operator=( o );
+ return *this;
+ }
+
+ UTF8& operator=( const char* s )
+ {
+ std::string::operator=( s );
+ return *this;
+ }
+
+ UTF8& operator=( char c )
+ {
+ std::string::operator=( c );
+ return *this;
+ }
+
+ UTF8 substr( size_t pos = 0, size_t len = npos ) const
+ {
+ return std::string::substr( pos, len );
+ }
+
+ operator wxString () const;
+
+ /// This one is not in std::string, and one wonders why... might be a solid
+ /// enough reason to remove it still.
+ operator char* () const
+ {
+ return (char*) c_str();
+ }
+
+ /**
+ * Function uni_forward
+ * advances over a single UTF8 encoded multibyte character, capturing the
+ * unicode character as it goes, and returning the number of bytes consumed.
+ *
+ * @param aSequence is the UTF8 byte sequence, must be aligned on start of character.
+ * @param aResult is where to put the unicode character, and may be NULL if no interest.
+ * @return int - the count of bytes consumed.
+ */
+ static int uni_forward( const unsigned char* aSequence, unsigned* aResult = NULL );
+
+ /**
+ * class uni_iter
+ * is a non-muting iterator that walks through unicode code points in the UTF8 encoded
+ * string. The normal ++(), ++(int), ->(), and *() operators are all supported
+ * for read only access and some return an unsigned holding the unicode character
+ * appropriate for the respective operator.
+ */
+ class uni_iter
+ {
+ friend class UTF8;
+
+ const unsigned char* it;
+
+ // private constructor.
+ uni_iter( const char* start ) :
+ it( (const unsigned char*) start )
+ {
+ // for the human: assert( sizeof(unsigned) >= 4 );
+ }
+
+
+ public:
+
+ uni_iter() // Needed only to build python wrapper, not used outside the wrapper
+ {
+ it = NULL;
+ }
+
+ uni_iter( const uni_iter& o )
+ {
+ it = o.it;
+ }
+
+ /// pre-increment and return uni_iter at new position
+ const uni_iter& operator++()
+ {
+ it += uni_forward( it );
+ return *this;
+ }
+
+ /// post-increment and return uni_iter at initial position
+ uni_iter operator++( int )
+ {
+ uni_iter ret = *this;
+
+ it += uni_forward( it );
+ return ret;
+ }
+
+ /*
+ /// return unicode at current position
+ unsigned operator->() const
+ {
+ unsigned result;
+
+ // grab the result, do not advance
+ uni_forward( it, &result );
+ return result;
+ }
+ */
+
+ /// return unicode at current position
+ unsigned operator*() const
+ {
+ unsigned result;
+
+ // grab the result, do not advance
+ uni_forward( it, &result );
+ return result;
+ }
+
+ bool operator==( const uni_iter& other ) const { return it == other.it; }
+ bool operator!=( const uni_iter& other ) const { return it != other.it; }
+
+ /// Since the ++ operators advance more than one byte, this is your best
+ /// loop termination test, < end(), not == end().
+ bool operator< ( const uni_iter& other ) const { return it < other.it; }
+ bool operator<=( const uni_iter& other ) const { return it <= other.it; }
+ bool operator> ( const uni_iter& other ) const { return it > other.it; }
+ bool operator>=( const uni_iter& other ) const { return it >= other.it; }
+ };
+
+ /**
+ * Function ubegin
+ * returns a @a uni_iter initialized to the start of "this" UTF8 byte sequence.
+ */
+ uni_iter ubegin() const
+ {
+ return uni_iter( data() );
+ }
+
+ /**
+ * Function uend
+ * returns a @a uni_iter initialized to the end of "this" UTF8 byte sequence.
+ */
+ uni_iter uend() const
+ {
+ return uni_iter( data() + size() );
+ }
+};
+
+#endif // UTF8_H_