1 files changed, 229 insertions, 0 deletions
diff --git a/include/utf8.h b/include/utf8.h
new file mode 100644
index 0000000..b4f6f98
--- /dev/null
+++ b/include/utf8.h
@@ -0,0 +1,229 @@
+#ifndef UTF8_H_
+#define UTF8_H_
+/*
+ * This program source code file is part of KiCad, a free EDA CAD application.
+ *
+ * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
+ * Copyright (C) 2013 KiCad Developers, see CHANGELOG.TXT for contributors.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you may find one here:
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
+ * or you may search the http://www.gnu.org website for the version 2 license,
+ * or you may write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+
+#include <string>
+#include <wx/string.h>
+
+/**
+ * Class UTF8
+ * is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special
+ * conversion support to and from wxString, and has iteration over unicode characters.
+ *
+ * <p>I've been careful to supply only conversion facilities and not try
+ * and duplicate wxString() with many member functions.  In the end it is
+ * to be a std::string.  There are multiple ways to create text into a std::string
+ * without the need of too many member functions:
+ *
+ * <ul>
+ *  <li>richio.h's StrPrintf()</li>
+ *  <li>std::ostringstream.</li>
+ * </ul>
+ *
+ * <p>Because this class used no virtuals, it should be possible to cast any
+ * std::string into a UTF8 using this kind of cast: (UTF8 &) without construction
+ * or copying being the effect of the cast.  Be sure the source std::string holds
+ * UTF8 encoded text before you do that.
+ *
+ * @author Dick Hollenbeck
+ */
+class UTF8 : public std::string
+{
+public:
+
+    UTF8( const wxString& o );
+
+    /// This is a constructor for which you could end up with
+    /// non-UTF8 encoding, but that would be your fault.
+    UTF8( const char* txt ) :
+        std::string( txt )
+    {
+    }
+
+    /// For use with _() function on wx 2.8.
+    /// BTW _() on wx >= 2.9 returns wxString, not wchar_t* like on 2.8.
+    UTF8( const wchar_t* txt );
+
+    UTF8( const std::string& o ) :
+        std::string( o )
+    {
+    }
+
+    UTF8() :
+        std::string()
+    {
+    }
+
+    ~UTF8()     // Needed mainly to build python wrapper
+    {
+    }
+
+    UTF8& operator=( const wxString& o );
+
+    UTF8& operator=( const std::string& o )
+    {
+        std::string::operator=( o );
+        return *this;
+    }
+
+    UTF8& operator=( const char* s )
+    {
+        std::string::operator=( s );
+        return *this;
+    }
+
+    UTF8& operator=( char c )
+    {
+        std::string::operator=( c );
+        return *this;
+    }
+
+    UTF8 substr( size_t pos = 0, size_t len = npos ) const
+    {
+        return std::string::substr( pos, len );
+    }
+
+    operator wxString () const;
+
+    /// This one is not in std::string, and one wonders why... might be a solid
+    /// enough reason to remove it still.
+    operator char* () const
+    {
+        return (char*) c_str();
+    }
+
+    /**
+     * Function uni_forward
+     * advances over a single UTF8 encoded multibyte character, capturing the
+     * unicode character as it goes, and returning the number of bytes consumed.
+     *
+     * @param aSequence is the UTF8 byte sequence, must be aligned on start of character.
+     * @param aResult is where to put the unicode character, and may be NULL if no interest.
+     * @return int - the count of bytes consumed.
+     */
+    static int uni_forward( const unsigned char* aSequence, unsigned* aResult = NULL );
+
+    /**
+     * class uni_iter
+     * is a non-muting iterator that walks through unicode code points in the UTF8 encoded
+     * string.  The normal ++(), ++(int), ->(), and *() operators are all supported
+     * for read only access and some return an unsigned holding the unicode character
+     * appropriate for the respective operator.
+     */
+    class uni_iter
+    {
+        friend class UTF8;
+
+        const unsigned char* it;
+
+        // private constructor.
+        uni_iter( const char* start ) :
+            it( (const unsigned char*) start )
+        {
+            // for the human: assert( sizeof(unsigned) >= 4 );
+        }
+
+
+    public:
+
+        uni_iter()  // Needed only to build python wrapper, not used outside the wrapper
+        {
+            it = NULL;
+        }
+
+        uni_iter( const uni_iter& o )
+        {
+            it = o.it;
+        }
+
+        /// pre-increment and return uni_iter at new position
+        const uni_iter& operator++()
+        {
+            it += uni_forward( it );
+            return *this;
+        }
+
+        /// post-increment and return uni_iter at initial position
+        uni_iter operator++( int )
+        {
+            uni_iter ret = *this;
+
+            it += uni_forward( it );
+            return ret;
+        }
+
+        /*
+        /// return unicode at current position
+        unsigned operator->() const
+        {
+            unsigned    result;
+
+            // grab the result, do not advance
+            uni_forward( it, &result );
+            return result;
+        }
+        */
+
+        /// return unicode at current position
+        unsigned operator*() const
+        {
+            unsigned    result;
+
+            // grab the result, do not advance
+            uni_forward( it, &result );
+            return result;
+        }
+
+        bool operator==( const uni_iter& other ) const  { return it == other.it; }
+        bool operator!=( const uni_iter& other ) const  { return it != other.it; }
+
+        /// Since the ++ operators advance more than one byte, this is your best
+        /// loop termination test, < end(), not == end().
+        bool operator< ( const uni_iter& other ) const  { return it <  other.it; }
+        bool operator<=( const uni_iter& other ) const  { return it <= other.it; }
+        bool operator> ( const uni_iter& other ) const  { return it >  other.it; }
+        bool operator>=( const uni_iter& other ) const  { return it >= other.it; }
+    };
+
+    /**
+     * Function ubegin
+     * returns a @a uni_iter initialized to the start of "this" UTF8 byte sequence.
+     */
+    uni_iter ubegin() const
+    {
+        return uni_iter( data() );
+    }
+
+    /**
+     * Function uend
+     * returns a @a uni_iter initialized to the end of "this" UTF8 byte sequence.
+     */
+    uni_iter uend() const
+    {
+        return uni_iter( data() + size() );
+    }
+};
+
+#endif // UTF8_H_