/* * GDevelop Core * Copyright 2015 Victor Levasseur (victorlevasseur52@gmail.com). * This project is released under the MIT License. */ #ifndef GDCORE_UTF8_STRING_H #define GDCORE_UTF8_STRING_H #include #include #include #include #include #include #include #include "GDCore/Utf8/utf8.h" namespace sf {class String;}; namespace gd { class String; /** * \brief String represents an UTF8 encoded string. * * This class represents an UTF8 encoded string. It provides almost the same features as the STL std::string class * but is UTF8 aware (size() returns the number of characters, not the number of bytes for example). */ class GD_CORE_API String { public: using value_type = char32_t; using reference = char32_t&; using const_reference = const char32_t&; using pointer = char32_t*; using const_pointer = const char32_t*; using size_type = std::string::size_type; using difference_type = std::string::difference_type; static constexpr size_type npos = -1; template class GD_CORE_API StringIterator : public std::iterator { friend class String; friend class StringIterator; public: StringIterator() : strIt() {}; StringIterator(const StringIterator &other) : strIt(other.strIt) {} template StringIterator(const StringIterator &other) : strIt(other.strIt) {} //Convert from const_iterator to iterator StringIterator& operator=(const StringIterator &other) { strIt = other.strIt; return *this; } String::value_type operator*() {return ::utf8::unchecked::peek_next(strIt);} StringIterator& operator++() { ::utf8::unchecked::next(strIt); return *this; } StringIterator operator++(int) { StringIterator tmp(*this); operator++(); return tmp; } StringIterator& operator--() { ::utf8::unchecked::prior(strIt); return *this; } StringIterator operator--(int) { StringIterator tmp(*this); operator--(); return tmp; } bool operator==(const StringIterator &other) { return (strIt == other.strIt); } bool operator!=(const StringIterator &other) { return !operator==(other); } bool operator<(const StringIterator &other) { return (strIt < other.strIt); } bool operator<=(const StringIterator &other) { return (strIt <= other.strIt); } bool operator>(const StringIterator &other) { return (strIt > other.strIt); } bool operator>=(const StringIterator &other) { return (strIt >= other.strIt); } T base() const {return strIt;} private: StringIterator(T strIt) : strIt(strIt) {}; T strIt; }; using iterator = StringIterator; using const_iterator = StringIterator; using reverse_iterator = std::reverse_iterator; using const_reverse_iterator = std::reverse_iterator; /** * \name Constructors * \{ */ /** * Constructs an empty string. */ String(); /** * Constructs a string from an array of char **representing a string encoded * in UTF8**. * * Useful to implicitly create a String object from a string literal. * * **Usage:** * \code * gd::String str(u8"A little sentence."); * \endcode */ String(const char *characters); /** * Constructs a String from a std::u32string. * * **Usage:** * * \code * gd::String str(U"A UTF32 encoded string."); * \endcode */ String(const std::u32string &string); /** * Constructs a string from an sf::String. */ String(const sf::String &string); /** * \} */ /** * \name Assignment (implicit conversions) * \{ */ /** * Assign the String using a string literal (it assumes that the **string * literal is encoded in UTF8**). * * Usage: * \code * gd::String str; * str = u8"This is a test string."; * \endcode */ String& operator=(const char *characters); String& operator=(const sf::String &string); String& operator=(const std::u32string &string); /** * \} */ /** * \name Size * \{ */ /** * \brief Returns true if the string is empty. */ bool empty() const { return m_string.size() == 0; } /** * \brief Returns the string's length. */ size_type size() const; /** * \brief Returns the string's length. */ size_type length() const { return size(); }; /** * \brief Clear the string. * * **Iterators :** Obviously, all iterators are invalidated. */ void clear() { m_string.clear(); } void reserve(gd::String::size_type size) { m_string.reserve(size); } /** * \} */ /** * \name Iterators * \{ */ /** * \brief Get a beginning iterator. */ String::iterator begin(); /** * \brief Get a constant beginning iterator. */ String::const_iterator begin() const; /** * \brief Get a ending iterator. */ String::iterator end(); /** * \brief Get a constant ending iterator. */ String::const_iterator end() const; /** * \} */ /** * \name Convert from/to numbers * \{ */ /** * \brief Method to create a gd::String from a number (float, double, int, ...) * \return a gd::String created from **value**. */ template static String From(T value) { static_assert(!std::is_same::value, "Can't use gd::String::From with std::string."); static_assert(!std::is_same::value, "Can't use gd::String::From with sf::String."); std::ostringstream oss; oss << value; return gd::String(oss.str().c_str()); } /** * \brief Method to convert the string to a number * \return the string converted to the type **T** */ template T To() const { static_assert(!std::is_same::value, "Can't use gd::String::To with std::string."); static_assert(!std::is_same::value, "Can't use gd::String::To with sf::String."); T value; std::istringstream oss(m_string); oss >> value; return value; } /** * \} */ /** * \name Conversions from other string types * \{ */ /** * \return a String created from a std::string encoded in the current * locale. * * See \ref Conversions2 for more information. */ static String FromLocale( const std::string &localizedString ); /** * \return a String created from a std::u32string. */ static String FromUTF32( const std::u32string &string ); /** * \return a String created from a sf::String (UTF32). * * See \ref Conversions1 for more information. */ static String FromSfString( const sf::String &sfString ); /** * \return a String created an UTF8 encoded std::string. */ static String FromUTF8( const std::string &utf8Str ); /** * \return a String created from a std::wstring (UTF32 on Linux and UCS-2 on Windows) */ static String FromWide( const std::wstring &wstr ); /** * \} */ /** * \name Conversions to other string types * \{ */ /** * \return a localized std::string from the current string. * * See \ref Conversions2 for more information. */ std::string ToLocale() const; /** * \return a std::u32string. */ std::u32string ToUTF32() const; /** * \return a sf::String from the current string. * * See \ref Conversions1 for more information. */ sf::String ToSfString() const; /** * Implicit conversion operator to sf::String. * * See \ref Conversions1 for more information. */ operator sf::String() const; /** * \return a UTF8 encoded std::string from the current string. */ std::string ToUTF8() const; /** * \return a wide string (std::wstring) encoded in UTF32 on Linux and in UCS-2 on Windows * \note On Windows, this is possibly a lossy conversion. */ std::wstring ToWide() const; /** * \} */ /** * \name UTF8 tools * \{ */ /** * \return true if the string is valid. */ bool IsValid() const; /** * \brief Searches the string for invalid characters and replaces them with **replacement**. * \return *this */ String& ReplaceInvalid( value_type replacement = 0xfffd ); /** * \} */ /** * \name Element access / Internal string access * \{ */ /** * \brief Returns the code point at the specified position * \warning This operator has a linear complexity on the character's * position. You should avoid to use it in a loop and use the iterators * provided by this class instead. */ value_type operator[]( const size_type position ) const; /** * \brief Get the raw UTF8-encoded std::string */ std::string& Raw() { return m_string; } /** * \brief Get the raw UTF8-encoded std::string */ const std::string& Raw() const { return m_string; } /** * \brief Get the C-string. */ const char* c_str() const { return m_string.c_str(); } /** * \} */ /** * \name String modifiers * \{ */ String& operator+=( const String &other ); String& operator+=( const char *other ); String& operator+=( value_type character ); /** * \brief Add a character (from its codepoint) at the end of the String. * * **Iterators : ** All iterators may be invalidated (in particular if the * string is reallocated). */ void push_back( value_type character ); /** * \brief Remove the last character of the String. * * **Iterators : ** All iterators may be invalidated (in particular if the * string is reallocated). */ void pop_back(); /** * \brief Inserts characters right before the character at **pos**. * * \return *this * * **Iterators :** All iterators may be invalidated. */ String& insert( size_type pos, const String &str ); /** * \brief Replace the portion of the String between **i1** and **i2** (**i2** not * included) by the String **str**. * \return *this * * **Iterators :** All iterators may be invalidated. */ String& replace( iterator i1, iterator i2, const String &str ); /** * \brief Replace the portion of the String between **i1** and **i2** (**i2** not * included) by **n** consecutive copies of character **c**. * \return *this * * **Iterators :** All iterators may be invalidated. */ String& replace( iterator i1, iterator i2, size_type n, const char c ); /** * \brief Replace the portion of the String between **pos** and **pos** + **len** * (the character at **pos** + **len** is not included) with **str**. * \return *this * * **Iterators :** All iterators may be invalidated. */ String& replace( size_type pos, size_type len, const String &str ); /** * \brief Replace the portion of the String between **pos** and **pos** + **len** * (the character at **pos** + **len** is not included) with the character **c**. * \return *this * * **Iterators :** All iterators may be invalidated. */ String& replace( size_type pos, size_type len, const char c ); /** * \brief Search in the portion of the String between **i1** and **i2** (**i2** not * included) for characters matching predicate function **p** and replace them * by the String **str**. * \return *this * * **Iterators :** All iterators may be invalidated. */ String& replace_if( iterator i1, iterator i2, std::function p, const String &str ); /** * \brief Remove consecutive occurrences of the character **c** in the portion of the * between **i1** and **i2** (**i2** not included) to replace it by a single occurrence. * \return *this * * **Iterators :** All iterators may be invalidated. */ String& RemoveConsecutiveOccurrences(iterator i1, iterator i2, const char c); /** * \brief Erase the characters between **first** and **last** (**last** not included). * \param first an iterator to the first character to remove * \param last an iterator to the character next to the last one to remove * \return an iterator pointing at the old position of the first deleted character */ iterator erase( iterator first, iterator last ); /** * \brief Erase the character pointed by **p**. * \param p an iterator pointing to the character to be erased * \return an interator pointing at the old position of the deleted character */ iterator erase( iterator p ); /** * \brief Erase the characters between the positions **pos** and **pos** + **len** * (**pos** + **len** not included). * \param pos the position of the first character to remove * \param len the number of characters to remove from **pos** */ void erase( size_type pos = 0, size_type len = npos ); /** * \} */ /** * \name String operations * \{ */ /** * \brief Split the string with a delimiter * \param delimiter delimiter (an UTF32 codepoint) * \return a std::vector containing all the gd::String objects * * **Usage:** * * \code * gd::String str = u8"10;20;30;40"; * std::vector splittedStr = str.Split(U';'); * //the U prefix is mandatory to get a char32_t from the literal * //Now the vector contains "10", "20", "30" and "40" as gd::String objects * \endcode */ std::vector Split( value_type delimiter ) const; /** * \brief Returns the case-folded string. * \note This string is almost but not totally suitable for case-insensitive comparison because you have to make sure * that it is normalized. So, to do a case-insensitive comparison, do : * \code * str1.CaseFold().Normalize() == str2.CaseFold().Normalize() * \endcode * You can also use gd::CaseInsensitiveEquiv(); */ String CaseFold() const; /** * \brief Returns the string in uppercase. * \note Some characters that maps to multiple characters when uppercased may not be processed, e.g. the german etzett. */ String UpperCase() const; /** * \brief Returns the string in lowercase. * \note Some characters that maps to multiple characters when lowercased may not be processed, e.g. double SS to etzett in german. */ String LowerCase() const; /** * \brief Searches a string for a specified substring and returns a new string where all occurrences of this substring is replaced. * \param search The string that will be replaced by the new string. * \param replacement The value to replace the old substring with. * \param all If set to false, only the first matching substring will be replaced. */ String FindAndReplace(String search, String replacement, bool all = true) const; /** * \brief Removes the specified characters (by default all the "whitespaces" and line breaks) from the beginning of the string, * and return the new string. */ String LeftTrim(const gd::String& chars = " \t\n\v\f\r") { String trimmedString(*this); trimmedString.erase(0, trimmedString.find_first_not_of(chars)); return trimmedString; } /** * \brief Removes the specified characters (by default all the "whitespaces" and line breaks) from the end of the string, * and return the new string. */ String RightTrim(const gd::String& chars = " \t\n\v\f\r") { String trimmedString(*this); trimmedString.erase(trimmedString.find_last_not_of(chars) + 1); return trimmedString; } /** * \brief Removes the specified characters (by default all the "whitespaces" and line breaks) from the * beginning and the end of the string and return the new string. */ String Trim(const gd::String& chars = " \t\n\v\f\r") { return LeftTrim(chars).RightTrim(chars); } /** * Normalization form */ enum NormForm { NFD, ///< Normalization Form Decomposition: characters are decomposed by canonical equivalence, and multiple combining characters are arranged in a specific order. NFC, ///< Normalization Form Composition: characters are decomposed and then recomposed by canonical equivalence. NFKD, ///< Normalization Form Compatibility Decomposition: characters are decomposed by compatibility, and multiple combining characters are arranged in a specific order. NFKC, ///< Normalization Form Compatibility Composition: characters are decomposed by compatibility, then recomposed by canonical equivalence. }; /** * Normalize the string using the normalization form **form**. * \return *this */ String& Normalize(NormForm form = NFC); /** * Returns a sub-string starting from **start** and with length **length**. */ String substr( size_type start = 0, size_type length = npos ) const; /** * \return the position of the first occurence of **search** starting from **pos**. */ size_type find( const String &search, size_type pos = 0 ) const; /** * \return the position of the first occurence of **search** starting from **pos**. */ size_type find( const char *search, size_type pos = 0 ) const; /** * \return the position of the first occurence of **search** starting from **pos**. */ size_type find( const value_type search, size_type pos = 0 ) const; /** * \return the position of the last occurence of **search** starting before **pos**. */ size_type rfind( const String &search, size_type pos = npos ) const; /** * \return the position of the last occurence of **search** starting before **pos**. */ size_type rfind( const char *search, size_type pos = npos ) const; /** * \return the position of the last occurence of **search** starting before **pos**. */ size_type rfind( const value_type &search, size_type pos = npos ) const; /** * \brief Searches the string for the first character that matches any of the characters specified in * its arguments. * \param match the characters that will be looked for in the String * \param startPos where to start the search * \return the position of the first found character */ size_type find_first_of( const String &match, size_type startPos = 0 ) const; /** * \brief Searches the string for the first character that doesn't match any of the characters * specified in its arguments. * \param not_match the characters that will be looked for in the String * \param startPos where to start the search * \return the position of the first found character */ size_type find_first_not_of( const String ¬_match, size_type startPos = 0 ) const; /** * \brief Searches the string for the last character that matches any of the characters specified in * its arguments. * \param match the characters that will be looked for in the String * \param endPos where to end the search (this is the last character considered in the * search) * \return the position of the last found character */ size_type find_last_of( const String &match, size_type endPos = npos ) const; /** * \brief Searches the string for the last character that doesn't match any of the characters * specified in its arguments. * \param not_match the characters that will be looked for in the String * \param endPos where to end the search (this is the last character considered in the * search) * \return the position of the last found character */ size_type find_last_not_of( const String ¬_match, size_type endPos = npos ) const; /** * \brief Compares the current string with another. */ int compare( const String &other ) const; /** * \brief Do a case-insensitive search * \return the position of the first occurence of **search** starting from **pos**. * * \note This method isn't very efficient as it is linear on the string size times the * search string size */ size_type FindCaseInsensitive( const String &search, size_type pos = 0 ) const; /** * \} */ private: std::string m_string; ///< Internal std::string container }; /** * \name Non-member operators * \{ */ /** * \relates String * \return a String containing the concatenation of lhs and rhs. */ String GD_CORE_API operator+(String lhs, const String &rhs); /** * \relates String * \return a String containing the concatenation of lhs and rhs (rhs is * converted to gd::String assuming it's encoded in UTF8). */ String GD_CORE_API operator+(String lhs, const char *rhs); /** * \relates String * \return a String containing the concatenation of lhs and rhs (lhs is * converted to gd::String assuming it's encoded in UTF8). */ String GD_CORE_API operator+(const char *lhs, const String &rhs); /** * \} */ /** * \name Relational operators * \{ */ ///\relates String bool GD_CORE_API operator==( const String &lhs, const String &rhs ); ///\relates String bool GD_CORE_API operator==( const String &lhs, const char *rhs ); ///\relates String bool GD_CORE_API operator==( const char *lhs, const String &rhs ); ///\relates String bool GD_CORE_API operator!=( const String &lhs, const String &rhs ); ///\relates String bool GD_CORE_API operator!=( const String &lhs, const char *rhs ); ///\relates String bool GD_CORE_API operator!=( const char *lhs, const String &rhs ); ///\relates String bool GD_CORE_API operator<( const String &lhs, const String &rhs ); ///\relates String bool GD_CORE_API operator<( const String &lhs, const char *rhs ); ///\relates String bool GD_CORE_API operator<( const char *lhs, const String &rhs ); ///\relates String bool GD_CORE_API operator<=( const String &lhs, const String &rhs ); ///\relates String bool GD_CORE_API operator<=( const String &lhs, const char *rhs ); ///\relates String bool GD_CORE_API operator<=( const char *lhs, const String &rhs ); ///\relates String bool GD_CORE_API operator>( const String &lhs, const String &rhs ); ///\relates String bool GD_CORE_API operator>( const String &lhs, const char *rhs ); ///\relates String bool GD_CORE_API operator>( const char *lhs, const String &rhs ); ///\relates String bool GD_CORE_API operator>=( const String &lhs, const String &rhs ); ///\relates String bool GD_CORE_API operator>=( const String &lhs, const char *rhs ); ///\relates String bool GD_CORE_API operator>=( const char *lhs, const String &rhs ); /** * \} */ /** * \name Stream operators * \{ */ /** * \relates String * Outputs the string in a stream. * \note The string is converted to the current locale before. If you want to stream the string * as UTF8, do : * \code * std::cout << myString.Raw(); * \endcode */ std::ostream& GD_CORE_API operator<<(std::ostream &os, const String &str); /** * \relates String * Extracts a string from an input string assuming the stream inputs characters encoded in the * current locale. * \note The content of the string is replaced. */ std::istream& GD_CORE_API operator>>(std::istream &is, String &str); /** * \} */ /** * \relates String * \param compat if true, the strings are normalized using a compatibility normalization form to remove characters special appearance. * \return true if the two string are equivalent (in a case-sensitive way). */ bool GD_CORE_API CaseSensitiveEquiv( String lhs, String rhs, bool compat = true ); /** * \relates String * \param compat if true, the strings are normalized using a compatibility normalization form to remove characters special appearance. * \return true if the two string are equivalent (in a case-insensitive way). */ bool GD_CORE_API CaseInsensitiveEquiv( const String &lhs, const String &rhs, bool compat = true ); } namespace std { /** * std::hash specialization for gd::String */ template <> struct GD_CORE_API hash { size_t operator()(const gd::String & x) const { return hash()(x.Raw()); } }; } #endif /** * \class gd::String * * \section WhatIsUTF8 What is UTF8 and Unicode ? * (from https://en.wikipedia.org/wiki/Unicode and https://en.wikipedia.org/wiki/UTF-8) * Unicode is a computing industry standard for the consistent encoding, representation, and handling of text * expressed in most of the world's writing systems. * Unicode can be implemented by different character encodings. The most commonly used encodings are UTF-8, UTF-16 * and the now-obsolete UCS-2. * * UTF-8 is a character encoding capable of encoding all possible characters, or code points, in Unicode. * The encoding is variable-length (not every codepoint is 1 byte long) and uses 8-bit code units. It was designed * for backward compatibility with ASCII. * UTF-8 encodes each of the 1,112,064 valid code points in the Unicode code space using one to four 8-bit bytes * (a group of 8 bits is known as an octet in the Unicode Standard). Code points with lower numerical values * (i.e., earlier code positions in the Unicode character set, which tend to occur more frequently) are encoded using * fewer bytes. The first 128 characters of Unicode, which correspond one-to-one with ASCII, are encoded using a * single octet with the same binary value as ASCII, making valid ASCII text valid UTF-8-encoded Unicode as well. * * \section Limitations Limitations * The String class stores internally the string as an UTF8 encoded std::string. It results in some limitations : it's * impossible to edit a single character with operator[]() nor at() because the new character length might not be the same. * * **The gd::String class supports almost all Unicode characters, except the ones that can't be represented as a single * codepoint (obviously, a codepoint can be represented by 1 to 4 bytes, as codepoints are encoded in UTF8).** For examples, * some special letters are composed of multiple codepoints (a letter, and the accents). Most of them can be combined into a * single codepoint but some can't. These are the not supported ones. See \ref Normalization. * * \section Performance Performance * The UTF8 encoding has the advantage to reduce the RAM consumption compared to UTF16 or UTF32 for strings using a lot * of latin characters. But the characters variable length brings some performance issues compared to fixed size encoding. * That's why the complexity of each methods is written in their documentation. For instance, the size() method is linear * on the string size and so is the operator[](). * * \section Conversion Conversions from/to other string types * The String handles implicit conversion with sf::String (implicit constructor and implicit conversion * operator). * * **However, this is not the case with std::string** as this conversion is not often lossless (mostly on Windows). * You need to explicitly call gd::String::FromLocale or gd::String::FromUTF8 * to convert a std::string to a String. However, if you want to get a String object from a string literal, you can * directly use the operator=() or the constructor as they are supporting const char* as argument (it assumes the string * literal is encoded in UTF8, so you'll need to put the u8 prefix). * * \subsection Conversions1 Implicit conversion from/to sf::String * \code * //Get a String from sf::String * sf::String sfmlStr("This is a test ! "); * gd::String str1(sfmlStr); //Now contains "This is a test ! " encoded in UTF8 * * //Get a sf::String from String * sf::String anotherSfmlString = str; //anotherSfmlString now contains "Another test ! " * \endcode * * \subsection Conversions2 Conversion from/to std::string * \code * //Get a String from a std::string encoded in the current locale * std::string ansiStr = "Some beautiful localized characters. "; //Encoded in ANSI on Windows, UTF8 on Linux * gd::String str = gd::String::FromLocale(ansiStr); * * //Create a String using a string literal encoded in UTF8 * gd::String anotherStr = u8"This is an UTF8 string"; * //The same as gd::String anotherStr = gd::utf8::FromUTF8(u8"This is an UTF8 string"); * //But it works only with string literals. * * gd::String finalStr = str + anotherStr; //Concatenates the two Strings * std::cout << finalStr.ToLocale() << std::endl //Shows "Some beautiful localized characters. This is an UTF8 string" * \endcode * * \section Normalization Normalization * This class stores Unicode strings normalized with NFC which means that all characters are combined (if they can). For example, the "à" * character can be written in two ways according to the Unicode norm : U+00E0 (the "à" in a single codepoint) or * U+0061 (the "a" letter codepoint) + U+0300 "the "`" combining accent. We say that they are canonically equivalent. * However, this can cause problem when comparing strings, that's why **this class normalizes the string when constructed** using * the **Normalization Form Composition** (all characters are combined, e.g. "à" is represented by a single codepoint). * If the string **is invalid when constructed, the string is not normalized** : it will be **normalized when the invalid characters * will be removed using gd::String::ReplaceInvalid()**. * * \section CaseInsensitiveComparison Case-insensitive comparison * In Unicode, uppercasing/lowercasing strings to compare them in a case-insensitive way is not recommended. * That's why the function gd::CaseInsensitiveEquiv exists to compare two strings in a case-insensitive way. */