Files
GDevelop/Core/GDCore/String.h

896 lines
28 KiB
C++

/*
* GDevelop Core
* Copyright 2015 Victor Levasseur (victorlevasseur52@gmail.com).
* This project is released under the MIT License.
*/
#ifndef GDCORE_UTF8_STRING_H
#define GDCORE_UTF8_STRING_H
#include <functional>
#include <iostream>
#include <iterator>
#include <sstream>
#include <string>
#include <vector>
#include "GDCore/Utf8/utf8.h"
namespace gd
{
class String;
/**
* \brief String represents an UTF8 encoded string.
*
* This class represents an UTF8 encoded string. It provides almost the same features as the STL std::string class
* but is UTF8 aware (size() returns the number of characters, not the number of bytes for example).
*/
class GD_CORE_API String
{
public:
using value_type = char32_t;
using reference = char32_t&;
using const_reference = const char32_t&;
using pointer = char32_t*;
using const_pointer = const char32_t*;
using size_type = std::string::size_type;
using difference_type = std::string::difference_type;
static constexpr size_type npos = -1;
template<class T>
class GD_CORE_API StringIterator : public std::iterator<std::bidirectional_iterator_tag, String::value_type, String::difference_type>
{
friend class String;
friend class StringIterator<const T>;
public:
StringIterator() : strIt() {};
StringIterator(const StringIterator<T> &other) : strIt(other.strIt) {}
template<class U> StringIterator(const StringIterator<U> &other) : strIt(other.strIt) {} //Convert from const_iterator to iterator
StringIterator<T>& operator=(const StringIterator<T> &other) { strIt = other.strIt; return *this; }
String::value_type operator*() {return ::utf8::unchecked::peek_next(strIt);}
StringIterator<T>& operator++() { ::utf8::unchecked::next(strIt); return *this; }
StringIterator<T> operator++(int) { StringIterator<T> tmp(*this); operator++(); return tmp; }
StringIterator<T>& operator--() { ::utf8::unchecked::prior(strIt); return *this; }
StringIterator<T> operator--(int) { StringIterator<T> tmp(*this); operator--(); return tmp; }
bool operator==(const StringIterator<T> &other) { return (strIt == other.strIt); }
bool operator!=(const StringIterator<T> &other) { return !operator==(other); }
bool operator<(const StringIterator<T> &other) { return (strIt < other.strIt); }
bool operator<=(const StringIterator<T> &other) { return (strIt <= other.strIt); }
bool operator>(const StringIterator<T> &other) { return (strIt > other.strIt); }
bool operator>=(const StringIterator<T> &other) { return (strIt >= other.strIt); }
T base() const {return strIt;}
private:
StringIterator(T strIt) : strIt(strIt) {};
T strIt;
};
using iterator = StringIterator<std::string::iterator>;
using const_iterator = StringIterator<std::string::const_iterator>;
using reverse_iterator = std::reverse_iterator<iterator>;
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
/**
* \name Constructors
* \{
*/
/**
* Constructs an empty string.
*/
String();
/**
* Constructs a string from an array of char **representing a string encoded
* in UTF8**.
*
* Useful to implicitly create a String object from a string literal.
*
* **Usage:**
* \code
* gd::String str(u8"A little sentence.");
* \endcode
*/
String(const char *characters);
/**
* Constructs a String from a std::u32string.
*
* **Usage:**
*
* \code
* gd::String str(U"A UTF32 encoded string.");
* \endcode
*/
String(const std::u32string &string);
/**
* \}
*/
/**
* \name Assignment (implicit conversions)
* \{
*/
/**
* Assign the String using a string literal (it assumes that the **string
* literal is encoded in UTF8**).
*
* Usage:
* \code
* gd::String str;
* str = u8"This is a test string.";
* \endcode
*/
String& operator=(const char *characters);
String& operator=(const std::u32string &string);
/**
* \}
*/
/**
* \name Size
* \{
*/
/**
* \brief Returns true if the string is empty.
*/
bool empty() const { return m_string.size() == 0; }
/**
* \brief Returns the string's length.
*/
size_type size() const;
/**
* \brief Returns the string's length.
*/
size_type length() const { return size(); };
/**
* \brief Clear the string.
*
* **Iterators :** Obviously, all iterators are invalidated.
*/
void clear() { m_string.clear(); }
void reserve(gd::String::size_type size) { m_string.reserve(size); }
/**
* \}
*/
/**
* \name Iterators
* \{
*/
/**
* \brief Get a beginning iterator.
*/
String::iterator begin();
/**
* \brief Get a constant beginning iterator.
*/
String::const_iterator begin() const;
/**
* \brief Get a ending iterator.
*/
String::iterator end();
/**
* \brief Get a constant ending iterator.
*/
String::const_iterator end() const;
/**
* \}
*/
/**
* \name Convert from/to numbers
* \{
*/
/**
* \brief Method to create a gd::String from a number (float, double, int, ...)
* \return a gd::String created from **value**.
*/
template<typename T>
static String From(T value)
{
static_assert(!std::is_same<T, std::string>::value, "Can't use gd::String::From with std::string.");
std::ostringstream oss;
oss << value;
return gd::String(oss.str().c_str());
}
/**
* \brief Method to convert the string to a number
* \return the string converted to the type **T**
*/
template<typename T>
T To() const
{
static_assert(!std::is_same<T, std::string>::value, "Can't use gd::String::To with std::string.");
T value;
std::istringstream oss(m_string);
oss >> value;
return value;
}
/**
* \}
*/
/**
* \name Conversions from other string types
* \{
*/
/**
* \return a String created from a std::string encoded in the current
* locale.
*
* See \ref Conversions2 for more information.
*/
static String FromLocale( const std::string &localizedString );
/**
* \return a String created from a std::u32string.
*/
static String FromUTF32( const std::u32string &string );
/**
* \return a String created an UTF8 encoded std::string.
*/
static String FromUTF8( const std::string &utf8Str );
/**
* \return a String created from a std::wstring (UTF32 on Linux and UCS-2 on Windows)
*/
static String FromWide( const std::wstring &wstr );
/**
* \}
*/
/**
* \name Conversions to other string types
* \{
*/
/**
* \return a localized std::string from the current string.
*
* See \ref Conversions2 for more information.
*/
std::string ToLocale() const;
/**
* \return a std::u32string.
*/
std::u32string ToUTF32() const;
/**
* \return a UTF8 encoded std::string from the current string.
*/
std::string ToUTF8() const;
/**
* \return a wide string (std::wstring) encoded in UTF32 on Linux and in UCS-2 on Windows
* \note On Windows, this is possibly a lossy conversion.
*/
std::wstring ToWide() const;
/**
* \}
*/
/**
* \name UTF8 tools
* \{
*/
/**
* \return true if the string is valid.
*/
bool IsValid() const;
/**
* \brief Searches the string for invalid characters and replaces them with **replacement**.
* \return *this
*/
String& ReplaceInvalid( value_type replacement = 0xfffd );
/**
* \}
*/
/**
* \name Element access / Internal string access
* \{
*/
/**
* \brief Returns the code point at the specified position
* \warning This operator has a linear complexity on the character's
* position. You should avoid to use it in a loop and use the iterators
* provided by this class instead.
*/
value_type operator[]( const size_type position ) const;
/**
* \brief Get the raw UTF8-encoded std::string
*/
std::string& Raw() { return m_string; }
/**
* \brief Get the raw UTF8-encoded std::string
*/
const std::string& Raw() const { return m_string; }
/**
* \brief Get the C-string.
*/
const char* c_str() const { return m_string.c_str(); }
/**
* \}
*/
/**
* \name String modifiers
* \{
*/
String& operator+=( const String &other );
String& operator+=( const char *other );
String& operator+=( value_type character );
/**
* \brief Add a character (from its codepoint) at the end of the String.
*
* **Iterators : ** All iterators may be invalidated (in particular if the
* string is reallocated).
*/
void push_back( value_type character );
/**
* \brief Remove the last character of the String.
*
* **Iterators : ** All iterators may be invalidated (in particular if the
* string is reallocated).
*/
void pop_back();
/**
* \brief Inserts characters right before the character at **pos**.
*
* \return *this
*
* **Iterators :** All iterators may be invalidated.
*/
String& insert( size_type pos, const String &str );
/**
* \brief Replace the portion of the String between **i1** and **i2** (**i2** not
* included) by the String **str**.
* \return *this
*
* **Iterators :** All iterators may be invalidated.
*/
String& replace( iterator i1, iterator i2, const String &str );
/**
* \brief Replace the portion of the String between **i1** and **i2** (**i2** not
* included) by **n** consecutive copies of character **c**.
* \return *this
*
* **Iterators :** All iterators may be invalidated.
*/
String& replace( iterator i1, iterator i2, size_type n, const char c );
/**
* \brief Replace the portion of the String between **pos** and **pos** + **len**
* (the character at **pos** + **len** is not included) with **str**.
* \return *this
*
* **Iterators :** All iterators may be invalidated.
*/
String& replace( size_type pos, size_type len, const String &str );
/**
* \brief Replace the portion of the String between **pos** and **pos** + **len**
* (the character at **pos** + **len** is not included) with the character **c**.
* \return *this
*
* **Iterators :** All iterators may be invalidated.
*/
String& replace( size_type pos, size_type len, const char c );
/**
* \brief Search in the portion of the String between **i1** and **i2** (**i2** not
* included) for characters matching predicate function **p** and replace them
* by the String **str**.
* \return *this
*
* **Iterators :** All iterators may be invalidated.
*/
String& replace_if( iterator i1, iterator i2, std::function<bool(char32_t)> p, const String &str );
/**
* \brief Remove consecutive occurrences of the character **c** in the portion of the
* between **i1** and **i2** (**i2** not included) to replace it by a single occurrence.
* \return *this
*
* **Iterators :** All iterators may be invalidated.
*/
String& RemoveConsecutiveOccurrences(iterator i1, iterator i2, const char c);
/**
* \brief Erase the characters between **first** and **last** (**last** not included).
* \param first an iterator to the first character to remove
* \param last an iterator to the character next to the last one to remove
* \return an iterator pointing at the old position of the first deleted character
*/
iterator erase( iterator first, iterator last );
/**
* \brief Erase the character pointed by **p**.
* \param p an iterator pointing to the character to be erased
* \return an interator pointing at the old position of the deleted character
*/
iterator erase( iterator p );
/**
* \brief Erase the characters between the positions **pos** and **pos** + **len**
* (**pos** + **len** not included).
* \param pos the position of the first character to remove
* \param len the number of characters to remove from **pos**
*/
void erase( size_type pos = 0, size_type len = npos );
/**
* \}
*/
/**
* \name String operations
* \{
*/
/**
* \brief Split the string with a delimiter
* \param delimiter delimiter (an UTF32 codepoint)
* \return a std::vector containing all the gd::String objects
*
* **Usage:**
*
* \code
* gd::String str = u8"10;20;30;40";
* std::vector<gd::String> splittedStr = str.Split(U';');
* //the U prefix is mandatory to get a char32_t from the literal
* //Now the vector contains "10", "20", "30" and "40" as gd::String objects
* \endcode
*/
std::vector<String> Split( value_type delimiter ) const;
/**
* \brief Returns the case-folded string.
* \note This string is almost but not totally suitable for case-insensitive comparison because you have to make sure
* that it is normalized. So, to do a case-insensitive comparison, do :
* \code
* str1.CaseFold().Normalize() == str2.CaseFold().Normalize()
* \endcode
* You can also use gd::CaseInsensitiveEquiv();
*/
String CaseFold() const;
/**
* \brief Returns the string in uppercase.
* \note Some characters that maps to multiple characters when uppercased may not be processed, e.g. the german etzett.
*/
String UpperCase() const;
/**
* \brief Returns the string in lowercase.
* \note Some characters that maps to multiple characters when lowercased may not be processed, e.g. double SS to etzett in german.
*/
String LowerCase() const;
/**
* \brief Searches a string for a specified substring and returns a new string where all occurrences of this substring is replaced.
* \param search The string that will be replaced by the new string.
* \param replacement The value to replace the old substring with.
* \param all If set to false, only the first matching substring will be replaced.
*/
String FindAndReplace(String search, String replacement, bool all = true) const;
/**
* \brief Removes the specified characters (by default all the "whitespaces" and line breaks) from the beginning of the string,
* and return the new string.
*/
String LeftTrim(const gd::String& chars = " \t\n\v\f\r")
{
String trimmedString(*this);
trimmedString.erase(0, trimmedString.find_first_not_of(chars));
return trimmedString;
}
/**
* \brief Removes the specified characters (by default all the "whitespaces" and line breaks) from the end of the string,
* and return the new string.
*/
String RightTrim(const gd::String& chars = " \t\n\v\f\r")
{
String trimmedString(*this);
trimmedString.erase(trimmedString.find_last_not_of(chars) + 1);
return trimmedString;
}
/**
* \brief Removes the specified characters (by default all the "whitespaces" and line breaks) from the
* beginning and the end of the string and return the new string.
*/
String Trim(const gd::String& chars = " \t\n\v\f\r")
{
return LeftTrim(chars).RightTrim(chars);
}
/**
* Normalization form
*/
enum NormForm
{
NFD, ///< Normalization Form Decomposition: characters are decomposed by canonical equivalence, and multiple combining characters are arranged in a specific order.
NFC, ///< Normalization Form Composition: characters are decomposed and then recomposed by canonical equivalence.
NFKD, ///< Normalization Form Compatibility Decomposition: characters are decomposed by compatibility, and multiple combining characters are arranged in a specific order.
NFKC, ///< Normalization Form Compatibility Composition: characters are decomposed by compatibility, then recomposed by canonical equivalence.
};
/**
* Normalize the string using the normalization form **form**.
* \return *this
*/
String& Normalize(NormForm form = NFC);
/**
* Returns a sub-string starting from **start** and with length **length**.
*/
String substr( size_type start = 0, size_type length = npos ) const;
/**
* \return the position of the first occurrence of **search** starting from **pos**.
*/
size_type find( const String &search, size_type pos = 0 ) const;
/**
* \return the position of the first occurrence of **search** starting from **pos**.
*/
size_type find( const char *search, size_type pos = 0 ) const;
/**
* \return the position of the first occurrence of **search** starting from **pos**.
*/
size_type find( const value_type search, size_type pos = 0 ) const;
/**
* \return the position of the last occurrence of **search** starting before **pos**.
*/
size_type rfind( const String &search, size_type pos = npos ) const;
/**
* \return the position of the last occurrence of **search** starting before **pos**.
*/
size_type rfind( const char *search, size_type pos = npos ) const;
/**
* \return the position of the last occurrence of **search** starting before **pos**.
*/
size_type rfind( const value_type &search, size_type pos = npos ) const;
/**
* \brief Searches the string for the first character that matches any of the characters specified in
* its arguments.
* \param match the characters that will be looked for in the String
* \param startPos where to start the search
* \return the position of the first found character
*/
size_type find_first_of( const String &match, size_type startPos = 0 ) const;
/**
* \brief Searches the string for the first character that doesn't match any of the characters
* specified in its arguments.
* \param not_match the characters that will be looked for in the String
* \param startPos where to start the search
* \return the position of the first found character
*/
size_type find_first_not_of( const String &not_match, size_type startPos = 0 ) const;
/**
* \brief Searches the string for the last character that matches any of the characters specified in
* its arguments.
* \param match the characters that will be looked for in the String
* \param endPos where to end the search (this is the last character considered in the
* search)
* \return the position of the last found character
*/
size_type find_last_of( const String &match, size_type endPos = npos ) const;
/**
* \brief Searches the string for the last character that doesn't match any of the characters
* specified in its arguments.
* \param not_match the characters that will be looked for in the String
* \param endPos where to end the search (this is the last character considered in the
* search)
* \return the position of the last found character
*/
size_type find_last_not_of( const String &not_match, size_type endPos = npos ) const;
/**
* \brief Compares the current string with another.
*/
int compare( const String &other ) const;
/**
* \brief Do a case-insensitive search
* \return the position of the first occurrence of **search** starting from **pos**.
*
* \note This method isn't very efficient as it is linear on the string size times the
* search string size
*/
size_type FindCaseInsensitive( const String &search, size_type pos = 0 ) const;
/**
* \}
*/
private:
std::string m_string; ///< Internal std::string container
};
/**
* \name Non-member operators
* \{
*/
/**
* \relates String
* \return a String containing the concatenation of lhs and rhs.
*/
String GD_CORE_API operator+(String lhs, const String &rhs);
/**
* \relates String
* \return a String containing the concatenation of lhs and rhs (rhs is
* converted to gd::String assuming it's encoded in UTF8).
*/
String GD_CORE_API operator+(String lhs, const char *rhs);
/**
* \relates String
* \return a String containing the concatenation of lhs and rhs (lhs is
* converted to gd::String assuming it's encoded in UTF8).
*/
String GD_CORE_API operator+(const char *lhs, const String &rhs);
const String& GD_CORE_API operator||(const String &lhs, const String &rhs);
String GD_CORE_API operator||(String lhs, const char *rhs);
/**
* \}
*/
/**
* \name Relational operators
* \{
*/
///\relates String
bool GD_CORE_API operator==( const String &lhs, const String &rhs );
///\relates String
bool GD_CORE_API operator==( const String &lhs, const char *rhs );
///\relates String
bool GD_CORE_API operator==( const char *lhs, const String &rhs );
///\relates String
bool GD_CORE_API operator!=( const String &lhs, const String &rhs );
///\relates String
bool GD_CORE_API operator!=( const String &lhs, const char *rhs );
///\relates String
bool GD_CORE_API operator!=( const char *lhs, const String &rhs );
///\relates String
bool GD_CORE_API operator<( const String &lhs, const String &rhs );
///\relates String
bool GD_CORE_API operator<( const String &lhs, const char *rhs );
///\relates String
bool GD_CORE_API operator<( const char *lhs, const String &rhs );
///\relates String
bool GD_CORE_API operator<=( const String &lhs, const String &rhs );
///\relates String
bool GD_CORE_API operator<=( const String &lhs, const char *rhs );
///\relates String
bool GD_CORE_API operator<=( const char *lhs, const String &rhs );
///\relates String
bool GD_CORE_API operator>( const String &lhs, const String &rhs );
///\relates String
bool GD_CORE_API operator>( const String &lhs, const char *rhs );
///\relates String
bool GD_CORE_API operator>( const char *lhs, const String &rhs );
///\relates String
bool GD_CORE_API operator>=( const String &lhs, const String &rhs );
///\relates String
bool GD_CORE_API operator>=( const String &lhs, const char *rhs );
///\relates String
bool GD_CORE_API operator>=( const char *lhs, const String &rhs );
/**
* \}
*/
/**
* \name Stream operators
* \{
*/
/**
* \relates String
* Outputs the string in a stream.
* \note The string is converted to the current locale before. If you want to stream the string
* as UTF8, do :
* \code
* std::cout << myString.Raw();
* \endcode
*/
std::ostream& GD_CORE_API operator<<(std::ostream &os, const String &str);
/**
* \relates String
* Extracts a string from an input string assuming the stream inputs characters encoded in the
* current locale.
* \note The content of the string is replaced.
*/
std::istream& GD_CORE_API operator>>(std::istream &is, String &str);
/**
* \}
*/
/**
* \relates String
* \param compat if true, the strings are normalized using a compatibility normalization form to remove characters special appearance.
* \return true if the two string are equivalent (in a case-sensitive way).
*/
bool GD_CORE_API CaseSensitiveEquiv( String lhs, String rhs, bool compat = true );
/**
* \relates String
* \param compat if true, the strings are normalized using a compatibility normalization form to remove characters special appearance.
* \return true if the two string are equivalent (in a case-insensitive way).
*/
bool GD_CORE_API CaseInsensitiveEquiv( const String &lhs, const String &rhs, bool compat = true );
}
namespace std
{
/**
* std::hash specialization for gd::String
*/
template <> struct GD_CORE_API hash<gd::String>
{
size_t operator()(const gd::String & x) const
{
return hash<std::string>()(x.Raw());
}
};
}
#endif
/**
* \class gd::String
*
* \section WhatIsUTF8 What is UTF8 and Unicode ?
* (from https://en.wikipedia.org/wiki/Unicode and https://en.wikipedia.org/wiki/UTF-8)
* Unicode is a computing industry standard for the consistent encoding, representation, and handling of text
* expressed in most of the world's writing systems.
* Unicode can be implemented by different character encodings. The most commonly used encodings are UTF-8, UTF-16
* and the now-obsolete UCS-2.
*
* UTF-8 is a character encoding capable of encoding all possible characters, or code points, in Unicode.
* The encoding is variable-length (not every codepoint is 1 byte long) and uses 8-bit code units. It was designed
* for backward compatibility with ASCII.
* UTF-8 encodes each of the 1,112,064 valid code points in the Unicode code space using one to four 8-bit bytes
* (a group of 8 bits is known as an octet in the Unicode Standard). Code points with lower numerical values
* (i.e., earlier code positions in the Unicode character set, which tend to occur more frequently) are encoded using
* fewer bytes. The first 128 characters of Unicode, which correspond one-to-one with ASCII, are encoded using a
* single octet with the same binary value as ASCII, making valid ASCII text valid UTF-8-encoded Unicode as well.
*
* \section Limitations Limitations
* The String class stores internally the string as an UTF8 encoded std::string. It results in some limitations : it's
* impossible to edit a single character with operator[]() nor at() because the new character length might not be the same.
*
* **The gd::String class supports almost all Unicode characters, except the ones that can't be represented as a single
* codepoint (obviously, a codepoint can be represented by 1 to 4 bytes, as codepoints are encoded in UTF8).** For examples,
* some special letters are composed of multiple codepoints (a letter, and the accents). Most of them can be combined into a
* single codepoint but some can't. These are the not supported ones. See \ref Normalization.
*
* \section Performance Performance
* The UTF8 encoding has the advantage to reduce the RAM consumption compared to UTF16 or UTF32 for strings using a lot
* of latin characters. But the characters variable length brings some performance issues compared to fixed size encoding.
* That's why the complexity of each methods is written in their documentation. For instance, the size() method is linear
* on the string size and so is the operator[]().
*
* \section Conversion Conversions from/to other string types
* The String handles implicit conversion with std::String (implicit constructor and implicit conversion
* operator).
*
* **However, this is not the case with std::string** as this conversion is not often lossless (mostly on Windows).
* You need to explicitly call gd::String::FromLocale or gd::String::FromUTF8
* to convert a std::string to a String. However, if you want to get a String object from a string literal, you can
* directly use the operator=() or the constructor as they are supporting const char* as argument (it assumes the string
* literal is encoded in UTF8, so you'll need to put the u8 prefix).
*
* \subsection Conversions2 Conversion from/to std::string
* \code
* //Get a String from a std::string encoded in the current locale
* std::string ansiStr = "Some beautiful localized characters. "; //Encoded in ANSI on Windows, UTF8 on Linux
* gd::String str = gd::String::FromLocale(ansiStr);
*
* //Create a String using a string literal encoded in UTF8
* gd::String anotherStr = u8"This is an UTF8 string";
* //The same as gd::String anotherStr = gd::utf8::FromUTF8(u8"This is an UTF8 string");
* //But it works only with string literals.
*
* gd::String finalStr = str + anotherStr; //Concatenates the two Strings
* std::cout << finalStr.ToLocale() << std::endl //Shows "Some beautiful localized characters. This is an UTF8 string"
* \endcode
*
* \section Normalization Normalization
* This class stores Unicode strings normalized with NFC which means that all characters are combined (if they can). For example, the "à"
* character can be written in two ways according to the Unicode norm : U+00E0 (the "à" in a single codepoint) or
* U+0061 (the "a" letter codepoint) + U+0300 "the "`" combining accent. We say that they are canonically equivalent.
* However, this can cause problem when comparing strings, that's why **this class normalizes the string when constructed** using
* the **Normalization Form Composition** (all characters are combined, e.g. "à" is represented by a single codepoint).
* If the string **is invalid when constructed, the string is not normalized** : it will be **normalized when the invalid characters
* will be removed using gd::String::ReplaceInvalid()**.
*
* \section CaseInsensitiveComparison Case-insensitive comparison
* In Unicode, uppercasing/lowercasing strings to compare them in a case-insensitive way is not recommended.
* That's why the function gd::CaseInsensitiveEquiv exists to compare two strings in a case-insensitive way.
*/