mptString.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. /*
  2. * mptString.h
  3. * ----------
  4. * Purpose: Small string-related utilities, number and message formatting.
  5. * Notes : Currently none.
  6. * Authors: OpenMPT Devs
  7. * The OpenMPT source code is released under the BSD license. Read LICENSE for more details.
  8. */
  9. #pragma once
  10. #include "openmpt/all/BuildSettings.hpp"
  11. #include "mpt/base/alloc.hpp"
  12. #include "mpt/base/span.hpp"
  13. #include "mpt/string/types.hpp"
  14. #include "mpt/string/utility.hpp"
  15. #include "mptBaseTypes.h"
  16. #include <algorithm>
  17. #include <limits>
  18. #include <string>
  19. #include <string_view>
  20. #include <cstring>
  21. OPENMPT_NAMESPACE_BEGIN
  22. namespace mpt
  23. {
  24. namespace String
  25. {
  26. template <typename Tstring, typename Tstring2, typename Tstring3>
  27. inline Tstring Replace(Tstring str, const Tstring2 &oldStr, const Tstring3 &newStr)
  28. {
  29. return mpt::replace(str, oldStr, newStr);
  30. }
  31. } // namespace String
  32. enum class Charset {
  33. UTF8,
  34. ASCII, // strictly 7-bit ASCII
  35. ISO8859_1,
  36. ISO8859_15,
  37. CP850,
  38. CP437,
  39. CP437AMS,
  40. CP437AMS2,
  41. Windows1252,
  42. Amiga,
  43. RISC_OS,
  44. ISO8859_1_no_C1,
  45. ISO8859_15_no_C1,
  46. Amiga_no_C1,
  47. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  48. Locale, // CP_ACP on windows, current C locale otherwise
  49. #endif // MPT_ENABLE_CHARSET_LOCALE
  50. };
  51. // source code / preprocessor (i.e. # token)
  52. inline constexpr Charset CharsetSource = Charset::ASCII;
  53. // debug log files
  54. inline constexpr Charset CharsetLogfile = Charset::UTF8;
  55. // std::clog / std::cout / std::cerr
  56. #if defined(MODPLUG_TRACKER) && MPT_OS_WINDOWS && defined(MPT_ENABLE_CHARSET_LOCALE)
  57. inline constexpr Charset CharsetStdIO = Charset::Locale;
  58. #else
  59. inline constexpr Charset CharsetStdIO = Charset::UTF8;
  60. #endif
  61. // getenv
  62. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  63. inline constexpr Charset CharsetEnvironment = Charset::Locale;
  64. #else
  65. inline constexpr Charset CharsetEnvironment = Charset::UTF8;
  66. #endif
  67. // std::exception::what()
  68. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  69. inline constexpr Charset CharsetException = Charset::Locale;
  70. #else
  71. inline constexpr Charset CharsetException = Charset::UTF8;
  72. #endif
  73. // Checks if the std::string represents an UTF8 string.
  74. // This is currently implemented as converting to std::wstring and back assuming UTF8 both ways,
  75. // and comparing the result to the original string.
  76. // Caveats:
  77. // - can give false negatives because of possible unicode normalization during conversion
  78. // - can give false positives if the 8bit encoding contains high-ascii only in valid utf8 groups
  79. // - slow because of double conversion
  80. bool IsUTF8(const std::string &str);
  81. #if MPT_WSTRING_CONVERT
  82. // Convert to a wide character string.
  83. // The wide encoding is UTF-16 or UTF-32, based on sizeof(wchar_t).
  84. // If str does not contain any invalid characters, this conversion is lossless.
  85. // Invalid source bytes will be replaced by some replacement character or string.
  86. inline std::wstring ToWide(const std::wstring &str) { return str; }
  87. inline std::wstring ToWide(const wchar_t * str) { return (str ? std::wstring(str) : std::wstring()); }
  88. std::wstring ToWide(Charset from, const std::string &str);
  89. inline std::wstring ToWide(Charset from, const char * str) { return ToWide(from, str ? std::string(str) : std::string()); }
  90. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  91. std::wstring ToWide(const mpt::lstring &str);
  92. #endif // MPT_ENABLE_CHARSET_LOCALE
  93. #endif
  94. // Convert to a string encoded in the 'to'-specified character set.
  95. // If str does not contain any invalid characters,
  96. // this conversion will be lossless iff, and only iff,
  97. // 'to' is UTF8.
  98. // Invalid source bytes or characters that are not representable in the
  99. // destination charset will be replaced by some replacement character or string.
  100. #if MPT_WSTRING_CONVERT
  101. std::string ToCharset(Charset to, const std::wstring &str);
  102. inline std::string ToCharset(Charset to, const wchar_t * str) { return ToCharset(to, str ? std::wstring(str) : std::wstring()); }
  103. #endif
  104. std::string ToCharset(Charset to, Charset from, const std::string &str);
  105. inline std::string ToCharset(Charset to, Charset from, const char * str) { return ToCharset(to, from, str ? std::string(str) : std::string()); }
  106. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  107. std::string ToCharset(Charset to, const mpt::lstring &str);
  108. #endif // MPT_ENABLE_CHARSET_LOCALE
  109. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  110. #if MPT_WSTRING_CONVERT
  111. mpt::lstring ToLocale(const std::wstring &str);
  112. inline mpt::lstring ToLocale(const wchar_t * str) { return ToLocale(str ? std::wstring(str): std::wstring()); }
  113. #endif
  114. mpt::lstring ToLocale(Charset from, const std::string &str);
  115. inline mpt::lstring ToLocale(Charset from, const char * str) { return ToLocale(from, str ? std::string(str): std::string()); }
  116. inline mpt::lstring ToLocale(const mpt::lstring &str) { return str; }
  117. #endif // MPT_ENABLE_CHARSET_LOCALE
  118. #if MPT_OS_WINDOWS
  119. #if MPT_WSTRING_CONVERT
  120. mpt::winstring ToWin(const std::wstring &str);
  121. inline mpt::winstring ToWin(const wchar_t * str) { return ToWin(str ? std::wstring(str): std::wstring()); }
  122. #endif
  123. mpt::winstring ToWin(Charset from, const std::string &str);
  124. inline mpt::winstring ToWin(Charset from, const char * str) { return ToWin(from, str ? std::string(str): std::string()); }
  125. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  126. mpt::winstring ToWin(const mpt::lstring &str);
  127. #endif // MPT_ENABLE_CHARSET_LOCALE
  128. #endif // MPT_OS_WINDOWS
  129. #if defined(MPT_WITH_MFC)
  130. #if !(MPT_WSTRING_CONVERT)
  131. #error "MFC depends on MPT_WSTRING_CONVERT)"
  132. #endif
  133. // Convert to a MFC CString. The CString encoding depends on UNICODE.
  134. // This should also be used when converting to TCHAR strings.
  135. // If UNICODE is defined, this is a completely lossless operation.
  136. inline CString ToCString(const CString &str) { return str; }
  137. CString ToCString(const std::wstring &str);
  138. inline CString ToCString(const wchar_t * str) { return ToCString(str ? std::wstring(str) : std::wstring()); }
  139. CString ToCString(Charset from, const std::string &str);
  140. inline CString ToCString(Charset from, const char * str) { return ToCString(from, str ? std::string(str) : std::string()); }
  141. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  142. CString ToCString(const mpt::lstring &str);
  143. mpt::lstring ToLocale(const CString &str);
  144. #endif // MPT_ENABLE_CHARSET_LOCALE
  145. #if MPT_OS_WINDOWS
  146. mpt::winstring ToWin(const CString &str);
  147. #endif // MPT_OS_WINDOWS
  148. // Convert from a MFC CString. The CString encoding depends on UNICODE.
  149. // This should also be used when converting from TCHAR strings.
  150. // If UNICODE is defined, this is a completely lossless operation.
  151. std::wstring ToWide(const CString &str);
  152. std::string ToCharset(Charset to, const CString &str);
  153. #endif // MPT_WITH_MFC
  154. #define UC_(x) MPT_UCHAR(x)
  155. #define UL_(x) MPT_ULITERAL(x)
  156. #define U_(x) MPT_USTRING(x)
  157. #if MPT_USTRING_MODE_WIDE
  158. #if !(MPT_WSTRING_CONVERT)
  159. #error "MPT_USTRING_MODE_WIDE depends on MPT_WSTRING_CONVERT)"
  160. #endif
  161. inline mpt::ustring ToUnicode(const std::wstring &str) { return str; }
  162. inline mpt::ustring ToUnicode(const wchar_t * str) { return (str ? std::wstring(str) : std::wstring()); }
  163. inline mpt::ustring ToUnicode(Charset from, const std::string &str) { return ToWide(from, str); }
  164. inline mpt::ustring ToUnicode(Charset from, const char * str) { return ToUnicode(from, str ? std::string(str) : std::string()); }
  165. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  166. inline mpt::ustring ToUnicode(const mpt::lstring &str) { return ToWide(str); }
  167. #endif // MPT_ENABLE_CHARSET_LOCALE
  168. #if defined(MPT_WITH_MFC)
  169. inline mpt::ustring ToUnicode(const CString &str) { return ToWide(str); }
  170. #endif // MFC
  171. #else // !MPT_USTRING_MODE_WIDE
  172. inline mpt::ustring ToUnicode(const mpt::ustring &str) { return str; }
  173. #if MPT_WSTRING_CONVERT
  174. mpt::ustring ToUnicode(const std::wstring &str);
  175. inline mpt::ustring ToUnicode(const wchar_t * str) { return ToUnicode(str ? std::wstring(str) : std::wstring()); }
  176. #endif
  177. mpt::ustring ToUnicode(Charset from, const std::string &str);
  178. inline mpt::ustring ToUnicode(Charset from, const char * str) { return ToUnicode(from, str ? std::string(str) : std::string()); }
  179. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  180. mpt::ustring ToUnicode(const mpt::lstring &str);
  181. #endif // MPT_ENABLE_CHARSET_LOCALE
  182. #if defined(MPT_WITH_MFC)
  183. mpt::ustring ToUnicode(const CString &str);
  184. #endif // MPT_WITH_MFC
  185. #endif // MPT_USTRING_MODE_WIDE
  186. #if MPT_USTRING_MODE_WIDE
  187. #if !(MPT_WSTRING_CONVERT)
  188. #error "MPT_USTRING_MODE_WIDE depends on MPT_WSTRING_CONVERT)"
  189. #endif
  190. // nothing, std::wstring overloads will catch all stuff
  191. #else // !MPT_USTRING_MODE_WIDE
  192. #if MPT_WSTRING_CONVERT
  193. std::wstring ToWide(const mpt::ustring &str);
  194. #endif
  195. std::string ToCharset(Charset to, const mpt::ustring &str);
  196. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  197. mpt::lstring ToLocale(const mpt::ustring &str);
  198. #endif // MPT_ENABLE_CHARSET_LOCALE
  199. #if MPT_OS_WINDOWS
  200. mpt::winstring ToWin(const mpt::ustring &str);
  201. #endif // MPT_OS_WINDOWS
  202. #if defined(MPT_WITH_MFC)
  203. CString ToCString(const mpt::ustring &str);
  204. #endif // MPT_WITH_MFC
  205. #endif // MPT_USTRING_MODE_WIDE
  206. // The MPT_UTF8 allows specifying UTF8 char arrays.
  207. // The resulting type is mpt::ustring and the construction might require runtime translation,
  208. // i.e. it is NOT generally available at compile time.
  209. // Use explicit UTF8 encoding,
  210. // i.e. U+00FC (LATIN SMALL LETTER U WITH DIAERESIS) would be written as "\xC3\xBC".
  211. #define MPT_UTF8(x) mpt::ToUnicode(mpt::Charset::UTF8, x)
  212. mpt::ustring ToUnicode(uint16 codepage, mpt::Charset fallback, const std::string &str);
  213. char ToLowerCaseAscii(char c);
  214. char ToUpperCaseAscii(char c);
  215. std::string ToLowerCaseAscii(std::string s);
  216. std::string ToUpperCaseAscii(std::string s);
  217. int CompareNoCaseAscii(const char *a, const char *b, std::size_t n);
  218. int CompareNoCaseAscii(std::string_view a, std::string_view b);
  219. int CompareNoCaseAscii(const std::string &a, const std::string &b);
  220. #if defined(MODPLUG_TRACKER)
  221. mpt::ustring ToLowerCase(const mpt::ustring &s);
  222. mpt::ustring ToUpperCase(const mpt::ustring &s);
  223. #endif // MODPLUG_TRACKER
  224. } // namespace mpt
  225. // The AnyString types are meant to be used as function argument types only,
  226. // and only during the transition phase to all-unicode strings in the whole codebase.
  227. // Using an AnyString type as function argument avoids the need to overload a function for all the
  228. // different string types that we currently have.
  229. // Warning: These types will silently do charset conversions. Only use them when this can be tolerated.
  230. // BasicAnyString is convertable to mpt::ustring and constructable from any string at all.
  231. template <mpt::Charset charset = mpt::Charset::UTF8, bool tryUTF8 = true>
  232. class BasicAnyString : public mpt::ustring
  233. {
  234. private:
  235. static mpt::ustring From8bit(const std::string &str)
  236. {
  237. if constexpr(charset == mpt::Charset::UTF8)
  238. {
  239. return mpt::ToUnicode(mpt::Charset::UTF8, str);
  240. } else
  241. {
  242. // auto utf8 detection
  243. if constexpr(tryUTF8)
  244. {
  245. if(mpt::IsUTF8(str))
  246. {
  247. return mpt::ToUnicode(mpt::Charset::UTF8, str);
  248. } else
  249. {
  250. return mpt::ToUnicode(charset, str);
  251. }
  252. } else
  253. {
  254. return mpt::ToUnicode(charset, str);
  255. }
  256. }
  257. }
  258. public:
  259. // 8 bit
  260. BasicAnyString(const char *str) : mpt::ustring(From8bit(str ? str : std::string())) { }
  261. BasicAnyString(const std::string str) : mpt::ustring(From8bit(str)) { }
  262. // locale
  263. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  264. BasicAnyString(const mpt::lstring str) : mpt::ustring(mpt::ToUnicode(str)) { }
  265. #endif // MPT_ENABLE_CHARSET_LOCALE
  266. // unicode
  267. BasicAnyString(const mpt::ustring &str) : mpt::ustring(str) { }
  268. BasicAnyString(mpt::ustring &&str) : mpt::ustring(std::move(str)) { }
  269. #if MPT_USTRING_MODE_UTF8 && MPT_WSTRING_CONVERT
  270. BasicAnyString(const std::wstring &str) : mpt::ustring(mpt::ToUnicode(str)) { }
  271. #endif
  272. #if MPT_WSTRING_CONVERT
  273. BasicAnyString(const wchar_t *str) : mpt::ustring(str ? mpt::ToUnicode(str) : mpt::ustring()) { }
  274. #endif
  275. // mfc
  276. #if defined(MPT_WITH_MFC)
  277. BasicAnyString(const CString &str) : mpt::ustring(mpt::ToUnicode(str)) { }
  278. #endif // MPT_WITH_MFC
  279. // fallback for custom string types
  280. template <typename Tstring> BasicAnyString(const Tstring &str) : mpt::ustring(mpt::ToUnicode(str)) { }
  281. template <typename Tstring> BasicAnyString(Tstring &&str) : mpt::ustring(mpt::ToUnicode(std::forward<Tstring>(str))) { }
  282. };
  283. // AnyUnicodeString is convertable to mpt::ustring and constructable from any unicode string,
  284. class AnyUnicodeString : public mpt::ustring
  285. {
  286. public:
  287. // locale
  288. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  289. AnyUnicodeString(const mpt::lstring &str) : mpt::ustring(mpt::ToUnicode(str)) { }
  290. #endif // MPT_ENABLE_CHARSET_LOCALE
  291. // unicode
  292. AnyUnicodeString(const mpt::ustring &str) : mpt::ustring(str) { }
  293. AnyUnicodeString(mpt::ustring &&str) : mpt::ustring(std::move(str)) { }
  294. #if MPT_USTRING_MODE_UTF8 && MPT_WSTRING_CONVERT
  295. AnyUnicodeString(const std::wstring &str) : mpt::ustring(mpt::ToUnicode(str)) { }
  296. #endif
  297. #if MPT_WSTRING_CONVERT
  298. AnyUnicodeString(const wchar_t *str) : mpt::ustring(str ? mpt::ToUnicode(str) : mpt::ustring()) { }
  299. #endif
  300. // mfc
  301. #if defined(MPT_WITH_MFC)
  302. AnyUnicodeString(const CString &str) : mpt::ustring(mpt::ToUnicode(str)) { }
  303. #endif // MPT_WITH_MFC
  304. // fallback for custom string types
  305. template <typename Tstring> AnyUnicodeString(const Tstring &str) : mpt::ustring(mpt::ToUnicode(str)) { }
  306. template <typename Tstring> AnyUnicodeString(Tstring &&str) : mpt::ustring(mpt::ToUnicode(std::forward<Tstring>(str))) { }
  307. };
  308. // AnyString
  309. // Try to do the smartest auto-magic we can do.
  310. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  311. using AnyString = BasicAnyString<mpt::Charset::Locale, true>;
  312. #elif MPT_OS_WINDOWS
  313. using AnyString = BasicAnyString<mpt::Charset::Windows1252, true>;
  314. #else
  315. using AnyString = BasicAnyString<mpt::Charset::ISO8859_1, true>;
  316. #endif
  317. // AnyStringLocale
  318. // char-based strings are assumed to be in locale encoding.
  319. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  320. using AnyStringLocale = BasicAnyString<mpt::Charset::Locale, false>;
  321. #else
  322. using AnyStringLocale = BasicAnyString<mpt::Charset::UTF8, false>;
  323. #endif
  324. // AnyStringUTF8orLocale
  325. // char-based strings are tried in UTF8 first, if this fails, locale is used.
  326. #if defined(MPT_ENABLE_CHARSET_LOCALE)
  327. using AnyStringUTF8orLocale = BasicAnyString<mpt::Charset::Locale, true>;
  328. #else
  329. using AnyStringUTF8orLocale = BasicAnyString<mpt::Charset::UTF8, false>;
  330. #endif
  331. // AnyStringUTF8
  332. // char-based strings are assumed to be in UTF8.
  333. using AnyStringUTF8 = BasicAnyString<mpt::Charset::UTF8, false>;
  334. OPENMPT_NAMESPACE_END