util.hpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0.
  5. // https://www.boost.org/LICENSE_1_0.txt
  6. #ifndef BOOST_LOCALE_UTIL_HPP
  7. #define BOOST_LOCALE_UTIL_HPP
  8. #include <boost/locale/generator.hpp>
  9. #include <boost/locale/utf.hpp>
  10. #include <boost/assert.hpp>
  11. #include <boost/cstdint.hpp>
  12. #include <locale>
  13. #include <memory>
  14. #include <typeinfo>
  15. namespace boost { namespace locale {
  16. /// \brief This namespace provides various utility function useful for Boost.Locale's backends
  17. /// implementations
  18. namespace util {
  19. /// \brief Return default system locale name in POSIX format.
  20. ///
  21. /// This function tries to detect the locale using LC_ALL, LC_CTYPE and LANG environment
  22. /// variables in this order and if all of them are unset, on POSIX platforms it returns "C".
  23. /// On Windows additionally to the above environment variables, this function
  24. /// tries to create the locale name from ISO-639 and ISO-3166 country codes defined
  25. /// for the users default locale.
  26. /// If \a use_utf8_on_windows is true it sets the encoding to UTF-8,
  27. /// otherwise, if the system locale supports ANSI codepages it defines the ANSI encoding, e.g. windows-1252,
  28. /// otherwise (if ANSI codepage is not available) it uses UTF-8 encoding.
  29. BOOST_LOCALE_DECL
  30. std::string get_system_locale(bool use_utf8_on_windows = false);
  31. /// \brief Installs information facet to locale \a in based on locale name \a name
  32. ///
  33. /// This function installs boost::locale::info facet into the locale \a in and returns
  34. /// newly created locale.
  35. ///
  36. /// Note: all information is based only on parsing of string \a name;
  37. ///
  38. /// The name has following format: language[_COUNTRY][.encoding][\@variant]
  39. /// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166
  40. /// country identifier like "US" or "RU". the Encoding is a character set name
  41. /// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or
  42. /// calendar=hebrew.
  43. ///
  44. /// If some parameters are missing they are specified as blanks, default encoding
  45. /// is assumed to be US-ASCII and missing language is assumed to be "C"
  46. BOOST_LOCALE_DECL
  47. std::locale create_info(const std::locale& in, const std::string& name);
  48. /// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for
  49. /// each single code point
  50. ///
  51. /// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding
  52. /// to encoding supported by this converter
  53. ///
  54. /// Please note, this converter should be fully stateless. Fully stateless means it should
  55. /// never assume that it is called in any specific order on the text. Even if the
  56. /// encoding itself seems to be stateless like windows-1255 or shift-jis, some
  57. /// encoders (most notably iconv) can actually compose several code-point into one or
  58. /// decompose them in case composite characters are found. So be very careful when implementing
  59. /// these converters for certain character set.
  60. class BOOST_LOCALE_DECL base_converter {
  61. public:
  62. /// This value should be returned when an illegal input sequence or code-point is observed:
  63. /// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates
  64. /// or an invalid UTF-8 sequence is found
  65. static constexpr uint32_t illegal = utf::illegal;
  66. /// This value is returned in following cases: The of incomplete input sequence was found or
  67. /// insufficient output buffer was provided so complete output could not be written.
  68. static constexpr uint32_t incomplete = utf::incomplete;
  69. virtual ~base_converter();
  70. /// Return the maximal length that one Unicode code-point can be converted to, for example
  71. /// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1
  72. virtual int max_len() const { return 1; }
  73. /// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe.
  74. ///
  75. /// Rule of thumb: if this class' implementation uses simple tables that are unchanged
  76. /// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for
  77. /// independent to_unicode, from_unicode calls, you may set it to true, otherwise,
  78. /// for example if you use iconv_t descriptor or UConverter as conversion object return false,
  79. /// and this object will be cloned for each use.
  80. virtual bool is_thread_safe() const { return false; }
  81. /// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false
  82. virtual base_converter* clone() const
  83. {
  84. BOOST_ASSERT(typeid(*this) == typeid(base_converter));
  85. return new base_converter();
  86. }
  87. /// Convert a single character starting at begin and ending at most at end to Unicode code-point.
  88. ///
  89. /// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a
  90. /// code_point_end <= \a end it is converted to its Unicode code point equivalent, \a begin is set to \a
  91. /// code_point_end
  92. ///
  93. /// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a
  94. /// code_point_end > \a end and [\a begin, \a code_point_end) would be valid input sequence, then \a
  95. /// incomplete is returned begin stays unchanged, for example for UTF-8 conversion a *begin = 0xc2, \a begin
  96. /// +1 = \a end is such situation.
  97. ///
  98. /// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a
  99. /// code_point_end <= \a end that is illegal for this encoding, \a illegal is returned and begin stays
  100. /// unchanged. For example if *begin = 0xFF and begin < end for UTF-8, then \a illegal is returned.
  101. virtual uint32_t to_unicode(const char*& begin, const char* end)
  102. {
  103. if(begin == end)
  104. return incomplete;
  105. unsigned char cp = *begin;
  106. if(cp <= 0x7F) {
  107. begin++;
  108. return cp;
  109. }
  110. return illegal;
  111. }
  112. /// Convert a single code-point \a u into encoding and store it in [begin,end) range.
  113. ///
  114. /// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set,
  115. /// \a illegal should be returned
  116. ///
  117. /// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then
  118. ///
  119. /// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned
  120. /// -# If end - begin < N, incomplete is returned, it is unspecified what would be
  121. /// stored in bytes in range [begin,end)
  122. virtual uint32_t from_unicode(uint32_t u, char* begin, const char* end)
  123. {
  124. if(begin == end)
  125. return incomplete;
  126. if(u >= 0x80)
  127. return illegal;
  128. *begin = static_cast<char>(u);
  129. return 1;
  130. }
  131. };
  132. /// This function creates a \a base_converter that can be used for conversion between UTF-8 and
  133. /// unicode code points
  134. BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_utf8_converter();
  135. BOOST_DEPRECATED("This function is deprecated, use 'create_utf8_converter()'")
  136. inline std::unique_ptr<base_converter> create_utf8_converter_unique_ptr()
  137. {
  138. return create_utf8_converter();
  139. }
  140. /// This function creates a \a base_converter that can be used for conversion between single byte
  141. /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
  142. ///
  143. /// If \a encoding is not supported, empty pointer is returned.
  144. /// So you should check whether the returned pointer is valid/non-NULL
  145. BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_simple_converter(const std::string& encoding);
  146. BOOST_DEPRECATED("This function is deprecated, use 'create_simple_converter()'")
  147. inline std::unique_ptr<base_converter> create_simple_converter_unique_ptr(const std::string& encoding)
  148. {
  149. return create_simple_converter(encoding);
  150. }
  151. /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
  152. /// facet.
  153. ///
  154. /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
  155. /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or
  156. /// output.
  157. ///
  158. /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
  159. /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
  160. /// of wide encoding type
  161. BOOST_LOCALE_DECL
  162. std::locale create_codecvt(const std::locale& in, std::unique_ptr<base_converter> cvt, char_facet_t type);
  163. BOOST_DEPRECATED("This function is deprecated, use 'create_codecvt()'")
  164. inline std::locale create_codecvt_from_pointer(const std::locale& in, base_converter* cvt, char_facet_t type)
  165. {
  166. return create_codecvt(in, std::unique_ptr<base_converter>(cvt), type);
  167. }
  168. /// This function creates a \a base_converter that can be used for conversion between UTF-8 and
  169. /// unicode code points
  170. BOOST_LOCALE_DECL base_converter* create_utf8_converter_new_ptr();
  171. /// This function creates a \a base_converter that can be used for conversion between single byte
  172. /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
  173. ///
  174. /// If \a encoding is not supported, empty pointer is returned. You should check if
  175. /// the returned pointer is NULL.
  176. BOOST_LOCALE_DECL base_converter* create_simple_converter_new_ptr(const std::string& encoding);
  177. /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return
  178. /// new locale that is based on \a in and uses new facet.
  179. BOOST_LOCALE_DECL
  180. std::locale create_utf8_codecvt(const std::locale& in, char_facet_t type);
  181. /// This function installs codecvt that can be used for conversion between single byte
  182. /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
  183. ///
  184. /// Throws boost::locale::conv::invalid_charset_error if the character set is not supported or isn't single byte
  185. /// character set
  186. BOOST_LOCALE_DECL
  187. std::locale create_simple_codecvt(const std::locale& in, const std::string& encoding, char_facet_t type);
  188. } // namespace util
  189. }} // namespace boost::locale
  190. #endif