generic_codecvt.hpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. //
  2. // Copyright (c) 2015 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0.
  5. // https://www.boost.org/LICENSE_1_0.txt
  6. #ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
  7. #define BOOST_LOCALE_GENERIC_CODECVT_HPP
  8. #include <boost/locale/utf.hpp>
  9. #include <boost/cstdint.hpp>
  10. #include <locale>
  11. namespace boost { namespace locale {
  12. #ifndef BOOST_LOCALE_DOXYGEN
  13. //
  14. // Make sure that mbstate can keep 16 bit of UTF-16 sequence
  15. //
  16. static_assert(sizeof(std::mbstate_t) >= 2, "std::mbstate_t is to small");
  17. #endif
  18. #if defined(_MSC_VER) && _MSC_VER < 1700
  19. // up to MSVC 11 (2012) do_length is non-standard it counts wide characters instead of narrow and does not change
  20. // mbstate
  21. # define BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
  22. #endif
  23. /// \brief A base class that used to define constants for generic_codecvt
  24. class generic_codecvt_base {
  25. public:
  26. /// Initial state for converting to or from unicode code points, used by initial_state in derived classes
  27. enum initial_convertion_state {
  28. to_unicode_state, ///< The state would be used by to_unicode functions
  29. from_unicode_state ///< The state would be used by from_unicode functions
  30. };
  31. };
  32. /// \brief Generic codecvt facet for various stateless encodings to UTF-16 and UTF-32 using wchar_t, char32_t
  33. /// and char16_t
  34. ///
  35. /// Implementations should derive from this class defining itself as CodecvtImpl and provide following members
  36. ///
  37. /// - `state_type` - a type of special object that allows to store intermediate cached data, for example `iconv_t`
  38. /// descriptor
  39. /// - `state_type initial_state(generic_codecvt_base::initial_convertion_state direction) const` - member function
  40. /// that creates initial state
  41. /// - `int max_encoding_length() const` - a maximal length that one Unicode code point is represented, for UTF-8 for
  42. /// example it is 4 from ISO-8859-1 it is 1
  43. /// - `utf::code_point to_unicode(state_type &state,char const *&begin,char const *end)` - extract first code point
  44. /// from the text in range [begin,end), in case of success begin would point to the next character sequence to be
  45. /// encoded to next code point, in case of incomplete sequence - utf::incomplete shell be returned, and in case of
  46. /// invalid input sequence utf::illegal shell be returned and begin would remain unmodified
  47. /// - `utf::code_point from_unicode(state_type &state,utf::code_point u,char *begin,char const *end)` - convert a
  48. /// unicode code point `u` into a character sequence at [begin,end). Return the length of the sequence in case of
  49. /// success, utf::incomplete in case of not enough room to encode the code point of utf::illegal in case conversion
  50. /// can not be performed
  51. ///
  52. ///
  53. /// For example implementation of codecvt for latin1/ISO-8859-1 character set
  54. ///
  55. /// \code
  56. ///
  57. /// template<typename CharType>
  58. /// class latin1_codecvt :boost::locale::generic_codecvt<CharType,latin1_codecvt<CharType> >
  59. /// {
  60. /// public:
  61. ///
  62. /// /* Standard codecvt constructor */
  63. /// latin1_codecvt(size_t refs = 0): boost::locale::generic_codecvt<CharType,latin1_codecvt<CharType> >(refs)
  64. /// {
  65. /// }
  66. ///
  67. /// /* State is unused but required by generic_codecvt */
  68. /// struct state_type {};
  69. ///
  70. /// state_type initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const
  71. /// {
  72. /// return state_type();
  73. /// }
  74. ///
  75. /// int max_encoding_length() const
  76. /// {
  77. /// return 1;
  78. /// }
  79. ///
  80. /// boost::locale::utf::code_point to_unicode(state_type &,char const *&begin,char const *end) const
  81. /// {
  82. /// if(begin == end)
  83. /// return boost::locale::utf::incomplete;
  84. /// return *begin++;
  85. /// }
  86. ///
  87. /// boost::locale::utf::code_point from_unicode(state_type &,boost::locale::utf::code_point u,char *begin,char
  88. /// const *end) const
  89. /// {
  90. /// if(u >= 256)
  91. /// return boost::locale::utf::illegal;
  92. /// if(begin == end)
  93. /// return boost::locale::utf::incomplete;
  94. /// *begin = u;
  95. /// return 1;
  96. /// }
  97. /// };
  98. ///
  99. /// \endcode
  100. ///
  101. /// When external tools used for encoding conversion, the `state_type` is useful to save objects used for
  102. /// conversions. For example, icu::UConverter can be saved in such a state for an efficient use:
  103. ///
  104. /// \code
  105. /// template<typename CharType>
  106. /// class icu_codecvt :boost::locale::generic_codecvt<CharType,icu_codecvt<CharType> >
  107. /// {
  108. /// public:
  109. ///
  110. /// /* Standard codecvt constructor */
  111. /// icu_codecvt(std::string const &name,refs = 0):
  112. /// boost::locale::generic_codecvt<CharType,latin1_codecvt<CharType> >(refs)
  113. /// { ... }
  114. ///
  115. /// /* State is unused but required by generic_codecvt */
  116. /// struct std::unique_ptr<UConverter,void (*)(UConverter*)> state_type;
  117. ///
  118. /// state_type &&initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const
  119. /// {
  120. /// UErrorCode err = U_ZERO_ERROR;
  121. /// state_type ptr(ucnv_safeClone(converter_,0,0,&err,ucnv_close);
  122. /// return std::move(ptr);
  123. /// }
  124. ///
  125. /// boost::locale::utf::code_point to_unicode(state_type &ptr,char const *&begin,char const *end) const
  126. /// {
  127. /// UErrorCode err = U_ZERO_ERROR;
  128. /// boost::locale::utf::code_point cp = ucnv_getNextUChar(ptr.get(),&begin,end,&err);
  129. /// ...
  130. /// }
  131. /// ...
  132. /// };
  133. /// \endcode
  134. ///
  135. template<typename CharType, typename CodecvtImpl, int CharSize = sizeof(CharType)>
  136. class generic_codecvt;
  137. /// \brief UTF-16 to/from UTF-8 codecvt facet to use with char16_t or wchar_t on Windows
  138. ///
  139. /// Note in order to fit the requirements of usability by std::wfstream it uses mbstate_t
  140. /// to handle intermediate states in handling of variable length UTF-16 sequences
  141. ///
  142. /// Its member functions implement standard virtual functions of basic codecvt
  143. template<typename CharType, typename CodecvtImpl>
  144. class generic_codecvt<CharType, CodecvtImpl, 2> : public std::codecvt<CharType, char, std::mbstate_t>,
  145. public generic_codecvt_base {
  146. public:
  147. typedef CharType uchar;
  148. generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
  149. const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
  150. protected:
  151. std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
  152. {
  153. boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&s);
  154. if(state != 0)
  155. return std::codecvt_base::error;
  156. next = from;
  157. return std::codecvt_base::ok;
  158. }
  159. int do_encoding() const noexcept override { return 0; }
  160. int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
  161. bool do_always_noconv() const noexcept override { return false; }
  162. int do_length(
  163. #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
  164. const
  165. #endif
  166. std::mbstate_t& std_state,
  167. const char* from,
  168. const char* from_end,
  169. size_t max) const override
  170. {
  171. #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
  172. const char* save_from = from;
  173. boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
  174. #else
  175. const size_t start_max = max;
  176. boost::uint16_t state = *reinterpret_cast<const boost::uint16_t*>(&std_state);
  177. #endif
  178. typename CodecvtImpl::state_type cvt_state =
  179. implementation().initial_state(generic_codecvt_base::to_unicode_state);
  180. while(max > 0 && from < from_end) {
  181. const char* prev_from = from;
  182. boost::uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
  183. if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
  184. from = prev_from;
  185. break;
  186. }
  187. max--;
  188. if(ch > 0xFFFF) {
  189. if(state == 0) {
  190. from = prev_from;
  191. state = 1;
  192. } else {
  193. state = 0;
  194. }
  195. }
  196. }
  197. #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
  198. return static_cast<int>(from - save_from);
  199. #else
  200. return static_cast<int>(start_max - max);
  201. #endif
  202. }
  203. std::codecvt_base::result do_in(std::mbstate_t& std_state,
  204. const char* from,
  205. const char* from_end,
  206. const char*& from_next,
  207. uchar* to,
  208. uchar* to_end,
  209. uchar*& to_next) const override
  210. {
  211. std::codecvt_base::result r = std::codecvt_base::ok;
  212. // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
  213. // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
  214. //
  215. // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
  216. // and first pair is written, but no input consumed
  217. boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
  218. typename CodecvtImpl::state_type cvt_state =
  219. implementation().initial_state(generic_codecvt_base::to_unicode_state);
  220. while(to < to_end && from < from_end) {
  221. const char* from_saved = from;
  222. uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
  223. if(ch == boost::locale::utf::illegal) {
  224. from = from_saved;
  225. r = std::codecvt_base::error;
  226. break;
  227. }
  228. if(ch == boost::locale::utf::incomplete) {
  229. from = from_saved;
  230. r = std::codecvt_base::partial;
  231. break;
  232. }
  233. // Normal codepoints go directly to stream
  234. if(ch <= 0xFFFF) {
  235. *to++ = static_cast<uchar>(ch);
  236. } else {
  237. // For other codepoints we do the following
  238. //
  239. // 1. We can't consume our input as we may find ourselves
  240. // in state where all input consumed but not all output written,i.e. only
  241. // 1st pair is written
  242. // 2. We only write first pair and mark this in the state, we also revert back
  243. // the from pointer in order to make sure this codepoint would be read
  244. // once again and then we would consume our input together with writing
  245. // second surrogate pair
  246. ch -= 0x10000;
  247. boost::uint16_t w1 = static_cast<boost::uint16_t>(0xD800 | (ch >> 10));
  248. boost::uint16_t w2 = static_cast<boost::uint16_t>(0xDC00 | (ch & 0x3FF));
  249. if(state == 0) {
  250. from = from_saved;
  251. *to++ = w1;
  252. state = 1;
  253. } else {
  254. *to++ = w2;
  255. state = 0;
  256. }
  257. }
  258. }
  259. from_next = from;
  260. to_next = to;
  261. if(r == std::codecvt_base::ok && (from != from_end || state != 0))
  262. r = std::codecvt_base::partial;
  263. return r;
  264. }
  265. std::codecvt_base::result do_out(std::mbstate_t& std_state,
  266. const uchar* from,
  267. const uchar* from_end,
  268. const uchar*& from_next,
  269. char* to,
  270. char* to_end,
  271. char*& to_next) const override
  272. {
  273. std::codecvt_base::result r = std::codecvt_base::ok;
  274. // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
  275. // according to standard. We assume that sizeof(mbstate_t) >=2 in order
  276. // to be able to store first observed surrogate pair
  277. //
  278. // State: state!=0 - a first surrogate pair was observed (state = first pair),
  279. // we expect the second one to come and then zero the state
  280. boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
  281. typename CodecvtImpl::state_type cvt_state =
  282. implementation().initial_state(generic_codecvt_base::from_unicode_state);
  283. while(to < to_end && from < from_end) {
  284. boost::uint32_t ch = 0;
  285. if(state != 0) {
  286. // if the state indicates that 1st surrogate pair was written
  287. // we should make sure that the second one that comes is actually
  288. // second surrogate
  289. boost::uint16_t w1 = state;
  290. boost::uint16_t w2 = *from;
  291. // we don't forward from as writing may fail to incomplete or
  292. // partial conversion
  293. if(0xDC00 <= w2 && w2 <= 0xDFFF) {
  294. boost::uint16_t vh = w1 - 0xD800;
  295. boost::uint16_t vl = w2 - 0xDC00;
  296. ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
  297. } else {
  298. // Invalid surrogate
  299. r = std::codecvt_base::error;
  300. break;
  301. }
  302. } else {
  303. ch = *from;
  304. if(0xD800 <= ch && ch <= 0xDBFF) {
  305. // if this is a first surrogate pair we put
  306. // it into the state and consume it, note we don't
  307. // go forward as it should be illegal so we increase
  308. // the from pointer manually
  309. state = static_cast<uint16_t>(ch);
  310. from++;
  311. continue;
  312. } else if(0xDC00 <= ch && ch <= 0xDFFF) {
  313. // if we observe second surrogate pair and
  314. // first only may be expected we should break from the loop with error
  315. // as it is illegal input
  316. r = std::codecvt_base::error;
  317. break;
  318. }
  319. }
  320. if(!boost::locale::utf::is_valid_codepoint(ch)) {
  321. r = std::codecvt_base::error;
  322. break;
  323. }
  324. boost::uint32_t len = implementation().from_unicode(cvt_state, ch, to, to_end);
  325. if(len == boost::locale::utf::incomplete) {
  326. r = std::codecvt_base::partial;
  327. break;
  328. } else if(len == boost::locale::utf::illegal) {
  329. r = std::codecvt_base::error;
  330. break;
  331. } else
  332. to += len;
  333. state = 0;
  334. from++;
  335. }
  336. from_next = from;
  337. to_next = to;
  338. if(r == std::codecvt_base::ok && (from != from_end || state != 0))
  339. r = std::codecvt_base::partial;
  340. return r;
  341. }
  342. };
  343. /// \brief UTF-32 to/from UTF-8 codecvt facet to use with char32_t or wchar_t on POSIX platforms
  344. ///
  345. /// Its member functions implement standard virtual functions of basic codecvt.
  346. /// mbstate_t is not used for UTF-32 handling due to fixed length encoding
  347. template<typename CharType, typename CodecvtImpl>
  348. class generic_codecvt<CharType, CodecvtImpl, 4> : public std::codecvt<CharType, char, std::mbstate_t>,
  349. public generic_codecvt_base {
  350. public:
  351. typedef CharType uchar;
  352. generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
  353. const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
  354. protected:
  355. std::codecvt_base::result
  356. do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
  357. {
  358. next = from;
  359. return std::codecvt_base::ok;
  360. }
  361. int do_encoding() const noexcept override { return 0; }
  362. int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
  363. bool do_always_noconv() const noexcept override { return false; }
  364. int do_length(
  365. #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
  366. const
  367. #endif
  368. std::mbstate_t& /*state*/,
  369. const char* from,
  370. const char* from_end,
  371. size_t max) const override
  372. {
  373. #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
  374. const char* start_from = from;
  375. #else
  376. const size_t start_max = max;
  377. #endif
  378. typename CodecvtImpl::state_type cvt_state =
  379. implementation().initial_state(generic_codecvt_base::to_unicode_state);
  380. while(max > 0 && from < from_end) {
  381. const char* save_from = from;
  382. boost::uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
  383. if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
  384. from = save_from;
  385. break;
  386. }
  387. max--;
  388. }
  389. #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
  390. return static_cast<int>(from - start_from);
  391. #else
  392. return static_cast<int>(start_max - max);
  393. #endif
  394. }
  395. std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
  396. const char* from,
  397. const char* from_end,
  398. const char*& from_next,
  399. uchar* to,
  400. uchar* to_end,
  401. uchar*& to_next) const override
  402. {
  403. std::codecvt_base::result r = std::codecvt_base::ok;
  404. // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
  405. // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
  406. //
  407. // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
  408. // and first pair is written, but no input consumed
  409. auto cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
  410. while(to < to_end && from < from_end) {
  411. const char* from_saved = from;
  412. uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
  413. if(ch == boost::locale::utf::illegal) {
  414. r = std::codecvt_base::error;
  415. from = from_saved;
  416. break;
  417. }
  418. if(ch == boost::locale::utf::incomplete) {
  419. r = std::codecvt_base::partial;
  420. from = from_saved;
  421. break;
  422. }
  423. *to++ = ch;
  424. }
  425. from_next = from;
  426. to_next = to;
  427. if(r == std::codecvt_base::ok && from != from_end)
  428. r = std::codecvt_base::partial;
  429. return r;
  430. }
  431. std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
  432. const uchar* from,
  433. const uchar* from_end,
  434. const uchar*& from_next,
  435. char* to,
  436. char* to_end,
  437. char*& to_next) const override
  438. {
  439. std::codecvt_base::result r = std::codecvt_base::ok;
  440. auto cvt_state = implementation().initial_state(generic_codecvt_base::from_unicode_state);
  441. while(to < to_end && from < from_end) {
  442. boost::uint32_t ch = 0;
  443. ch = *from;
  444. if(!boost::locale::utf::is_valid_codepoint(ch)) {
  445. r = std::codecvt_base::error;
  446. break;
  447. }
  448. boost::uint32_t len = implementation().from_unicode(cvt_state, ch, to, to_end);
  449. if(len == boost::locale::utf::incomplete) {
  450. r = std::codecvt_base::partial;
  451. break;
  452. } else if(len == boost::locale::utf::illegal) {
  453. r = std::codecvt_base::error;
  454. break;
  455. }
  456. to += len;
  457. from++;
  458. }
  459. from_next = from;
  460. to_next = to;
  461. if(r == std::codecvt_base::ok && from != from_end)
  462. r = std::codecvt_base::partial;
  463. return r;
  464. }
  465. };
  466. template<typename CharType, typename CodecvtImpl>
  467. class generic_codecvt<CharType, CodecvtImpl, 1> : public std::codecvt<CharType, char, std::mbstate_t>,
  468. public generic_codecvt_base {
  469. public:
  470. typedef CharType uchar;
  471. const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
  472. generic_codecvt(size_t refs = 0) : std::codecvt<char, char, std::mbstate_t>(refs) {}
  473. };
  474. }} // namespace boost::locale
  475. #endif