index.hpp 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0.
  5. // https://www.boost.org/LICENSE_1_0.txt
  6. #ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
  7. #define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
  8. #include <boost/locale/boundary/boundary_point.hpp>
  9. #include <boost/locale/boundary/facets.hpp>
  10. #include <boost/locale/boundary/segment.hpp>
  11. #include <boost/locale/boundary/types.hpp>
  12. #include <boost/cstdint.hpp>
  13. #include <boost/iterator/iterator_facade.hpp>
  14. #include <algorithm>
  15. #include <iterator>
  16. #include <locale>
  17. #include <memory>
  18. #include <stdexcept>
  19. #include <string>
  20. #include <type_traits>
  21. #include <vector>
  22. #ifdef BOOST_MSVC
  23. # pragma warning(push)
  24. # pragma warning(disable : 4275 4251 4231 4660)
  25. #endif
  26. namespace boost { namespace locale { namespace boundary {
  27. ///
  28. /// \defgroup boundary Boundary Analysis
  29. ///
  30. /// This module contains all operations required for %boundary analysis of text: character, word, line and sentence
  31. /// boundaries
  32. ///
  33. /// @{
  34. ///
  35. /// \cond INTERNAL
  36. namespace detail {
  37. template<typename Char>
  38. const boundary_indexing<Char>& get_boundary_indexing(const std::locale& l)
  39. {
  40. using facet_type = boundary_indexing<Char>;
  41. if(!std::has_facet<facet_type>(l))
  42. throw std::runtime_error("Locale was generated without segmentation support!");
  43. return std::use_facet<facet_type>(l);
  44. }
  45. template<typename IteratorType,
  46. typename CategoryType = typename std::iterator_traits<IteratorType>::iterator_category>
  47. struct mapping_traits {
  48. typedef typename std::iterator_traits<IteratorType>::value_type char_type;
  49. static index_type map(boundary_type t, IteratorType b, IteratorType e, const std::locale& l)
  50. {
  51. std::basic_string<char_type> str(b, e);
  52. return get_boundary_indexing<char_type>(l).map(t, str.c_str(), str.c_str() + str.size());
  53. }
  54. };
  55. template<typename CharType, typename SomeIteratorType>
  56. struct linear_iterator_traits {
  57. static constexpr bool is_linear =
  58. std::is_same<SomeIteratorType, CharType*>::value || std::is_same<SomeIteratorType, const CharType*>::value
  59. || std::is_same<SomeIteratorType, typename std::basic_string<CharType>::iterator>::value
  60. || std::is_same<SomeIteratorType, typename std::basic_string<CharType>::const_iterator>::value
  61. || std::is_same<SomeIteratorType, typename std::vector<CharType>::iterator>::value
  62. || std::is_same<SomeIteratorType, typename std::vector<CharType>::const_iterator>::value;
  63. };
  64. template<typename IteratorType>
  65. struct mapping_traits<IteratorType, std::random_access_iterator_tag> {
  66. typedef typename std::iterator_traits<IteratorType>::value_type char_type;
  67. static index_type map(boundary_type t, IteratorType b, IteratorType e, const std::locale& l)
  68. {
  69. index_type result;
  70. // Optimize for most common cases
  71. //
  72. // C++11 requires that string is continuous in memory and all known
  73. // string implementations do this because of c_str() support.
  74. if(linear_iterator_traits<char_type, IteratorType>::is_linear && b != e) {
  75. const char_type* begin = &*b;
  76. const char_type* end = begin + (e - b);
  77. index_type tmp = get_boundary_indexing<char_type>(l).map(t, begin, end);
  78. result.swap(tmp);
  79. } else {
  80. std::basic_string<char_type> str(b, e);
  81. index_type tmp = get_boundary_indexing<char_type>(l).map(t, str.c_str(), str.c_str() + str.size());
  82. result.swap(tmp);
  83. }
  84. return result;
  85. }
  86. };
  87. template<typename BaseIterator>
  88. class mapping {
  89. public:
  90. typedef BaseIterator base_iterator;
  91. typedef typename std::iterator_traits<base_iterator>::value_type char_type;
  92. mapping(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc) :
  93. index_(new index_type()), begin_(begin), end_(end)
  94. {
  95. index_type idx = detail::mapping_traits<base_iterator>::map(type, begin, end, loc);
  96. index_->swap(idx);
  97. }
  98. mapping() {}
  99. const index_type& index() const { return *index_; }
  100. base_iterator begin() const { return begin_; }
  101. base_iterator end() const { return end_; }
  102. private:
  103. std::shared_ptr<index_type> index_;
  104. base_iterator begin_, end_;
  105. };
  106. template<typename BaseIterator>
  107. class segment_index_iterator : public boost::iterator_facade<segment_index_iterator<BaseIterator>,
  108. segment<BaseIterator>,
  109. boost::bidirectional_traversal_tag,
  110. const segment<BaseIterator>&> {
  111. public:
  112. typedef BaseIterator base_iterator;
  113. typedef mapping<base_iterator> mapping_type;
  114. typedef segment<base_iterator> segment_type;
  115. segment_index_iterator() : current_(0, 0), map_(0), mask_(0), full_select_(false) {}
  116. segment_index_iterator(base_iterator p, const mapping_type* map, rule_type mask, bool full_select) :
  117. map_(map), mask_(mask), full_select_(full_select)
  118. {
  119. set(p);
  120. }
  121. segment_index_iterator(bool is_begin, const mapping_type* map, rule_type mask, bool full_select) :
  122. map_(map), mask_(mask), full_select_(full_select)
  123. {
  124. if(is_begin)
  125. set_begin();
  126. else
  127. set_end();
  128. }
  129. const segment_type& dereference() const { return value_; }
  130. bool equal(const segment_index_iterator& other) const
  131. {
  132. return map_ == other.map_ && current_.second == other.current_.second;
  133. }
  134. void increment()
  135. {
  136. std::pair<size_t, size_t> next = current_;
  137. if(full_select_) {
  138. next.first = next.second;
  139. while(next.second < size()) {
  140. next.second++;
  141. if(valid_offset(next.second))
  142. break;
  143. }
  144. if(next.second == size())
  145. next.first = next.second - 1;
  146. } else {
  147. while(next.second < size()) {
  148. next.first = next.second;
  149. next.second++;
  150. if(valid_offset(next.second))
  151. break;
  152. }
  153. }
  154. update_current(next);
  155. }
  156. void decrement()
  157. {
  158. std::pair<size_t, size_t> next = current_;
  159. if(full_select_) {
  160. while(next.second > 1) {
  161. next.second--;
  162. if(valid_offset(next.second))
  163. break;
  164. }
  165. next.first = next.second;
  166. while(next.first > 0) {
  167. next.first--;
  168. if(valid_offset(next.first))
  169. break;
  170. }
  171. } else {
  172. while(next.second > 1) {
  173. next.second--;
  174. if(valid_offset(next.second))
  175. break;
  176. }
  177. next.first = next.second - 1;
  178. }
  179. update_current(next);
  180. }
  181. private:
  182. void set_end()
  183. {
  184. current_.first = size() - 1;
  185. current_.second = size();
  186. value_ = segment_type(map_->end(), map_->end(), 0);
  187. }
  188. void set_begin()
  189. {
  190. current_.first = current_.second = 0;
  191. value_ = segment_type(map_->begin(), map_->begin(), 0);
  192. increment();
  193. }
  194. void set(base_iterator p)
  195. {
  196. size_t dist = std::distance(map_->begin(), p);
  197. index_type::const_iterator b = map_->index().begin(), e = map_->index().end();
  198. index_type::const_iterator boundary_point = std::upper_bound(b, e, break_info(dist));
  199. while(boundary_point != e && (boundary_point->rule & mask_) == 0)
  200. boundary_point++;
  201. current_.first = current_.second = boundary_point - b;
  202. if(full_select_) {
  203. while(current_.first > 0) {
  204. current_.first--;
  205. if(valid_offset(current_.first))
  206. break;
  207. }
  208. } else {
  209. if(current_.first > 0)
  210. current_.first--;
  211. }
  212. value_.first = map_->begin();
  213. std::advance(value_.first, get_offset(current_.first));
  214. value_.second = value_.first;
  215. std::advance(value_.second, get_offset(current_.second) - get_offset(current_.first));
  216. update_rule();
  217. }
  218. void update_current(std::pair<size_t, size_t> pos)
  219. {
  220. std::ptrdiff_t first_diff = get_offset(pos.first) - get_offset(current_.first);
  221. std::ptrdiff_t second_diff = get_offset(pos.second) - get_offset(current_.second);
  222. std::advance(value_.first, first_diff);
  223. std::advance(value_.second, second_diff);
  224. current_ = pos;
  225. update_rule();
  226. }
  227. void update_rule()
  228. {
  229. if(current_.second != size()) {
  230. value_.rule(index()[current_.second].rule);
  231. }
  232. }
  233. size_t get_offset(size_t ind) const
  234. {
  235. if(ind == size())
  236. return index().back().offset;
  237. return index()[ind].offset;
  238. }
  239. bool valid_offset(size_t offset) const
  240. {
  241. return offset == 0 || offset == size() // make sure we not acess index[size]
  242. || (index()[offset].rule & mask_) != 0;
  243. }
  244. size_t size() const { return index().size(); }
  245. const index_type& index() const { return map_->index(); }
  246. segment_type value_;
  247. std::pair<size_t, size_t> current_;
  248. const mapping_type* map_;
  249. rule_type mask_;
  250. bool full_select_;
  251. };
  252. template<typename BaseIterator>
  253. class boundary_point_index_iterator : public boost::iterator_facade<boundary_point_index_iterator<BaseIterator>,
  254. boundary_point<BaseIterator>,
  255. boost::bidirectional_traversal_tag,
  256. const boundary_point<BaseIterator>&> {
  257. public:
  258. typedef BaseIterator base_iterator;
  259. typedef mapping<base_iterator> mapping_type;
  260. typedef boundary_point<base_iterator> boundary_point_type;
  261. boundary_point_index_iterator() : current_(0), map_(0), mask_(0) {}
  262. boundary_point_index_iterator(bool is_begin, const mapping_type* map, rule_type mask) :
  263. map_(map), mask_(mask)
  264. {
  265. if(is_begin)
  266. set_begin();
  267. else
  268. set_end();
  269. }
  270. boundary_point_index_iterator(base_iterator p, const mapping_type* map, rule_type mask) :
  271. map_(map), mask_(mask)
  272. {
  273. set(p);
  274. }
  275. const boundary_point_type& dereference() const { return value_; }
  276. bool equal(const boundary_point_index_iterator& other) const
  277. {
  278. return map_ == other.map_ && current_ == other.current_;
  279. }
  280. void increment()
  281. {
  282. size_t next = current_;
  283. while(next < size()) {
  284. next++;
  285. if(valid_offset(next))
  286. break;
  287. }
  288. update_current(next);
  289. }
  290. void decrement()
  291. {
  292. size_t next = current_;
  293. while(next > 0) {
  294. next--;
  295. if(valid_offset(next))
  296. break;
  297. }
  298. update_current(next);
  299. }
  300. private:
  301. void set_end()
  302. {
  303. current_ = size();
  304. value_ = boundary_point_type(map_->end(), 0);
  305. }
  306. void set_begin()
  307. {
  308. current_ = 0;
  309. value_ = boundary_point_type(map_->begin(), 0);
  310. }
  311. void set(base_iterator p)
  312. {
  313. size_t dist = std::distance(map_->begin(), p);
  314. index_type::const_iterator b = index().begin();
  315. index_type::const_iterator e = index().end();
  316. index_type::const_iterator ptr = std::lower_bound(b, e, break_info(dist));
  317. if(ptr == index().end())
  318. current_ = size() - 1;
  319. else
  320. current_ = ptr - index().begin();
  321. while(!valid_offset(current_))
  322. current_++;
  323. std::ptrdiff_t diff = get_offset(current_) - dist;
  324. std::advance(p, diff);
  325. value_.iterator(p);
  326. update_rule();
  327. }
  328. void update_current(size_t pos)
  329. {
  330. std::ptrdiff_t diff = get_offset(pos) - get_offset(current_);
  331. base_iterator i = value_.iterator();
  332. std::advance(i, diff);
  333. current_ = pos;
  334. value_.iterator(i);
  335. update_rule();
  336. }
  337. void update_rule()
  338. {
  339. if(current_ != size()) {
  340. value_.rule(index()[current_].rule);
  341. }
  342. }
  343. size_t get_offset(size_t ind) const
  344. {
  345. if(ind == size())
  346. return index().back().offset;
  347. return index()[ind].offset;
  348. }
  349. bool valid_offset(size_t offset) const
  350. {
  351. return offset == 0 || offset + 1 >= size() // last and first are always valid regardless of mark
  352. || (index()[offset].rule & mask_) != 0;
  353. }
  354. size_t size() const { return index().size(); }
  355. const index_type& index() const { return map_->index(); }
  356. boundary_point_type value_;
  357. size_t current_;
  358. const mapping_type* map_;
  359. rule_type mask_;
  360. };
  361. } // namespace detail
  362. /// \endcond
  363. template<typename BaseIterator>
  364. class segment_index;
  365. template<typename BaseIterator>
  366. class boundary_point_index;
  367. /// \brief This class holds an index of segments in the text range and allows to iterate over them
  368. ///
  369. /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators
  370. /// to the \ref segment objects.
  371. ///
  372. /// It provides two options on way of selecting segments:
  373. ///
  374. /// - \ref rule(rule_type mask) - a mask that allows to select only specific types of segments according to
  375. /// various masks %as \ref word_any.
  376. /// \n
  377. /// The default is to select any types of boundaries.
  378. /// \n
  379. /// For example: using word %boundary analysis, when the provided mask is \ref word_kana then the iterators
  380. /// would iterate only over the words containing Kana letters and \ref word_any would select all types of
  381. /// words excluding ranges that consist of white space and punctuation marks. So iterating over the text
  382. /// "to be or not to be?" with \ref word_any rule would return segments "to", "be", "or", "not", "to", "be",
  383. /// instead of default "to", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", "?".
  384. /// - \ref full_select(bool how) - a flag that defines the way a range is selected if the rule of the previous
  385. /// %boundary point does not fit the selected rule.
  386. /// \n
  387. /// For example: We want to fetch all sentences from the following text: "Hello! How\nare you?".
  388. /// \n
  389. /// This text contains three %boundary points separating it to sentences by different rules:
  390. /// - The exclamation mark "!" ends the sentence "Hello!"
  391. /// - The line feed that splits the sentence "How\nare you?" into two parts.
  392. /// - The question mark that ends the second sentence.
  393. /// \n
  394. /// If you would only change the \ref rule() to \ref sentence_term then the segment_index would
  395. /// provide two sentences "Hello!" and "are you?" %as only them actually terminated with required
  396. /// terminator "!" or "?". But changing \ref full_select() to true, the selected segment would include
  397. /// all the text up to previous valid %boundary point and would return two expected sentences:
  398. /// "Hello!" and "How\nare you?".
  399. ///
  400. /// This class allows to find a segment according to the given iterator in range using \ref find() member
  401. /// function.
  402. ///
  403. /// \note
  404. ///
  405. /// - Changing any of the options - \ref rule() or \ref full_select() and of course re-indexing the text
  406. /// invalidates existing iterators and they can't be used any more.
  407. /// - segment_index can be created from boundary_point_index or other segment_index that was created with
  408. /// same \ref boundary_type. This is very fast operation %as they shared same index
  409. /// and it does not require its regeneration.
  410. ///
  411. /// \see
  412. ///
  413. /// - \ref boundary_point_index
  414. /// - \ref segment
  415. /// - \ref boundary_point
  416. template<typename BaseIterator>
  417. class segment_index {
  418. public:
  419. /// The type of the iterator used to iterate over the original text
  420. typedef BaseIterator base_iterator;
  421. #ifdef BOOST_LOCALE_DOXYGEN
  422. /// The bidirectional iterator that iterates over \ref value_type objects.
  423. ///
  424. /// - The iterators may be invalidated by use of any non-const member function
  425. /// including but not limited to \ref rule(rule_type) and \ref full_select(bool).
  426. /// - The returned value_type object is valid %as long %as iterator points to it.
  427. /// So this following code is wrong %as t used after p was updated:
  428. /// \code
  429. /// segment_index<some_iterator>::iterator p=index.begin();
  430. /// segment<some_iterator> &t = *p;
  431. /// ++p;
  432. /// std::cout << t.str() << std::endl;
  433. /// \endcode
  434. typedef unspecified_iterator_type iterator;
  435. /// \copydoc iterator
  436. typedef unspecified_iterator_type const_iterator;
  437. #else
  438. typedef detail::segment_index_iterator<base_iterator> iterator;
  439. typedef detail::segment_index_iterator<base_iterator> const_iterator;
  440. #endif
  441. /// The type dereferenced by the \ref iterator and \ref const_iterator. It is
  442. /// an object that represents selected segment.
  443. typedef segment<base_iterator> value_type;
  444. /// Default constructor.
  445. ///
  446. /// \note
  447. ///
  448. /// When this object is constructed by default it does not include a valid index, thus
  449. /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined
  450. /// behavior
  451. segment_index() : mask_(0xFFFFFFFFu), full_select_(false) {}
  452. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  453. /// in range [begin,end) using a rule \a mask for locale \a loc.
  454. segment_index(boundary_type type,
  455. base_iterator begin,
  456. base_iterator end,
  457. rule_type mask,
  458. const std::locale& loc = std::locale()) :
  459. map_(type, begin, end, loc),
  460. mask_(mask), full_select_(false)
  461. {}
  462. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  463. /// in range [begin,end) selecting all possible segments (full mask) for locale \a loc.
  464. segment_index(boundary_type type,
  465. base_iterator begin,
  466. base_iterator end,
  467. const std::locale& loc = std::locale()) :
  468. map_(type, begin, end, loc),
  469. mask_(0xFFFFFFFFu), full_select_(false)
  470. {}
  471. /// Create a segment_index from a \ref boundary_point_index. It copies all indexing information
  472. /// and used default rule (all possible segments)
  473. ///
  474. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  475. /// range it is much better to create one from another rather then indexing the same
  476. /// range twice.
  477. ///
  478. /// \note \ref rule() flags are not copied
  479. segment_index(const boundary_point_index<base_iterator>&);
  480. /// Copy an index from a \ref boundary_point_index. It copies all indexing information
  481. /// and uses the default rule (all possible segments)
  482. ///
  483. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  484. /// range it is much better to create one from another rather then indexing the same
  485. /// range twice.
  486. ///
  487. /// \note \ref rule() flags are not copied
  488. segment_index& operator=(const boundary_point_index<base_iterator>&);
  489. /// Create a new index for %boundary analysis \ref boundary_type "type" of the text
  490. /// in range [begin,end) for locale \a loc.
  491. ///
  492. /// \note \ref rule() and \ref full_select() remain unchanged.
  493. void map(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc = std::locale())
  494. {
  495. map_ = mapping_type(type, begin, end, loc);
  496. }
  497. /// Get the \ref iterator on the beginning of the segments range.
  498. ///
  499. /// Preconditions: the segment_index should have a mapping
  500. ///
  501. /// \note
  502. ///
  503. /// The returned iterator is invalidated by access to any non-const member functions of this object
  504. iterator begin() const
  505. {
  506. return iterator(true, &map_, mask_, full_select_);
  507. }
  508. /// Get the \ref iterator on the ending of the segments range.
  509. ///
  510. /// Preconditions: the segment_index should have a mapping
  511. ///
  512. /// The returned iterator is invalidated by access to any non-const member functions of this object
  513. iterator end() const
  514. {
  515. return iterator(false, &map_, mask_, full_select_);
  516. }
  517. /// Find a first valid segment following a position \a p.
  518. ///
  519. /// If \a p is inside a valid segment this segment is selected:
  520. ///
  521. /// For example: For \ref word %boundary analysis with \ref word_any rule():
  522. ///
  523. /// - "to| be or ", would point to "be",
  524. /// - "t|o be or ", would point to "to",
  525. /// - "to be or| ", would point to end.
  526. ///
  527. ///
  528. /// Preconditions: the segment_index should have a mapping and \a p should be valid iterator
  529. /// to the text in the mapped range.
  530. ///
  531. /// The returned iterator is invalidated by access to any non-const member functions of this object
  532. iterator find(base_iterator p) const
  533. {
  534. return iterator(p, &map_, mask_, full_select_);
  535. }
  536. /// Get the mask of rules that are used
  537. rule_type rule() const
  538. {
  539. return mask_;
  540. }
  541. /// Set the mask of rules that are used
  542. void rule(rule_type v)
  543. {
  544. mask_ = v;
  545. }
  546. /// Get the full_select property value - should segment include in the range
  547. /// values that not belong to specific \ref rule() or not.
  548. ///
  549. /// The default value is false.
  550. ///
  551. /// For example for \ref sentence %boundary with rule \ref sentence_term the segments
  552. /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false
  553. /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select()
  554. /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the
  555. /// following part "are you?"
  556. bool full_select() const
  557. {
  558. return full_select_;
  559. }
  560. /// Set the full_select property value - should segment include in the range
  561. /// values that not belong to specific \ref rule() or not.
  562. ///
  563. /// The default value is false.
  564. ///
  565. /// For example for \ref sentence %boundary with rule \ref sentence_term the segments
  566. /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false
  567. /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select()
  568. /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the
  569. /// following part "are you?"
  570. void full_select(bool v)
  571. {
  572. full_select_ = v;
  573. }
  574. private:
  575. friend class boundary_point_index<base_iterator>;
  576. typedef detail::mapping<base_iterator> mapping_type;
  577. mapping_type map_;
  578. rule_type mask_;
  579. bool full_select_;
  580. };
  581. /// \brief This class holds an index of \ref boundary_point "boundary points" and allows iterating
  582. /// over them.
  583. ///
  584. /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators
  585. /// to the \ref boundary_point objects.
  586. ///
  587. /// It provides an option that affects selecting %boundary points according to different rules:
  588. /// using \ref rule(rule_type mask) member function. It allows to set a mask that select only specific
  589. /// types of %boundary points like \ref sentence_term.
  590. ///
  591. /// For example for a sentence %boundary analysis of a text "Hello! How\nare you?" when the default
  592. /// rule is used the %boundary points would be:
  593. ///
  594. /// - "|Hello! How\nare you?"
  595. /// - "Hello! |How\nare you?"
  596. /// - "Hello! How\n|are you?"
  597. /// - "Hello! How\nare you?|"
  598. ///
  599. /// However if \ref rule() is set to \ref sentence_term then the selected %boundary points would be:
  600. ///
  601. /// - "|Hello! How\nare you?"
  602. /// - "Hello! |How\nare you?"
  603. /// - "Hello! How\nare you?|"
  604. ///
  605. /// Such that a %boundary point defined by a line feed character would be ignored.
  606. ///
  607. /// This class allows to find a boundary_point according to the given iterator in range using \ref find() member
  608. /// function.
  609. ///
  610. /// \note
  611. /// - Even an empty text range [x,x) considered to have a one %boundary point x.
  612. /// - \a a and \a b points of the range [a,b) are always considered %boundary points
  613. /// regardless the rules used.
  614. /// - Changing any of the option \ref rule() or course re-indexing the text
  615. /// invalidates existing iterators and they can't be used any more.
  616. /// - boundary_point_index can be created from segment_index or other boundary_point_index that was created with
  617. /// same \ref boundary_type. This is very fast operation %as they shared same index
  618. /// and it does not require its regeneration.
  619. ///
  620. /// \see
  621. ///
  622. /// - \ref segment_index
  623. /// - \ref boundary_point
  624. /// - \ref segment
  625. template<typename BaseIterator>
  626. class boundary_point_index {
  627. public:
  628. /// The type of the iterator used to iterate over the original text
  629. typedef BaseIterator base_iterator;
  630. #ifdef BOOST_LOCALE_DOXYGEN
  631. /// The bidirectional iterator that iterates over \ref value_type objects.
  632. ///
  633. /// - The iterators may be invalidated by use of any non-const member function
  634. /// including but not limited to \ref rule(rule_type) member function.
  635. /// - The returned value_type object is valid %as long %as iterator points to it.
  636. /// So this following code is wrong %as t used after p was updated:
  637. /// \code
  638. /// boundary_point_index<some_iterator>::iterator p=index.begin();
  639. /// boundary_point<some_iterator> &t = *p;
  640. /// ++p;
  641. /// rule_type r = t->rule();
  642. /// \endcode
  643. ///
  644. typedef unspecified_iterator_type iterator;
  645. /// \copydoc iterator
  646. typedef unspecified_iterator_type const_iterator;
  647. #else
  648. typedef detail::boundary_point_index_iterator<base_iterator> iterator;
  649. typedef detail::boundary_point_index_iterator<base_iterator> const_iterator;
  650. #endif
  651. /// The type dereferenced by the \ref iterator and \ref const_iterator. It is
  652. /// an object that represents the selected \ref boundary_point "boundary point".
  653. typedef boundary_point<base_iterator> value_type;
  654. /// Default constructor.
  655. ///
  656. /// \note
  657. ///
  658. /// When this object is constructed by default it does not include a valid index, thus
  659. /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined
  660. /// behavior
  661. boundary_point_index() : mask_(0xFFFFFFFFu) {}
  662. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  663. /// in range [begin,end) using a rule \a mask for locale \a loc.
  664. boundary_point_index(boundary_type type,
  665. base_iterator begin,
  666. base_iterator end,
  667. rule_type mask,
  668. const std::locale& loc = std::locale()) :
  669. map_(type, begin, end, loc),
  670. mask_(mask)
  671. {}
  672. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  673. /// in range [begin,end) selecting all possible %boundary points (full mask) for locale \a loc.
  674. boundary_point_index(boundary_type type,
  675. base_iterator begin,
  676. base_iterator end,
  677. const std::locale& loc = std::locale()) :
  678. map_(type, begin, end, loc),
  679. mask_(0xFFFFFFFFu)
  680. {}
  681. /// Create a boundary_point_index from a \ref segment_index. It copies all indexing information
  682. /// and uses the default rule (all possible %boundary points)
  683. ///
  684. /// This operation is very cheap, so if you use boundary_point_index and segment_index on the same text
  685. /// range it is much better to create one from another rather then indexing the same
  686. /// range twice.
  687. ///
  688. /// \note \ref rule() flags are not copied
  689. boundary_point_index(const segment_index<base_iterator>& other);
  690. /// Copy a boundary_point_index from a \ref segment_index. It copies all indexing information
  691. /// and keeps the current \ref rule() unchanged
  692. ///
  693. /// This operation is very cheap, so if you use boundary_point_index and segment_index on the same text
  694. /// range it is much better to create one from another rather then indexing the same
  695. /// range twice.
  696. ///
  697. /// \note \ref rule() flags are not copied
  698. boundary_point_index& operator=(const segment_index<base_iterator>& other);
  699. /// Create a new index for %boundary analysis \ref boundary_type "type" of the text
  700. /// in range [begin,end) for locale \a loc.
  701. ///
  702. /// \note \ref rule() remains unchanged.
  703. void map(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc = std::locale())
  704. {
  705. map_ = mapping_type(type, begin, end, loc);
  706. }
  707. /// Get the \ref iterator on the beginning of the %boundary points range.
  708. ///
  709. /// Preconditions: this boundary_point_index should have a mapping
  710. ///
  711. /// \note
  712. ///
  713. /// The returned iterator is invalidated by access to any non-const member functions of this object
  714. iterator begin() const
  715. {
  716. return iterator(true, &map_, mask_);
  717. }
  718. /// Get the \ref iterator on the ending of the %boundary points range.
  719. ///
  720. /// Preconditions: this boundary_point_index should have a mapping
  721. ///
  722. /// \note
  723. ///
  724. /// The returned iterator is invalidated by access to any non-const member functions of this object
  725. iterator end() const
  726. {
  727. return iterator(false, &map_, mask_);
  728. }
  729. /// Find a first valid %boundary point on a position \a p or following it.
  730. ///
  731. /// For example: For \ref word %boundary analysis of the text "to be or"
  732. ///
  733. /// - "|to be", would return %boundary point at "|to be",
  734. /// - "t|o be", would point to "to| be"
  735. ///
  736. /// Preconditions: the boundary_point_index should have a mapping and \a p should be valid iterator
  737. /// to the text in the mapped range.
  738. ///
  739. /// The returned iterator is invalidated by access to any non-const member functions of this object
  740. iterator find(base_iterator p) const
  741. {
  742. return iterator(p, &map_, mask_);
  743. }
  744. /// Get the mask of rules that are used
  745. rule_type rule() const
  746. {
  747. return mask_;
  748. }
  749. /// Set the mask of rules that are used
  750. void rule(rule_type v)
  751. {
  752. mask_ = v;
  753. }
  754. private:
  755. friend class segment_index<base_iterator>;
  756. typedef detail::mapping<base_iterator> mapping_type;
  757. mapping_type map_;
  758. rule_type mask_;
  759. };
  760. /// \cond INTERNAL
  761. template<typename BaseIterator>
  762. segment_index<BaseIterator>::segment_index(const boundary_point_index<BaseIterator>& other) :
  763. map_(other.map_), mask_(0xFFFFFFFFu), full_select_(false)
  764. {}
  765. template<typename BaseIterator>
  766. boundary_point_index<BaseIterator>::boundary_point_index(const segment_index<BaseIterator>& other) :
  767. map_(other.map_), mask_(0xFFFFFFFFu)
  768. {}
  769. template<typename BaseIterator>
  770. segment_index<BaseIterator>& segment_index<BaseIterator>::operator=(const boundary_point_index<BaseIterator>& other)
  771. {
  772. map_ = other.map_;
  773. return *this;
  774. }
  775. template<typename BaseIterator>
  776. boundary_point_index<BaseIterator>&
  777. boundary_point_index<BaseIterator>::operator=(const segment_index<BaseIterator>& other)
  778. {
  779. map_ = other.map_;
  780. return *this;
  781. }
  782. /// \endcond
  783. typedef segment_index<std::string::const_iterator> ssegment_index; ///< convenience typedef
  784. typedef segment_index<std::wstring::const_iterator> wssegment_index; ///< convenience typedef
  785. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  786. typedef segment_index<std::u16string::const_iterator> u16ssegment_index; ///< convenience typedef
  787. #endif
  788. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  789. typedef segment_index<std::u32string::const_iterator> u32ssegment_index; ///< convenience typedef
  790. #endif
  791. typedef segment_index<const char*> csegment_index; ///< convenience typedef
  792. typedef segment_index<const wchar_t*> wcsegment_index; ///< convenience typedef
  793. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  794. typedef segment_index<const char16_t*> u16csegment_index; ///< convenience typedef
  795. #endif
  796. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  797. typedef segment_index<const char32_t*> u32csegment_index; ///< convenience typedef
  798. #endif
  799. typedef boundary_point_index<std::string::const_iterator> sboundary_point_index; ///< convenience typedef
  800. typedef boundary_point_index<std::wstring::const_iterator> wsboundary_point_index; ///< convenience typedef
  801. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  802. typedef boundary_point_index<std::u16string::const_iterator> u16sboundary_point_index; ///< convenience typedef
  803. #endif
  804. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  805. typedef boundary_point_index<std::u32string::const_iterator> u32sboundary_point_index; ///< convenience typedef
  806. #endif
  807. typedef boundary_point_index<const char*> cboundary_point_index; ///< convenience typedef
  808. typedef boundary_point_index<const wchar_t*> wcboundary_point_index; ///< convenience typedef
  809. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  810. typedef boundary_point_index<const char16_t*> u16cboundary_point_index; ///< convenience typedef
  811. #endif
  812. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  813. typedef boundary_point_index<const char32_t*> u32cboundary_point_index; ///< convenience typedef
  814. #endif
  815. }}} // namespace boost::locale::boundary
  816. ///
  817. /// \example boundary.cpp
  818. /// Example of using segment_index
  819. /// \example wboundary.cpp
  820. /// Example of using segment_index over wide strings
  821. ///
  822. #ifdef BOOST_MSVC
  823. # pragma warning(pop)
  824. #endif
  825. #endif