dictbe.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /**
  4. *******************************************************************************
  5. * Copyright (C) 2006-2014, International Business Machines Corporation *
  6. * and others. All Rights Reserved. *
  7. *******************************************************************************
  8. */
  9. #ifndef DICTBE_H
  10. #define DICTBE_H
  11. #include "unicode/utypes.h"
  12. #include "unicode/uniset.h"
  13. #include "unicode/utext.h"
  14. #include "brkeng.h"
  15. #include "hash.h"
  16. #include "mlbe.h"
  17. #include "uvectr32.h"
  18. U_NAMESPACE_BEGIN
  19. class DictionaryMatcher;
  20. class MlBreakEngine;
  21. class Normalizer2;
  22. /*******************************************************************
  23. * DictionaryBreakEngine
  24. */
  25. /**
  26. * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
  27. * dictionary to determine language-specific breaks.</p>
  28. *
  29. * <p>After it is constructed a DictionaryBreakEngine may be shared between
  30. * threads without synchronization.</p>
  31. */
  32. class DictionaryBreakEngine : public LanguageBreakEngine {
  33. private:
  34. /**
  35. * The set of characters handled by this engine
  36. * @internal
  37. */
  38. UnicodeSet fSet;
  39. public:
  40. /**
  41. * <p>Constructor </p>
  42. */
  43. DictionaryBreakEngine();
  44. /**
  45. * <p>Virtual destructor.</p>
  46. */
  47. virtual ~DictionaryBreakEngine();
  48. /**
  49. * <p>Indicate whether this engine handles a particular character for
  50. * a particular kind of break.</p>
  51. *
  52. * @param c A character which begins a run that the engine might handle
  53. * @param locale The locale.
  54. * @return true if this engine handles the particular character and break
  55. * type.
  56. */
  57. virtual UBool handles(UChar32 c, const char* locale) const override;
  58. /**
  59. * <p>Find any breaks within a run in the supplied text.</p>
  60. *
  61. * @param text A UText representing the text. The iterator is left at
  62. * the end of the run of characters which the engine is capable of handling
  63. * that starts from the first character in the range.
  64. * @param startPos The start of the run within the supplied text.
  65. * @param endPos The end of the run within the supplied text.
  66. * @param foundBreaks vector of int32_t to receive the break positions
  67. * @param status Information on any errors encountered.
  68. * @return The number of breaks found.
  69. */
  70. virtual int32_t findBreaks( UText *text,
  71. int32_t startPos,
  72. int32_t endPos,
  73. UVector32 &foundBreaks,
  74. UBool isPhraseBreaking,
  75. UErrorCode& status ) const override;
  76. protected:
  77. /**
  78. * <p>Set the character set handled by this engine.</p>
  79. *
  80. * @param set A UnicodeSet of the set of characters handled by the engine
  81. */
  82. virtual void setCharacters( const UnicodeSet &set );
  83. /**
  84. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  85. *
  86. * @param text A UText representing the text
  87. * @param rangeStart The start of the range of dictionary characters
  88. * @param rangeEnd The end of the range of dictionary characters
  89. * @param foundBreaks Output of C array of int32_t break positions, or 0
  90. * @param status Information on any errors encountered.
  91. * @return The number of breaks found
  92. */
  93. virtual int32_t divideUpDictionaryRange( UText *text,
  94. int32_t rangeStart,
  95. int32_t rangeEnd,
  96. UVector32 &foundBreaks,
  97. UBool isPhraseBreaking,
  98. UErrorCode& status) const = 0;
  99. };
  100. /*******************************************************************
  101. * ThaiBreakEngine
  102. */
  103. /**
  104. * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
  105. * dictionary and heuristics to determine Thai-specific breaks.</p>
  106. *
  107. * <p>After it is constructed a ThaiBreakEngine may be shared between
  108. * threads without synchronization.</p>
  109. */
  110. class ThaiBreakEngine : public DictionaryBreakEngine {
  111. private:
  112. /**
  113. * The set of characters handled by this engine
  114. * @internal
  115. */
  116. UnicodeSet fEndWordSet;
  117. UnicodeSet fBeginWordSet;
  118. UnicodeSet fSuffixSet;
  119. UnicodeSet fMarkSet;
  120. DictionaryMatcher *fDictionary;
  121. public:
  122. /**
  123. * <p>Default constructor.</p>
  124. *
  125. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  126. * engine is deleted.
  127. */
  128. ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  129. /**
  130. * <p>Virtual destructor.</p>
  131. */
  132. virtual ~ThaiBreakEngine();
  133. protected:
  134. /**
  135. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  136. *
  137. * @param text A UText representing the text
  138. * @param rangeStart The start of the range of dictionary characters
  139. * @param rangeEnd The end of the range of dictionary characters
  140. * @param foundBreaks Output of C array of int32_t break positions, or 0
  141. * @param status Information on any errors encountered.
  142. * @return The number of breaks found
  143. */
  144. virtual int32_t divideUpDictionaryRange( UText *text,
  145. int32_t rangeStart,
  146. int32_t rangeEnd,
  147. UVector32 &foundBreaks,
  148. UBool isPhraseBreaking,
  149. UErrorCode& status) const override;
  150. };
  151. /*******************************************************************
  152. * LaoBreakEngine
  153. */
  154. /**
  155. * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
  156. * dictionary and heuristics to determine Lao-specific breaks.</p>
  157. *
  158. * <p>After it is constructed a LaoBreakEngine may be shared between
  159. * threads without synchronization.</p>
  160. */
  161. class LaoBreakEngine : public DictionaryBreakEngine {
  162. private:
  163. /**
  164. * The set of characters handled by this engine
  165. * @internal
  166. */
  167. UnicodeSet fEndWordSet;
  168. UnicodeSet fBeginWordSet;
  169. UnicodeSet fMarkSet;
  170. DictionaryMatcher *fDictionary;
  171. public:
  172. /**
  173. * <p>Default constructor.</p>
  174. *
  175. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  176. * engine is deleted.
  177. */
  178. LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  179. /**
  180. * <p>Virtual destructor.</p>
  181. */
  182. virtual ~LaoBreakEngine();
  183. protected:
  184. /**
  185. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  186. *
  187. * @param text A UText representing the text
  188. * @param rangeStart The start of the range of dictionary characters
  189. * @param rangeEnd The end of the range of dictionary characters
  190. * @param foundBreaks Output of C array of int32_t break positions, or 0
  191. * @param status Information on any errors encountered.
  192. * @return The number of breaks found
  193. */
  194. virtual int32_t divideUpDictionaryRange( UText *text,
  195. int32_t rangeStart,
  196. int32_t rangeEnd,
  197. UVector32 &foundBreaks,
  198. UBool isPhraseBreaking,
  199. UErrorCode& status) const override;
  200. };
  201. /*******************************************************************
  202. * BurmeseBreakEngine
  203. */
  204. /**
  205. * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
  206. * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
  207. *
  208. * <p>After it is constructed a BurmeseBreakEngine may be shared between
  209. * threads without synchronization.</p>
  210. */
  211. class BurmeseBreakEngine : public DictionaryBreakEngine {
  212. private:
  213. /**
  214. * The set of characters handled by this engine
  215. * @internal
  216. */
  217. UnicodeSet fEndWordSet;
  218. UnicodeSet fBeginWordSet;
  219. UnicodeSet fMarkSet;
  220. DictionaryMatcher *fDictionary;
  221. public:
  222. /**
  223. * <p>Default constructor.</p>
  224. *
  225. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  226. * engine is deleted.
  227. */
  228. BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  229. /**
  230. * <p>Virtual destructor.</p>
  231. */
  232. virtual ~BurmeseBreakEngine();
  233. protected:
  234. /**
  235. * <p>Divide up a range of known dictionary characters.</p>
  236. *
  237. * @param text A UText representing the text
  238. * @param rangeStart The start of the range of dictionary characters
  239. * @param rangeEnd The end of the range of dictionary characters
  240. * @param foundBreaks Output of C array of int32_t break positions, or 0
  241. * @param status Information on any errors encountered.
  242. * @return The number of breaks found
  243. */
  244. virtual int32_t divideUpDictionaryRange( UText *text,
  245. int32_t rangeStart,
  246. int32_t rangeEnd,
  247. UVector32 &foundBreaks,
  248. UBool isPhraseBreaking,
  249. UErrorCode& status) const override;
  250. };
  251. /*******************************************************************
  252. * KhmerBreakEngine
  253. */
  254. /**
  255. * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
  256. * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
  257. *
  258. * <p>After it is constructed a KhmerBreakEngine may be shared between
  259. * threads without synchronization.</p>
  260. */
  261. class KhmerBreakEngine : public DictionaryBreakEngine {
  262. private:
  263. /**
  264. * The set of characters handled by this engine
  265. * @internal
  266. */
  267. UnicodeSet fEndWordSet;
  268. UnicodeSet fBeginWordSet;
  269. UnicodeSet fMarkSet;
  270. DictionaryMatcher *fDictionary;
  271. public:
  272. /**
  273. * <p>Default constructor.</p>
  274. *
  275. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  276. * engine is deleted.
  277. */
  278. KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  279. /**
  280. * <p>Virtual destructor.</p>
  281. */
  282. virtual ~KhmerBreakEngine();
  283. protected:
  284. /**
  285. * <p>Divide up a range of known dictionary characters.</p>
  286. *
  287. * @param text A UText representing the text
  288. * @param rangeStart The start of the range of dictionary characters
  289. * @param rangeEnd The end of the range of dictionary characters
  290. * @param foundBreaks Output of C array of int32_t break positions, or 0
  291. * @param status Information on any errors encountered.
  292. * @return The number of breaks found
  293. */
  294. virtual int32_t divideUpDictionaryRange( UText *text,
  295. int32_t rangeStart,
  296. int32_t rangeEnd,
  297. UVector32 &foundBreaks,
  298. UBool isPhraseBreaking,
  299. UErrorCode& status) const override;
  300. };
  301. #if !UCONFIG_NO_NORMALIZATION
  302. /*******************************************************************
  303. * CjkBreakEngine
  304. */
  305. //indicates language/script that the CjkBreakEngine will handle
  306. enum LanguageType {
  307. kKorean,
  308. kChineseJapanese
  309. };
  310. /**
  311. * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
  312. * dictionary with costs associated with each word and
  313. * Viterbi decoding to determine CJK-specific breaks.</p>
  314. */
  315. class CjkBreakEngine : public DictionaryBreakEngine {
  316. protected:
  317. /**
  318. * The set of characters handled by this engine
  319. * @internal
  320. */
  321. UnicodeSet fHangulWordSet;
  322. UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
  323. UnicodeSet fClosePunctuationSet;
  324. DictionaryMatcher *fDictionary;
  325. const Normalizer2 *nfkcNorm2;
  326. MlBreakEngine *fMlBreakEngine;
  327. bool isCj;
  328. private:
  329. // Load Japanese extensions.
  330. void loadJapaneseExtensions(UErrorCode& error);
  331. // Load Japanese Hiragana.
  332. void loadHiragana(UErrorCode& error);
  333. // Initialize fSkipSet by loading Japanese Hiragana and extensions.
  334. void initJapanesePhraseParameter(UErrorCode& error);
  335. Hashtable fSkipSet;
  336. public:
  337. /**
  338. * <p>Default constructor.</p>
  339. *
  340. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  341. * engine is deleted. The DictionaryMatcher must contain costs for each word
  342. * in order for the dictionary to work properly.
  343. */
  344. CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
  345. /**
  346. * <p>Virtual destructor.</p>
  347. */
  348. virtual ~CjkBreakEngine();
  349. protected:
  350. /**
  351. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  352. *
  353. * @param text A UText representing the text
  354. * @param rangeStart The start of the range of dictionary characters
  355. * @param rangeEnd The end of the range of dictionary characters
  356. * @param foundBreaks Output of C array of int32_t break positions, or 0
  357. * @param status Information on any errors encountered.
  358. * @return The number of breaks found
  359. */
  360. virtual int32_t divideUpDictionaryRange( UText *text,
  361. int32_t rangeStart,
  362. int32_t rangeEnd,
  363. UVector32 &foundBreaks,
  364. UBool isPhraseBreaking,
  365. UErrorCode& status) const override;
  366. };
  367. #endif
  368. U_NAMESPACE_END
  369. /* DICTBE_H */
  370. #endif