mlbe.h 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. // © 2022 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. #ifndef MLBREAKENGINE_H
  4. #define MLBREAKENGINE_H
  5. #include "hash.h"
  6. #include "unicode/resbund.h"
  7. #include "unicode/uniset.h"
  8. #include "unicode/utext.h"
  9. #include "uvectr32.h"
  10. U_NAMESPACE_BEGIN
  11. #if !UCONFIG_NO_BREAK_ITERATION
  12. /**
  13. * A machine learning break engine for the phrase breaking in Japanese.
  14. */
  15. class MlBreakEngine : public UMemory {
  16. public:
  17. /**
  18. * Constructor.
  19. *
  20. * @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
  21. * alphabet.
  22. * @param closePunctuationSet An UnicodeSet with close punctuation.
  23. * @param status Information on any errors encountered.
  24. */
  25. MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
  26. const UnicodeSet &closePunctuationSet, UErrorCode &status);
  27. /**
  28. * Virtual destructor.
  29. */
  30. virtual ~MlBreakEngine();
  31. public:
  32. /**
  33. * Divide up a range of characters handled by this break engine.
  34. *
  35. * @param inText A UText representing the text
  36. * @param rangeStart The start of the range of the characters
  37. * @param rangeEnd The end of the range of the characters
  38. * @param foundBreaks Output of C array of int32_t break positions, or 0
  39. * @param inString The normalized string of text ranging from rangeStart to rangeEnd
  40. * @param inputMap The vector storing the native index of inText
  41. * @param status Information on any errors encountered.
  42. * @return The number of breaks found
  43. */
  44. int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
  45. UVector32 &foundBreaks, const UnicodeString &inString,
  46. const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
  47. private:
  48. /**
  49. * Load the machine learning's model file.
  50. *
  51. * @param error Information on any errors encountered.
  52. */
  53. void loadMLModel(UErrorCode &error);
  54. /**
  55. * In the machine learning's model file, specify the name of the key and value to load the
  56. * corresponding feature and its score.
  57. *
  58. * @param rb A ResouceBundle corresponding to the model file.
  59. * @param keyName The kay name in the model file.
  60. * @param valueName The value name in the model file.
  61. * @param model A hashtable to store the pairs of the feature and its score.
  62. * @param error Information on any errors encountered.
  63. */
  64. void initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
  65. Hashtable &model, UErrorCode &error);
  66. /**
  67. * Initialize the index list from the input string.
  68. *
  69. * @param inString A input string to be segmented.
  70. * @param indexList A code unit index list of inString.
  71. * @param status Information on any errors encountered.
  72. * @return The number of code units of the first four characters in inString.
  73. */
  74. int32_t initIndexList(const UnicodeString &inString, int32_t *indexList,
  75. UErrorCode &status) const;
  76. /**
  77. * Evaluate whether the index is a potential breakpoint.
  78. *
  79. * @param inString A input string to be segmented.
  80. * @param indexList A code unit index list of the inString.
  81. * @param startIdx The start index of the indexList.
  82. * @param numCodeUnits The current code unit boundary of the indexList.
  83. * @param numBreaks The accumulated number of breakpoints.
  84. * @param boundary A vector including the index of the breakpoint.
  85. * @param status Information on any errors encountered.
  86. * @return The number of breakpoints
  87. */
  88. int32_t evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, int32_t startIdx,
  89. int32_t numCodeUnits, int32_t numBreaks, UVector32 &boundary,
  90. UErrorCode &status) const;
  91. void printUnicodeString(const UnicodeString &s) const;
  92. UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
  93. UnicodeSet fClosePunctuationSet;
  94. Hashtable fModel[13]; // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13
  95. int32_t fNegativeSum;
  96. };
  97. #endif
  98. U_NAMESPACE_END
  99. /* MLBREAKENGINE_H */
  100. #endif