123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- // © 2022 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- #ifndef MLBREAKENGINE_H
- #define MLBREAKENGINE_H
- #include "hash.h"
- #include "unicode/resbund.h"
- #include "unicode/uniset.h"
- #include "unicode/utext.h"
- #include "uvectr32.h"
- U_NAMESPACE_BEGIN
- #if !UCONFIG_NO_BREAK_ITERATION
- /**
- * A machine learning break engine for the phrase breaking in Japanese.
- */
- class MlBreakEngine : public UMemory {
- public:
- /**
- * Constructor.
- *
- * @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
- * alphabet.
- * @param closePunctuationSet An UnicodeSet with close punctuation.
- * @param status Information on any errors encountered.
- */
- MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
- const UnicodeSet &closePunctuationSet, UErrorCode &status);
- /**
- * Virtual destructor.
- */
- virtual ~MlBreakEngine();
- public:
- /**
- * Divide up a range of characters handled by this break engine.
- *
- * @param inText A UText representing the text
- * @param rangeStart The start of the range of the characters
- * @param rangeEnd The end of the range of the characters
- * @param foundBreaks Output of C array of int32_t break positions, or 0
- * @param inString The normalized string of text ranging from rangeStart to rangeEnd
- * @param inputMap The vector storing the native index of inText
- * @param status Information on any errors encountered.
- * @return The number of breaks found
- */
- int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
- UVector32 &foundBreaks, const UnicodeString &inString,
- const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
- private:
- /**
- * Load the machine learning's model file.
- *
- * @param error Information on any errors encountered.
- */
- void loadMLModel(UErrorCode &error);
- /**
- * In the machine learning's model file, specify the name of the key and value to load the
- * corresponding feature and its score.
- *
- * @param rb A ResouceBundle corresponding to the model file.
- * @param keyName The kay name in the model file.
- * @param valueName The value name in the model file.
- * @param model A hashtable to store the pairs of the feature and its score.
- * @param error Information on any errors encountered.
- */
- void initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
- Hashtable &model, UErrorCode &error);
- /**
- * Initialize the index list from the input string.
- *
- * @param inString A input string to be segmented.
- * @param indexList A code unit index list of inString.
- * @param status Information on any errors encountered.
- * @return The number of code units of the first four characters in inString.
- */
- int32_t initIndexList(const UnicodeString &inString, int32_t *indexList,
- UErrorCode &status) const;
- /**
- * Evaluate whether the index is a potential breakpoint.
- *
- * @param inString A input string to be segmented.
- * @param indexList A code unit index list of the inString.
- * @param startIdx The start index of the indexList.
- * @param numCodeUnits The current code unit boundary of the indexList.
- * @param numBreaks The accumulated number of breakpoints.
- * @param boundary A vector including the index of the breakpoint.
- * @param status Information on any errors encountered.
- * @return The number of breakpoints
- */
- int32_t evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, int32_t startIdx,
- int32_t numCodeUnits, int32_t numBreaks, UVector32 &boundary,
- UErrorCode &status) const;
- void printUnicodeString(const UnicodeString &s) const;
- UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
- UnicodeSet fClosePunctuationSet;
- Hashtable fModel[13]; // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13
- int32_t fNegativeSum;
- };
- #endif
- U_NAMESPACE_END
- /* MLBREAKENGINE_H */
- #endif
|