rbbistbl.cpp 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // file: rbbistbl.cpp Implementation of the ICU RBBISymbolTable class
  5. //
  6. /*
  7. ***************************************************************************
  8. * Copyright (C) 2002-2014 International Business Machines Corporation
  9. * and others. All rights reserved.
  10. ***************************************************************************
  11. */
  12. #include "unicode/utypes.h"
  13. #if !UCONFIG_NO_BREAK_ITERATION
  14. #include "unicode/unistr.h"
  15. #include "unicode/uniset.h"
  16. #include "unicode/uchar.h"
  17. #include "unicode/parsepos.h"
  18. #include "cstr.h"
  19. #include "rbbinode.h"
  20. #include "rbbirb.h"
  21. #include "umutex.h"
  22. //
  23. // RBBISymbolTableEntry_deleter Used by the UHashTable to delete the contents
  24. // when the hash table is deleted.
  25. //
  26. U_CDECL_BEGIN
  27. static void U_CALLCONV RBBISymbolTableEntry_deleter(void *p) {
  28. icu::RBBISymbolTableEntry *px = (icu::RBBISymbolTableEntry *)p;
  29. delete px;
  30. }
  31. U_CDECL_END
  32. U_NAMESPACE_BEGIN
  33. RBBISymbolTable::RBBISymbolTable(RBBIRuleScanner *rs, const UnicodeString &rules, UErrorCode &status)
  34. : fRules(rules), fRuleScanner(rs), ffffString(static_cast<char16_t>(0xffff))
  35. {
  36. fHashTable = nullptr;
  37. fCachedSetLookup = nullptr;
  38. fHashTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, &status);
  39. // uhash_open checks status
  40. if (U_FAILURE(status)) {
  41. return;
  42. }
  43. uhash_setValueDeleter(fHashTable, RBBISymbolTableEntry_deleter);
  44. }
  45. RBBISymbolTable::~RBBISymbolTable()
  46. {
  47. uhash_close(fHashTable);
  48. }
  49. //
  50. // RBBISymbolTable::lookup This function from the abstract symbol table interface
  51. // looks up a variable name and returns a UnicodeString
  52. // containing the substitution text.
  53. //
  54. // The variable name does NOT include the leading $.
  55. //
  56. const UnicodeString *RBBISymbolTable::lookup(const UnicodeString& s) const
  57. {
  58. RBBISymbolTableEntry *el;
  59. RBBINode *varRefNode;
  60. RBBINode *exprNode;
  61. RBBINode *usetNode;
  62. const UnicodeString *retString;
  63. RBBISymbolTable *This = const_cast<RBBISymbolTable*>(this); // cast off const
  64. el = static_cast<RBBISymbolTableEntry*>(uhash_get(fHashTable, &s));
  65. if (el == nullptr) {
  66. return nullptr;
  67. }
  68. varRefNode = el->val;
  69. exprNode = varRefNode->fLeftChild; // Root node of expression for variable
  70. if (exprNode->fType == RBBINode::setRef) {
  71. // The $variable refers to a single UnicodeSet
  72. // return the ffffString, which will subsequently be interpreted as a
  73. // stand-in character for the set by RBBISymbolTable::lookupMatcher()
  74. usetNode = exprNode->fLeftChild;
  75. This->fCachedSetLookup = usetNode->fInputSet;
  76. retString = &ffffString;
  77. }
  78. else
  79. {
  80. // The variable refers to something other than just a set.
  81. // return the original source string for the expression
  82. retString = &exprNode->fText;
  83. This->fCachedSetLookup = nullptr;
  84. }
  85. return retString;
  86. }
  87. //
  88. // RBBISymbolTable::lookupMatcher This function from the abstract symbol table
  89. // interface maps a single stand-in character to a
  90. // pointer to a Unicode Set. The Unicode Set code uses this
  91. // mechanism to get all references to the same $variable
  92. // name to refer to a single common Unicode Set instance.
  93. //
  94. // This implementation cheats a little, and does not maintain a map of stand-in chars
  95. // to sets. Instead, it takes advantage of the fact that the UnicodeSet
  96. // constructor will always call this function right after calling lookup(),
  97. // and we just need to remember what set to return between these two calls.
  98. const UnicodeFunctor *RBBISymbolTable::lookupMatcher(UChar32 ch) const
  99. {
  100. UnicodeSet *retVal = nullptr;
  101. RBBISymbolTable *This = const_cast<RBBISymbolTable*>(this); // cast off const
  102. if (ch == 0xffff) {
  103. retVal = fCachedSetLookup;
  104. This->fCachedSetLookup = nullptr;
  105. }
  106. return retVal;
  107. }
  108. //
  109. // RBBISymbolTable::parseReference This function from the abstract symbol table interface
  110. // looks for a $variable name in the source text.
  111. // It does not look it up, only scans for it.
  112. // It is used by the UnicodeSet parser.
  113. //
  114. // This implementation is lifted pretty much verbatim
  115. // from the rules based transliterator implementation.
  116. // I didn't see an obvious way of sharing it.
  117. //
  118. UnicodeString RBBISymbolTable::parseReference(const UnicodeString& text,
  119. ParsePosition& pos, int32_t limit) const
  120. {
  121. int32_t start = pos.getIndex();
  122. int32_t i = start;
  123. UnicodeString result;
  124. while (i < limit) {
  125. char16_t c = text.charAt(i);
  126. if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
  127. break;
  128. }
  129. ++i;
  130. }
  131. if (i == start) { // No valid name chars
  132. return result; // Indicate failure with empty string
  133. }
  134. pos.setIndex(i);
  135. text.extractBetween(start, i, result);
  136. return result;
  137. }
  138. //
  139. // RBBISymbolTable::lookupNode Given a key (a variable name), return the
  140. // corresponding RBBI Node. If there is no entry
  141. // in the table for this name, return nullptr.
  142. //
  143. RBBINode *RBBISymbolTable::lookupNode(const UnicodeString &key) const{
  144. RBBINode *retNode = nullptr;
  145. RBBISymbolTableEntry *el;
  146. el = static_cast<RBBISymbolTableEntry*>(uhash_get(fHashTable, &key));
  147. if (el != nullptr) {
  148. retNode = el->val;
  149. }
  150. return retNode;
  151. }
  152. //
  153. // RBBISymbolTable::addEntry Add a new entry to the symbol table.
  154. // Indicate an error if the name already exists -
  155. // this will only occur in the case of duplicate
  156. // variable assignments.
  157. //
  158. void RBBISymbolTable::addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err) {
  159. RBBISymbolTableEntry *e;
  160. /* test for buffer overflows */
  161. if (U_FAILURE(err)) {
  162. return;
  163. }
  164. e = static_cast<RBBISymbolTableEntry*>(uhash_get(fHashTable, &key));
  165. if (e != nullptr) {
  166. err = U_BRK_VARIABLE_REDFINITION;
  167. return;
  168. }
  169. e = new RBBISymbolTableEntry;
  170. if (e == nullptr) {
  171. err = U_MEMORY_ALLOCATION_ERROR;
  172. return;
  173. }
  174. e->key = key;
  175. e->val = val;
  176. uhash_put( fHashTable, &e->key, e, &err);
  177. }
  178. RBBISymbolTableEntry::RBBISymbolTableEntry() : UMemory(), key(), val(nullptr) {}
  179. RBBISymbolTableEntry::~RBBISymbolTableEntry() {
  180. // The "val" of a symbol table entry is a variable reference node.
  181. // The l. child of the val is the rhs expression from the assignment.
  182. // Unlike other node types, children of variable reference nodes are not
  183. // automatically recursively deleted. We do it manually here.
  184. delete val->fLeftChild;
  185. val->fLeftChild = nullptr;
  186. delete val;
  187. // Note: the key UnicodeString is destructed by virtue of being in the object by value.
  188. }
  189. //
  190. // RBBISymbolTable::print Debugging function, dump out the symbol table contents.
  191. //
  192. #ifdef RBBI_DEBUG
  193. void RBBISymbolTable::rbbiSymtablePrint() const {
  194. RBBIDebugPrintf("Variable Definitions Symbol Table\n"
  195. "Name Node serial String Val\n"
  196. "-------------------------------------------------------------------\n");
  197. int32_t pos = UHASH_FIRST;
  198. const UHashElement *e = nullptr;
  199. for (;;) {
  200. e = uhash_nextElement(fHashTable, &pos);
  201. if (e == nullptr ) {
  202. break;
  203. }
  204. RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer;
  205. RBBIDebugPrintf("%-19s %8p %7d ", CStr(s->key)(), (void *)s->val, s->val->fSerialNum);
  206. RBBIDebugPrintf(" %s\n", CStr(s->val->fLeftChild->fText)());
  207. }
  208. RBBIDebugPrintf("\nParsed Variable Definitions\n");
  209. pos = -1;
  210. for (;;) {
  211. e = uhash_nextElement(fHashTable, &pos);
  212. if (e == nullptr ) {
  213. break;
  214. }
  215. RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer;
  216. RBBIDebugPrintf("%s\n", CStr(s->key)());
  217. RBBINode::printTree(s->val, true);
  218. RBBINode::printTree(s->val->fLeftChild, false);
  219. RBBIDebugPrintf("\n");
  220. }
  221. }
  222. #endif
  223. U_NAMESPACE_END
  224. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */