propname.cpp 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (c) 2002-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * Author: Alan Liu
  9. * Created: October 30 2002
  10. * Since: ICU 2.4
  11. * 2010nov19 Markus Scherer Rewrite for formatVersion 2.
  12. **********************************************************************
  13. */
  14. #include "propname.h"
  15. #include "unicode/uchar.h"
  16. #include "unicode/udata.h"
  17. #include "unicode/uscript.h"
  18. #include "umutex.h"
  19. #include "cmemory.h"
  20. #include "cstring.h"
  21. #include "uarrsort.h"
  22. #include "uinvchar.h"
  23. #define INCLUDED_FROM_PROPNAME_CPP
  24. #include "propname_data.h"
  25. U_CDECL_BEGIN
  26. /**
  27. * Get the next non-ignorable ASCII character from a property name
  28. * and lowercases it.
  29. * @return ((advance count for the name)<<8)|character
  30. */
  31. static inline int32_t
  32. getASCIIPropertyNameChar(const char *name) {
  33. int32_t i;
  34. char c;
  35. /* Ignore delimiters '-', '_', and ASCII White_Space */
  36. for(i=0;
  37. (c=name[i++])==0x2d || c==0x5f ||
  38. c==0x20 || (0x09<=c && c<=0x0d);
  39. ) {}
  40. if(c!=0) {
  41. return (i << 8) | static_cast<uint8_t>(uprv_asciitolower(c));
  42. } else {
  43. return i<<8;
  44. }
  45. }
  46. /**
  47. * Get the next non-ignorable EBCDIC character from a property name
  48. * and lowercases it.
  49. * @return ((advance count for the name)<<8)|character
  50. */
  51. static inline int32_t
  52. getEBCDICPropertyNameChar(const char *name) {
  53. int32_t i;
  54. char c;
  55. /* Ignore delimiters '-', '_', and EBCDIC White_Space */
  56. for(i=0;
  57. (c=name[i++])==0x60 || c==0x6d ||
  58. c==0x40 || c==0x05 || c==0x15 || c==0x25 || c==0x0b || c==0x0c || c==0x0d;
  59. ) {}
  60. if(c!=0) {
  61. return (i << 8) | static_cast<uint8_t>(uprv_ebcdictolower(c));
  62. } else {
  63. return i<<8;
  64. }
  65. }
  66. /**
  67. * Unicode property names and property value names are compared "loosely".
  68. *
  69. * UCD.html 4.0.1 says:
  70. * For all property names, property value names, and for property values for
  71. * Enumerated, Binary, or Catalog properties, use the following
  72. * loose matching rule:
  73. *
  74. * LM3. Ignore case, whitespace, underscore ('_'), and hyphens.
  75. *
  76. * This function does just that, for (char *) name strings.
  77. * It is almost identical to ucnv_compareNames() but also ignores
  78. * C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC).
  79. *
  80. * @internal
  81. */
  82. U_CAPI int32_t U_EXPORT2
  83. uprv_compareASCIIPropertyNames(const char *name1, const char *name2) {
  84. int32_t rc, r1, r2;
  85. for(;;) {
  86. r1=getASCIIPropertyNameChar(name1);
  87. r2=getASCIIPropertyNameChar(name2);
  88. /* If we reach the ends of both strings then they match */
  89. if(((r1|r2)&0xff)==0) {
  90. return 0;
  91. }
  92. /* Compare the lowercased characters */
  93. if(r1!=r2) {
  94. rc=(r1&0xff)-(r2&0xff);
  95. if(rc!=0) {
  96. return rc;
  97. }
  98. }
  99. name1+=r1>>8;
  100. name2+=r2>>8;
  101. }
  102. }
  103. U_CAPI int32_t U_EXPORT2
  104. uprv_compareEBCDICPropertyNames(const char *name1, const char *name2) {
  105. int32_t rc, r1, r2;
  106. for(;;) {
  107. r1=getEBCDICPropertyNameChar(name1);
  108. r2=getEBCDICPropertyNameChar(name2);
  109. /* If we reach the ends of both strings then they match */
  110. if(((r1|r2)&0xff)==0) {
  111. return 0;
  112. }
  113. /* Compare the lowercased characters */
  114. if(r1!=r2) {
  115. rc=(r1&0xff)-(r2&0xff);
  116. if(rc!=0) {
  117. return rc;
  118. }
  119. }
  120. name1+=r1>>8;
  121. name2+=r2>>8;
  122. }
  123. }
  124. U_CDECL_END
  125. U_NAMESPACE_BEGIN
  126. int32_t PropNameData::findProperty(int32_t property) {
  127. int32_t i=1; // valueMaps index, initially after numRanges
  128. for(int32_t numRanges=valueMaps[0]; numRanges>0; --numRanges) {
  129. // Read and skip the start and limit of this range.
  130. int32_t start=valueMaps[i];
  131. int32_t limit=valueMaps[i+1];
  132. i+=2;
  133. if(property<start) {
  134. break;
  135. }
  136. if(property<limit) {
  137. return i+(property-start)*2;
  138. }
  139. i+=(limit-start)*2; // Skip all entries for this range.
  140. }
  141. return 0;
  142. }
  143. int32_t PropNameData::findPropertyValueNameGroup(int32_t valueMapIndex, int32_t value) {
  144. if(valueMapIndex==0) {
  145. return 0; // The property does not have named values.
  146. }
  147. ++valueMapIndex; // Skip the BytesTrie offset.
  148. int32_t numRanges=valueMaps[valueMapIndex++];
  149. if(numRanges<0x10) {
  150. // Ranges of values.
  151. for(; numRanges>0; --numRanges) {
  152. // Read and skip the start and limit of this range.
  153. int32_t start=valueMaps[valueMapIndex];
  154. int32_t limit=valueMaps[valueMapIndex+1];
  155. valueMapIndex+=2;
  156. if(value<start) {
  157. break;
  158. }
  159. if(value<limit) {
  160. return valueMaps[valueMapIndex+value-start];
  161. }
  162. valueMapIndex+=limit-start; // Skip all entries for this range.
  163. }
  164. } else {
  165. // List of values.
  166. int32_t valuesStart=valueMapIndex;
  167. int32_t nameGroupOffsetsStart=valueMapIndex+numRanges-0x10;
  168. do {
  169. int32_t v=valueMaps[valueMapIndex];
  170. if(value<v) {
  171. break;
  172. }
  173. if(value==v) {
  174. return valueMaps[nameGroupOffsetsStart+valueMapIndex-valuesStart];
  175. }
  176. } while(++valueMapIndex<nameGroupOffsetsStart);
  177. }
  178. return 0;
  179. }
  180. const char *PropNameData::getName(const char *nameGroup, int32_t nameIndex) {
  181. int32_t numNames=*nameGroup++;
  182. if(nameIndex<0 || numNames<=nameIndex) {
  183. return nullptr;
  184. }
  185. // Skip nameIndex names.
  186. for(; nameIndex>0; --nameIndex) {
  187. nameGroup=uprv_strchr(nameGroup, 0)+1;
  188. }
  189. if(*nameGroup==0) {
  190. return nullptr; // no name (Property[Value]Aliases.txt has "n/a")
  191. }
  192. return nameGroup;
  193. }
  194. UBool PropNameData::containsName(BytesTrie &trie, const char *name) {
  195. if(name==nullptr) {
  196. return false;
  197. }
  198. UStringTrieResult result=USTRINGTRIE_NO_VALUE;
  199. char c;
  200. while((c=*name++)!=0) {
  201. c=uprv_invCharToLowercaseAscii(c);
  202. // Ignore delimiters '-', '_', and ASCII White_Space.
  203. if(c==0x2d || c==0x5f || c==0x20 || (0x09<=c && c<=0x0d)) {
  204. continue;
  205. }
  206. if(!USTRINGTRIE_HAS_NEXT(result)) {
  207. return false;
  208. }
  209. result = trie.next(static_cast<uint8_t>(c));
  210. }
  211. return USTRINGTRIE_HAS_VALUE(result);
  212. }
  213. const char *PropNameData::getPropertyName(int32_t property, int32_t nameChoice) {
  214. int32_t valueMapIndex=findProperty(property);
  215. if(valueMapIndex==0) {
  216. return nullptr; // Not a known property.
  217. }
  218. return getName(nameGroups+valueMaps[valueMapIndex], nameChoice);
  219. }
  220. const char *PropNameData::getPropertyValueName(int32_t property, int32_t value, int32_t nameChoice) {
  221. int32_t valueMapIndex=findProperty(property);
  222. if(valueMapIndex==0) {
  223. return nullptr; // Not a known property.
  224. }
  225. int32_t nameGroupOffset=findPropertyValueNameGroup(valueMaps[valueMapIndex+1], value);
  226. if(nameGroupOffset==0) {
  227. return nullptr;
  228. }
  229. return getName(nameGroups+nameGroupOffset, nameChoice);
  230. }
  231. int32_t PropNameData::getPropertyOrValueEnum(int32_t bytesTrieOffset, const char *alias) {
  232. BytesTrie trie(bytesTries+bytesTrieOffset);
  233. if(containsName(trie, alias)) {
  234. return trie.getValue();
  235. } else {
  236. return UCHAR_INVALID_CODE;
  237. }
  238. }
  239. int32_t PropNameData::getPropertyEnum(const char *alias) {
  240. return getPropertyOrValueEnum(0, alias);
  241. }
  242. int32_t PropNameData::getPropertyValueEnum(int32_t property, const char *alias) {
  243. int32_t valueMapIndex=findProperty(property);
  244. if(valueMapIndex==0) {
  245. return UCHAR_INVALID_CODE; // Not a known property.
  246. }
  247. valueMapIndex=valueMaps[valueMapIndex+1];
  248. if(valueMapIndex==0) {
  249. return UCHAR_INVALID_CODE; // The property does not have named values.
  250. }
  251. // valueMapIndex is the start of the property's valueMap,
  252. // where the first word is the BytesTrie offset.
  253. return getPropertyOrValueEnum(valueMaps[valueMapIndex], alias);
  254. }
  255. U_NAMESPACE_END
  256. //----------------------------------------------------------------------
  257. // Public API implementation
  258. U_CAPI const char* U_EXPORT2
  259. u_getPropertyName(UProperty property,
  260. UPropertyNameChoice nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
  261. // The nameChoice is really an integer with a couple of named constants.
  262. // Unicode allows for names other than short and long ones.
  263. // If present, these will be returned for U_LONG_PROPERTY_NAME + i, where i=1, 2,...
  264. U_NAMESPACE_USE
  265. return PropNameData::getPropertyName(property, nameChoice);
  266. }
  267. U_CAPI UProperty U_EXPORT2
  268. u_getPropertyEnum(const char* alias) {
  269. U_NAMESPACE_USE
  270. return (UProperty)PropNameData::getPropertyEnum(alias);
  271. }
  272. U_CAPI const char* U_EXPORT2
  273. u_getPropertyValueName(UProperty property,
  274. int32_t value,
  275. UPropertyNameChoice nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
  276. // The nameChoice is really an integer with a couple of named constants.
  277. // Unicode allows for names other than short and long ones.
  278. // If present, these will be returned for U_LONG_PROPERTY_NAME + i, where i=1, 2,...
  279. U_NAMESPACE_USE
  280. return PropNameData::getPropertyValueName(property, value, nameChoice);
  281. }
  282. U_CAPI int32_t U_EXPORT2
  283. u_getPropertyValueEnum(UProperty property,
  284. const char* alias) {
  285. U_NAMESPACE_USE
  286. return PropNameData::getPropertyValueEnum(property, alias);
  287. }
  288. U_CAPI const char* U_EXPORT2
  289. uscript_getName(UScriptCode scriptCode){
  290. return u_getPropertyValueName(UCHAR_SCRIPT, scriptCode,
  291. U_LONG_PROPERTY_NAME);
  292. }
  293. U_CAPI const char* U_EXPORT2
  294. uscript_getShortName(UScriptCode scriptCode){
  295. return u_getPropertyValueName(UCHAR_SCRIPT, scriptCode,
  296. U_SHORT_PROPERTY_NAME);
  297. }