loclikely.cpp 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 1997-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: loclikely.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2010feb25
  16. * created by: Markus W. Scherer
  17. *
  18. * Code for likely and minimized locale subtags, separated out from other .cpp files
  19. * that then do not depend on resource bundle code and likely-subtags data.
  20. */
  21. #include "unicode/bytestream.h"
  22. #include "unicode/utypes.h"
  23. #include "unicode/locid.h"
  24. #include "unicode/putil.h"
  25. #include "unicode/uchar.h"
  26. #include "unicode/uloc.h"
  27. #include "unicode/ures.h"
  28. #include "unicode/uscript.h"
  29. #include "bytesinkutil.h"
  30. #include "charstr.h"
  31. #include "cmemory.h"
  32. #include "cstring.h"
  33. #include "ulocimp.h"
  34. #include "ustr_imp.h"
  35. /**
  36. * These are the canonical strings for unknown languages, scripts and regions.
  37. **/
  38. static const char* const unknownLanguage = "und";
  39. static const char* const unknownScript = "Zzzz";
  40. static const char* const unknownRegion = "ZZ";
  41. /**
  42. * This function looks for the localeID in the likelySubtags resource.
  43. *
  44. * @param localeID The tag to find.
  45. * @param buffer A buffer to hold the matching entry
  46. * @param bufferLength The length of the output buffer
  47. * @return A pointer to "buffer" if found, or a null pointer if not.
  48. */
  49. static const char* U_CALLCONV
  50. findLikelySubtags(const char* localeID,
  51. char* buffer,
  52. int32_t bufferLength,
  53. UErrorCode* err) {
  54. const char* result = nullptr;
  55. if (!U_FAILURE(*err)) {
  56. int32_t resLen = 0;
  57. const char16_t* s = nullptr;
  58. UErrorCode tmpErr = U_ZERO_ERROR;
  59. icu::LocalUResourceBundlePointer subtags(ures_openDirect(nullptr, "likelySubtags", &tmpErr));
  60. if (U_SUCCESS(tmpErr)) {
  61. icu::CharString und;
  62. if (localeID != nullptr) {
  63. if (*localeID == '\0') {
  64. localeID = unknownLanguage;
  65. } else if (*localeID == '_') {
  66. und.append(unknownLanguage, *err);
  67. und.append(localeID, *err);
  68. if (U_FAILURE(*err)) {
  69. return nullptr;
  70. }
  71. localeID = und.data();
  72. }
  73. }
  74. s = ures_getStringByKey(subtags.getAlias(), localeID, &resLen, &tmpErr);
  75. if (U_FAILURE(tmpErr)) {
  76. /*
  77. * If a resource is missing, it's not really an error, it's
  78. * just that we don't have any data for that particular locale ID.
  79. */
  80. if (tmpErr != U_MISSING_RESOURCE_ERROR) {
  81. *err = tmpErr;
  82. }
  83. }
  84. else if (resLen >= bufferLength) {
  85. /* The buffer should never overflow. */
  86. *err = U_INTERNAL_PROGRAM_ERROR;
  87. }
  88. else {
  89. u_UCharsToChars(s, buffer, resLen + 1);
  90. if (resLen >= 3 &&
  91. uprv_strnicmp(buffer, unknownLanguage, 3) == 0 &&
  92. (resLen == 3 || buffer[3] == '_')) {
  93. uprv_memmove(buffer, buffer + 3, resLen - 3 + 1);
  94. }
  95. result = buffer;
  96. }
  97. } else {
  98. *err = tmpErr;
  99. }
  100. }
  101. return result;
  102. }
  103. /**
  104. * Append a tag to a buffer, adding the separator if necessary. The buffer
  105. * must be large enough to contain the resulting tag plus any separator
  106. * necessary. The tag must not be a zero-length string.
  107. *
  108. * @param tag The tag to add.
  109. * @param tagLength The length of the tag.
  110. * @param buffer The output buffer.
  111. * @param bufferLength The length of the output buffer. This is an input/output parameter.
  112. **/
  113. static void U_CALLCONV
  114. appendTag(
  115. const char* tag,
  116. int32_t tagLength,
  117. char* buffer,
  118. int32_t* bufferLength,
  119. UBool withSeparator) {
  120. if (withSeparator) {
  121. buffer[*bufferLength] = '_';
  122. ++(*bufferLength);
  123. }
  124. uprv_memmove(
  125. &buffer[*bufferLength],
  126. tag,
  127. tagLength);
  128. *bufferLength += tagLength;
  129. }
  130. /**
  131. * Create a tag string from the supplied parameters. The lang, script and region
  132. * parameters may be nullptr pointers. If they are, their corresponding length parameters
  133. * must be less than or equal to 0.
  134. *
  135. * If any of the language, script or region parameters are empty, and the alternateTags
  136. * parameter is not nullptr, it will be parsed for potential language, script and region tags
  137. * to be used when constructing the new tag. If the alternateTags parameter is nullptr, or
  138. * it contains no language tag, the default tag for the unknown language is used.
  139. *
  140. * If the length of the new string exceeds the capacity of the output buffer,
  141. * the function copies as many bytes to the output buffer as it can, and returns
  142. * the error U_BUFFER_OVERFLOW_ERROR.
  143. *
  144. * If an illegal argument is provided, the function returns the error
  145. * U_ILLEGAL_ARGUMENT_ERROR.
  146. *
  147. * Note that this function can return the warning U_STRING_NOT_TERMINATED_WARNING if
  148. * the tag string fits in the output buffer, but the null terminator doesn't.
  149. *
  150. * @param lang The language tag to use.
  151. * @param langLength The length of the language tag.
  152. * @param script The script tag to use.
  153. * @param scriptLength The length of the script tag.
  154. * @param region The region tag to use.
  155. * @param regionLength The length of the region tag.
  156. * @param trailing Any trailing data to append to the new tag.
  157. * @param trailingLength The length of the trailing data.
  158. * @param alternateTags A string containing any alternate tags.
  159. * @param sink The output sink receiving the tag string.
  160. * @param err A pointer to a UErrorCode for error reporting.
  161. **/
  162. static void U_CALLCONV
  163. createTagStringWithAlternates(
  164. const char* lang,
  165. int32_t langLength,
  166. const char* script,
  167. int32_t scriptLength,
  168. const char* region,
  169. int32_t regionLength,
  170. const char* trailing,
  171. int32_t trailingLength,
  172. const char* alternateTags,
  173. icu::ByteSink& sink,
  174. UErrorCode* err) {
  175. if (U_FAILURE(*err)) {
  176. goto error;
  177. }
  178. else if (langLength >= ULOC_LANG_CAPACITY ||
  179. scriptLength >= ULOC_SCRIPT_CAPACITY ||
  180. regionLength >= ULOC_COUNTRY_CAPACITY) {
  181. goto error;
  182. }
  183. else {
  184. /**
  185. * ULOC_FULLNAME_CAPACITY will provide enough capacity
  186. * that we can build a string that contains the language,
  187. * script and region code without worrying about overrunning
  188. * the user-supplied buffer.
  189. **/
  190. char tagBuffer[ULOC_FULLNAME_CAPACITY];
  191. int32_t tagLength = 0;
  192. UBool regionAppended = false;
  193. if (langLength > 0) {
  194. appendTag(
  195. lang,
  196. langLength,
  197. tagBuffer,
  198. &tagLength,
  199. /*withSeparator=*/false);
  200. }
  201. else if (alternateTags == nullptr) {
  202. /*
  203. * Use the empty string for an unknown language, if
  204. * we found no language.
  205. */
  206. }
  207. else {
  208. /*
  209. * Parse the alternateTags string for the language.
  210. */
  211. char alternateLang[ULOC_LANG_CAPACITY];
  212. int32_t alternateLangLength = sizeof(alternateLang);
  213. alternateLangLength =
  214. uloc_getLanguage(
  215. alternateTags,
  216. alternateLang,
  217. alternateLangLength,
  218. err);
  219. if(U_FAILURE(*err) ||
  220. alternateLangLength >= ULOC_LANG_CAPACITY) {
  221. goto error;
  222. }
  223. else if (alternateLangLength == 0) {
  224. /*
  225. * Use the empty string for an unknown language, if
  226. * we found no language.
  227. */
  228. }
  229. else {
  230. appendTag(
  231. alternateLang,
  232. alternateLangLength,
  233. tagBuffer,
  234. &tagLength,
  235. /*withSeparator=*/false);
  236. }
  237. }
  238. if (scriptLength > 0) {
  239. appendTag(
  240. script,
  241. scriptLength,
  242. tagBuffer,
  243. &tagLength,
  244. /*withSeparator=*/true);
  245. }
  246. else if (alternateTags != nullptr) {
  247. /*
  248. * Parse the alternateTags string for the script.
  249. */
  250. char alternateScript[ULOC_SCRIPT_CAPACITY];
  251. const int32_t alternateScriptLength =
  252. uloc_getScript(
  253. alternateTags,
  254. alternateScript,
  255. sizeof(alternateScript),
  256. err);
  257. if (U_FAILURE(*err) ||
  258. alternateScriptLength >= ULOC_SCRIPT_CAPACITY) {
  259. goto error;
  260. }
  261. else if (alternateScriptLength > 0) {
  262. appendTag(
  263. alternateScript,
  264. alternateScriptLength,
  265. tagBuffer,
  266. &tagLength,
  267. /*withSeparator=*/true);
  268. }
  269. }
  270. if (regionLength > 0) {
  271. appendTag(
  272. region,
  273. regionLength,
  274. tagBuffer,
  275. &tagLength,
  276. /*withSeparator=*/true);
  277. regionAppended = true;
  278. }
  279. else if (alternateTags != nullptr) {
  280. /*
  281. * Parse the alternateTags string for the region.
  282. */
  283. char alternateRegion[ULOC_COUNTRY_CAPACITY];
  284. const int32_t alternateRegionLength =
  285. uloc_getCountry(
  286. alternateTags,
  287. alternateRegion,
  288. sizeof(alternateRegion),
  289. err);
  290. if (U_FAILURE(*err) ||
  291. alternateRegionLength >= ULOC_COUNTRY_CAPACITY) {
  292. goto error;
  293. }
  294. else if (alternateRegionLength > 0) {
  295. appendTag(
  296. alternateRegion,
  297. alternateRegionLength,
  298. tagBuffer,
  299. &tagLength,
  300. /*withSeparator=*/true);
  301. regionAppended = true;
  302. }
  303. }
  304. /**
  305. * Copy the partial tag from our internal buffer to the supplied
  306. * target.
  307. **/
  308. sink.Append(tagBuffer, tagLength);
  309. if (trailingLength > 0) {
  310. if (*trailing != '@') {
  311. sink.Append("_", 1);
  312. if (!regionAppended) {
  313. /* extra separator is required */
  314. sink.Append("_", 1);
  315. }
  316. }
  317. /*
  318. * Copy the trailing data into the supplied buffer.
  319. */
  320. sink.Append(trailing, trailingLength);
  321. }
  322. return;
  323. }
  324. error:
  325. /**
  326. * An overflow indicates the locale ID passed in
  327. * is ill-formed. If we got here, and there was
  328. * no previous error, it's an implicit overflow.
  329. **/
  330. if (*err == U_BUFFER_OVERFLOW_ERROR ||
  331. U_SUCCESS(*err)) {
  332. *err = U_ILLEGAL_ARGUMENT_ERROR;
  333. }
  334. }
  335. /**
  336. * Create a tag string from the supplied parameters. The lang, script and region
  337. * parameters may be nullptr pointers. If they are, their corresponding length parameters
  338. * must be less than or equal to 0. If the lang parameter is an empty string, the
  339. * default value for an unknown language is written to the output buffer.
  340. *
  341. * If the length of the new string exceeds the capacity of the output buffer,
  342. * the function copies as many bytes to the output buffer as it can, and returns
  343. * the error U_BUFFER_OVERFLOW_ERROR.
  344. *
  345. * If an illegal argument is provided, the function returns the error
  346. * U_ILLEGAL_ARGUMENT_ERROR.
  347. *
  348. * @param lang The language tag to use.
  349. * @param langLength The length of the language tag.
  350. * @param script The script tag to use.
  351. * @param scriptLength The length of the script tag.
  352. * @param region The region tag to use.
  353. * @param regionLength The length of the region tag.
  354. * @param trailing Any trailing data to append to the new tag.
  355. * @param trailingLength The length of the trailing data.
  356. * @param sink The output sink receiving the tag string.
  357. * @param err A pointer to a UErrorCode for error reporting.
  358. **/
  359. static void U_CALLCONV
  360. createTagString(
  361. const char* lang,
  362. int32_t langLength,
  363. const char* script,
  364. int32_t scriptLength,
  365. const char* region,
  366. int32_t regionLength,
  367. const char* trailing,
  368. int32_t trailingLength,
  369. icu::ByteSink& sink,
  370. UErrorCode* err)
  371. {
  372. createTagStringWithAlternates(
  373. lang,
  374. langLength,
  375. script,
  376. scriptLength,
  377. region,
  378. regionLength,
  379. trailing,
  380. trailingLength,
  381. nullptr,
  382. sink,
  383. err);
  384. }
  385. /**
  386. * Parse the language, script, and region subtags from a tag string, and copy the
  387. * results into the corresponding output parameters. The buffers are null-terminated,
  388. * unless overflow occurs.
  389. *
  390. * The langLength, scriptLength, and regionLength parameters are input/output
  391. * parameters, and must contain the capacity of their corresponding buffers on
  392. * input. On output, they will contain the actual length of the buffers, not
  393. * including the null terminator.
  394. *
  395. * If the length of any of the output subtags exceeds the capacity of the corresponding
  396. * buffer, the function copies as many bytes to the output buffer as it can, and returns
  397. * the error U_BUFFER_OVERFLOW_ERROR. It will not parse any more subtags once overflow
  398. * occurs.
  399. *
  400. * If an illegal argument is provided, the function returns the error
  401. * U_ILLEGAL_ARGUMENT_ERROR.
  402. *
  403. * @param localeID The locale ID to parse.
  404. * @param lang The language tag buffer.
  405. * @param langLength The length of the language tag.
  406. * @param script The script tag buffer.
  407. * @param scriptLength The length of the script tag.
  408. * @param region The region tag buffer.
  409. * @param regionLength The length of the region tag.
  410. * @param err A pointer to a UErrorCode for error reporting.
  411. * @return The number of chars of the localeID parameter consumed.
  412. **/
  413. static int32_t U_CALLCONV
  414. parseTagString(
  415. const char* localeID,
  416. char* lang,
  417. int32_t* langLength,
  418. char* script,
  419. int32_t* scriptLength,
  420. char* region,
  421. int32_t* regionLength,
  422. UErrorCode* err)
  423. {
  424. const char* position = localeID;
  425. int32_t subtagLength = 0;
  426. if(U_FAILURE(*err) ||
  427. localeID == nullptr ||
  428. lang == nullptr ||
  429. langLength == nullptr ||
  430. script == nullptr ||
  431. scriptLength == nullptr ||
  432. region == nullptr ||
  433. regionLength == nullptr) {
  434. goto error;
  435. }
  436. subtagLength = ulocimp_getLanguage(position, &position, *err).extract(lang, *langLength, *err);
  437. /*
  438. * Note that we explicit consider U_STRING_NOT_TERMINATED_WARNING
  439. * to be an error, because it indicates the user-supplied tag is
  440. * not well-formed.
  441. */
  442. if(U_FAILURE(*err)) {
  443. goto error;
  444. }
  445. *langLength = subtagLength;
  446. /*
  447. * If no language was present, use the empty string instead.
  448. * Otherwise, move past any separator.
  449. */
  450. if (_isIDSeparator(*position)) {
  451. ++position;
  452. }
  453. subtagLength = ulocimp_getScript(position, &position, *err).extract(script, *scriptLength, *err);
  454. if(U_FAILURE(*err)) {
  455. goto error;
  456. }
  457. *scriptLength = subtagLength;
  458. if (*scriptLength > 0) {
  459. if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) {
  460. /**
  461. * If the script part is the "unknown" script, then don't return it.
  462. **/
  463. *scriptLength = 0;
  464. }
  465. /*
  466. * Move past any separator.
  467. */
  468. if (_isIDSeparator(*position)) {
  469. ++position;
  470. }
  471. }
  472. subtagLength = ulocimp_getCountry(position, &position, *err).extract(region, *regionLength, *err);
  473. if(U_FAILURE(*err)) {
  474. goto error;
  475. }
  476. *regionLength = subtagLength;
  477. if (*regionLength > 0) {
  478. if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) {
  479. /**
  480. * If the region part is the "unknown" region, then don't return it.
  481. **/
  482. *regionLength = 0;
  483. }
  484. } else if (*position != 0 && *position != '@') {
  485. /* back up over consumed trailing separator */
  486. --position;
  487. }
  488. exit:
  489. return (int32_t)(position - localeID);
  490. error:
  491. /**
  492. * If we get here, we have no explicit error, it's the result of an
  493. * illegal argument.
  494. **/
  495. if (!U_FAILURE(*err)) {
  496. *err = U_ILLEGAL_ARGUMENT_ERROR;
  497. }
  498. goto exit;
  499. }
  500. static UBool U_CALLCONV
  501. createLikelySubtagsString(
  502. const char* lang,
  503. int32_t langLength,
  504. const char* script,
  505. int32_t scriptLength,
  506. const char* region,
  507. int32_t regionLength,
  508. const char* variants,
  509. int32_t variantsLength,
  510. icu::ByteSink& sink,
  511. UErrorCode* err) {
  512. /**
  513. * ULOC_FULLNAME_CAPACITY will provide enough capacity
  514. * that we can build a string that contains the language,
  515. * script and region code without worrying about overrunning
  516. * the user-supplied buffer.
  517. **/
  518. char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY];
  519. if(U_FAILURE(*err)) {
  520. goto error;
  521. }
  522. /**
  523. * Try the language with the script and region first.
  524. **/
  525. if (scriptLength > 0 && regionLength > 0) {
  526. const char* likelySubtags = nullptr;
  527. icu::CharString tagBuffer;
  528. {
  529. icu::CharStringByteSink sink(&tagBuffer);
  530. createTagString(
  531. lang,
  532. langLength,
  533. script,
  534. scriptLength,
  535. region,
  536. regionLength,
  537. nullptr,
  538. 0,
  539. sink,
  540. err);
  541. }
  542. if(U_FAILURE(*err)) {
  543. goto error;
  544. }
  545. likelySubtags =
  546. findLikelySubtags(
  547. tagBuffer.data(),
  548. likelySubtagsBuffer,
  549. sizeof(likelySubtagsBuffer),
  550. err);
  551. if(U_FAILURE(*err)) {
  552. goto error;
  553. }
  554. if (likelySubtags != nullptr) {
  555. /* Always use the language tag from the
  556. maximal string, since it may be more
  557. specific than the one provided. */
  558. createTagStringWithAlternates(
  559. nullptr,
  560. 0,
  561. nullptr,
  562. 0,
  563. nullptr,
  564. 0,
  565. variants,
  566. variantsLength,
  567. likelySubtags,
  568. sink,
  569. err);
  570. return true;
  571. }
  572. }
  573. /**
  574. * Try the language with just the script.
  575. **/
  576. if (scriptLength > 0) {
  577. const char* likelySubtags = nullptr;
  578. icu::CharString tagBuffer;
  579. {
  580. icu::CharStringByteSink sink(&tagBuffer);
  581. createTagString(
  582. lang,
  583. langLength,
  584. script,
  585. scriptLength,
  586. nullptr,
  587. 0,
  588. nullptr,
  589. 0,
  590. sink,
  591. err);
  592. }
  593. if(U_FAILURE(*err)) {
  594. goto error;
  595. }
  596. likelySubtags =
  597. findLikelySubtags(
  598. tagBuffer.data(),
  599. likelySubtagsBuffer,
  600. sizeof(likelySubtagsBuffer),
  601. err);
  602. if(U_FAILURE(*err)) {
  603. goto error;
  604. }
  605. if (likelySubtags != nullptr) {
  606. /* Always use the language tag from the
  607. maximal string, since it may be more
  608. specific than the one provided. */
  609. createTagStringWithAlternates(
  610. nullptr,
  611. 0,
  612. nullptr,
  613. 0,
  614. region,
  615. regionLength,
  616. variants,
  617. variantsLength,
  618. likelySubtags,
  619. sink,
  620. err);
  621. return true;
  622. }
  623. }
  624. /**
  625. * Try the language with just the region.
  626. **/
  627. if (regionLength > 0) {
  628. const char* likelySubtags = nullptr;
  629. icu::CharString tagBuffer;
  630. {
  631. icu::CharStringByteSink sink(&tagBuffer);
  632. createTagString(
  633. lang,
  634. langLength,
  635. nullptr,
  636. 0,
  637. region,
  638. regionLength,
  639. nullptr,
  640. 0,
  641. sink,
  642. err);
  643. }
  644. if(U_FAILURE(*err)) {
  645. goto error;
  646. }
  647. likelySubtags =
  648. findLikelySubtags(
  649. tagBuffer.data(),
  650. likelySubtagsBuffer,
  651. sizeof(likelySubtagsBuffer),
  652. err);
  653. if(U_FAILURE(*err)) {
  654. goto error;
  655. }
  656. if (likelySubtags != nullptr) {
  657. /* Always use the language tag from the
  658. maximal string, since it may be more
  659. specific than the one provided. */
  660. createTagStringWithAlternates(
  661. nullptr,
  662. 0,
  663. script,
  664. scriptLength,
  665. nullptr,
  666. 0,
  667. variants,
  668. variantsLength,
  669. likelySubtags,
  670. sink,
  671. err);
  672. return true;
  673. }
  674. }
  675. /**
  676. * Finally, try just the language.
  677. **/
  678. {
  679. const char* likelySubtags = nullptr;
  680. icu::CharString tagBuffer;
  681. {
  682. icu::CharStringByteSink sink(&tagBuffer);
  683. createTagString(
  684. lang,
  685. langLength,
  686. nullptr,
  687. 0,
  688. nullptr,
  689. 0,
  690. nullptr,
  691. 0,
  692. sink,
  693. err);
  694. }
  695. if(U_FAILURE(*err)) {
  696. goto error;
  697. }
  698. likelySubtags =
  699. findLikelySubtags(
  700. tagBuffer.data(),
  701. likelySubtagsBuffer,
  702. sizeof(likelySubtagsBuffer),
  703. err);
  704. if(U_FAILURE(*err)) {
  705. goto error;
  706. }
  707. if (likelySubtags != nullptr) {
  708. /* Always use the language tag from the
  709. maximal string, since it may be more
  710. specific than the one provided. */
  711. createTagStringWithAlternates(
  712. nullptr,
  713. 0,
  714. script,
  715. scriptLength,
  716. region,
  717. regionLength,
  718. variants,
  719. variantsLength,
  720. likelySubtags,
  721. sink,
  722. err);
  723. return true;
  724. }
  725. }
  726. return false;
  727. error:
  728. if (!U_FAILURE(*err)) {
  729. *err = U_ILLEGAL_ARGUMENT_ERROR;
  730. }
  731. return false;
  732. }
  733. #define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) UPRV_BLOCK_MACRO_BEGIN { \
  734. int32_t count = 0; \
  735. int32_t i; \
  736. for (i = 0; i < trailingLength; i++) { \
  737. if (trailing[i] == '-' || trailing[i] == '_') { \
  738. count = 0; \
  739. if (count > 8) { \
  740. goto error; \
  741. } \
  742. } else if (trailing[i] == '@') { \
  743. break; \
  744. } else if (count > 8) { \
  745. goto error; \
  746. } else { \
  747. count++; \
  748. } \
  749. } \
  750. } UPRV_BLOCK_MACRO_END
  751. static UBool
  752. _uloc_addLikelySubtags(const char* localeID,
  753. icu::ByteSink& sink,
  754. UErrorCode* err) {
  755. char lang[ULOC_LANG_CAPACITY];
  756. int32_t langLength = sizeof(lang);
  757. char script[ULOC_SCRIPT_CAPACITY];
  758. int32_t scriptLength = sizeof(script);
  759. char region[ULOC_COUNTRY_CAPACITY];
  760. int32_t regionLength = sizeof(region);
  761. const char* trailing = "";
  762. int32_t trailingLength = 0;
  763. int32_t trailingIndex = 0;
  764. UBool success = false;
  765. if(U_FAILURE(*err)) {
  766. goto error;
  767. }
  768. if (localeID == nullptr) {
  769. goto error;
  770. }
  771. trailingIndex = parseTagString(
  772. localeID,
  773. lang,
  774. &langLength,
  775. script,
  776. &scriptLength,
  777. region,
  778. &regionLength,
  779. err);
  780. if(U_FAILURE(*err)) {
  781. /* Overflow indicates an illegal argument error */
  782. if (*err == U_BUFFER_OVERFLOW_ERROR) {
  783. *err = U_ILLEGAL_ARGUMENT_ERROR;
  784. }
  785. goto error;
  786. }
  787. /* Find the length of the trailing portion. */
  788. while (_isIDSeparator(localeID[trailingIndex])) {
  789. trailingIndex++;
  790. }
  791. trailing = &localeID[trailingIndex];
  792. trailingLength = (int32_t)uprv_strlen(trailing);
  793. CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
  794. success =
  795. createLikelySubtagsString(
  796. lang,
  797. langLength,
  798. script,
  799. scriptLength,
  800. region,
  801. regionLength,
  802. trailing,
  803. trailingLength,
  804. sink,
  805. err);
  806. if (!success) {
  807. const int32_t localIDLength = (int32_t)uprv_strlen(localeID);
  808. /*
  809. * If we get here, we need to return localeID.
  810. */
  811. sink.Append(localeID, localIDLength);
  812. }
  813. return success;
  814. error:
  815. if (!U_FAILURE(*err)) {
  816. *err = U_ILLEGAL_ARGUMENT_ERROR;
  817. }
  818. return false;
  819. }
  820. // Add likely subtags to the sink
  821. // return true if the value in the sink is produced by a match during the lookup
  822. // return false if the value in the sink is the same as input because there are
  823. // no match after the lookup.
  824. static UBool _ulocimp_addLikelySubtags(const char*, icu::ByteSink&, UErrorCode*);
  825. static void
  826. _uloc_minimizeSubtags(const char* localeID,
  827. icu::ByteSink& sink,
  828. UErrorCode* err) {
  829. icu::CharString maximizedTagBuffer;
  830. char lang[ULOC_LANG_CAPACITY];
  831. int32_t langLength = sizeof(lang);
  832. char script[ULOC_SCRIPT_CAPACITY];
  833. int32_t scriptLength = sizeof(script);
  834. char region[ULOC_COUNTRY_CAPACITY];
  835. int32_t regionLength = sizeof(region);
  836. const char* trailing = "";
  837. int32_t trailingLength = 0;
  838. int32_t trailingIndex = 0;
  839. UBool successGetMax = false;
  840. if(U_FAILURE(*err)) {
  841. goto error;
  842. }
  843. else if (localeID == nullptr) {
  844. goto error;
  845. }
  846. trailingIndex =
  847. parseTagString(
  848. localeID,
  849. lang,
  850. &langLength,
  851. script,
  852. &scriptLength,
  853. region,
  854. &regionLength,
  855. err);
  856. if(U_FAILURE(*err)) {
  857. /* Overflow indicates an illegal argument error */
  858. if (*err == U_BUFFER_OVERFLOW_ERROR) {
  859. *err = U_ILLEGAL_ARGUMENT_ERROR;
  860. }
  861. goto error;
  862. }
  863. /* Find the spot where the variants or the keywords begin, if any. */
  864. while (_isIDSeparator(localeID[trailingIndex])) {
  865. trailingIndex++;
  866. }
  867. trailing = &localeID[trailingIndex];
  868. trailingLength = (int32_t)uprv_strlen(trailing);
  869. CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
  870. {
  871. icu::CharString base;
  872. {
  873. icu::CharStringByteSink baseSink(&base);
  874. createTagString(
  875. lang,
  876. langLength,
  877. script,
  878. scriptLength,
  879. region,
  880. regionLength,
  881. nullptr,
  882. 0,
  883. baseSink,
  884. err);
  885. }
  886. /**
  887. * First, we need to first get the maximization
  888. * from AddLikelySubtags.
  889. **/
  890. {
  891. icu::CharStringByteSink maxSink(&maximizedTagBuffer);
  892. successGetMax = _ulocimp_addLikelySubtags(base.data(), maxSink, err);
  893. }
  894. }
  895. if(U_FAILURE(*err)) {
  896. goto error;
  897. }
  898. if (!successGetMax) {
  899. /**
  900. * If we got here, return the locale ID parameter unchanged.
  901. **/
  902. const int32_t localeIDLength = (int32_t)uprv_strlen(localeID);
  903. sink.Append(localeID, localeIDLength);
  904. return;
  905. }
  906. // In the following, the lang, script, region are referring to those in
  907. // the maximizedTagBuffer, not the one in the localeID.
  908. langLength = sizeof(lang);
  909. scriptLength = sizeof(script);
  910. regionLength = sizeof(region);
  911. parseTagString(
  912. maximizedTagBuffer.data(),
  913. lang,
  914. &langLength,
  915. script,
  916. &scriptLength,
  917. region,
  918. &regionLength,
  919. err);
  920. if(U_FAILURE(*err)) {
  921. goto error;
  922. }
  923. /**
  924. * Start first with just the language.
  925. **/
  926. {
  927. icu::CharString tagBuffer;
  928. {
  929. icu::CharStringByteSink tagSink(&tagBuffer);
  930. createLikelySubtagsString(
  931. lang,
  932. langLength,
  933. nullptr,
  934. 0,
  935. nullptr,
  936. 0,
  937. nullptr,
  938. 0,
  939. tagSink,
  940. err);
  941. }
  942. if(U_FAILURE(*err)) {
  943. goto error;
  944. }
  945. else if (!tagBuffer.isEmpty() &&
  946. uprv_strnicmp(
  947. maximizedTagBuffer.data(),
  948. tagBuffer.data(),
  949. tagBuffer.length()) == 0) {
  950. createTagString(
  951. lang,
  952. langLength,
  953. nullptr,
  954. 0,
  955. nullptr,
  956. 0,
  957. trailing,
  958. trailingLength,
  959. sink,
  960. err);
  961. return;
  962. }
  963. }
  964. /**
  965. * Next, try the language and region.
  966. **/
  967. if (regionLength > 0) {
  968. icu::CharString tagBuffer;
  969. {
  970. icu::CharStringByteSink tagSink(&tagBuffer);
  971. createLikelySubtagsString(
  972. lang,
  973. langLength,
  974. nullptr,
  975. 0,
  976. region,
  977. regionLength,
  978. nullptr,
  979. 0,
  980. tagSink,
  981. err);
  982. }
  983. if(U_FAILURE(*err)) {
  984. goto error;
  985. }
  986. else if (!tagBuffer.isEmpty() &&
  987. uprv_strnicmp(
  988. maximizedTagBuffer.data(),
  989. tagBuffer.data(),
  990. tagBuffer.length()) == 0) {
  991. createTagString(
  992. lang,
  993. langLength,
  994. nullptr,
  995. 0,
  996. region,
  997. regionLength,
  998. trailing,
  999. trailingLength,
  1000. sink,
  1001. err);
  1002. return;
  1003. }
  1004. }
  1005. /**
  1006. * Finally, try the language and script. This is our last chance,
  1007. * since trying with all three subtags would only yield the
  1008. * maximal version that we already have.
  1009. **/
  1010. if (scriptLength > 0) {
  1011. icu::CharString tagBuffer;
  1012. {
  1013. icu::CharStringByteSink tagSink(&tagBuffer);
  1014. createLikelySubtagsString(
  1015. lang,
  1016. langLength,
  1017. script,
  1018. scriptLength,
  1019. nullptr,
  1020. 0,
  1021. nullptr,
  1022. 0,
  1023. tagSink,
  1024. err);
  1025. }
  1026. if(U_FAILURE(*err)) {
  1027. goto error;
  1028. }
  1029. else if (!tagBuffer.isEmpty() &&
  1030. uprv_strnicmp(
  1031. maximizedTagBuffer.data(),
  1032. tagBuffer.data(),
  1033. tagBuffer.length()) == 0) {
  1034. createTagString(
  1035. lang,
  1036. langLength,
  1037. script,
  1038. scriptLength,
  1039. nullptr,
  1040. 0,
  1041. trailing,
  1042. trailingLength,
  1043. sink,
  1044. err);
  1045. return;
  1046. }
  1047. }
  1048. {
  1049. /**
  1050. * If we got here, return the max + trail.
  1051. **/
  1052. createTagString(
  1053. lang,
  1054. langLength,
  1055. script,
  1056. scriptLength,
  1057. region,
  1058. regionLength,
  1059. trailing,
  1060. trailingLength,
  1061. sink,
  1062. err);
  1063. return;
  1064. }
  1065. error:
  1066. if (!U_FAILURE(*err)) {
  1067. *err = U_ILLEGAL_ARGUMENT_ERROR;
  1068. }
  1069. }
  1070. static int32_t
  1071. do_canonicalize(const char* localeID,
  1072. char* buffer,
  1073. int32_t bufferCapacity,
  1074. UErrorCode* err)
  1075. {
  1076. int32_t canonicalizedSize = uloc_canonicalize(
  1077. localeID,
  1078. buffer,
  1079. bufferCapacity,
  1080. err);
  1081. if (*err == U_STRING_NOT_TERMINATED_WARNING ||
  1082. *err == U_BUFFER_OVERFLOW_ERROR) {
  1083. return canonicalizedSize;
  1084. }
  1085. else if (U_FAILURE(*err)) {
  1086. return -1;
  1087. }
  1088. else {
  1089. return canonicalizedSize;
  1090. }
  1091. }
  1092. U_CAPI int32_t U_EXPORT2
  1093. uloc_addLikelySubtags(const char* localeID,
  1094. char* maximizedLocaleID,
  1095. int32_t maximizedLocaleIDCapacity,
  1096. UErrorCode* status) {
  1097. if (U_FAILURE(*status)) {
  1098. return 0;
  1099. }
  1100. icu::CheckedArrayByteSink sink(
  1101. maximizedLocaleID, maximizedLocaleIDCapacity);
  1102. ulocimp_addLikelySubtags(localeID, sink, status);
  1103. int32_t reslen = sink.NumberOfBytesAppended();
  1104. if (U_FAILURE(*status)) {
  1105. return sink.Overflowed() ? reslen : -1;
  1106. }
  1107. if (sink.Overflowed()) {
  1108. *status = U_BUFFER_OVERFLOW_ERROR;
  1109. } else {
  1110. u_terminateChars(
  1111. maximizedLocaleID, maximizedLocaleIDCapacity, reslen, status);
  1112. }
  1113. return reslen;
  1114. }
  1115. static UBool
  1116. _ulocimp_addLikelySubtags(const char* localeID,
  1117. icu::ByteSink& sink,
  1118. UErrorCode* status) {
  1119. PreflightingLocaleIDBuffer localeBuffer;
  1120. do {
  1121. localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
  1122. localeBuffer.getCapacity(), status);
  1123. } while (localeBuffer.needToTryAgain(status));
  1124. if (U_SUCCESS(*status)) {
  1125. return _uloc_addLikelySubtags(localeBuffer.getBuffer(), sink, status);
  1126. } else {
  1127. return false;
  1128. }
  1129. }
  1130. U_CAPI void U_EXPORT2
  1131. ulocimp_addLikelySubtags(const char* localeID,
  1132. icu::ByteSink& sink,
  1133. UErrorCode* status) {
  1134. _ulocimp_addLikelySubtags(localeID, sink, status);
  1135. }
  1136. U_CAPI int32_t U_EXPORT2
  1137. uloc_minimizeSubtags(const char* localeID,
  1138. char* minimizedLocaleID,
  1139. int32_t minimizedLocaleIDCapacity,
  1140. UErrorCode* status) {
  1141. if (U_FAILURE(*status)) {
  1142. return 0;
  1143. }
  1144. icu::CheckedArrayByteSink sink(
  1145. minimizedLocaleID, minimizedLocaleIDCapacity);
  1146. ulocimp_minimizeSubtags(localeID, sink, status);
  1147. int32_t reslen = sink.NumberOfBytesAppended();
  1148. if (U_FAILURE(*status)) {
  1149. return sink.Overflowed() ? reslen : -1;
  1150. }
  1151. if (sink.Overflowed()) {
  1152. *status = U_BUFFER_OVERFLOW_ERROR;
  1153. } else {
  1154. u_terminateChars(
  1155. minimizedLocaleID, minimizedLocaleIDCapacity, reslen, status);
  1156. }
  1157. return reslen;
  1158. }
  1159. U_CAPI void U_EXPORT2
  1160. ulocimp_minimizeSubtags(const char* localeID,
  1161. icu::ByteSink& sink,
  1162. UErrorCode* status) {
  1163. PreflightingLocaleIDBuffer localeBuffer;
  1164. do {
  1165. localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
  1166. localeBuffer.getCapacity(), status);
  1167. } while (localeBuffer.needToTryAgain(status));
  1168. _uloc_minimizeSubtags(localeBuffer.getBuffer(), sink, status);
  1169. }
  1170. // Pairs of (language subtag, + or -) for finding out fast if common languages
  1171. // are LTR (minus) or RTL (plus).
  1172. static const char LANG_DIR_STRING[] =
  1173. "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
  1174. // Implemented here because this calls ulocimp_addLikelySubtags().
  1175. U_CAPI UBool U_EXPORT2
  1176. uloc_isRightToLeft(const char *locale) {
  1177. UErrorCode errorCode = U_ZERO_ERROR;
  1178. char script[8];
  1179. int32_t scriptLength = uloc_getScript(locale, script, UPRV_LENGTHOF(script), &errorCode);
  1180. if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
  1181. scriptLength == 0) {
  1182. // Fastpath: We know the likely scripts and their writing direction
  1183. // for some common languages.
  1184. errorCode = U_ZERO_ERROR;
  1185. char lang[8];
  1186. int32_t langLength = uloc_getLanguage(locale, lang, UPRV_LENGTHOF(lang), &errorCode);
  1187. if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
  1188. return false;
  1189. }
  1190. if (langLength > 0) {
  1191. const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang);
  1192. if (langPtr != nullptr) {
  1193. switch (langPtr[langLength]) {
  1194. case '-': return false;
  1195. case '+': return true;
  1196. default: break; // partial match of a longer code
  1197. }
  1198. }
  1199. }
  1200. // Otherwise, find the likely script.
  1201. errorCode = U_ZERO_ERROR;
  1202. icu::CharString likely;
  1203. {
  1204. icu::CharStringByteSink sink(&likely);
  1205. ulocimp_addLikelySubtags(locale, sink, &errorCode);
  1206. }
  1207. if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
  1208. return false;
  1209. }
  1210. scriptLength = uloc_getScript(likely.data(), script, UPRV_LENGTHOF(script), &errorCode);
  1211. if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
  1212. scriptLength == 0) {
  1213. return false;
  1214. }
  1215. }
  1216. UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script);
  1217. return uscript_isRightToLeft(scriptCode);
  1218. }
  1219. U_NAMESPACE_BEGIN
  1220. UBool
  1221. Locale::isRightToLeft() const {
  1222. return uloc_isRightToLeft(getBaseName());
  1223. }
  1224. U_NAMESPACE_END
  1225. // The following must at least allow for rg key value (6) plus terminator (1).
  1226. #define ULOC_RG_BUFLEN 8
  1227. U_CAPI int32_t U_EXPORT2
  1228. ulocimp_getRegionForSupplementalData(const char *localeID, UBool inferRegion,
  1229. char *region, int32_t regionCapacity, UErrorCode* status) {
  1230. if (U_FAILURE(*status)) {
  1231. return 0;
  1232. }
  1233. char rgBuf[ULOC_RG_BUFLEN];
  1234. UErrorCode rgStatus = U_ZERO_ERROR;
  1235. // First check for rg keyword value
  1236. int32_t rgLen = uloc_getKeywordValue(localeID, "rg", rgBuf, ULOC_RG_BUFLEN, &rgStatus);
  1237. if (U_FAILURE(rgStatus) || rgLen != 6) {
  1238. rgLen = 0;
  1239. } else {
  1240. // rgBuf guaranteed to be zero terminated here, with text len 6
  1241. char *rgPtr = rgBuf;
  1242. for (; *rgPtr!= 0; rgPtr++) {
  1243. *rgPtr = uprv_toupper(*rgPtr);
  1244. }
  1245. rgLen = (uprv_strcmp(rgBuf+2, "ZZZZ") == 0)? 2: 0;
  1246. }
  1247. if (rgLen == 0) {
  1248. // No valid rg keyword value, try for unicode_region_subtag
  1249. rgLen = uloc_getCountry(localeID, rgBuf, ULOC_RG_BUFLEN, status);
  1250. if (U_FAILURE(*status)) {
  1251. rgLen = 0;
  1252. } else if (rgLen == 0 && inferRegion) {
  1253. // no unicode_region_subtag but inferRegion true, try likely subtags
  1254. rgStatus = U_ZERO_ERROR;
  1255. icu::CharString locBuf;
  1256. {
  1257. icu::CharStringByteSink sink(&locBuf);
  1258. ulocimp_addLikelySubtags(localeID, sink, &rgStatus);
  1259. }
  1260. if (U_SUCCESS(rgStatus)) {
  1261. rgLen = uloc_getCountry(locBuf.data(), rgBuf, ULOC_RG_BUFLEN, status);
  1262. if (U_FAILURE(*status)) {
  1263. rgLen = 0;
  1264. }
  1265. }
  1266. }
  1267. }
  1268. rgBuf[rgLen] = 0;
  1269. uprv_strncpy(region, rgBuf, regionCapacity);
  1270. return u_terminateChars(region, regionCapacity, rgLen, status);
  1271. }