ucnv_u7.cpp 55 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2002-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * file name: ucnv_u7.c
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2002jul01
  14. * created by: Markus W. Scherer
  15. *
  16. * UTF-7 converter implementation. Used to be in ucnv_utf.c.
  17. */
  18. #include "unicode/utypes.h"
  19. #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  20. #include "cmemory.h"
  21. #include "unicode/ucnv.h"
  22. #include "ucnv_bld.h"
  23. #include "ucnv_cnv.h"
  24. #include "uassert.h"
  25. /* UTF-7 -------------------------------------------------------------------- */
  26. /*
  27. * UTF-7 is a stateful encoding of Unicode.
  28. * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
  29. * It was intended for use in Internet email systems, using in its bytewise
  30. * encoding only a subset of 7-bit US-ASCII.
  31. * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
  32. * occasionally used.
  33. *
  34. * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
  35. * characters directly or in base64. Especially, the characters in set O
  36. * as defined in the RFC (see below) may be encoded directly but are not
  37. * allowed in, e.g., email headers.
  38. * By default, the ICU UTF-7 converter encodes set O directly.
  39. * By choosing the option "version=1", set O will be escaped instead.
  40. * For example:
  41. * utf7Converter=ucnv_open("UTF-7,version=1");
  42. *
  43. * For details about email headers see RFC 2047.
  44. */
  45. /*
  46. * Tests for US-ASCII characters belonging to character classes
  47. * defined in UTF-7.
  48. *
  49. * Set D (directly encoded characters) consists of the following
  50. * characters: the upper and lower case letters A through Z
  51. * and a through z, the 10 digits 0-9, and the following nine special
  52. * characters (note that "+" and "=" are omitted):
  53. * '(),-./:?
  54. *
  55. * Set O (optional direct characters) consists of the following
  56. * characters (note that "\" and "~" are omitted):
  57. * !"#$%&*;<=>@[]^_`{|}
  58. *
  59. * According to the rules in RFC 2152, the byte values for the following
  60. * US-ASCII characters are not used in UTF-7 and are therefore illegal:
  61. * - all C0 control codes except for CR LF TAB
  62. * - BACKSLASH
  63. * - TILDE
  64. * - DEL
  65. * - all codes beyond US-ASCII, i.e. all >127
  66. */
  67. #define inSetD(c) \
  68. ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
  69. (uint8_t)((c)-48)<10 || /* digits */ \
  70. (uint8_t)((c)-39)<3 || /* '() */ \
  71. (uint8_t)((c)-44)<4 || /* ,-./ */ \
  72. (c)==58 || (c)==63 /* :? */ \
  73. )
  74. #define inSetO(c) \
  75. ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
  76. (uint8_t)((c)-59)<4 || /* ;<=> */ \
  77. (uint8_t)((c)-93)<4 || /* ]^_` */ \
  78. (uint8_t)((c)-123)<3 || /* {|} */ \
  79. (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
  80. )
  81. #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
  82. #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
  83. #define PLUS 43
  84. #define MINUS 45
  85. #define BACKSLASH 92
  86. #define TILDE 126
  87. /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
  88. #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
  89. /* encode directly sets D and O and CR LF SP TAB */
  90. static const UBool encodeDirectlyMaximum[128]={
  91. /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  92. 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
  93. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  94. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
  95. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  96. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  97. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
  98. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  99. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
  100. };
  101. /* encode directly set D and CR LF SP TAB but not set O */
  102. static const UBool encodeDirectlyRestricted[128]={
  103. /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  104. 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
  105. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  106. 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
  107. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
  108. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  109. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
  110. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  111. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
  112. };
  113. static const uint8_t
  114. toBase64[64]={
  115. /* A-Z */
  116. 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
  117. 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
  118. /* a-z */
  119. 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
  120. 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
  121. /* 0-9 */
  122. 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
  123. /* +/ */
  124. 43, 47
  125. };
  126. static const int8_t
  127. fromBase64[128]={
  128. /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
  129. -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
  130. -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
  131. /* general punctuation with + and / and a special value (-2) for - */
  132. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
  133. /* digits */
  134. 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
  135. /* A-Z */
  136. -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
  137. 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
  138. /* a-z */
  139. -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
  140. 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
  141. };
  142. /*
  143. * converter status values:
  144. *
  145. * toUnicodeStatus:
  146. * 24 inDirectMode (boolean)
  147. * 23..16 base64Counter (-1..7)
  148. * 15..0 bits (up to 14 bits incoming base64)
  149. *
  150. * fromUnicodeStatus:
  151. * 31..28 version (0: set O direct 1: set O escaped)
  152. * 24 inDirectMode (boolean)
  153. * 23..16 base64Counter (0..2)
  154. * 7..0 bits (6 bits outgoing base64)
  155. *
  156. */
  157. U_CDECL_BEGIN
  158. static void U_CALLCONV
  159. _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
  160. if(choice<=UCNV_RESET_TO_UNICODE) {
  161. /* reset toUnicode */
  162. cnv->toUnicodeStatus=0x1000000; /* inDirectMode=true */
  163. cnv->toULength=0;
  164. }
  165. if(choice!=UCNV_RESET_TO_UNICODE) {
  166. /* reset fromUnicode */
  167. cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
  168. }
  169. }
  170. static void U_CALLCONV
  171. _UTF7Open(UConverter *cnv,
  172. UConverterLoadArgs *pArgs,
  173. UErrorCode *pErrorCode) {
  174. (void)pArgs;
  175. if(UCNV_GET_VERSION(cnv)<=1) {
  176. /* TODO(markus): Should just use cnv->options rather than copying the version number. */
  177. cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
  178. _UTF7Reset(cnv, UCNV_RESET_BOTH);
  179. } else {
  180. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  181. }
  182. }
  183. static void U_CALLCONV
  184. _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  185. UErrorCode *pErrorCode) {
  186. UConverter *cnv;
  187. const uint8_t *source, *sourceLimit;
  188. char16_t *target;
  189. const char16_t *targetLimit;
  190. int32_t *offsets;
  191. uint8_t *bytes;
  192. uint8_t byteIndex;
  193. int32_t length, targetCapacity;
  194. /* UTF-7 state */
  195. uint16_t bits;
  196. int8_t base64Counter;
  197. UBool inDirectMode;
  198. int8_t base64Value;
  199. int32_t sourceIndex, nextSourceIndex;
  200. uint8_t b;
  201. /* set up the local pointers */
  202. cnv=pArgs->converter;
  203. source=(const uint8_t *)pArgs->source;
  204. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  205. target=pArgs->target;
  206. targetLimit=pArgs->targetLimit;
  207. offsets=pArgs->offsets;
  208. /* get the state machine state */
  209. {
  210. uint32_t status=cnv->toUnicodeStatus;
  211. inDirectMode=(UBool)((status>>24)&1);
  212. base64Counter=(int8_t)(status>>16);
  213. bits=(uint16_t)status;
  214. }
  215. bytes=cnv->toUBytes;
  216. byteIndex=cnv->toULength;
  217. /* sourceIndex=-1 if the current character began in the previous buffer */
  218. sourceIndex=byteIndex==0 ? 0 : -1;
  219. nextSourceIndex=0;
  220. if(inDirectMode) {
  221. directMode:
  222. /*
  223. * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
  224. * with their US-ASCII byte values.
  225. * Backslash and Tilde and most control characters are not allowed in UTF-7.
  226. * A plus sign starts Unicode (or "escape") Mode.
  227. *
  228. * In Direct Mode, only the sourceIndex is used.
  229. */
  230. byteIndex=0;
  231. length=(int32_t)(sourceLimit-source);
  232. targetCapacity=(int32_t)(targetLimit-target);
  233. if(length>targetCapacity) {
  234. length=targetCapacity;
  235. }
  236. while(length>0) {
  237. b=*source++;
  238. if(!isLegalUTF7(b)) {
  239. /* illegal */
  240. bytes[0]=b;
  241. byteIndex=1;
  242. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  243. break;
  244. } else if(b!=PLUS) {
  245. /* write directly encoded character */
  246. *target++=b;
  247. if(offsets!=nullptr) {
  248. *offsets++=sourceIndex++;
  249. }
  250. } else /* PLUS */ {
  251. /* switch to Unicode mode */
  252. nextSourceIndex=++sourceIndex;
  253. inDirectMode=false;
  254. byteIndex=0;
  255. bits=0;
  256. base64Counter=-1;
  257. goto unicodeMode;
  258. }
  259. --length;
  260. }
  261. if(source<sourceLimit && target>=targetLimit) {
  262. /* target is full */
  263. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  264. }
  265. } else {
  266. unicodeMode:
  267. /*
  268. * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
  269. * The base64 sequence ends with any character that is not in the base64 alphabet.
  270. * A terminating minus sign is consumed.
  271. *
  272. * In Unicode Mode, the sourceIndex has the index to the start of the current
  273. * base64 bytes, while nextSourceIndex is precisely parallel to source,
  274. * keeping the index to the following byte.
  275. * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
  276. */
  277. while(source<sourceLimit) {
  278. if(target<targetLimit) {
  279. bytes[byteIndex++]=b=*source++;
  280. ++nextSourceIndex;
  281. base64Value = -3; /* initialize as illegal */
  282. if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
  283. /* either
  284. * base64Value==-1 for any legal character except base64 and minus sign, or
  285. * base64Value==-3 for illegal characters:
  286. * 1. In either case, leave Unicode mode.
  287. * 2.1. If we ended with an incomplete char16_t or none after the +, then
  288. * generate an error for the preceding erroneous sequence and deal with
  289. * the current (possibly illegal) character next time through.
  290. * 2.2. Else the current char comes after a complete char16_t, which was already
  291. * pushed to the output buf, so:
  292. * 2.2.1. If the current char is legal, just save it for processing next time.
  293. * It may be for example, a plus which we need to deal with in direct mode.
  294. * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
  295. */
  296. inDirectMode=true;
  297. if(base64Counter==-1) {
  298. /* illegal: + immediately followed by something other than base64 or minus sign */
  299. /* include the plus sign in the reported sequence, but not the subsequent char */
  300. --source;
  301. bytes[0]=PLUS;
  302. byteIndex=1;
  303. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  304. break;
  305. } else if(bits!=0) {
  306. /* bits are illegally left over, a char16_t is incomplete */
  307. /* don't include current char (legal or illegal) in error seq */
  308. --source;
  309. --byteIndex;
  310. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  311. break;
  312. } else {
  313. /* previous char16_t was complete */
  314. if(base64Value==-3) {
  315. /* current character is illegal, deal with it here */
  316. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  317. break;
  318. } else {
  319. /* un-read the current character in case it is a plus sign */
  320. --source;
  321. sourceIndex=nextSourceIndex-1;
  322. goto directMode;
  323. }
  324. }
  325. } else if(base64Value>=0) {
  326. /* collect base64 bytes into UChars */
  327. switch(base64Counter) {
  328. case -1: /* -1 is immediately after the + */
  329. case 0:
  330. bits=base64Value;
  331. base64Counter=1;
  332. break;
  333. case 1:
  334. case 3:
  335. case 4:
  336. case 6:
  337. bits=(uint16_t)((bits<<6)|base64Value);
  338. ++base64Counter;
  339. break;
  340. case 2:
  341. *target++=(char16_t)((bits<<4)|(base64Value>>2));
  342. if(offsets!=nullptr) {
  343. *offsets++=sourceIndex;
  344. sourceIndex=nextSourceIndex-1;
  345. }
  346. bytes[0]=b; /* keep this byte in case an error occurs */
  347. byteIndex=1;
  348. bits=(uint16_t)(base64Value&3);
  349. base64Counter=3;
  350. break;
  351. case 5:
  352. *target++=(char16_t)((bits<<2)|(base64Value>>4));
  353. if(offsets!=nullptr) {
  354. *offsets++=sourceIndex;
  355. sourceIndex=nextSourceIndex-1;
  356. }
  357. bytes[0]=b; /* keep this byte in case an error occurs */
  358. byteIndex=1;
  359. bits=(uint16_t)(base64Value&15);
  360. base64Counter=6;
  361. break;
  362. case 7:
  363. *target++=(char16_t)((bits<<6)|base64Value);
  364. if(offsets!=nullptr) {
  365. *offsets++=sourceIndex;
  366. sourceIndex=nextSourceIndex;
  367. }
  368. byteIndex=0;
  369. bits=0;
  370. base64Counter=0;
  371. break;
  372. default:
  373. /* will never occur */
  374. break;
  375. }
  376. } else /*base64Value==-2*/ {
  377. /* minus sign terminates the base64 sequence */
  378. inDirectMode=true;
  379. if(base64Counter==-1) {
  380. /* +- i.e. a minus immediately following a plus */
  381. *target++=PLUS;
  382. if(offsets!=nullptr) {
  383. *offsets++=sourceIndex-1;
  384. }
  385. } else {
  386. /* absorb the minus and leave the Unicode Mode */
  387. if(bits!=0) {
  388. /* bits are illegally left over, a char16_t is incomplete */
  389. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  390. break;
  391. }
  392. }
  393. sourceIndex=nextSourceIndex;
  394. goto directMode;
  395. }
  396. } else {
  397. /* target is full */
  398. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  399. break;
  400. }
  401. }
  402. }
  403. if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
  404. /*
  405. * if we are in Unicode mode, then the byteIndex might not be 0,
  406. * but that is ok if bits==0
  407. * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
  408. * (not true for IMAP-mailbox-name where we must end in direct mode)
  409. */
  410. byteIndex=0;
  411. }
  412. /* set the converter state back into UConverter */
  413. cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
  414. cnv->toULength=byteIndex;
  415. /* write back the updated pointers */
  416. pArgs->source=(const char *)source;
  417. pArgs->target=target;
  418. pArgs->offsets=offsets;
  419. }
  420. static void U_CALLCONV
  421. _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  422. UErrorCode *pErrorCode) {
  423. UConverter *cnv;
  424. const char16_t *source, *sourceLimit;
  425. uint8_t *target, *targetLimit;
  426. int32_t *offsets;
  427. int32_t length, targetCapacity, sourceIndex;
  428. char16_t c;
  429. /* UTF-7 state */
  430. const UBool *encodeDirectly;
  431. uint8_t bits;
  432. int8_t base64Counter;
  433. UBool inDirectMode;
  434. /* set up the local pointers */
  435. cnv=pArgs->converter;
  436. /* set up the local pointers */
  437. source=pArgs->source;
  438. sourceLimit=pArgs->sourceLimit;
  439. target=(uint8_t *)pArgs->target;
  440. targetLimit=(uint8_t *)pArgs->targetLimit;
  441. offsets=pArgs->offsets;
  442. /* get the state machine state */
  443. {
  444. uint32_t status=cnv->fromUnicodeStatus;
  445. encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
  446. inDirectMode=(UBool)((status>>24)&1);
  447. base64Counter=(int8_t)(status>>16);
  448. bits=(uint8_t)status;
  449. U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
  450. }
  451. /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
  452. sourceIndex=0;
  453. if(inDirectMode) {
  454. directMode:
  455. length=(int32_t)(sourceLimit-source);
  456. targetCapacity=(int32_t)(targetLimit-target);
  457. if(length>targetCapacity) {
  458. length=targetCapacity;
  459. }
  460. while(length>0) {
  461. c=*source++;
  462. /* currently always encode CR LF SP TAB directly */
  463. if(c<=127 && encodeDirectly[c]) {
  464. /* encode directly */
  465. *target++=(uint8_t)c;
  466. if(offsets!=nullptr) {
  467. *offsets++=sourceIndex++;
  468. }
  469. } else if(c==PLUS) {
  470. /* output +- for + */
  471. *target++=PLUS;
  472. if(target<targetLimit) {
  473. *target++=MINUS;
  474. if(offsets!=nullptr) {
  475. *offsets++=sourceIndex;
  476. *offsets++=sourceIndex++;
  477. }
  478. /* realign length and targetCapacity */
  479. goto directMode;
  480. } else {
  481. if(offsets!=nullptr) {
  482. *offsets++=sourceIndex++;
  483. }
  484. cnv->charErrorBuffer[0]=MINUS;
  485. cnv->charErrorBufferLength=1;
  486. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  487. break;
  488. }
  489. } else {
  490. /* un-read this character and switch to Unicode Mode */
  491. --source;
  492. *target++=PLUS;
  493. if(offsets!=nullptr) {
  494. *offsets++=sourceIndex;
  495. }
  496. inDirectMode=false;
  497. base64Counter=0;
  498. goto unicodeMode;
  499. }
  500. --length;
  501. }
  502. if(source<sourceLimit && target>=targetLimit) {
  503. /* target is full */
  504. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  505. }
  506. } else {
  507. unicodeMode:
  508. while(source<sourceLimit) {
  509. if(target<targetLimit) {
  510. c=*source++;
  511. if(c<=127 && encodeDirectly[c]) {
  512. /* encode directly */
  513. inDirectMode=true;
  514. /* trick: back out this character to make this easier */
  515. --source;
  516. /* terminate the base64 sequence */
  517. if(base64Counter!=0) {
  518. /* write remaining bits for the previous character */
  519. *target++=toBase64[bits];
  520. if(offsets!=nullptr) {
  521. *offsets++=sourceIndex-1;
  522. }
  523. }
  524. if(fromBase64[c]!=-1) {
  525. /* need to terminate with a minus */
  526. if(target<targetLimit) {
  527. *target++=MINUS;
  528. if(offsets!=nullptr) {
  529. *offsets++=sourceIndex-1;
  530. }
  531. } else {
  532. cnv->charErrorBuffer[0]=MINUS;
  533. cnv->charErrorBufferLength=1;
  534. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  535. break;
  536. }
  537. }
  538. goto directMode;
  539. } else {
  540. /*
  541. * base64 this character:
  542. * Output 2 or 3 base64 bytes for the remaining bits of the previous character
  543. * and the bits of this character, each implicitly in UTF-16BE.
  544. *
  545. * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
  546. * character to the next. The actual 2 or 4 bits are shifted to the left edge
  547. * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
  548. */
  549. switch(base64Counter) {
  550. case 0:
  551. *target++=toBase64[c>>10];
  552. if(target<targetLimit) {
  553. *target++=toBase64[(c>>4)&0x3f];
  554. if(offsets!=nullptr) {
  555. *offsets++=sourceIndex;
  556. *offsets++=sourceIndex++;
  557. }
  558. } else {
  559. if(offsets!=nullptr) {
  560. *offsets++=sourceIndex++;
  561. }
  562. cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
  563. cnv->charErrorBufferLength=1;
  564. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  565. }
  566. bits=(uint8_t)((c&15)<<2);
  567. base64Counter=1;
  568. break;
  569. case 1:
  570. *target++=toBase64[bits|(c>>14)];
  571. if(target<targetLimit) {
  572. *target++=toBase64[(c>>8)&0x3f];
  573. if(target<targetLimit) {
  574. *target++=toBase64[(c>>2)&0x3f];
  575. if(offsets!=nullptr) {
  576. *offsets++=sourceIndex;
  577. *offsets++=sourceIndex;
  578. *offsets++=sourceIndex++;
  579. }
  580. } else {
  581. if(offsets!=nullptr) {
  582. *offsets++=sourceIndex;
  583. *offsets++=sourceIndex++;
  584. }
  585. cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
  586. cnv->charErrorBufferLength=1;
  587. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  588. }
  589. } else {
  590. if(offsets!=nullptr) {
  591. *offsets++=sourceIndex++;
  592. }
  593. cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
  594. cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
  595. cnv->charErrorBufferLength=2;
  596. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  597. }
  598. bits=(uint8_t)((c&3)<<4);
  599. base64Counter=2;
  600. break;
  601. case 2:
  602. *target++=toBase64[bits|(c>>12)];
  603. if(target<targetLimit) {
  604. *target++=toBase64[(c>>6)&0x3f];
  605. if(target<targetLimit) {
  606. *target++=toBase64[c&0x3f];
  607. if(offsets!=nullptr) {
  608. *offsets++=sourceIndex;
  609. *offsets++=sourceIndex;
  610. *offsets++=sourceIndex++;
  611. }
  612. } else {
  613. if(offsets!=nullptr) {
  614. *offsets++=sourceIndex;
  615. *offsets++=sourceIndex++;
  616. }
  617. cnv->charErrorBuffer[0]=toBase64[c&0x3f];
  618. cnv->charErrorBufferLength=1;
  619. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  620. }
  621. } else {
  622. if(offsets!=nullptr) {
  623. *offsets++=sourceIndex++;
  624. }
  625. cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
  626. cnv->charErrorBuffer[1]=toBase64[c&0x3f];
  627. cnv->charErrorBufferLength=2;
  628. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  629. }
  630. bits=0;
  631. base64Counter=0;
  632. break;
  633. default:
  634. /* will never occur */
  635. break;
  636. }
  637. }
  638. } else {
  639. /* target is full */
  640. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  641. break;
  642. }
  643. }
  644. }
  645. if(pArgs->flush && source>=sourceLimit) {
  646. /* flush remaining bits to the target */
  647. if(!inDirectMode) {
  648. if (base64Counter!=0) {
  649. if(target<targetLimit) {
  650. *target++=toBase64[bits];
  651. if(offsets!=nullptr) {
  652. *offsets++=sourceIndex-1;
  653. }
  654. } else {
  655. cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
  656. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  657. }
  658. }
  659. /* Add final MINUS to terminate unicodeMode */
  660. if(target<targetLimit) {
  661. *target++=MINUS;
  662. if(offsets!=nullptr) {
  663. *offsets++=sourceIndex-1;
  664. }
  665. } else {
  666. cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
  667. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  668. }
  669. }
  670. /* reset the state for the next conversion */
  671. cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
  672. } else {
  673. /* set the converter state back into UConverter */
  674. cnv->fromUnicodeStatus=
  675. (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
  676. ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
  677. }
  678. /* write back the updated pointers */
  679. pArgs->source=source;
  680. pArgs->target=(char *)target;
  681. pArgs->offsets=offsets;
  682. }
  683. static const char * U_CALLCONV
  684. _UTF7GetName(const UConverter *cnv) {
  685. switch(cnv->fromUnicodeStatus>>28) {
  686. case 1:
  687. return "UTF-7,version=1";
  688. default:
  689. return "UTF-7";
  690. }
  691. }
  692. U_CDECL_END
  693. static const UConverterImpl _UTF7Impl={
  694. UCNV_UTF7,
  695. nullptr,
  696. nullptr,
  697. _UTF7Open,
  698. nullptr,
  699. _UTF7Reset,
  700. _UTF7ToUnicodeWithOffsets,
  701. _UTF7ToUnicodeWithOffsets,
  702. _UTF7FromUnicodeWithOffsets,
  703. _UTF7FromUnicodeWithOffsets,
  704. nullptr,
  705. nullptr,
  706. _UTF7GetName,
  707. nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
  708. nullptr,
  709. ucnv_getCompleteUnicodeSet,
  710. nullptr,
  711. nullptr
  712. };
  713. static const UConverterStaticData _UTF7StaticData={
  714. sizeof(UConverterStaticData),
  715. "UTF-7",
  716. 0, /* TODO CCSID for UTF-7 */
  717. UCNV_IBM, UCNV_UTF7,
  718. 1, 4,
  719. { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
  720. false, false,
  721. 0,
  722. 0,
  723. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  724. };
  725. const UConverterSharedData _UTF7Data=
  726. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
  727. /* IMAP mailbox name encoding ----------------------------------------------- */
  728. /*
  729. * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
  730. * http://www.ietf.org/rfc/rfc2060.txt
  731. *
  732. * 5.1.3. Mailbox International Naming Convention
  733. *
  734. * By convention, international mailbox names are specified using a
  735. * modified version of the UTF-7 encoding described in [UTF-7]. The
  736. * purpose of these modifications is to correct the following problems
  737. * with UTF-7:
  738. *
  739. * 1) UTF-7 uses the "+" character for shifting; this conflicts with
  740. * the common use of "+" in mailbox names, in particular USENET
  741. * newsgroup names.
  742. *
  743. * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
  744. * conflicts with the use of "/" as a popular hierarchy delimiter.
  745. *
  746. * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
  747. * the use of "\" as a popular hierarchy delimiter.
  748. *
  749. * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
  750. * the use of "~" in some servers as a home directory indicator.
  751. *
  752. * 5) UTF-7 permits multiple alternate forms to represent the same
  753. * string; in particular, printable US-ASCII characters can be
  754. * represented in encoded form.
  755. *
  756. * In modified UTF-7, printable US-ASCII characters except for "&"
  757. * represent themselves; that is, characters with octet values 0x20-0x25
  758. * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
  759. * octet sequence "&-".
  760. *
  761. * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
  762. * Unicode 16-bit octets) are represented in modified BASE64, with a
  763. * further modification from [UTF-7] that "," is used instead of "/".
  764. * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
  765. * character which can represent itself.
  766. *
  767. * "&" is used to shift to modified BASE64 and "-" to shift back to US-
  768. * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
  769. * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
  770. * ").
  771. *
  772. * For example, here is a mailbox name which mixes English, Japanese,
  773. * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
  774. */
  775. /*
  776. * Tests for US-ASCII characters belonging to character classes
  777. * defined in UTF-7.
  778. *
  779. * Set D (directly encoded characters) consists of the following
  780. * characters: the upper and lower case letters A through Z
  781. * and a through z, the 10 digits 0-9, and the following nine special
  782. * characters (note that "+" and "=" are omitted):
  783. * '(),-./:?
  784. *
  785. * Set O (optional direct characters) consists of the following
  786. * characters (note that "\" and "~" are omitted):
  787. * !"#$%&*;<=>@[]^_`{|}
  788. *
  789. * According to the rules in RFC 2152, the byte values for the following
  790. * US-ASCII characters are not used in UTF-7 and are therefore illegal:
  791. * - all C0 control codes except for CR LF TAB
  792. * - BACKSLASH
  793. * - TILDE
  794. * - DEL
  795. * - all codes beyond US-ASCII, i.e. all >127
  796. */
  797. /* uses '&' not '+' to start a base64 sequence */
  798. #define AMPERSAND 0x26
  799. #define COMMA 0x2c
  800. #define SLASH 0x2f
  801. /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
  802. #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
  803. /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
  804. #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
  805. #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
  806. #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
  807. /*
  808. * converter status values:
  809. *
  810. * toUnicodeStatus:
  811. * 24 inDirectMode (boolean)
  812. * 23..16 base64Counter (-1..7)
  813. * 15..0 bits (up to 14 bits incoming base64)
  814. *
  815. * fromUnicodeStatus:
  816. * 24 inDirectMode (boolean)
  817. * 23..16 base64Counter (0..2)
  818. * 7..0 bits (6 bits outgoing base64)
  819. *
  820. * ignore bits 31..25
  821. */
  822. U_CDECL_BEGIN
  823. static void U_CALLCONV
  824. _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  825. UErrorCode *pErrorCode) {
  826. UConverter *cnv;
  827. const uint8_t *source, *sourceLimit;
  828. char16_t *target;
  829. const char16_t *targetLimit;
  830. int32_t *offsets;
  831. uint8_t *bytes;
  832. uint8_t byteIndex;
  833. int32_t length, targetCapacity;
  834. /* UTF-7 state */
  835. uint16_t bits;
  836. int8_t base64Counter;
  837. UBool inDirectMode;
  838. int8_t base64Value;
  839. int32_t sourceIndex, nextSourceIndex;
  840. char16_t c;
  841. uint8_t b;
  842. /* set up the local pointers */
  843. cnv=pArgs->converter;
  844. source=(const uint8_t *)pArgs->source;
  845. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  846. target=pArgs->target;
  847. targetLimit=pArgs->targetLimit;
  848. offsets=pArgs->offsets;
  849. /* get the state machine state */
  850. {
  851. uint32_t status=cnv->toUnicodeStatus;
  852. inDirectMode=(UBool)((status>>24)&1);
  853. base64Counter=(int8_t)(status>>16);
  854. bits=(uint16_t)status;
  855. }
  856. bytes=cnv->toUBytes;
  857. byteIndex=cnv->toULength;
  858. /* sourceIndex=-1 if the current character began in the previous buffer */
  859. sourceIndex=byteIndex==0 ? 0 : -1;
  860. nextSourceIndex=0;
  861. if(inDirectMode) {
  862. directMode:
  863. /*
  864. * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
  865. * with their US-ASCII byte values.
  866. * An ampersand starts Unicode (or "escape") Mode.
  867. *
  868. * In Direct Mode, only the sourceIndex is used.
  869. */
  870. byteIndex=0;
  871. length=(int32_t)(sourceLimit-source);
  872. targetCapacity=(int32_t)(targetLimit-target);
  873. if(length>targetCapacity) {
  874. length=targetCapacity;
  875. }
  876. while(length>0) {
  877. b=*source++;
  878. if(!isLegalIMAP(b)) {
  879. /* illegal */
  880. bytes[0]=b;
  881. byteIndex=1;
  882. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  883. break;
  884. } else if(b!=AMPERSAND) {
  885. /* write directly encoded character */
  886. *target++=b;
  887. if(offsets!=nullptr) {
  888. *offsets++=sourceIndex++;
  889. }
  890. } else /* AMPERSAND */ {
  891. /* switch to Unicode mode */
  892. nextSourceIndex=++sourceIndex;
  893. inDirectMode=false;
  894. byteIndex=0;
  895. bits=0;
  896. base64Counter=-1;
  897. goto unicodeMode;
  898. }
  899. --length;
  900. }
  901. if(source<sourceLimit && target>=targetLimit) {
  902. /* target is full */
  903. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  904. }
  905. } else {
  906. unicodeMode:
  907. /*
  908. * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
  909. * The base64 sequence ends with any character that is not in the base64 alphabet.
  910. * A terminating minus sign is consumed.
  911. * US-ASCII must not be base64-ed.
  912. *
  913. * In Unicode Mode, the sourceIndex has the index to the start of the current
  914. * base64 bytes, while nextSourceIndex is precisely parallel to source,
  915. * keeping the index to the following byte.
  916. * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
  917. */
  918. while(source<sourceLimit) {
  919. if(target<targetLimit) {
  920. bytes[byteIndex++]=b=*source++;
  921. ++nextSourceIndex;
  922. if(b>0x7e) {
  923. /* illegal - test other illegal US-ASCII values by base64Value==-3 */
  924. inDirectMode=true;
  925. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  926. break;
  927. } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
  928. /* collect base64 bytes into UChars */
  929. switch(base64Counter) {
  930. case -1: /* -1 is immediately after the & */
  931. case 0:
  932. bits=base64Value;
  933. base64Counter=1;
  934. break;
  935. case 1:
  936. case 3:
  937. case 4:
  938. case 6:
  939. bits=(uint16_t)((bits<<6)|base64Value);
  940. ++base64Counter;
  941. break;
  942. case 2:
  943. c=(char16_t)((bits<<4)|(base64Value>>2));
  944. if(isLegalIMAP(c)) {
  945. /* illegal */
  946. inDirectMode=true;
  947. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  948. goto endloop;
  949. }
  950. *target++=c;
  951. if(offsets!=nullptr) {
  952. *offsets++=sourceIndex;
  953. sourceIndex=nextSourceIndex-1;
  954. }
  955. bytes[0]=b; /* keep this byte in case an error occurs */
  956. byteIndex=1;
  957. bits=(uint16_t)(base64Value&3);
  958. base64Counter=3;
  959. break;
  960. case 5:
  961. c=(char16_t)((bits<<2)|(base64Value>>4));
  962. if(isLegalIMAP(c)) {
  963. /* illegal */
  964. inDirectMode=true;
  965. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  966. goto endloop;
  967. }
  968. *target++=c;
  969. if(offsets!=nullptr) {
  970. *offsets++=sourceIndex;
  971. sourceIndex=nextSourceIndex-1;
  972. }
  973. bytes[0]=b; /* keep this byte in case an error occurs */
  974. byteIndex=1;
  975. bits=(uint16_t)(base64Value&15);
  976. base64Counter=6;
  977. break;
  978. case 7:
  979. c=(char16_t)((bits<<6)|base64Value);
  980. if(isLegalIMAP(c)) {
  981. /* illegal */
  982. inDirectMode=true;
  983. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  984. goto endloop;
  985. }
  986. *target++=c;
  987. if(offsets!=nullptr) {
  988. *offsets++=sourceIndex;
  989. sourceIndex=nextSourceIndex;
  990. }
  991. byteIndex=0;
  992. bits=0;
  993. base64Counter=0;
  994. break;
  995. default:
  996. /* will never occur */
  997. break;
  998. }
  999. } else if(base64Value==-2) {
  1000. /* minus sign terminates the base64 sequence */
  1001. inDirectMode=true;
  1002. if(base64Counter==-1) {
  1003. /* &- i.e. a minus immediately following an ampersand */
  1004. *target++=AMPERSAND;
  1005. if(offsets!=nullptr) {
  1006. *offsets++=sourceIndex-1;
  1007. }
  1008. } else {
  1009. /* absorb the minus and leave the Unicode Mode */
  1010. if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
  1011. /* bits are illegally left over, a char16_t is incomplete */
  1012. /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
  1013. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1014. break;
  1015. }
  1016. }
  1017. sourceIndex=nextSourceIndex;
  1018. goto directMode;
  1019. } else {
  1020. if(base64Counter==-1) {
  1021. /* illegal: & immediately followed by something other than base64 or minus sign */
  1022. /* include the ampersand in the reported sequence */
  1023. --sourceIndex;
  1024. bytes[0]=AMPERSAND;
  1025. bytes[1]=b;
  1026. byteIndex=2;
  1027. }
  1028. /* base64Value==-1 for characters that are illegal only in Unicode mode */
  1029. /* base64Value==-3 for illegal characters */
  1030. /* illegal */
  1031. inDirectMode=true;
  1032. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1033. break;
  1034. }
  1035. } else {
  1036. /* target is full */
  1037. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1038. break;
  1039. }
  1040. }
  1041. }
  1042. endloop:
  1043. /*
  1044. * the end of the input stream and detection of truncated input
  1045. * are handled by the framework, but here we must check if we are in Unicode
  1046. * mode and byteIndex==0 because we must end in direct mode
  1047. *
  1048. * conditions:
  1049. * successful
  1050. * in Unicode mode and byteIndex==0
  1051. * end of input and no truncated input
  1052. */
  1053. if( U_SUCCESS(*pErrorCode) &&
  1054. !inDirectMode && byteIndex==0 &&
  1055. pArgs->flush && source>=sourceLimit
  1056. ) {
  1057. if(base64Counter==-1) {
  1058. /* & at the very end of the input */
  1059. /* make the ampersand the reported sequence */
  1060. bytes[0]=AMPERSAND;
  1061. byteIndex=1;
  1062. }
  1063. /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
  1064. inDirectMode=true; /* avoid looping */
  1065. *pErrorCode=U_TRUNCATED_CHAR_FOUND;
  1066. }
  1067. /* set the converter state back into UConverter */
  1068. cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
  1069. cnv->toULength=byteIndex;
  1070. /* write back the updated pointers */
  1071. pArgs->source=(const char *)source;
  1072. pArgs->target=target;
  1073. pArgs->offsets=offsets;
  1074. }
  1075. static void U_CALLCONV
  1076. _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  1077. UErrorCode *pErrorCode) {
  1078. UConverter *cnv;
  1079. const char16_t *source, *sourceLimit;
  1080. uint8_t *target, *targetLimit;
  1081. int32_t *offsets;
  1082. int32_t length, targetCapacity, sourceIndex;
  1083. char16_t c;
  1084. uint8_t b;
  1085. /* UTF-7 state */
  1086. uint8_t bits;
  1087. int8_t base64Counter;
  1088. UBool inDirectMode;
  1089. /* set up the local pointers */
  1090. cnv=pArgs->converter;
  1091. /* set up the local pointers */
  1092. source=pArgs->source;
  1093. sourceLimit=pArgs->sourceLimit;
  1094. target=(uint8_t *)pArgs->target;
  1095. targetLimit=(uint8_t *)pArgs->targetLimit;
  1096. offsets=pArgs->offsets;
  1097. /* get the state machine state */
  1098. {
  1099. uint32_t status=cnv->fromUnicodeStatus;
  1100. inDirectMode=(UBool)((status>>24)&1);
  1101. base64Counter=(int8_t)(status>>16);
  1102. bits=(uint8_t)status;
  1103. }
  1104. /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
  1105. sourceIndex=0;
  1106. if(inDirectMode) {
  1107. directMode:
  1108. length=(int32_t)(sourceLimit-source);
  1109. targetCapacity=(int32_t)(targetLimit-target);
  1110. if(length>targetCapacity) {
  1111. length=targetCapacity;
  1112. }
  1113. while(length>0) {
  1114. c=*source++;
  1115. /* encode 0x20..0x7e except '&' directly */
  1116. if(inSetDIMAP(c)) {
  1117. /* encode directly */
  1118. *target++=(uint8_t)c;
  1119. if(offsets!=nullptr) {
  1120. *offsets++=sourceIndex++;
  1121. }
  1122. } else if(c==AMPERSAND) {
  1123. /* output &- for & */
  1124. *target++=AMPERSAND;
  1125. if(target<targetLimit) {
  1126. *target++=MINUS;
  1127. if(offsets!=nullptr) {
  1128. *offsets++=sourceIndex;
  1129. *offsets++=sourceIndex++;
  1130. }
  1131. /* realign length and targetCapacity */
  1132. goto directMode;
  1133. } else {
  1134. if(offsets!=nullptr) {
  1135. *offsets++=sourceIndex++;
  1136. }
  1137. cnv->charErrorBuffer[0]=MINUS;
  1138. cnv->charErrorBufferLength=1;
  1139. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1140. break;
  1141. }
  1142. } else {
  1143. /* un-read this character and switch to Unicode Mode */
  1144. --source;
  1145. *target++=AMPERSAND;
  1146. if(offsets!=nullptr) {
  1147. *offsets++=sourceIndex;
  1148. }
  1149. inDirectMode=false;
  1150. base64Counter=0;
  1151. goto unicodeMode;
  1152. }
  1153. --length;
  1154. }
  1155. if(source<sourceLimit && target>=targetLimit) {
  1156. /* target is full */
  1157. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1158. }
  1159. } else {
  1160. unicodeMode:
  1161. while(source<sourceLimit) {
  1162. if(target<targetLimit) {
  1163. c=*source++;
  1164. if(isLegalIMAP(c)) {
  1165. /* encode directly */
  1166. inDirectMode=true;
  1167. /* trick: back out this character to make this easier */
  1168. --source;
  1169. /* terminate the base64 sequence */
  1170. if(base64Counter!=0) {
  1171. /* write remaining bits for the previous character */
  1172. *target++=TO_BASE64_IMAP(bits);
  1173. if(offsets!=nullptr) {
  1174. *offsets++=sourceIndex-1;
  1175. }
  1176. }
  1177. /* need to terminate with a minus */
  1178. if(target<targetLimit) {
  1179. *target++=MINUS;
  1180. if(offsets!=nullptr) {
  1181. *offsets++=sourceIndex-1;
  1182. }
  1183. } else {
  1184. cnv->charErrorBuffer[0]=MINUS;
  1185. cnv->charErrorBufferLength=1;
  1186. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1187. break;
  1188. }
  1189. goto directMode;
  1190. } else {
  1191. /*
  1192. * base64 this character:
  1193. * Output 2 or 3 base64 bytes for the remaining bits of the previous character
  1194. * and the bits of this character, each implicitly in UTF-16BE.
  1195. *
  1196. * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
  1197. * character to the next. The actual 2 or 4 bits are shifted to the left edge
  1198. * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
  1199. */
  1200. switch(base64Counter) {
  1201. case 0:
  1202. b=(uint8_t)(c>>10);
  1203. *target++=TO_BASE64_IMAP(b);
  1204. if(target<targetLimit) {
  1205. b=(uint8_t)((c>>4)&0x3f);
  1206. *target++=TO_BASE64_IMAP(b);
  1207. if(offsets!=nullptr) {
  1208. *offsets++=sourceIndex;
  1209. *offsets++=sourceIndex++;
  1210. }
  1211. } else {
  1212. if(offsets!=nullptr) {
  1213. *offsets++=sourceIndex++;
  1214. }
  1215. b=(uint8_t)((c>>4)&0x3f);
  1216. cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
  1217. cnv->charErrorBufferLength=1;
  1218. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1219. }
  1220. bits=(uint8_t)((c&15)<<2);
  1221. base64Counter=1;
  1222. break;
  1223. case 1:
  1224. b=(uint8_t)(bits|(c>>14));
  1225. *target++=TO_BASE64_IMAP(b);
  1226. if(target<targetLimit) {
  1227. b=(uint8_t)((c>>8)&0x3f);
  1228. *target++=TO_BASE64_IMAP(b);
  1229. if(target<targetLimit) {
  1230. b=(uint8_t)((c>>2)&0x3f);
  1231. *target++=TO_BASE64_IMAP(b);
  1232. if(offsets!=nullptr) {
  1233. *offsets++=sourceIndex;
  1234. *offsets++=sourceIndex;
  1235. *offsets++=sourceIndex++;
  1236. }
  1237. } else {
  1238. if(offsets!=nullptr) {
  1239. *offsets++=sourceIndex;
  1240. *offsets++=sourceIndex++;
  1241. }
  1242. b=(uint8_t)((c>>2)&0x3f);
  1243. cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
  1244. cnv->charErrorBufferLength=1;
  1245. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1246. }
  1247. } else {
  1248. if(offsets!=nullptr) {
  1249. *offsets++=sourceIndex++;
  1250. }
  1251. b=(uint8_t)((c>>8)&0x3f);
  1252. cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
  1253. b=(uint8_t)((c>>2)&0x3f);
  1254. cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
  1255. cnv->charErrorBufferLength=2;
  1256. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1257. }
  1258. bits=(uint8_t)((c&3)<<4);
  1259. base64Counter=2;
  1260. break;
  1261. case 2:
  1262. b=(uint8_t)(bits|(c>>12));
  1263. *target++=TO_BASE64_IMAP(b);
  1264. if(target<targetLimit) {
  1265. b=(uint8_t)((c>>6)&0x3f);
  1266. *target++=TO_BASE64_IMAP(b);
  1267. if(target<targetLimit) {
  1268. b=(uint8_t)(c&0x3f);
  1269. *target++=TO_BASE64_IMAP(b);
  1270. if(offsets!=nullptr) {
  1271. *offsets++=sourceIndex;
  1272. *offsets++=sourceIndex;
  1273. *offsets++=sourceIndex++;
  1274. }
  1275. } else {
  1276. if(offsets!=nullptr) {
  1277. *offsets++=sourceIndex;
  1278. *offsets++=sourceIndex++;
  1279. }
  1280. b=(uint8_t)(c&0x3f);
  1281. cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
  1282. cnv->charErrorBufferLength=1;
  1283. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1284. }
  1285. } else {
  1286. if(offsets!=nullptr) {
  1287. *offsets++=sourceIndex++;
  1288. }
  1289. b=(uint8_t)((c>>6)&0x3f);
  1290. cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
  1291. b=(uint8_t)(c&0x3f);
  1292. cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
  1293. cnv->charErrorBufferLength=2;
  1294. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1295. }
  1296. bits=0;
  1297. base64Counter=0;
  1298. break;
  1299. default:
  1300. /* will never occur */
  1301. break;
  1302. }
  1303. }
  1304. } else {
  1305. /* target is full */
  1306. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1307. break;
  1308. }
  1309. }
  1310. }
  1311. if(pArgs->flush && source>=sourceLimit) {
  1312. /* flush remaining bits to the target */
  1313. if(!inDirectMode) {
  1314. if(base64Counter!=0) {
  1315. if(target<targetLimit) {
  1316. *target++=TO_BASE64_IMAP(bits);
  1317. if(offsets!=nullptr) {
  1318. *offsets++=sourceIndex-1;
  1319. }
  1320. } else {
  1321. cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
  1322. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1323. }
  1324. }
  1325. /* need to terminate with a minus */
  1326. if(target<targetLimit) {
  1327. *target++=MINUS;
  1328. if(offsets!=nullptr) {
  1329. *offsets++=sourceIndex-1;
  1330. }
  1331. } else {
  1332. cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
  1333. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1334. }
  1335. }
  1336. /* reset the state for the next conversion */
  1337. cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
  1338. } else {
  1339. /* set the converter state back into UConverter */
  1340. cnv->fromUnicodeStatus=
  1341. (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
  1342. ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
  1343. }
  1344. /* write back the updated pointers */
  1345. pArgs->source=source;
  1346. pArgs->target=(char *)target;
  1347. pArgs->offsets=offsets;
  1348. }
  1349. U_CDECL_END
  1350. static const UConverterImpl _IMAPImpl={
  1351. UCNV_IMAP_MAILBOX,
  1352. nullptr,
  1353. nullptr,
  1354. _UTF7Open,
  1355. nullptr,
  1356. _UTF7Reset,
  1357. _IMAPToUnicodeWithOffsets,
  1358. _IMAPToUnicodeWithOffsets,
  1359. _IMAPFromUnicodeWithOffsets,
  1360. _IMAPFromUnicodeWithOffsets,
  1361. nullptr,
  1362. nullptr,
  1363. nullptr,
  1364. nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
  1365. nullptr,
  1366. ucnv_getCompleteUnicodeSet,
  1367. nullptr,
  1368. nullptr
  1369. };
  1370. static const UConverterStaticData _IMAPStaticData={
  1371. sizeof(UConverterStaticData),
  1372. "IMAP-mailbox-name",
  1373. 0, /* TODO CCSID for IMAP-mailbox-name */
  1374. UCNV_IBM, UCNV_IMAP_MAILBOX,
  1375. 1, 4,
  1376. { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
  1377. false, false,
  1378. 0,
  1379. 0,
  1380. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1381. };
  1382. const UConverterSharedData _IMAPData=
  1383. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
  1384. #endif