read_unicode_ut.cpp 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. /*
  2. * unicode_range_ut.cpp --
  3. *
  4. * Copyright (c) 2019 YANDEX LLC
  5. * Author: Karina Usmanova <usmanova.karin@yandex.ru>
  6. *
  7. * This file is part of Pire, the Perl Incompatible
  8. * Regular Expressions library.
  9. *
  10. * Pire is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU Lesser Public License as published by
  12. * the Free Software Foundation, either version 3 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * Pire is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU Lesser Public License for more details.
  19. * You should have received a copy of the GNU Lesser Public License
  20. * along with Pire. If not, see <http://www.gnu.org/licenses>.
  21. */
  22. #include <pire.h>
  23. #include "stub/cppunit.h"
  24. #include "common.h"
  25. Y_UNIT_TEST_SUITE(ReadUnicodeTest) {
  26. ystring CreateStringWithZeroSymbol(const char* str, size_t pos) {
  27. ystring result = str;
  28. Y_ASSERT(pos < result.size());
  29. result[pos] = '\0';
  30. return result;
  31. }
  32. Y_UNIT_TEST(ZeroSymbol)
  33. {
  34. REGEXP("\\x{0}") {
  35. ACCEPTS(CreateStringWithZeroSymbol("a", 0));
  36. ACCEPTS(CreateStringWithZeroSymbol("some text", 3));
  37. DENIES("string without zero");
  38. }
  39. REGEXP("the\\x00middle") {
  40. ACCEPTS(CreateStringWithZeroSymbol("in the middle", 6));
  41. DENIES(CreateStringWithZeroSymbol("in the middle", 5));
  42. DENIES("in the middle");
  43. }
  44. }
  45. Y_UNIT_TEST(SymbolsByCodes)
  46. {
  47. REGEXP("\\x{41}") {
  48. ACCEPTS("A");
  49. ACCEPTS("tAst string");
  50. DENIES("test string");
  51. }
  52. REGEXP("\\x26abc") {
  53. ACCEPTS("&abc;");
  54. DENIES("test &ab");
  55. DENIES("without");
  56. }
  57. }
  58. Y_UNIT_TEST(ErrorsWhileCompiling)
  59. {
  60. UNIT_ASSERT(HasError("\\x"));
  61. UNIT_ASSERT(HasError("\\x0"));
  62. UNIT_ASSERT(HasError("\\xfu"));
  63. UNIT_ASSERT(HasError("\\xs1"));
  64. UNIT_ASSERT(HasError("\\x 0"));
  65. UNIT_ASSERT(HasError("\\x0 "));
  66. UNIT_ASSERT(HasError("\\x{2A1"));
  67. UNIT_ASSERT(HasError("\\x{"));
  68. UNIT_ASSERT(HasError("\\x}"));
  69. UNIT_ASSERT(HasError("\\x2}"));
  70. UNIT_ASSERT(HasError("\\x{{3}"));
  71. UNIT_ASSERT(HasError("\\x{2a{5}"));
  72. UNIT_ASSERT(HasError("\\x{}"));
  73. UNIT_ASSERT(HasError("\\x{+3}"));
  74. UNIT_ASSERT(HasError("\\x{-3}"));
  75. UNIT_ASSERT(HasError("\\x{ 2F}"));
  76. UNIT_ASSERT(HasError("\\x{2A F}"));
  77. UNIT_ASSERT(HasError("\\x{2Arft}"));
  78. UNIT_ASSERT(HasError("\\x{110000}"));
  79. UNIT_ASSERT(!HasError("\\x{fB1}"));
  80. UNIT_ASSERT(!HasError("\\x00"));
  81. UNIT_ASSERT(!HasError("\\x{10FFFF}"));
  82. }
  83. Y_UNIT_TEST(OneCharacterRange)
  84. {
  85. SCANNER("[\\x{61}]") {
  86. ACCEPTS("a");
  87. ACCEPTS("bac");
  88. DENIES("test");
  89. }
  90. SCANNER("[\\x3f]") {
  91. ACCEPTS("?");
  92. ACCEPTS("test?");
  93. DENIES("test");
  94. }
  95. }
  96. Y_UNIT_TEST(CharacterRange) {
  97. REGEXP("[\\x{61}\\x62\\x{3f}\\x26]") {
  98. ACCEPTS("a");
  99. ACCEPTS("b");
  100. ACCEPTS("?");
  101. ACCEPTS("acd");
  102. ACCEPTS("bcd");
  103. ACCEPTS("cd?");
  104. ACCEPTS("ab?");
  105. DENIES("cd");
  106. }
  107. REGEXP("[\\x{61}-\\x{63}]") {
  108. ACCEPTS("a");
  109. ACCEPTS("b");
  110. ACCEPTS("c");
  111. ACCEPTS("qwertya");
  112. DENIES("d");
  113. }
  114. REGEXP("[\\x61-\\x61]") {
  115. ACCEPTS("a");
  116. ACCEPTS("qwertya");
  117. DENIES("b");
  118. }
  119. REGEXP("[\\x26\\x{61}-\\x{62}\\x{3f}]") {
  120. ACCEPTS("&");
  121. ACCEPTS("a");
  122. ACCEPTS("b");
  123. ACCEPTS("?");
  124. ACCEPTS("ade");
  125. ACCEPTS("ab?");
  126. DENIES("d");
  127. }
  128. REGEXP("[\\x{41}-\\x{42}\\x{61}-\\x{62}]") {
  129. ACCEPTS("a");
  130. ACCEPTS("b");
  131. ACCEPTS("A");
  132. ACCEPTS("B");
  133. DENIES("c");
  134. DENIES("C");
  135. }
  136. REGEXP("[\\x{41}-\\x{42}][\\x{61}-\\x{62}]") {
  137. ACCEPTS("Aa");
  138. ACCEPTS("Ab");
  139. ACCEPTS("Ba");
  140. ACCEPTS("Bb");
  141. DENIES("a");
  142. DENIES("b");
  143. DENIES("A");
  144. DENIES("B");
  145. DENIES("ab");
  146. DENIES("AB");
  147. DENIES("Ca");
  148. }
  149. }
  150. Y_UNIT_TEST(RangeExcludeCharacters) {
  151. REGEXP("[^\\x{61}]") {
  152. ACCEPTS("b");
  153. ACCEPTS("c");
  154. ACCEPTS("aba");
  155. DENIES("a");
  156. DENIES("aaa");
  157. }
  158. REGEXP("[^\\x{61}-\\x{7a}]") {
  159. ACCEPTS("A");
  160. ACCEPTS("123");
  161. ACCEPTS("acb1");
  162. DENIES("a");
  163. DENIES("abcxyz");
  164. }
  165. }
  166. Y_UNIT_TEST(MixedRange) {
  167. REGEXP("[\\x{61}B]") {
  168. ACCEPTS("a");
  169. ACCEPTS("B");
  170. ACCEPTS("atestB");
  171. DENIES("test");
  172. }
  173. REGEXP("[^\\x{61}A]") {
  174. ACCEPTS("b");
  175. ACCEPTS("B");
  176. ACCEPTS("atestB");
  177. DENIES("a");
  178. DENIES("A");
  179. DENIES("aaAA");
  180. }
  181. REGEXP("[0-9][\\x{61}-\\x{62}A-B]") {
  182. ACCEPTS("0a");
  183. ACCEPTS("1A");
  184. ACCEPTS("5b");
  185. ACCEPTS("9B");
  186. ACCEPTS("1atestB");
  187. ACCEPTS("2Atest");
  188. DENIES("aB");
  189. DENIES("testb");
  190. DENIES("test");
  191. }
  192. REGEXP("[\\x{61}-c]") {
  193. ACCEPTS("a");
  194. ACCEPTS("b");
  195. ACCEPTS("c");
  196. ACCEPTS("testb");
  197. DENIES("d");
  198. }
  199. REGEXP("[^a-\\x{7a}]") {
  200. ACCEPTS("A");
  201. ACCEPTS("123");
  202. ACCEPTS("acb1");
  203. DENIES("a");
  204. DENIES("abcxyz");
  205. }
  206. REGEXP("[\\x{41}-Ba-\\x{62}]") {
  207. ACCEPTS("a");
  208. ACCEPTS("b");
  209. ACCEPTS("A");
  210. ACCEPTS("B");
  211. DENIES("c");
  212. DENIES("C");
  213. }
  214. }
  215. Y_UNIT_TEST(CompilingRange)
  216. {
  217. UNIT_ASSERT(HasError("[\\x41"));
  218. UNIT_ASSERT(HasError("[\\xfq]"));
  219. UNIT_ASSERT(HasError("[\\x{01}-]"));
  220. UNIT_ASSERT(!HasError("[\\x{10FFFF}]"));
  221. UNIT_ASSERT(!HasError("[\\x{00}]"));
  222. UNIT_ASSERT(!HasError("[\\x{abc}-\\x{FFF}]"));
  223. UNIT_ASSERT(!HasError("[^\\xFF]"));
  224. UNIT_ASSERT(!HasError("[^\\x{FF}-\\x{FF0}]"));
  225. UNIT_ASSERT(!HasError("[-\\x{01}]"));
  226. }
  227. Y_UNIT_TEST(UnicodeRepetition)
  228. {
  229. REGEXP("^\\x{78}{3,6}$") {
  230. DENIES ("xx");
  231. ACCEPTS("xxx");
  232. ACCEPTS("xxxx");
  233. ACCEPTS("xxxxx");
  234. ACCEPTS("xxxxxx");
  235. DENIES ("xxxxxxx");
  236. }
  237. REGEXP("^x{3,}$") {
  238. DENIES ("xx");
  239. ACCEPTS("xxx");
  240. ACCEPTS("xxxx");
  241. ACCEPTS("xxxxxxxxxxx");
  242. ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
  243. }
  244. REGEXP("^\\x{78}{3}$") {
  245. DENIES ("x");
  246. DENIES ("xx");
  247. ACCEPTS("xxx");
  248. DENIES ("xxxx");
  249. DENIES ("xxxxx");
  250. DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
  251. }
  252. REGEXP("^([\\x{78}-\\x{79}]){2}$") {
  253. DENIES("x");
  254. DENIES("y");
  255. ACCEPTS("xx");
  256. ACCEPTS("xy");
  257. ACCEPTS("yx");
  258. ACCEPTS("yy");
  259. DENIES("xxy");
  260. DENIES("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
  261. }
  262. }
  263. Y_UNIT_TEST(AnyUnicodeCodepointIsAllowed)
  264. {
  265. REGEXP("[\\x{0}-\\x{77}\\x{79}-\\x{10ffff}]") {
  266. ACCEPTS("w");
  267. DENIES ("x");
  268. ACCEPTS("y");
  269. }
  270. }
  271. }