transmogrify.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739
  1. #if STRINGLIB_IS_UNICODE
  2. # error "transmogrify.h only compatible with byte-wise strings"
  3. #endif
  4. /* the more complicated methods. parts of these should be pulled out into the
  5. shared code in bytes_methods.c to cut down on duplicate code bloat. */
  6. /*[clinic input]
  7. class B "PyObject *" "&PyType_Type"
  8. [clinic start generated code]*/
  9. /*[clinic end generated code: output=da39a3ee5e6b4b0d input=2935558188d97c76]*/
  10. #include "clinic/transmogrify.h.h"
  11. static inline PyObject *
  12. return_self(PyObject *self)
  13. {
  14. #if !STRINGLIB_MUTABLE
  15. if (STRINGLIB_CHECK_EXACT(self)) {
  16. return Py_NewRef(self);
  17. }
  18. #endif
  19. return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
  20. }
  21. /*[clinic input]
  22. B.expandtabs as stringlib_expandtabs
  23. tabsize: int = 8
  24. Return a copy where all tab characters are expanded using spaces.
  25. If tabsize is not given, a tab size of 8 characters is assumed.
  26. [clinic start generated code]*/
  27. static PyObject *
  28. stringlib_expandtabs_impl(PyObject *self, int tabsize)
  29. /*[clinic end generated code: output=069cb7fae72e4c2b input=3c6d3b12aa3ccbea]*/
  30. {
  31. const char *e, *p;
  32. char *q;
  33. Py_ssize_t i, j;
  34. PyObject *u;
  35. /* First pass: determine size of output string */
  36. i = j = 0;
  37. e = STRINGLIB_STR(self) + STRINGLIB_LEN(self);
  38. for (p = STRINGLIB_STR(self); p < e; p++) {
  39. if (*p == '\t') {
  40. if (tabsize > 0) {
  41. Py_ssize_t incr = tabsize - (j % tabsize);
  42. if (j > PY_SSIZE_T_MAX - incr)
  43. goto overflow;
  44. j += incr;
  45. }
  46. }
  47. else {
  48. if (j > PY_SSIZE_T_MAX - 1)
  49. goto overflow;
  50. j++;
  51. if (*p == '\n' || *p == '\r') {
  52. if (i > PY_SSIZE_T_MAX - j)
  53. goto overflow;
  54. i += j;
  55. j = 0;
  56. }
  57. }
  58. }
  59. if (i > PY_SSIZE_T_MAX - j)
  60. goto overflow;
  61. /* Second pass: create output string and fill it */
  62. u = STRINGLIB_NEW(NULL, i + j);
  63. if (!u)
  64. return NULL;
  65. j = 0;
  66. q = STRINGLIB_STR(u);
  67. for (p = STRINGLIB_STR(self); p < e; p++) {
  68. if (*p == '\t') {
  69. if (tabsize > 0) {
  70. i = tabsize - (j % tabsize);
  71. j += i;
  72. while (i--)
  73. *q++ = ' ';
  74. }
  75. }
  76. else {
  77. j++;
  78. *q++ = *p;
  79. if (*p == '\n' || *p == '\r')
  80. j = 0;
  81. }
  82. }
  83. return u;
  84. overflow:
  85. PyErr_SetString(PyExc_OverflowError, "result too long");
  86. return NULL;
  87. }
  88. static inline PyObject *
  89. pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill)
  90. {
  91. PyObject *u;
  92. if (left < 0)
  93. left = 0;
  94. if (right < 0)
  95. right = 0;
  96. if (left == 0 && right == 0) {
  97. return return_self(self);
  98. }
  99. u = STRINGLIB_NEW(NULL, left + STRINGLIB_LEN(self) + right);
  100. if (u) {
  101. if (left)
  102. memset(STRINGLIB_STR(u), fill, left);
  103. memcpy(STRINGLIB_STR(u) + left,
  104. STRINGLIB_STR(self),
  105. STRINGLIB_LEN(self));
  106. if (right)
  107. memset(STRINGLIB_STR(u) + left + STRINGLIB_LEN(self),
  108. fill, right);
  109. }
  110. return u;
  111. }
  112. /*[clinic input]
  113. B.ljust as stringlib_ljust
  114. width: Py_ssize_t
  115. fillchar: char = b' '
  116. /
  117. Return a left-justified string of length width.
  118. Padding is done using the specified fill character.
  119. [clinic start generated code]*/
  120. static PyObject *
  121. stringlib_ljust_impl(PyObject *self, Py_ssize_t width, char fillchar)
  122. /*[clinic end generated code: output=c79ca173c5ff8337 input=eff2d014bc7d80df]*/
  123. {
  124. if (STRINGLIB_LEN(self) >= width) {
  125. return return_self(self);
  126. }
  127. return pad(self, 0, width - STRINGLIB_LEN(self), fillchar);
  128. }
  129. /*[clinic input]
  130. B.rjust as stringlib_rjust
  131. width: Py_ssize_t
  132. fillchar: char = b' '
  133. /
  134. Return a right-justified string of length width.
  135. Padding is done using the specified fill character.
  136. [clinic start generated code]*/
  137. static PyObject *
  138. stringlib_rjust_impl(PyObject *self, Py_ssize_t width, char fillchar)
  139. /*[clinic end generated code: output=7df5d728a5439570 input=218b0bd31308955d]*/
  140. {
  141. if (STRINGLIB_LEN(self) >= width) {
  142. return return_self(self);
  143. }
  144. return pad(self, width - STRINGLIB_LEN(self), 0, fillchar);
  145. }
  146. /*[clinic input]
  147. B.center as stringlib_center
  148. width: Py_ssize_t
  149. fillchar: char = b' '
  150. /
  151. Return a centered string of length width.
  152. Padding is done using the specified fill character.
  153. [clinic start generated code]*/
  154. static PyObject *
  155. stringlib_center_impl(PyObject *self, Py_ssize_t width, char fillchar)
  156. /*[clinic end generated code: output=d8da2e055288b4c2 input=3776fd278765d89b]*/
  157. {
  158. Py_ssize_t marg, left;
  159. if (STRINGLIB_LEN(self) >= width) {
  160. return return_self(self);
  161. }
  162. marg = width - STRINGLIB_LEN(self);
  163. left = marg / 2 + (marg & width & 1);
  164. return pad(self, left, marg - left, fillchar);
  165. }
  166. /*[clinic input]
  167. B.zfill as stringlib_zfill
  168. width: Py_ssize_t
  169. /
  170. Pad a numeric string with zeros on the left, to fill a field of the given width.
  171. The original string is never truncated.
  172. [clinic start generated code]*/
  173. static PyObject *
  174. stringlib_zfill_impl(PyObject *self, Py_ssize_t width)
  175. /*[clinic end generated code: output=0b3c684a7f1b2319 input=2da6d7b8e9bcb19a]*/
  176. {
  177. Py_ssize_t fill;
  178. PyObject *s;
  179. char *p;
  180. if (STRINGLIB_LEN(self) >= width) {
  181. return return_self(self);
  182. }
  183. fill = width - STRINGLIB_LEN(self);
  184. s = pad(self, fill, 0, '0');
  185. if (s == NULL)
  186. return NULL;
  187. p = STRINGLIB_STR(s);
  188. if (p[fill] == '+' || p[fill] == '-') {
  189. /* move sign to beginning of string */
  190. p[0] = p[fill];
  191. p[fill] = '0';
  192. }
  193. return s;
  194. }
  195. /* find and count characters and substrings */
  196. #define findchar(target, target_len, c) \
  197. ((char *)memchr((const void *)(target), c, target_len))
  198. static Py_ssize_t
  199. countchar(const char *target, Py_ssize_t target_len, char c,
  200. Py_ssize_t maxcount)
  201. {
  202. Py_ssize_t count = 0;
  203. const char *start = target;
  204. const char *end = target + target_len;
  205. while ((start = findchar(start, end - start, c)) != NULL) {
  206. count++;
  207. if (count >= maxcount)
  208. break;
  209. start += 1;
  210. }
  211. return count;
  212. }
  213. /* Algorithms for different cases of string replacement */
  214. /* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
  215. static PyObject *
  216. stringlib_replace_interleave(PyObject *self,
  217. const char *to_s, Py_ssize_t to_len,
  218. Py_ssize_t maxcount)
  219. {
  220. const char *self_s;
  221. char *result_s;
  222. Py_ssize_t self_len, result_len;
  223. Py_ssize_t count, i;
  224. PyObject *result;
  225. self_len = STRINGLIB_LEN(self);
  226. /* 1 at the end plus 1 after every character;
  227. count = min(maxcount, self_len + 1) */
  228. if (maxcount <= self_len) {
  229. count = maxcount;
  230. }
  231. else {
  232. /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */
  233. count = self_len + 1;
  234. }
  235. /* Check for overflow */
  236. /* result_len = count * to_len + self_len; */
  237. assert(count > 0);
  238. if (to_len > (PY_SSIZE_T_MAX - self_len) / count) {
  239. PyErr_SetString(PyExc_OverflowError,
  240. "replace bytes is too long");
  241. return NULL;
  242. }
  243. result_len = count * to_len + self_len;
  244. result = STRINGLIB_NEW(NULL, result_len);
  245. if (result == NULL) {
  246. return NULL;
  247. }
  248. self_s = STRINGLIB_STR(self);
  249. result_s = STRINGLIB_STR(result);
  250. if (to_len > 1) {
  251. /* Lay the first one down (guaranteed this will occur) */
  252. memcpy(result_s, to_s, to_len);
  253. result_s += to_len;
  254. count -= 1;
  255. for (i = 0; i < count; i++) {
  256. *result_s++ = *self_s++;
  257. memcpy(result_s, to_s, to_len);
  258. result_s += to_len;
  259. }
  260. }
  261. else {
  262. result_s[0] = to_s[0];
  263. result_s += to_len;
  264. count -= 1;
  265. for (i = 0; i < count; i++) {
  266. *result_s++ = *self_s++;
  267. result_s[0] = to_s[0];
  268. result_s += to_len;
  269. }
  270. }
  271. /* Copy the rest of the original string */
  272. memcpy(result_s, self_s, self_len - i);
  273. return result;
  274. }
  275. /* Special case for deleting a single character */
  276. /* len(self)>=1, len(from)==1, to="", maxcount>=1 */
  277. static PyObject *
  278. stringlib_replace_delete_single_character(PyObject *self,
  279. char from_c, Py_ssize_t maxcount)
  280. {
  281. const char *self_s, *start, *next, *end;
  282. char *result_s;
  283. Py_ssize_t self_len, result_len;
  284. Py_ssize_t count;
  285. PyObject *result;
  286. self_len = STRINGLIB_LEN(self);
  287. self_s = STRINGLIB_STR(self);
  288. count = countchar(self_s, self_len, from_c, maxcount);
  289. if (count == 0) {
  290. return return_self(self);
  291. }
  292. result_len = self_len - count; /* from_len == 1 */
  293. assert(result_len>=0);
  294. result = STRINGLIB_NEW(NULL, result_len);
  295. if (result == NULL) {
  296. return NULL;
  297. }
  298. result_s = STRINGLIB_STR(result);
  299. start = self_s;
  300. end = self_s + self_len;
  301. while (count-- > 0) {
  302. next = findchar(start, end - start, from_c);
  303. if (next == NULL)
  304. break;
  305. memcpy(result_s, start, next - start);
  306. result_s += (next - start);
  307. start = next + 1;
  308. }
  309. memcpy(result_s, start, end - start);
  310. return result;
  311. }
  312. /* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
  313. static PyObject *
  314. stringlib_replace_delete_substring(PyObject *self,
  315. const char *from_s, Py_ssize_t from_len,
  316. Py_ssize_t maxcount)
  317. {
  318. const char *self_s, *start, *next, *end;
  319. char *result_s;
  320. Py_ssize_t self_len, result_len;
  321. Py_ssize_t count, offset;
  322. PyObject *result;
  323. self_len = STRINGLIB_LEN(self);
  324. self_s = STRINGLIB_STR(self);
  325. count = stringlib_count(self_s, self_len,
  326. from_s, from_len,
  327. maxcount);
  328. if (count == 0) {
  329. /* no matches */
  330. return return_self(self);
  331. }
  332. result_len = self_len - (count * from_len);
  333. assert (result_len>=0);
  334. result = STRINGLIB_NEW(NULL, result_len);
  335. if (result == NULL) {
  336. return NULL;
  337. }
  338. result_s = STRINGLIB_STR(result);
  339. start = self_s;
  340. end = self_s + self_len;
  341. while (count-- > 0) {
  342. offset = stringlib_find(start, end - start,
  343. from_s, from_len,
  344. 0);
  345. if (offset == -1)
  346. break;
  347. next = start + offset;
  348. memcpy(result_s, start, next - start);
  349. result_s += (next - start);
  350. start = next + from_len;
  351. }
  352. memcpy(result_s, start, end - start);
  353. return result;
  354. }
  355. /* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
  356. static PyObject *
  357. stringlib_replace_single_character_in_place(PyObject *self,
  358. char from_c, char to_c,
  359. Py_ssize_t maxcount)
  360. {
  361. const char *self_s, *end;
  362. char *result_s, *start, *next;
  363. Py_ssize_t self_len;
  364. PyObject *result;
  365. /* The result string will be the same size */
  366. self_s = STRINGLIB_STR(self);
  367. self_len = STRINGLIB_LEN(self);
  368. next = findchar(self_s, self_len, from_c);
  369. if (next == NULL) {
  370. /* No matches; return the original bytes */
  371. return return_self(self);
  372. }
  373. /* Need to make a new bytes */
  374. result = STRINGLIB_NEW(NULL, self_len);
  375. if (result == NULL) {
  376. return NULL;
  377. }
  378. result_s = STRINGLIB_STR(result);
  379. memcpy(result_s, self_s, self_len);
  380. /* change everything in-place, starting with this one */
  381. start = result_s + (next - self_s);
  382. *start = to_c;
  383. start++;
  384. end = result_s + self_len;
  385. while (--maxcount > 0) {
  386. next = findchar(start, end - start, from_c);
  387. if (next == NULL)
  388. break;
  389. *next = to_c;
  390. start = next + 1;
  391. }
  392. return result;
  393. }
  394. /* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
  395. static PyObject *
  396. stringlib_replace_substring_in_place(PyObject *self,
  397. const char *from_s, Py_ssize_t from_len,
  398. const char *to_s, Py_ssize_t to_len,
  399. Py_ssize_t maxcount)
  400. {
  401. const char *self_s, *end;
  402. char *result_s, *start;
  403. Py_ssize_t self_len, offset;
  404. PyObject *result;
  405. /* The result bytes will be the same size */
  406. self_s = STRINGLIB_STR(self);
  407. self_len = STRINGLIB_LEN(self);
  408. offset = stringlib_find(self_s, self_len,
  409. from_s, from_len,
  410. 0);
  411. if (offset == -1) {
  412. /* No matches; return the original bytes */
  413. return return_self(self);
  414. }
  415. /* Need to make a new bytes */
  416. result = STRINGLIB_NEW(NULL, self_len);
  417. if (result == NULL) {
  418. return NULL;
  419. }
  420. result_s = STRINGLIB_STR(result);
  421. memcpy(result_s, self_s, self_len);
  422. /* change everything in-place, starting with this one */
  423. start = result_s + offset;
  424. memcpy(start, to_s, from_len);
  425. start += from_len;
  426. end = result_s + self_len;
  427. while ( --maxcount > 0) {
  428. offset = stringlib_find(start, end - start,
  429. from_s, from_len,
  430. 0);
  431. if (offset == -1)
  432. break;
  433. memcpy(start + offset, to_s, from_len);
  434. start += offset + from_len;
  435. }
  436. return result;
  437. }
  438. /* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
  439. static PyObject *
  440. stringlib_replace_single_character(PyObject *self,
  441. char from_c,
  442. const char *to_s, Py_ssize_t to_len,
  443. Py_ssize_t maxcount)
  444. {
  445. const char *self_s, *start, *next, *end;
  446. char *result_s;
  447. Py_ssize_t self_len, result_len;
  448. Py_ssize_t count;
  449. PyObject *result;
  450. self_s = STRINGLIB_STR(self);
  451. self_len = STRINGLIB_LEN(self);
  452. count = countchar(self_s, self_len, from_c, maxcount);
  453. if (count == 0) {
  454. /* no matches, return unchanged */
  455. return return_self(self);
  456. }
  457. /* use the difference between current and new, hence the "-1" */
  458. /* result_len = self_len + count * (to_len-1) */
  459. assert(count > 0);
  460. if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) {
  461. PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
  462. return NULL;
  463. }
  464. result_len = self_len + count * (to_len - 1);
  465. result = STRINGLIB_NEW(NULL, result_len);
  466. if (result == NULL) {
  467. return NULL;
  468. }
  469. result_s = STRINGLIB_STR(result);
  470. start = self_s;
  471. end = self_s + self_len;
  472. while (count-- > 0) {
  473. next = findchar(start, end - start, from_c);
  474. if (next == NULL)
  475. break;
  476. if (next == start) {
  477. /* replace with the 'to' */
  478. memcpy(result_s, to_s, to_len);
  479. result_s += to_len;
  480. start += 1;
  481. } else {
  482. /* copy the unchanged old then the 'to' */
  483. memcpy(result_s, start, next - start);
  484. result_s += (next - start);
  485. memcpy(result_s, to_s, to_len);
  486. result_s += to_len;
  487. start = next + 1;
  488. }
  489. }
  490. /* Copy the remainder of the remaining bytes */
  491. memcpy(result_s, start, end - start);
  492. return result;
  493. }
  494. /* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
  495. static PyObject *
  496. stringlib_replace_substring(PyObject *self,
  497. const char *from_s, Py_ssize_t from_len,
  498. const char *to_s, Py_ssize_t to_len,
  499. Py_ssize_t maxcount)
  500. {
  501. const char *self_s, *start, *next, *end;
  502. char *result_s;
  503. Py_ssize_t self_len, result_len;
  504. Py_ssize_t count, offset;
  505. PyObject *result;
  506. self_s = STRINGLIB_STR(self);
  507. self_len = STRINGLIB_LEN(self);
  508. count = stringlib_count(self_s, self_len,
  509. from_s, from_len,
  510. maxcount);
  511. if (count == 0) {
  512. /* no matches, return unchanged */
  513. return return_self(self);
  514. }
  515. /* Check for overflow */
  516. /* result_len = self_len + count * (to_len-from_len) */
  517. assert(count > 0);
  518. if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) {
  519. PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
  520. return NULL;
  521. }
  522. result_len = self_len + count * (to_len - from_len);
  523. result = STRINGLIB_NEW(NULL, result_len);
  524. if (result == NULL) {
  525. return NULL;
  526. }
  527. result_s = STRINGLIB_STR(result);
  528. start = self_s;
  529. end = self_s + self_len;
  530. while (count-- > 0) {
  531. offset = stringlib_find(start, end - start,
  532. from_s, from_len,
  533. 0);
  534. if (offset == -1)
  535. break;
  536. next = start + offset;
  537. if (next == start) {
  538. /* replace with the 'to' */
  539. memcpy(result_s, to_s, to_len);
  540. result_s += to_len;
  541. start += from_len;
  542. } else {
  543. /* copy the unchanged old then the 'to' */
  544. memcpy(result_s, start, next - start);
  545. result_s += (next - start);
  546. memcpy(result_s, to_s, to_len);
  547. result_s += to_len;
  548. start = next + from_len;
  549. }
  550. }
  551. /* Copy the remainder of the remaining bytes */
  552. memcpy(result_s, start, end - start);
  553. return result;
  554. }
  555. static PyObject *
  556. stringlib_replace(PyObject *self,
  557. const char *from_s, Py_ssize_t from_len,
  558. const char *to_s, Py_ssize_t to_len,
  559. Py_ssize_t maxcount)
  560. {
  561. if (STRINGLIB_LEN(self) < from_len) {
  562. /* nothing to do; return the original bytes */
  563. return return_self(self);
  564. }
  565. if (maxcount < 0) {
  566. maxcount = PY_SSIZE_T_MAX;
  567. } else if (maxcount == 0) {
  568. /* nothing to do; return the original bytes */
  569. return return_self(self);
  570. }
  571. /* Handle zero-length special cases */
  572. if (from_len == 0) {
  573. if (to_len == 0) {
  574. /* nothing to do; return the original bytes */
  575. return return_self(self);
  576. }
  577. /* insert the 'to' bytes everywhere. */
  578. /* >>> b"Python".replace(b"", b".") */
  579. /* b'.P.y.t.h.o.n.' */
  580. return stringlib_replace_interleave(self, to_s, to_len, maxcount);
  581. }
  582. if (to_len == 0) {
  583. /* delete all occurrences of 'from' bytes */
  584. if (from_len == 1) {
  585. return stringlib_replace_delete_single_character(
  586. self, from_s[0], maxcount);
  587. } else {
  588. return stringlib_replace_delete_substring(
  589. self, from_s, from_len, maxcount);
  590. }
  591. }
  592. /* Handle special case where both bytes have the same length */
  593. if (from_len == to_len) {
  594. if (from_len == 1) {
  595. return stringlib_replace_single_character_in_place(
  596. self, from_s[0], to_s[0], maxcount);
  597. } else {
  598. return stringlib_replace_substring_in_place(
  599. self, from_s, from_len, to_s, to_len, maxcount);
  600. }
  601. }
  602. /* Otherwise use the more generic algorithms */
  603. if (from_len == 1) {
  604. return stringlib_replace_single_character(
  605. self, from_s[0], to_s, to_len, maxcount);
  606. } else {
  607. /* len('from')>=2, len('to')>=1 */
  608. return stringlib_replace_substring(
  609. self, from_s, from_len, to_s, to_len, maxcount);
  610. }
  611. }
  612. #undef findchar