split.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. /* stringlib: split implementation */
  2. #ifndef STRINGLIB_FASTSEARCH_H
  3. #error must include "stringlib/fastsearch.h" before including this module
  4. #endif
  5. /* Overallocate the initial list to reduce the number of reallocs for small
  6. split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
  7. resizes, to sizes 4, 8, then 16. Most observed string splits are for human
  8. text (roughly 11 words per line) and field delimited data (usually 1-10
  9. fields). For large strings the split algorithms are bandwidth limited
  10. so increasing the preallocation likely will not improve things.*/
  11. #define MAX_PREALLOC 12
  12. /* 5 splits gives 6 elements */
  13. #define PREALLOC_SIZE(maxsplit) \
  14. (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
  15. #define SPLIT_APPEND(data, left, right) \
  16. sub = STRINGLIB_NEW((data) + (left), \
  17. (right) - (left)); \
  18. if (sub == NULL) \
  19. goto onError; \
  20. if (PyList_Append(list, sub)) { \
  21. Py_DECREF(sub); \
  22. goto onError; \
  23. } \
  24. else \
  25. Py_DECREF(sub);
  26. #define SPLIT_ADD(data, left, right) { \
  27. sub = STRINGLIB_NEW((data) + (left), \
  28. (right) - (left)); \
  29. if (sub == NULL) \
  30. goto onError; \
  31. if (count < MAX_PREALLOC) { \
  32. PyList_SET_ITEM(list, count, sub); \
  33. } else { \
  34. if (PyList_Append(list, sub)) { \
  35. Py_DECREF(sub); \
  36. goto onError; \
  37. } \
  38. else \
  39. Py_DECREF(sub); \
  40. } \
  41. count++; }
  42. /* Always force the list to the expected size. */
  43. #define FIX_PREALLOC_SIZE(list) Py_SET_SIZE(list, count)
  44. Py_LOCAL_INLINE(PyObject *)
  45. STRINGLIB(split_whitespace)(PyObject* str_obj,
  46. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  47. Py_ssize_t maxcount)
  48. {
  49. Py_ssize_t i, j, count=0;
  50. PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
  51. PyObject *sub;
  52. if (list == NULL)
  53. return NULL;
  54. i = j = 0;
  55. while (maxcount-- > 0) {
  56. while (i < str_len && STRINGLIB_ISSPACE(str[i]))
  57. i++;
  58. if (i == str_len) break;
  59. j = i; i++;
  60. while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
  61. i++;
  62. #if !STRINGLIB_MUTABLE
  63. if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
  64. /* No whitespace in str_obj, so just use it as list[0] */
  65. Py_INCREF(str_obj);
  66. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  67. count++;
  68. break;
  69. }
  70. #endif
  71. SPLIT_ADD(str, j, i);
  72. }
  73. if (i < str_len) {
  74. /* Only occurs when maxcount was reached */
  75. /* Skip any remaining whitespace and copy to end of string */
  76. while (i < str_len && STRINGLIB_ISSPACE(str[i]))
  77. i++;
  78. if (i != str_len)
  79. SPLIT_ADD(str, i, str_len);
  80. }
  81. FIX_PREALLOC_SIZE(list);
  82. return list;
  83. onError:
  84. Py_DECREF(list);
  85. return NULL;
  86. }
  87. Py_LOCAL_INLINE(PyObject *)
  88. STRINGLIB(split_char)(PyObject* str_obj,
  89. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  90. const STRINGLIB_CHAR ch,
  91. Py_ssize_t maxcount)
  92. {
  93. Py_ssize_t i, j, count=0;
  94. PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
  95. PyObject *sub;
  96. if (list == NULL)
  97. return NULL;
  98. i = j = 0;
  99. while ((j < str_len) && (maxcount-- > 0)) {
  100. for(; j < str_len; j++) {
  101. /* I found that using memchr makes no difference */
  102. if (str[j] == ch) {
  103. SPLIT_ADD(str, i, j);
  104. i = j = j + 1;
  105. break;
  106. }
  107. }
  108. }
  109. #if !STRINGLIB_MUTABLE
  110. if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
  111. /* ch not in str_obj, so just use str_obj as list[0] */
  112. Py_INCREF(str_obj);
  113. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  114. count++;
  115. } else
  116. #endif
  117. if (i <= str_len) {
  118. SPLIT_ADD(str, i, str_len);
  119. }
  120. FIX_PREALLOC_SIZE(list);
  121. return list;
  122. onError:
  123. Py_DECREF(list);
  124. return NULL;
  125. }
  126. Py_LOCAL_INLINE(PyObject *)
  127. STRINGLIB(split)(PyObject* str_obj,
  128. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  129. const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
  130. Py_ssize_t maxcount)
  131. {
  132. Py_ssize_t i, j, pos, count=0;
  133. PyObject *list, *sub;
  134. if (sep_len == 0) {
  135. PyErr_SetString(PyExc_ValueError, "empty separator");
  136. return NULL;
  137. }
  138. else if (sep_len == 1)
  139. return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount);
  140. list = PyList_New(PREALLOC_SIZE(maxcount));
  141. if (list == NULL)
  142. return NULL;
  143. i = j = 0;
  144. while (maxcount-- > 0) {
  145. pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
  146. if (pos < 0)
  147. break;
  148. j = i + pos;
  149. SPLIT_ADD(str, i, j);
  150. i = j + sep_len;
  151. }
  152. #if !STRINGLIB_MUTABLE
  153. if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
  154. /* No match in str_obj, so just use it as list[0] */
  155. Py_INCREF(str_obj);
  156. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  157. count++;
  158. } else
  159. #endif
  160. {
  161. SPLIT_ADD(str, i, str_len);
  162. }
  163. FIX_PREALLOC_SIZE(list);
  164. return list;
  165. onError:
  166. Py_DECREF(list);
  167. return NULL;
  168. }
  169. Py_LOCAL_INLINE(PyObject *)
  170. STRINGLIB(rsplit_whitespace)(PyObject* str_obj,
  171. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  172. Py_ssize_t maxcount)
  173. {
  174. Py_ssize_t i, j, count=0;
  175. PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
  176. PyObject *sub;
  177. if (list == NULL)
  178. return NULL;
  179. i = j = str_len - 1;
  180. while (maxcount-- > 0) {
  181. while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
  182. i--;
  183. if (i < 0) break;
  184. j = i; i--;
  185. while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
  186. i--;
  187. #if !STRINGLIB_MUTABLE
  188. if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
  189. /* No whitespace in str_obj, so just use it as list[0] */
  190. Py_INCREF(str_obj);
  191. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  192. count++;
  193. break;
  194. }
  195. #endif
  196. SPLIT_ADD(str, i + 1, j + 1);
  197. }
  198. if (i >= 0) {
  199. /* Only occurs when maxcount was reached */
  200. /* Skip any remaining whitespace and copy to beginning of string */
  201. while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
  202. i--;
  203. if (i >= 0)
  204. SPLIT_ADD(str, 0, i + 1);
  205. }
  206. FIX_PREALLOC_SIZE(list);
  207. if (PyList_Reverse(list) < 0)
  208. goto onError;
  209. return list;
  210. onError:
  211. Py_DECREF(list);
  212. return NULL;
  213. }
  214. Py_LOCAL_INLINE(PyObject *)
  215. STRINGLIB(rsplit_char)(PyObject* str_obj,
  216. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  217. const STRINGLIB_CHAR ch,
  218. Py_ssize_t maxcount)
  219. {
  220. Py_ssize_t i, j, count=0;
  221. PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
  222. PyObject *sub;
  223. if (list == NULL)
  224. return NULL;
  225. i = j = str_len - 1;
  226. while ((i >= 0) && (maxcount-- > 0)) {
  227. for(; i >= 0; i--) {
  228. if (str[i] == ch) {
  229. SPLIT_ADD(str, i + 1, j + 1);
  230. j = i = i - 1;
  231. break;
  232. }
  233. }
  234. }
  235. #if !STRINGLIB_MUTABLE
  236. if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
  237. /* ch not in str_obj, so just use str_obj as list[0] */
  238. Py_INCREF(str_obj);
  239. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  240. count++;
  241. } else
  242. #endif
  243. if (j >= -1) {
  244. SPLIT_ADD(str, 0, j + 1);
  245. }
  246. FIX_PREALLOC_SIZE(list);
  247. if (PyList_Reverse(list) < 0)
  248. goto onError;
  249. return list;
  250. onError:
  251. Py_DECREF(list);
  252. return NULL;
  253. }
  254. Py_LOCAL_INLINE(PyObject *)
  255. STRINGLIB(rsplit)(PyObject* str_obj,
  256. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  257. const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
  258. Py_ssize_t maxcount)
  259. {
  260. Py_ssize_t j, pos, count=0;
  261. PyObject *list, *sub;
  262. if (sep_len == 0) {
  263. PyErr_SetString(PyExc_ValueError, "empty separator");
  264. return NULL;
  265. }
  266. else if (sep_len == 1)
  267. return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount);
  268. list = PyList_New(PREALLOC_SIZE(maxcount));
  269. if (list == NULL)
  270. return NULL;
  271. j = str_len;
  272. while (maxcount-- > 0) {
  273. pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH);
  274. if (pos < 0)
  275. break;
  276. SPLIT_ADD(str, pos + sep_len, j);
  277. j = pos;
  278. }
  279. #if !STRINGLIB_MUTABLE
  280. if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
  281. /* No match in str_obj, so just use it as list[0] */
  282. Py_INCREF(str_obj);
  283. PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
  284. count++;
  285. } else
  286. #endif
  287. {
  288. SPLIT_ADD(str, 0, j);
  289. }
  290. FIX_PREALLOC_SIZE(list);
  291. if (PyList_Reverse(list) < 0)
  292. goto onError;
  293. return list;
  294. onError:
  295. Py_DECREF(list);
  296. return NULL;
  297. }
  298. Py_LOCAL_INLINE(PyObject *)
  299. STRINGLIB(splitlines)(PyObject* str_obj,
  300. const STRINGLIB_CHAR* str, Py_ssize_t str_len,
  301. int keepends)
  302. {
  303. /* This does not use the preallocated list because splitlines is
  304. usually run with hundreds of newlines. The overhead of
  305. switching between PyList_SET_ITEM and append causes about a
  306. 2-3% slowdown for that common case. A smarter implementation
  307. could move the if check out, so the SET_ITEMs are done first
  308. and the appends only done when the prealloc buffer is full.
  309. That's too much work for little gain.*/
  310. Py_ssize_t i;
  311. Py_ssize_t j;
  312. PyObject *list = PyList_New(0);
  313. PyObject *sub;
  314. if (list == NULL)
  315. return NULL;
  316. for (i = j = 0; i < str_len; ) {
  317. Py_ssize_t eol;
  318. /* Find a line and append it */
  319. while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
  320. i++;
  321. /* Skip the line break reading CRLF as one line break */
  322. eol = i;
  323. if (i < str_len) {
  324. if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
  325. i += 2;
  326. else
  327. i++;
  328. if (keepends)
  329. eol = i;
  330. }
  331. #if !STRINGLIB_MUTABLE
  332. if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
  333. /* No linebreak in str_obj, so just use it as list[0] */
  334. if (PyList_Append(list, str_obj))
  335. goto onError;
  336. break;
  337. }
  338. #endif
  339. SPLIT_APPEND(str, j, eol);
  340. j = i;
  341. }
  342. return list;
  343. onError:
  344. Py_DECREF(list);
  345. return NULL;
  346. }