lorem.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. /*
  2. * Copyright (c) Meta Platforms, Inc. and affiliates.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. /* Implementation notes:
  11. *
  12. * This is a very simple lorem ipsum generator
  13. * which features a static list of words
  14. * and print them one after another randomly
  15. * with a fake sentence / paragraph structure.
  16. *
  17. * The goal is to generate a printable text
  18. * that can be used to fake a text compression scenario.
  19. * The resulting compression / ratio curve of the lorem ipsum generator
  20. * is more satisfying than the previous statistical generator,
  21. * which was initially designed for entropy compression,
  22. * and lacks a regularity more representative of text.
  23. *
  24. * The compression ratio achievable on the generated lorem ipsum
  25. * is still a bit too good, presumably because the dictionary is a bit too
  26. * small. It would be possible to create some more complex scheme, notably by
  27. * enlarging the dictionary with a word generator, and adding grammatical rules
  28. * (composition) and syntax rules. But that's probably overkill for the intended
  29. * goal.
  30. */
  31. #include "lorem.h"
  32. #include <assert.h>
  33. #include <limits.h> /* INT_MAX */
  34. #include <string.h> /* memcpy */
  35. #define WORD_MAX_SIZE 20
  36. /* Define the word pool */
  37. static const char* kWords[] = {
  38. "lorem", "ipsum", "dolor", "sit", "amet",
  39. "consectetur", "adipiscing", "elit", "sed", "do",
  40. "eiusmod", "tempor", "incididunt", "ut", "labore",
  41. "et", "dolore", "magna", "aliqua", "dis",
  42. "lectus", "vestibulum", "mattis", "ullamcorper", "velit",
  43. "commodo", "a", "lacus", "arcu", "magnis",
  44. "parturient", "montes", "nascetur", "ridiculus", "mus",
  45. "mauris", "nulla", "malesuada", "pellentesque", "eget",
  46. "gravida", "in", "dictum", "non", "erat",
  47. "nam", "voluptat", "maecenas", "blandit", "aliquam",
  48. "etiam", "enim", "lobortis", "scelerisque", "fermentum",
  49. "dui", "faucibus", "ornare", "at", "elementum",
  50. "eu", "facilisis", "odio", "morbi", "quis",
  51. "eros", "donec", "ac", "orci", "purus",
  52. "turpis", "cursus", "leo", "vel", "porta",
  53. "consequat", "interdum", "varius", "vulputate", "aliquet",
  54. "pharetra", "nunc", "auctor", "urna", "id",
  55. "metus", "viverra", "nibh", "cras", "mi",
  56. "unde", "omnis", "iste", "natus", "error",
  57. "perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium",
  58. "totam", "rem", "aperiam", "eaque", "ipsa",
  59. "quae", "ab", "illo", "inventore", "veritatis",
  60. "quasi", "architecto", "beatae", "vitae", "dicta",
  61. "sunt", "explicabo", "nemo", "ipsam", "quia",
  62. "voluptas", "aspernatur", "aut", "odit", "fugit",
  63. "consequuntur", "magni", "dolores", "eos", "qui",
  64. "ratione", "sequi", "nesciunt", "neque", "porro",
  65. "quisquam", "est", "dolorem", "adipisci", "numquam",
  66. "eius", "modi", "tempora", "incidunt", "magnam",
  67. "quaerat", "ad", "minima", "veniam", "nostrum",
  68. "ullam", "corporis", "suscipit", "laboriosam", "nisi",
  69. "aliquid", "ex", "ea", "commodi", "consequatur",
  70. "autem", "eum", "iure", "voluptate", "esse",
  71. "quam", "nihil", "molestiae", "illum", "fugiat",
  72. "quo", "pariatur", "vero", "accusamus", "iusto",
  73. "dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum",
  74. "deleniti", "atque", "corrupti", "quos", "quas",
  75. "molestias", "excepturi", "sint", "occaecati", "cupiditate",
  76. "provident", "similique", "culpa", "officia", "deserunt",
  77. "mollitia", "animi", "laborum", "dolorum", "fuga",
  78. "harum", "quidem", "rerum", "facilis", "expedita",
  79. "distinctio", "libero", "tempore", "cum", "soluta",
  80. "nobis", "eligendi", "optio", "cumque", "impedit",
  81. "minus", "quod", "maxime", "placeat", "facere",
  82. "possimus", "assumenda", "repellendus", "temporibus", "quibusdam",
  83. "officiis", "debitis", "saepe", "eveniet", "voluptates",
  84. "repudiandae", "recusandae", "itaque", "earum", "hic",
  85. "tenetur", "sapiente", "delectus", "reiciendis", "cillum",
  86. "maiores", "alias", "perferendis", "doloribus", "asperiores",
  87. "repellat", "minim", "nostrud", "exercitation", "ullamco",
  88. "laboris", "aliquip", "duis", "aute", "irure",
  89. };
  90. static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
  91. /* simple 1-dimension distribution, based on word's length, favors small words
  92. */
  93. static const int kWeights[] = { 0, 8, 6, 4, 3, 2 };
  94. static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
  95. #define DISTRIB_SIZE_MAX 650
  96. static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
  97. static unsigned g_distribCount = 0;
  98. static void countFreqs(
  99. const char* words[],
  100. size_t nbWords,
  101. const int* weights,
  102. size_t nbWeights)
  103. {
  104. unsigned total = 0;
  105. size_t w;
  106. for (w = 0; w < nbWords; w++) {
  107. size_t len = strlen(words[w]);
  108. int lmax;
  109. if (len >= nbWeights)
  110. len = nbWeights - 1;
  111. lmax = weights[len];
  112. total += (unsigned)lmax;
  113. }
  114. g_distribCount = total;
  115. assert(g_distribCount <= DISTRIB_SIZE_MAX);
  116. }
  117. static void init_word_distrib(
  118. const char* words[],
  119. size_t nbWords,
  120. const int* weights,
  121. size_t nbWeights)
  122. {
  123. size_t w, d = 0;
  124. countFreqs(words, nbWords, weights, nbWeights);
  125. for (w = 0; w < nbWords; w++) {
  126. size_t len = strlen(words[w]);
  127. int l, lmax;
  128. if (len >= nbWeights)
  129. len = nbWeights - 1;
  130. lmax = weights[len];
  131. for (l = 0; l < lmax; l++) {
  132. g_distrib[d++] = (int)w;
  133. }
  134. }
  135. }
  136. /* Note: this unit only works when invoked sequentially.
  137. * No concurrent access is allowed */
  138. static char* g_ptr = NULL;
  139. static size_t g_nbChars = 0;
  140. static size_t g_maxChars = 10000000;
  141. static unsigned g_randRoot = 0;
  142. #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
  143. static unsigned LOREM_rand(unsigned range)
  144. {
  145. static const unsigned prime1 = 2654435761U;
  146. static const unsigned prime2 = 2246822519U;
  147. unsigned rand32 = g_randRoot;
  148. rand32 *= prime1;
  149. rand32 ^= prime2;
  150. rand32 = RDG_rotl32(rand32, 13);
  151. g_randRoot = rand32;
  152. return (unsigned)(((unsigned long long)rand32 * range) >> 32);
  153. }
  154. static void writeLastCharacters(void)
  155. {
  156. size_t lastChars = g_maxChars - g_nbChars;
  157. assert(g_maxChars >= g_nbChars);
  158. if (lastChars == 0)
  159. return;
  160. g_ptr[g_nbChars++] = '.';
  161. if (lastChars > 2) {
  162. memset(g_ptr + g_nbChars, ' ', lastChars - 2);
  163. }
  164. if (lastChars > 1) {
  165. g_ptr[g_maxChars - 1] = '\n';
  166. }
  167. g_nbChars = g_maxChars;
  168. }
  169. static void generateWord(const char* word, const char* separator, int upCase)
  170. {
  171. size_t const len = strlen(word) + strlen(separator);
  172. if (g_nbChars + len > g_maxChars) {
  173. writeLastCharacters();
  174. return;
  175. }
  176. memcpy(g_ptr + g_nbChars, word, strlen(word));
  177. if (upCase) {
  178. static const char toUp = 'A' - 'a';
  179. g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
  180. }
  181. g_nbChars += strlen(word);
  182. memcpy(g_ptr + g_nbChars, separator, strlen(separator));
  183. g_nbChars += strlen(separator);
  184. }
  185. static int about(unsigned target)
  186. {
  187. return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
  188. }
  189. /* Function to generate a random sentence */
  190. static void generateSentence(int nbWords)
  191. {
  192. int commaPos = about(9);
  193. int comma2 = commaPos + about(7);
  194. int qmark = (LOREM_rand(11) == 7);
  195. const char* endSep = qmark ? "? " : ". ";
  196. int i;
  197. for (i = 0; i < nbWords; i++) {
  198. int const wordID = g_distrib[LOREM_rand(g_distribCount)];
  199. const char* const word = kWords[wordID];
  200. const char* sep = " ";
  201. if (i == commaPos)
  202. sep = ", ";
  203. if (i == comma2)
  204. sep = ", ";
  205. if (i == nbWords - 1)
  206. sep = endSep;
  207. generateWord(word, sep, i == 0);
  208. }
  209. }
  210. static void generateParagraph(int nbSentences)
  211. {
  212. int i;
  213. for (i = 0; i < nbSentences; i++) {
  214. int wordsPerSentence = about(11);
  215. generateSentence(wordsPerSentence);
  216. }
  217. if (g_nbChars < g_maxChars) {
  218. g_ptr[g_nbChars++] = '\n';
  219. }
  220. if (g_nbChars < g_maxChars) {
  221. g_ptr[g_nbChars++] = '\n';
  222. }
  223. }
  224. /* It's "common" for lorem ipsum generators to start with the same first
  225. * pre-defined sentence */
  226. static void generateFirstSentence(void)
  227. {
  228. int i;
  229. for (i = 0; i < 18; i++) {
  230. const char* word = kWords[i];
  231. const char* separator = " ";
  232. if (i == 4)
  233. separator = ", ";
  234. if (i == 7)
  235. separator = ", ";
  236. generateWord(word, separator, i == 0);
  237. }
  238. generateWord(kWords[18], ". ", 0);
  239. }
  240. size_t
  241. LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
  242. {
  243. g_ptr = (char*)buffer;
  244. assert(size < INT_MAX);
  245. g_maxChars = size;
  246. g_nbChars = 0;
  247. g_randRoot = seed;
  248. if (g_distribCount == 0) {
  249. init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
  250. }
  251. if (first) {
  252. generateFirstSentence();
  253. }
  254. while (g_nbChars < g_maxChars) {
  255. int sentencePerParagraph = about(7);
  256. generateParagraph(sentencePerParagraph);
  257. if (!fill)
  258. break; /* only generate one paragraph in not-fill mode */
  259. }
  260. g_ptr = NULL;
  261. return g_nbChars;
  262. }
  263. void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
  264. {
  265. LOREM_genBlock(buffer, size, seed, 1, 1);
  266. }