123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285 |
- /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
- /* Implementation notes:
- *
- * This is a very simple lorem ipsum generator
- * which features a static list of words
- * and print them one after another randomly
- * with a fake sentence / paragraph structure.
- *
- * The goal is to generate a printable text
- * that can be used to fake a text compression scenario.
- * The resulting compression / ratio curve of the lorem ipsum generator
- * is more satisfying than the previous statistical generator,
- * which was initially designed for entropy compression,
- * and lacks a regularity more representative of text.
- *
- * The compression ratio achievable on the generated lorem ipsum
- * is still a bit too good, presumably because the dictionary is a bit too
- * small. It would be possible to create some more complex scheme, notably by
- * enlarging the dictionary with a word generator, and adding grammatical rules
- * (composition) and syntax rules. But that's probably overkill for the intended
- * goal.
- */
- #include "lorem.h"
- #include <assert.h>
- #include <limits.h> /* INT_MAX */
- #include <string.h> /* memcpy */
- #define WORD_MAX_SIZE 20
- /* Define the word pool */
- static const char* kWords[] = {
- "lorem", "ipsum", "dolor", "sit", "amet",
- "consectetur", "adipiscing", "elit", "sed", "do",
- "eiusmod", "tempor", "incididunt", "ut", "labore",
- "et", "dolore", "magna", "aliqua", "dis",
- "lectus", "vestibulum", "mattis", "ullamcorper", "velit",
- "commodo", "a", "lacus", "arcu", "magnis",
- "parturient", "montes", "nascetur", "ridiculus", "mus",
- "mauris", "nulla", "malesuada", "pellentesque", "eget",
- "gravida", "in", "dictum", "non", "erat",
- "nam", "voluptat", "maecenas", "blandit", "aliquam",
- "etiam", "enim", "lobortis", "scelerisque", "fermentum",
- "dui", "faucibus", "ornare", "at", "elementum",
- "eu", "facilisis", "odio", "morbi", "quis",
- "eros", "donec", "ac", "orci", "purus",
- "turpis", "cursus", "leo", "vel", "porta",
- "consequat", "interdum", "varius", "vulputate", "aliquet",
- "pharetra", "nunc", "auctor", "urna", "id",
- "metus", "viverra", "nibh", "cras", "mi",
- "unde", "omnis", "iste", "natus", "error",
- "perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium",
- "totam", "rem", "aperiam", "eaque", "ipsa",
- "quae", "ab", "illo", "inventore", "veritatis",
- "quasi", "architecto", "beatae", "vitae", "dicta",
- "sunt", "explicabo", "nemo", "ipsam", "quia",
- "voluptas", "aspernatur", "aut", "odit", "fugit",
- "consequuntur", "magni", "dolores", "eos", "qui",
- "ratione", "sequi", "nesciunt", "neque", "porro",
- "quisquam", "est", "dolorem", "adipisci", "numquam",
- "eius", "modi", "tempora", "incidunt", "magnam",
- "quaerat", "ad", "minima", "veniam", "nostrum",
- "ullam", "corporis", "suscipit", "laboriosam", "nisi",
- "aliquid", "ex", "ea", "commodi", "consequatur",
- "autem", "eum", "iure", "voluptate", "esse",
- "quam", "nihil", "molestiae", "illum", "fugiat",
- "quo", "pariatur", "vero", "accusamus", "iusto",
- "dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum",
- "deleniti", "atque", "corrupti", "quos", "quas",
- "molestias", "excepturi", "sint", "occaecati", "cupiditate",
- "provident", "similique", "culpa", "officia", "deserunt",
- "mollitia", "animi", "laborum", "dolorum", "fuga",
- "harum", "quidem", "rerum", "facilis", "expedita",
- "distinctio", "libero", "tempore", "cum", "soluta",
- "nobis", "eligendi", "optio", "cumque", "impedit",
- "minus", "quod", "maxime", "placeat", "facere",
- "possimus", "assumenda", "repellendus", "temporibus", "quibusdam",
- "officiis", "debitis", "saepe", "eveniet", "voluptates",
- "repudiandae", "recusandae", "itaque", "earum", "hic",
- "tenetur", "sapiente", "delectus", "reiciendis", "cillum",
- "maiores", "alias", "perferendis", "doloribus", "asperiores",
- "repellat", "minim", "nostrud", "exercitation", "ullamco",
- "laboris", "aliquip", "duis", "aute", "irure",
- };
- static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
- /* simple 1-dimension distribution, based on word's length, favors small words
- */
- static const int kWeights[] = { 0, 8, 6, 4, 3, 2 };
- static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
- #define DISTRIB_SIZE_MAX 650
- static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
- static unsigned g_distribCount = 0;
- static void countFreqs(
- const char* words[],
- size_t nbWords,
- const int* weights,
- size_t nbWeights)
- {
- unsigned total = 0;
- size_t w;
- for (w = 0; w < nbWords; w++) {
- size_t len = strlen(words[w]);
- int lmax;
- if (len >= nbWeights)
- len = nbWeights - 1;
- lmax = weights[len];
- total += (unsigned)lmax;
- }
- g_distribCount = total;
- assert(g_distribCount <= DISTRIB_SIZE_MAX);
- }
- static void init_word_distrib(
- const char* words[],
- size_t nbWords,
- const int* weights,
- size_t nbWeights)
- {
- size_t w, d = 0;
- countFreqs(words, nbWords, weights, nbWeights);
- for (w = 0; w < nbWords; w++) {
- size_t len = strlen(words[w]);
- int l, lmax;
- if (len >= nbWeights)
- len = nbWeights - 1;
- lmax = weights[len];
- for (l = 0; l < lmax; l++) {
- g_distrib[d++] = (int)w;
- }
- }
- }
- /* Note: this unit only works when invoked sequentially.
- * No concurrent access is allowed */
- static char* g_ptr = NULL;
- static size_t g_nbChars = 0;
- static size_t g_maxChars = 10000000;
- static unsigned g_randRoot = 0;
- #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
- static unsigned LOREM_rand(unsigned range)
- {
- static const unsigned prime1 = 2654435761U;
- static const unsigned prime2 = 2246822519U;
- unsigned rand32 = g_randRoot;
- rand32 *= prime1;
- rand32 ^= prime2;
- rand32 = RDG_rotl32(rand32, 13);
- g_randRoot = rand32;
- return (unsigned)(((unsigned long long)rand32 * range) >> 32);
- }
- static void writeLastCharacters(void)
- {
- size_t lastChars = g_maxChars - g_nbChars;
- assert(g_maxChars >= g_nbChars);
- if (lastChars == 0)
- return;
- g_ptr[g_nbChars++] = '.';
- if (lastChars > 2) {
- memset(g_ptr + g_nbChars, ' ', lastChars - 2);
- }
- if (lastChars > 1) {
- g_ptr[g_maxChars - 1] = '\n';
- }
- g_nbChars = g_maxChars;
- }
- static void generateWord(const char* word, const char* separator, int upCase)
- {
- size_t const len = strlen(word) + strlen(separator);
- if (g_nbChars + len > g_maxChars) {
- writeLastCharacters();
- return;
- }
- memcpy(g_ptr + g_nbChars, word, strlen(word));
- if (upCase) {
- static const char toUp = 'A' - 'a';
- g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
- }
- g_nbChars += strlen(word);
- memcpy(g_ptr + g_nbChars, separator, strlen(separator));
- g_nbChars += strlen(separator);
- }
- static int about(unsigned target)
- {
- return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
- }
- /* Function to generate a random sentence */
- static void generateSentence(int nbWords)
- {
- int commaPos = about(9);
- int comma2 = commaPos + about(7);
- int qmark = (LOREM_rand(11) == 7);
- const char* endSep = qmark ? "? " : ". ";
- int i;
- for (i = 0; i < nbWords; i++) {
- int const wordID = g_distrib[LOREM_rand(g_distribCount)];
- const char* const word = kWords[wordID];
- const char* sep = " ";
- if (i == commaPos)
- sep = ", ";
- if (i == comma2)
- sep = ", ";
- if (i == nbWords - 1)
- sep = endSep;
- generateWord(word, sep, i == 0);
- }
- }
- static void generateParagraph(int nbSentences)
- {
- int i;
- for (i = 0; i < nbSentences; i++) {
- int wordsPerSentence = about(11);
- generateSentence(wordsPerSentence);
- }
- if (g_nbChars < g_maxChars) {
- g_ptr[g_nbChars++] = '\n';
- }
- if (g_nbChars < g_maxChars) {
- g_ptr[g_nbChars++] = '\n';
- }
- }
- /* It's "common" for lorem ipsum generators to start with the same first
- * pre-defined sentence */
- static void generateFirstSentence(void)
- {
- int i;
- for (i = 0; i < 18; i++) {
- const char* word = kWords[i];
- const char* separator = " ";
- if (i == 4)
- separator = ", ";
- if (i == 7)
- separator = ", ";
- generateWord(word, separator, i == 0);
- }
- generateWord(kWords[18], ". ", 0);
- }
- size_t
- LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
- {
- g_ptr = (char*)buffer;
- assert(size < INT_MAX);
- g_maxChars = size;
- g_nbChars = 0;
- g_randRoot = seed;
- if (g_distribCount == 0) {
- init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
- }
- if (first) {
- generateFirstSentence();
- }
- while (g_nbChars < g_maxChars) {
- int sentencePerParagraph = about(7);
- generateParagraph(sentencePerParagraph);
- if (!fill)
- break; /* only generate one paragraph in not-fill mode */
- }
- g_ptr = NULL;
- return g_nbChars;
- }
- void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
- {
- LOREM_genBlock(buffer, size, seed, 1, 1);
- }
|