mirror of
				https://github.com/facebook/zstd.git
				synced 2025-10-31 08:37:43 +02:00 
			
		
		
		
	increase vocabulary size
makes compression a bit less good, hence a bit more comparable with real text (though still too easy to compress). level 6 is now stronger than level 4, by a hair. However, there is still a ratio dip at level 5.
This commit is contained in:
		
							
								
								
									
										7
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -39,12 +39,15 @@ buck-out/ | ||||
| build-* | ||||
| *.gcda | ||||
|  | ||||
| # IDE | ||||
| .clang_complete | ||||
| compile_flags.txt | ||||
| .clang-format | ||||
|  | ||||
| # Other files | ||||
| .directory | ||||
| _codelite/ | ||||
| _zstdbench/ | ||||
| .clang_complete | ||||
| compile_flags.txt | ||||
| *.idea | ||||
| *.swp | ||||
| .DS_Store | ||||
|   | ||||
							
								
								
									
										257
									
								
								programs/lorem.c
									
									
									
									
									
								
							
							
						
						
									
										257
									
								
								programs/lorem.c
									
									
									
									
									
								
							| @@ -8,7 +8,6 @@ | ||||
|  * You may select, at your option, one of the above-listed licenses. | ||||
|  */ | ||||
|  | ||||
|  | ||||
| /* Implementation notes: | ||||
|  * | ||||
|  * This is a very simple lorem ipsum generator | ||||
| @@ -32,83 +31,97 @@ | ||||
|  */ | ||||
|  | ||||
| #include "lorem.h" | ||||
| #include <string.h>  /* memcpy */ | ||||
| #include <limits.h>  /* INT_MAX */ | ||||
| #include <assert.h> | ||||
| #include <limits.h> /* INT_MAX */ | ||||
| #include <string.h> /* memcpy */ | ||||
|  | ||||
| #define WORD_MAX_SIZE 20 | ||||
|  | ||||
| /* Define the word pool */ | ||||
| static const char *words[] = { | ||||
|     "lorem",       "ipsum",      "dolor",      "sit",          "amet", | ||||
|     "consectetur", "adipiscing", "elit",       "sed",          "do", | ||||
|     "eiusmod",     "tempor",     "incididunt", "ut",           "labore", | ||||
|     "et",          "dolore",     "magna",      "aliqua",       "dis", | ||||
|     "lectus",      "vestibulum", "mattis",     "ullamcorper",  "velit", | ||||
|     "commodo",     "a",          "lacus",      "arcu",         "magnis", | ||||
|     "parturient",  "montes",     "nascetur",   "ridiculus",    "mus", | ||||
|     "mauris",      "nulla",      "malesuada",  "pellentesque", "eget", | ||||
|     "gravida",     "in",         "dictum",     "non",          "erat", | ||||
|     "nam",         "voluptat",   "maecenas",   "blandit",      "aliquam", | ||||
|     "etiam",       "enim",       "lobortis",   "scelerisque",  "fermentum", | ||||
|     "dui",         "faucibus",   "ornare",     "at",           "elementum", | ||||
|     "eu",          "facilisis",  "odio",       "morbi",        "quis", | ||||
|     "eros",        "donec",      "ac",         "orci",         "purus", | ||||
|     "turpis",      "cursus",     "leo",        "vel",          "porta"}; | ||||
| static const char* words[] = { | ||||
|     "lorem",        "ipsum",      "dolor",       "sit",          "amet", | ||||
|     "consectetur",  "adipiscing", "elit",        "sed",          "do", | ||||
|     "eiusmod",      "tempor",     "incididunt",  "ut",           "labore", | ||||
|     "et",           "dolore",     "magna",       "aliqua",       "dis", | ||||
|     "lectus",       "vestibulum", "mattis",      "ullamcorper",  "velit", | ||||
|     "commodo",      "a",          "lacus",       "arcu",         "magnis", | ||||
|     "parturient",   "montes",     "nascetur",    "ridiculus",    "mus", | ||||
|     "mauris",       "nulla",      "malesuada",   "pellentesque", "eget", | ||||
|     "gravida",      "in",         "dictum",      "non",          "erat", | ||||
|     "nam",          "voluptat",   "maecenas",    "blandit",      "aliquam", | ||||
|     "etiam",        "enim",       "lobortis",    "scelerisque",  "fermentum", | ||||
|     "dui",          "faucibus",   "ornare",      "at",           "elementum", | ||||
|     "eu",           "facilisis",  "odio",        "morbi",        "quis", | ||||
|     "eros",         "donec",      "ac",          "orci",         "purus", | ||||
|     "turpis",       "cursus",     "leo",         "vel",          "porta", | ||||
|     "consequat",    "interdum",   "varius",      "vulputate",    "aliquet", | ||||
|     "pharetra",     "nunc",       "auctor",      "urna",         "id", | ||||
|     "metus",        "viverra",    "nibh",        "cras",         "mi", | ||||
|     "unde",         "omnis",      "iste",        "natus",        "error", | ||||
|     "perspiciatis", "voluptatem", "accusantium", "doloremque",   "laudantium", | ||||
|     "totam",        "rem",        "aperiam",     "eaque",        "ipsa", | ||||
|     "quae",         "ab",         "illo",        "inventore",    "veritatis", | ||||
|     "quasi",        "architecto", "beatae",      "vitae",        "dicta", | ||||
|     "sunt",         "explicabo",  "nemo",        "ipsam",        "quia", | ||||
|     "voluptas",     "aspernatur", "aut",         "odit",         "fugit" | ||||
| }; | ||||
|  | ||||
| /* simple distribution that favors small words : | ||||
| /* simple 1-dimension distribution that favors small words : | ||||
|  * 1 letter : weight 3 | ||||
|  * 2-3 letters : weight 2 | ||||
|  * 4+ letters : weight 1 | ||||
|  * This is expected to be a bit more difficult to compress */ | ||||
|  */ | ||||
| static const int distrib[] = { | ||||
|     0, 1, 2, 3, 3, 4, 5, 6, 7, 8, | ||||
|     8,9, 9, 10, 11, 12, 13, 13, 14, 15, | ||||
|     15, 16, 17, 18, 19, 19, 20, 21, 22, 23, | ||||
|     24, 25, 26, 26, 26, 27, 28, 29, 30, 31, | ||||
|     32, 33, 34, 34, 35, 36, 37, 38, 39, 40, | ||||
|     41, 41, 42, 43, 43, 44, 45, 45, 46, 47, | ||||
|     48, 49, 50, 51, 52, 53, 54, 55, 55, 56, | ||||
|     57, 58, 58, 59, 60, 60, 61, 62, 63, 64, | ||||
|     65, 66, 67, 67, 68, 69, 70, 71, 72, 72, | ||||
|     73, 73, 74 }; | ||||
|     0,   1,   2,   3,   3,   4,   5,   6,   7,   8,   8,   9,   9,   10,  11, | ||||
|     12,  13,  13,  14,  15,  15,  16,  17,  18,  19,  19,  20,  21,  22,  23, | ||||
|     24,  25,  26,  26,  26,  27,  28,  29,  30,  31,  32,  33,  34,  34,  35, | ||||
|     36,  37,  38,  39,  40,  41,  41,  42,  43,  43,  44,  45,  45,  46,  47, | ||||
|     48,  49,  50,  51,  52,  53,  54,  55,  55,  56,  57,  58,  58,  59,  60, | ||||
|     60,  61,  62,  63,  64,  65,  66,  67,  67,  68,  69,  70,  71,  72,  72, | ||||
|     73,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  84,  85, | ||||
|     86,  87,  88,  89,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, | ||||
|     100, 101, 101, 102, 103, 104, 105, 106, 106, 107, 108, 109, 110, 111, 112, | ||||
|     113, 114, 115, 116, 117, 118, 119, 129, 121, 122, 123, 124, | ||||
| }; | ||||
| static const unsigned distribCount = sizeof(distrib) / sizeof(distrib[0]); | ||||
|  | ||||
| /* Note: this unit only works when invoked sequentially. | ||||
|  * No concurrent access is allowed */ | ||||
| static char *g_ptr = NULL; | ||||
| static size_t g_nbChars = 0; | ||||
| static size_t g_maxChars = 10000000; | ||||
| static char* g_ptr         = NULL; | ||||
| static size_t g_nbChars    = 0; | ||||
| static size_t g_maxChars   = 10000000; | ||||
| static unsigned g_randRoot = 0; | ||||
|  | ||||
| #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r))) | ||||
| static unsigned LOREM_rand(unsigned range) { | ||||
|   static const unsigned prime1 = 2654435761U; | ||||
|   static const unsigned prime2 = 2246822519U; | ||||
|   unsigned rand32 = g_randRoot; | ||||
|   rand32 *= prime1; | ||||
|   rand32 ^= prime2; | ||||
|   rand32 = RDG_rotl32(rand32, 13); | ||||
|   g_randRoot = rand32; | ||||
|   return (unsigned)(((unsigned long long)rand32 * range) >> 32); | ||||
| static unsigned LOREM_rand(unsigned range) | ||||
| { | ||||
|     static const unsigned prime1 = 2654435761U; | ||||
|     static const unsigned prime2 = 2246822519U; | ||||
|     unsigned rand32              = g_randRoot; | ||||
|     rand32 *= prime1; | ||||
|     rand32 ^= prime2; | ||||
|     rand32     = RDG_rotl32(rand32, 13); | ||||
|     g_randRoot = rand32; | ||||
|     return (unsigned)(((unsigned long long)rand32 * range) >> 32); | ||||
| } | ||||
|  | ||||
| static void writeLastCharacters(void) { | ||||
|   size_t lastChars = g_maxChars - g_nbChars; | ||||
|   assert(g_maxChars >= g_nbChars); | ||||
|   if (lastChars == 0) | ||||
|     return; | ||||
|   g_ptr[g_nbChars++] = '.'; | ||||
|   if (lastChars > 2) { | ||||
|     memset(g_ptr + g_nbChars, ' ', lastChars - 2); | ||||
|   } | ||||
|   if (lastChars > 1) { | ||||
|     g_ptr[g_maxChars-1] = '\n'; | ||||
|   } | ||||
|   g_nbChars = g_maxChars; | ||||
| static void writeLastCharacters(void) | ||||
| { | ||||
|     size_t lastChars = g_maxChars - g_nbChars; | ||||
|     assert(g_maxChars >= g_nbChars); | ||||
|     if (lastChars == 0) | ||||
|         return; | ||||
|     g_ptr[g_nbChars++] = '.'; | ||||
|     if (lastChars > 2) { | ||||
|         memset(g_ptr + g_nbChars, ' ', lastChars - 2); | ||||
|     } | ||||
|     if (lastChars > 1) { | ||||
|         g_ptr[g_maxChars - 1] = '\n'; | ||||
|     } | ||||
|     g_nbChars = g_maxChars; | ||||
| } | ||||
|  | ||||
| static void generateWord(const char *word, const char *separator, int upCase) | ||||
| static void generateWord(const char* word, const char* separator, int upCase) | ||||
| { | ||||
|     size_t const len = strlen(word) + strlen(separator); | ||||
|     if (g_nbChars + len > g_maxChars) { | ||||
| @@ -118,90 +131,92 @@ static void generateWord(const char *word, const char *separator, int upCase) | ||||
|     memcpy(g_ptr + g_nbChars, word, strlen(word)); | ||||
|     if (upCase) { | ||||
|         static const char toUp = 'A' - 'a'; | ||||
|         g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp); | ||||
|         g_ptr[g_nbChars]       = (char)(g_ptr[g_nbChars] + toUp); | ||||
|     } | ||||
|     g_nbChars += strlen(word); | ||||
|     memcpy(g_ptr + g_nbChars, separator, strlen(separator)); | ||||
|     g_nbChars += strlen(separator); | ||||
| } | ||||
|  | ||||
| static int about(unsigned target) { | ||||
|   return (int)(LOREM_rand(target) + LOREM_rand(target) + 1); | ||||
| static int about(unsigned target) | ||||
| { | ||||
|     return (int)(LOREM_rand(target) + LOREM_rand(target) + 1); | ||||
| } | ||||
|  | ||||
| /* Function to generate a random sentence */ | ||||
| static void generateSentence(int nbWords) { | ||||
|   int commaPos = about(9); | ||||
|   int comma2 = commaPos + about(7); | ||||
|   int i; | ||||
|   for (i = 0; i < nbWords; i++) { | ||||
|     int const wordID = distrib[LOREM_rand(distribCount)]; | ||||
|     const char *const word = words[wordID]; | ||||
|     const char* sep = " "; | ||||
|     if (i == commaPos) | ||||
|       sep = ", "; | ||||
|     if (i == comma2) | ||||
|       sep = ", "; | ||||
|     if (i == nbWords - 1) | ||||
|       sep = ". "; | ||||
|     generateWord(word, sep, i==0); | ||||
|   } | ||||
| static void generateSentence(int nbWords) | ||||
| { | ||||
|     int commaPos = about(9); | ||||
|     int comma2   = commaPos + about(7); | ||||
|     int i; | ||||
|     for (i = 0; i < nbWords; i++) { | ||||
|         int const wordID       = distrib[LOREM_rand(distribCount)]; | ||||
|         const char* const word = words[wordID]; | ||||
|         const char* sep        = " "; | ||||
|         if (i == commaPos) | ||||
|             sep = ", "; | ||||
|         if (i == comma2) | ||||
|             sep = ", "; | ||||
|         if (i == nbWords - 1) | ||||
|             sep = ". "; | ||||
|         generateWord(word, sep, i == 0); | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void generateParagraph(int nbSentences) { | ||||
|   int i; | ||||
|   for (i = 0; i < nbSentences; i++) { | ||||
|     int wordsPerSentence = about(8); | ||||
|     generateSentence(wordsPerSentence); | ||||
|   } | ||||
|   if (g_nbChars < g_maxChars) { | ||||
|     g_ptr[g_nbChars++] = '\n'; | ||||
|   } | ||||
|   if (g_nbChars < g_maxChars) { | ||||
|     g_ptr[g_nbChars++] = '\n'; | ||||
|   } | ||||
| static void generateParagraph(int nbSentences) | ||||
| { | ||||
|     int i; | ||||
|     for (i = 0; i < nbSentences; i++) { | ||||
|         int wordsPerSentence = about(8); | ||||
|         generateSentence(wordsPerSentence); | ||||
|     } | ||||
|     if (g_nbChars < g_maxChars) { | ||||
|         g_ptr[g_nbChars++] = '\n'; | ||||
|     } | ||||
|     if (g_nbChars < g_maxChars) { | ||||
|         g_ptr[g_nbChars++] = '\n'; | ||||
|     } | ||||
| } | ||||
|  | ||||
| /* It's "common" for lorem ipsum generators to start with the same first | ||||
|  * pre-defined sentence */ | ||||
| static void generateFirstSentence(void) { | ||||
|   int i; | ||||
|   for (i = 0; i < 18; i++) { | ||||
|     const char *word = words[i]; | ||||
|     const char *separator = " "; | ||||
|     if (i == 4) | ||||
|       separator = ", "; | ||||
|     if (i == 7) | ||||
|       separator = ", "; | ||||
|     generateWord(word, separator, i==0); | ||||
|   } | ||||
|   generateWord(words[18], ". ", 0); | ||||
| static void generateFirstSentence(void) | ||||
| { | ||||
|     int i; | ||||
|     for (i = 0; i < 18; i++) { | ||||
|         const char* word      = words[i]; | ||||
|         const char* separator = " "; | ||||
|         if (i == 4) | ||||
|             separator = ", "; | ||||
|         if (i == 7) | ||||
|             separator = ", "; | ||||
|         generateWord(word, separator, i == 0); | ||||
|     } | ||||
|     generateWord(words[18], ". ", 0); | ||||
| } | ||||
|  | ||||
| size_t LOREM_genBlock(void* buffer, size_t size, | ||||
|                       unsigned seed, | ||||
|                       int first, int fill) | ||||
| size_t | ||||
| LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill) | ||||
| { | ||||
|   g_ptr = (char*)buffer; | ||||
|   assert(size < INT_MAX); | ||||
|   g_maxChars = size; | ||||
|   g_nbChars = 0; | ||||
|   g_randRoot = seed; | ||||
|   if (first) { | ||||
|     generateFirstSentence(); | ||||
|   } | ||||
|   while (g_nbChars < g_maxChars) { | ||||
|     int sentencePerParagraph = about(7); | ||||
|     generateParagraph(sentencePerParagraph); | ||||
|     if (!fill) | ||||
|       break; /* only generate one paragraph in not-fill mode */ | ||||
|   } | ||||
|   g_ptr = NULL; | ||||
|   return g_nbChars; | ||||
|     g_ptr = (char*)buffer; | ||||
|     assert(size < INT_MAX); | ||||
|     g_maxChars = size; | ||||
|     g_nbChars  = 0; | ||||
|     g_randRoot = seed; | ||||
|     if (first) { | ||||
|         generateFirstSentence(); | ||||
|     } | ||||
|     while (g_nbChars < g_maxChars) { | ||||
|         int sentencePerParagraph = about(7); | ||||
|         generateParagraph(sentencePerParagraph); | ||||
|         if (!fill) | ||||
|             break; /* only generate one paragraph in not-fill mode */ | ||||
|     } | ||||
|     g_ptr = NULL; | ||||
|     return g_nbChars; | ||||
| } | ||||
|  | ||||
| void LOREM_genBuffer(void* buffer, size_t size, unsigned seed) | ||||
| { | ||||
|   LOREM_genBlock(buffer, size, seed, 1, 1); | ||||
|     LOREM_genBlock(buffer, size, seed, 1, 1); | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user