/* * Copyright (c) 2017-2020, Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */ #include #include #include #include #include #include #include "util.h" #include "timefn.h" /* UTIL_clockSpanMicro, SEC_TO_MICRO, UTIL_TIME_INITIALIZER */ #include "zstd.h" #include "zstd_internal.h" #include "mem.h" #define ZDICT_STATIC_LINKING_ONLY #include "zdict.h" /* Direct access to internal compression functions is required */ #include "zstd_compress.c" #define XXH_STATIC_LINKING_ONLY #include "xxhash.h" /* XXH64 */ #ifndef MIN #define MIN(a, b) ((a) < (b) ? (a) : (b)) #endif #ifndef MAX_PATH #ifdef PATH_MAX #define MAX_PATH PATH_MAX #else #define MAX_PATH 256 #endif #endif /*-************************************ * DISPLAY Macros **************************************/ #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } static U32 g_displayLevel = 2; #define DISPLAYUPDATE(...) \ do { \ if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || \ (g_displayLevel >= 4)) { \ g_displayClock = UTIL_getTime(); \ DISPLAY(__VA_ARGS__); \ if (g_displayLevel >= 4) fflush(stderr); \ } \ } while (0) static const U64 g_refreshRate = SEC_TO_MICRO / 6; static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; #define CHECKERR(code) \ do { \ if (ZSTD_isError(code)) { \ DISPLAY("Error occurred while generating data: %s\n", \ ZSTD_getErrorName(code)); \ exit(1); \ } \ } while (0) /*-******************************************************* * Random function *********************************************************/ static U32 RAND(U32* src) { #define RAND_rotl32(x,r) ((x << r) | (x >> (32 - r))) static const U32 prime1 = 2654435761U; static const U32 prime2 = 2246822519U; U32 rand32 = *src; rand32 *= prime1; rand32 += prime2; rand32 = RAND_rotl32(rand32, 13); *src = rand32; return RAND_rotl32(rand32, 27); #undef RAND_rotl32 } #define DISTSIZE (8192) /* Write `size` bytes into `ptr`, all of which are less than or equal to `maxSymb` */ static void RAND_bufferMaxSymb(U32* seed, void* ptr, size_t size, int maxSymb) { size_t i; BYTE* op = ptr; for (i = 0; i < size; i++) { op[i] = (BYTE) (RAND(seed) % (maxSymb + 1)); } } /* Write `size` random bytes into `ptr` */ static void RAND_buffer(U32* seed, void* ptr, size_t size) { size_t i; BYTE* op = ptr; for (i = 0; i + 4 <= size; i += 4) { MEM_writeLE32(op + i, RAND(seed)); } for (; i < size; i++) { op[i] = RAND(seed) & 0xff; } } /* Write `size` bytes into `ptr` following the distribution `dist` */ static void RAND_bufferDist(U32* seed, BYTE* dist, void* ptr, size_t size) { size_t i; BYTE* op = ptr; for (i = 0; i < size; i++) { op[i] = dist[RAND(seed) % DISTSIZE]; } } /* Generate a random distribution where the frequency of each symbol follows a * geometric distribution defined by `weight` * `dist` should have size at least `DISTSIZE` */ static void RAND_genDist(U32* seed, BYTE* dist, double weight) { size_t i = 0; size_t statesLeft = DISTSIZE; BYTE symb = (BYTE) (RAND(seed) % 256); BYTE step = (BYTE) ((RAND(seed) % 256) | 1); /* force it to be odd so it's relatively prime to 256 */ while (i < DISTSIZE) { size_t states = ((size_t)(weight * statesLeft)) + 1; size_t j; for (j = 0; j < states && i < DISTSIZE; j++, i++) { dist[i] = symb; } symb += step; statesLeft -= states; } } /* Generates a random number in the range [min, max) */ static inline U32 RAND_range(U32* seed, U32 min, U32 max) { return (RAND(seed) % (max-min)) + min; } #define ROUND(x) ((U32)(x + 0.5)) /* Generates a random number in an exponential distribution with mean `mean` */ static double RAND_exp(U32* seed, double mean) { double const u = RAND(seed) / (double) UINT_MAX; return log(1-u) * (-mean); } /*-******************************************************* * Constants and Structs *********************************************************/ const char *BLOCK_TYPES[] = {"raw", "rle", "compressed"}; #define MAX_DECOMPRESSED_SIZE_LOG 20 #define MAX_DECOMPRESSED_SIZE (1ULL << MAX_DECOMPRESSED_SIZE_LOG) #define MAX_WINDOW_LOG 22 /* Recommended support is 8MB, so limit to 4MB + mantissa */ #define MIN_SEQ_LEN (3) #define MAX_NB_SEQ ((ZSTD_BLOCKSIZE_MAX + MIN_SEQ_LEN - 1) / MIN_SEQ_LEN) BYTE CONTENT_BUFFER[MAX_DECOMPRESSED_SIZE]; BYTE FRAME_BUFFER[MAX_DECOMPRESSED_SIZE * 2]; BYTE LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX]; seqDef SEQUENCE_BUFFER[MAX_NB_SEQ]; BYTE SEQUENCE_LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX]; /* storeSeq expects a place to copy literals to */ BYTE SEQUENCE_LLCODE[ZSTD_BLOCKSIZE_MAX]; BYTE SEQUENCE_MLCODE[ZSTD_BLOCKSIZE_MAX]; BYTE SEQUENCE_OFCODE[ZSTD_BLOCKSIZE_MAX]; unsigned WKSP[HUF_WORKSPACE_SIZE_U32]; typedef struct { size_t contentSize; /* 0 means unknown (unless contentSize == windowSize == 0) */ unsigned windowSize; /* contentSize >= windowSize means single segment */ } frameHeader_t; /* For repeat modes */ typedef struct { U32 rep[ZSTD_REP_NUM]; int hufInit; /* the distribution used in the previous block for repeat mode */ BYTE hufDist[DISTSIZE]; HUF_CElt hufTable [256]; int fseInit; FSE_CTable offcodeCTable [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)]; FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)]; FSE_CTable litlengthCTable [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)]; /* Symbols that were present in the previous distribution, for use with * set_repeat */ BYTE litlengthSymbolSet[36]; BYTE offsetSymbolSet[29]; BYTE matchlengthSymbolSet[53]; } cblockStats_t; typedef struct { void* data; void* dataStart; void* dataEnd; void* src; void* srcStart; void* srcEnd; frameHeader_t header; cblockStats_t stats; cblockStats_t oldStats; /* so they can be rolled back if uncompressible */ } frame_t; typedef struct { int useDict; U32 dictID; size_t dictContentSize; BYTE* dictContent; } dictInfo; typedef enum { gt_frame = 0, /* generate frames */ gt_block, /* generate compressed blocks without block/frame headers */ } genType_e; /*-******************************************************* * Global variables (set from command line) *********************************************************/ U32 g_maxDecompressedSizeLog = MAX_DECOMPRESSED_SIZE_LOG; /* <= 20 */ U32 g_maxBlockSize = ZSTD_BLOCKSIZE_MAX; /* <= 128 KB */ /*-******************************************************* * Generator Functions *********************************************************/ struct { int contentSize; /* force the content size to be present */ } opts; /* advanced options on generation */ /* Generate and write a random frame header */ static void writeFrameHeader(U32* seed, frame_t* frame, dictInfo info) { BYTE* const op = frame->data; size_t pos = 0; frameHeader_t fh; BYTE windowByte = 0; int singleSegment = 0; int contentSizeFlag = 0; int fcsCode = 0; memset(&fh, 0, sizeof(fh)); /* generate window size */ { /* Follow window algorithm from specification */ int const exponent = RAND(seed) % (MAX_WINDOW_LOG - 10); int const mantissa = RAND(seed) % 8; windowByte = (BYTE) ((exponent << 3) | mantissa); fh.windowSize = (1U << (exponent + 10)); fh.windowSize += fh.windowSize / 8 * mantissa; } { /* Generate random content size */ size_t highBit; if (RAND(seed) & 7 && g_maxDecompressedSizeLog > 7) { /* do content of at least 128 bytes */ highBit = 1ULL << RAND_range(seed, 7, g_maxDecompressedSizeLog); } else if (RAND(seed) & 3) { /* do small content */ highBit = 1ULL << RAND_range(seed, 0, MIN(7, 1U << g_maxDecompressedSizeLog)); } else { /* 0 size frame */ highBit = 0; } fh.contentSize = highBit ? highBit + (RAND(seed) % highBit) : 0; /* provide size sometimes */ contentSizeFlag = opts.contentSize | (RAND(seed) & 1); if (contentSizeFlag && (fh.contentSize == 0 || !(RAND(seed) & 7))) { /* do single segment sometimes */ fh.windowSize = (U32) fh.contentSize; singleSegment = 1; } } if (contentSizeFlag) { /* Determine how large fcs field has to be */ int minFcsCode = (fh.contentSize >= 256) + (fh.contentSize >= 65536 + 256) + (fh.contentSize > 0xFFFFFFFFU); if (!singleSegment && !minFcsCode) { minFcsCode = 1; } fcsCode = minFcsCode + (RAND(seed) % (4 - minFcsCode)); if (fcsCode == 1 && fh.contentSize < 256) fcsCode++; } /* write out the header */ MEM_writeLE32(op + pos, ZSTD_MAGICNUMBER); pos += 4; { /* * fcsCode: 2-bit flag specifying how many bytes used to represent Frame_Content_Size (bits 7-6) * singleSegment: 1-bit flag describing if data must be regenerated within a single continuous memory segment. (bit 5) * contentChecksumFlag: 1-bit flag that is set if frame includes checksum at the end -- set to 1 below (bit 2) * dictBits: 2-bit flag describing how many bytes Dictionary_ID uses -- set to 3 (bits 1-0) * For more information: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header */ int const dictBits = info.useDict ? 3 : 0; BYTE const frameHeaderDescriptor = (BYTE) ((fcsCode << 6) | (singleSegment << 5) | (1 << 2) | dictBits); op[pos++] = frameHeaderDescriptor; } if (!singleSegment) { op[pos++] = windowByte; } if (info.useDict) { MEM_writeLE32(op + pos, (U32) info.dictID); pos += 4; } if (contentSizeFlag) { switch (fcsCode) { default: /* Impossible */ case 0: op[pos++] = (BYTE) fh.contentSize; break; case 1: MEM_writeLE16(op + pos, (U16) (fh.contentSize - 256)); pos += 2; break; case 2: MEM_writeLE32(op + pos, (U32) fh.contentSize); pos += 4; break; case 3: MEM_writeLE64(op + pos, (U64) fh.contentSize); pos += 8; break; } } DISPLAYLEVEL(3, " frame content size:\t%u\n", (unsigned)fh.contentSize); DISPLAYLEVEL(3, " frame window size:\t%u\n", fh.windowSize); DISPLAYLEVEL(3, " content size flag:\t%d\n", contentSizeFlag); DISPLAYLEVEL(3, " single segment flag:\t%d\n", singleSegment); frame->data = op + pos; frame->header = fh; } /* Write a literal block in either raw or RLE form, return the literals size */ static size_t writeLiteralsBlockSimple(U32* seed, frame_t* frame, size_t contentSize) { BYTE* op = (BYTE*)frame->data; int const type = RAND(seed) % 2; int const sizeFormatDesc = RAND(seed) % 8; size_t litSize; size_t maxLitSize = MIN(contentSize, g_maxBlockSize); if (sizeFormatDesc == 0) { /* Size_FormatDesc = ?0 */ maxLitSize = MIN(maxLitSize, 31); } else if (sizeFormatDesc <= 4) { /* Size_FormatDesc = 01 */ maxLitSize = MIN(maxLitSize, 4095); } else { /* Size_Format = 11 */ maxLitSize = MIN(maxLitSize, 1048575); } litSize = RAND(seed) % (maxLitSize + 1); if (frame->src == frame->srcStart && litSize == 0) { litSize = 1; /* no empty literals if there's nothing preceding this block */ } if (litSize + 3 > contentSize) { litSize = contentSize; /* no matches shorter than 3 are allowed */ } /* use smallest size format that fits */ if (litSize < 32) { op[0] = (type | (0 << 2) | (litSize << 3)) & 0xff; op += 1; } else if (litSize < 4096) { op[0] = (type | (1 << 2) | (litSize << 4)) & 0xff; op[1] = (litSize >> 4) & 0xff; op += 2; } else { op[0] = (type | (3 << 2) | (litSize << 4)) & 0xff; op[1] = (litSize >> 4) & 0xff; op[2] = (litSize >> 12) & 0xff; op += 3; } if (type == 0) { /* Raw literals */ DISPLAYLEVEL(4, " raw literals\n"); RAND_buffer(seed, LITERAL_BUFFER, litSize); memcpy(op, LITERAL_BUFFER, litSize); op += litSize; } else { /* RLE literals */ BYTE const symb = (BYTE) (RAND(seed) % 256); DISPLAYLEVEL(4, " rle literals: 0x%02x\n", (unsigned)symb); memset(LITERAL_BUFFER, symb, litSize); op[0] = symb; op++; } frame->data = op; return litSize; } /* Generate a Huffman header for the given source */ static size_t writeHufHeader(U32* seed, HUF_CElt* hufTable, void* dst, size_t dstSize, const void* src, size_t srcSize) { BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; unsigned huffLog = 11; unsigned maxSymbolValue = 255; unsigned count[HUF_SYMBOLVALUE_MAX+1]; /* Scan input and build symbol stats */ { size_t const largest = HIST_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, WKSP, sizeof(WKSP)); assert(!HIST_isError(largest)); if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 0; } /* single symbol, rle */ if (largest <= (srcSize >> 7)+1) return 0; /* Fast heuristic : not compressible enough */ } /* Build Huffman Tree */ /* Max Huffman log is 11, min is highbit(maxSymbolValue)+1 */ huffLog = RAND_range(seed, ZSTD_highbit32(maxSymbolValue)+1, huffLog+1); DISPLAYLEVEL(6, " huffman log: %u\n", huffLog); { size_t const maxBits = HUF_buildCTable_wksp (hufTable, count, maxSymbolValue, huffLog, WKSP, sizeof(WKSP)); CHECKERR(maxBits); huffLog = (U32)maxBits; } /* Write table description header */ { size_t const hSize = HUF_writeCTable (op, dstSize, hufTable, maxSymbolValue, huffLog); if (hSize + 12 >= srcSize) return 0; /* not useful to try compression */ op += hSize; } return op - ostart; } /* Write a Huffman coded literals block and return the literals size */ static size_t writeLiteralsBlockCompressed(U32* seed, frame_t* frame, size_t contentSize) { BYTE* origop = (BYTE*)frame->data; BYTE* opend = (BYTE*)frame->dataEnd; BYTE* op; BYTE* const ostart = origop; int const sizeFormat = RAND(seed) % 4; size_t litSize; size_t hufHeaderSize = 0; size_t compressedSize = 0; size_t maxLitSize = MIN(contentSize-3, g_maxBlockSize); symbolEncodingType_e hType; if (contentSize < 64) { /* make sure we get reasonably-sized literals for compression */ return ERROR(GENERIC); } DISPLAYLEVEL(4, " compressed literals\n"); switch (sizeFormat) { case 0: /* fall through, size is the same as case 1 */ case 1: maxLitSize = MIN(maxLitSize, 1023); origop += 3; break; case 2: maxLitSize = MIN(maxLitSize, 16383); origop += 4; break; case 3: maxLitSize = MIN(maxLitSize, 262143); origop += 5; break; default:; /* impossible */ } do { op = origop; do { litSize = RAND(seed) % (maxLitSize + 1); } while (litSize < 32); /* avoid small literal sizes */ if (litSize + 3 > contentSize) { litSize = contentSize; /* no matches shorter than 3 are allowed */ } /* most of the time generate a new distribution */ if ((RAND(seed) & 3) || !frame->stats.hufInit) { do { if (RAND(seed) & 3) { /* add 10 to ensure some compressibility */ double const weight = ((RAND(seed) % 90) + 10) / 100.0; DISPLAYLEVEL(5, " distribution weight: %d%%\n", (int)(weight * 100)); RAND_genDist(seed, frame->stats.hufDist, weight); } else { /* sometimes do restricted range literals to force * non-huffman headers */ DISPLAYLEVEL(5, " small range literals\n"); RAND_bufferMaxSymb(seed, frame->stats.hufDist, DISTSIZE, 15); } RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER, litSize); /* generate the header from the distribution instead of the * actual data to avoid bugs with symbols that were in the * distribution but never showed up in the output */ hufHeaderSize = writeHufHeader( seed, frame->stats.hufTable, op, opend - op, frame->stats.hufDist, DISTSIZE); CHECKERR(hufHeaderSize); /* repeat until a valid header is written */ } while (hufHeaderSize == 0); op += hufHeaderSize; hType = set_compressed; frame->stats.hufInit = 1; } else { /* repeat the distribution/table from last time */ DISPLAYLEVEL(5, " huffman repeat stats\n"); RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER, litSize); hufHeaderSize = 0; hType = set_repeat; } do { compressedSize = sizeFormat == 0 ? HUF_compress1X_usingCTable( op, opend - op, LITERAL_BUFFER, litSize, frame->stats.hufTable) : HUF_compress4X_usingCTable( op, opend - op, LITERAL_BUFFER, litSize, frame->stats.hufTable); CHECKERR(compressedSize); /* this only occurs when it could not compress or similar */ } while (compressedSize <= 0); op += compressedSize; compressedSize += hufHeaderSize; DISPLAYLEVEL(5, " regenerated size: %u\n", (unsigned)litSize); DISPLAYLEVEL(5, " compressed size: %u\n", (unsigned)compressedSize); if (compressedSize >= litSize) { DISPLAYLEVEL(5, " trying again\n"); /* if we have to try again, reset the stats so we don't accidentally * try to repeat a distribution we just made */ frame->stats = frame->oldStats; } else { break; } } while (1); /* write header */ switch (sizeFormat) { case 0: /* fall through, size is the same as case 1 */ case 1: { U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) | ((U32)compressedSize << 14); MEM_writeLE24(ostart, header); break; } case 2: { U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) | ((U32)compressedSize << 18); MEM_writeLE32(ostart, header); break; } case 3: { U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) | ((U32)compressedSize << 22); MEM_writeLE32(ostart, header); ostart[4] = (BYTE)(compressedSize >> 10); break; } default:; /* impossible */ } frame->data = op; return litSize; } static size_t writeLiteralsBlock(U32* seed, frame_t* frame, size_t contentSize) { /* only do compressed for larger segments to avoid compressibility issues */ if (RAND(seed) & 7 && contentSize >= 64) { return writeLiteralsBlockCompressed(seed, frame, contentSize); } else { return writeLiteralsBlockSimple(seed, frame, contentSize); } } static inline void initSeqStore(seqStore_t *seqStore) { seqStore->maxNbSeq = MAX_NB_SEQ; seqStore->maxNbLit = ZSTD_BLOCKSIZE_MAX; seqStore->sequencesStart = SEQUENCE_BUFFER; seqStore->litStart = SEQUENCE_LITERAL_BUFFER; seqStore->llCode = SEQUENCE_LLCODE; seqStore->mlCode = SEQUENCE_MLCODE; seqStore->ofCode = SEQUENCE_OFCODE; ZSTD_resetSeqStore(seqStore); } /* Randomly generate sequence commands */ static U32 generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore, size_t contentSize, size_t literalsSize, dictInfo info) { /* The total length of all the matches */ size_t const remainingMatch = contentSize - literalsSize; size_t excessMatch = 0; U32 numSequences = 0; U32 i; const BYTE* literals = LITERAL_BUFFER; BYTE* srcPtr = frame->src; if (literalsSize != contentSize) { /* each match must be at least MIN_SEQ_LEN, so this is the maximum * number of sequences we can have */ U32 const maxSequences = (U32)remainingMatch / MIN_SEQ_LEN; numSequences = (RAND(seed) % maxSequences) + 1; /* the extra match lengths we have to allocate to each sequence */ excessMatch = remainingMatch - numSequences * MIN_SEQ_LEN; } DISPLAYLEVEL(5, " total match lengths: %u\n", (unsigned)remainingMatch); for (i = 0; i < numSequences; i++) { /* Generate match and literal lengths by exponential distribution to * ensure nice numbers */ U32 matchLen = MIN_SEQ_LEN + ROUND(RAND_exp(seed, excessMatch / (double)(numSequences - i))); U32 literalLen = (RAND(seed) & 7) ? ROUND(RAND_exp(seed, literalsSize / (double)(numSequences - i))) : 0; /* actual offset, code to send, and point to copy up to when shifting * codes in the repeat offsets history */ U32 offset, offsetCode, repIndex; /* bounds checks */ matchLen = (U32) MIN(matchLen, excessMatch + MIN_SEQ_LEN); literalLen = MIN(literalLen, (U32) literalsSize); if (i == 0 && srcPtr == frame->srcStart && literalLen == 0) literalLen = 1; if (i + 1 == numSequences) matchLen = MIN_SEQ_LEN + (U32) excessMatch; memcpy(srcPtr, literals, literalLen); srcPtr += literalLen; do { if (RAND(seed) & 7) { /* do a normal offset */ U32 const dataDecompressed = (U32)((BYTE*)srcPtr-(BYTE*)frame->srcStart); offset = (RAND(seed) % MIN(frame->header.windowSize, (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) + 1; if (info.useDict && (RAND(seed) & 1) && i + 1 != numSequences && dataDecompressed < frame->header.windowSize) { /* need to occasionally generate offsets that go past the start */ /* including i+1 != numSequences because the last sequences has to adhere to predetermined contentSize */ U32 lenPastStart = (RAND(seed) % info.dictContentSize) + 1; offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart)+lenPastStart; if (offset > frame->header.windowSize) { if (lenPastStart < MIN_SEQ_LEN) { /* when offset > windowSize, matchLen bound by end of dictionary (lenPastStart) */ /* this also means that lenPastStart must be greater than MIN_SEQ_LEN */ /* make sure lenPastStart does not go past dictionary start though */ lenPastStart = MIN(lenPastStart+MIN_SEQ_LEN, (U32)info.dictContentSize); offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) + lenPastStart; } { U32 const matchLenBound = MIN(frame->header.windowSize, lenPastStart); matchLen = MIN(matchLen, matchLenBound); } } } offsetCode = offset + ZSTD_REP_MOVE; repIndex = 2; } else { /* do a repeat offset */ offsetCode = RAND(seed) % 3; if (literalLen > 0) { offset = frame->stats.rep[offsetCode]; repIndex = offsetCode; } else { /* special case */ offset = offsetCode == 2 ? frame->stats.rep[0] - 1 : frame->stats.rep[offsetCode + 1]; repIndex = MIN(2, offsetCode + 1); } } } while (((!info.useDict) && (offset > (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) || offset == 0); { size_t j; BYTE* const dictEnd = info.dictContent + info.dictContentSize; for (j = 0; j < matchLen; j++) { if ((U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) < offset) { /* copy from dictionary instead of literals */ size_t const dictOffset = offset - (srcPtr - (BYTE*)frame->srcStart); *srcPtr = *(dictEnd - dictOffset); } else { *srcPtr = *(srcPtr-offset); } srcPtr++; } } { int r; for (r = repIndex; r > 0; r--) { frame->stats.rep[r] = frame->stats.rep[r - 1]; } frame->stats.rep[0] = offset; } DISPLAYLEVEL(6, " LL: %5u OF: %5u ML: %5u", (unsigned)literalLen, (unsigned)offset, (unsigned)matchLen); DISPLAYLEVEL(7, " srcPos: %8u seqNb: %3u", (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart), (unsigned)i); DISPLAYLEVEL(6, "\n"); if (offsetCode < 3) { DISPLAYLEVEL(7, " repeat offset: %d\n", (int)repIndex); } /* use libzstd sequence handling */ ZSTD_storeSeq(seqStore, literalLen, literals, literals + literalLen, offsetCode, matchLen - MINMATCH); literalsSize -= literalLen; excessMatch -= (matchLen - MIN_SEQ_LEN); literals += literalLen; } memcpy(srcPtr, literals, literalsSize); srcPtr += literalsSize; DISPLAYLEVEL(6, " excess literals: %5u", (unsigned)literalsSize); DISPLAYLEVEL(7, " srcPos: %8u", (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart)); DISPLAYLEVEL(6, "\n"); return numSequences; } static void initSymbolSet(const BYTE* symbols, size_t len, BYTE* set, BYTE maxSymbolValue) { size_t i; memset(set, 0, (size_t)maxSymbolValue+1); for (i = 0; i < len; i++) { set[symbols[i]] = 1; } } static int isSymbolSubset(const BYTE* symbols, size_t len, const BYTE* set, BYTE maxSymbolValue) { size_t i; for (i = 0; i < len; i++) { if (symbols[i] > maxSymbolValue || !set[symbols[i]]) { return 0; } } return 1; } static size_t writeSequences(U32* seed, frame_t* frame, seqStore_t* seqStorePtr, size_t nbSeq) { /* This code is mostly copied from ZSTD_compressSequences in zstd_compress.c */ unsigned count[MaxSeq+1]; S16 norm[MaxSeq+1]; FSE_CTable* CTable_LitLength = frame->stats.litlengthCTable; FSE_CTable* CTable_OffsetBits = frame->stats.offcodeCTable; FSE_CTable* CTable_MatchLength = frame->stats.matchlengthCTable; U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ const seqDef* const sequences = seqStorePtr->sequencesStart; const BYTE* const ofCodeTable = seqStorePtr->ofCode; const BYTE* const llCodeTable = seqStorePtr->llCode; const BYTE* const mlCodeTable = seqStorePtr->mlCode; BYTE* const oend = (BYTE*)frame->dataEnd; BYTE* op = (BYTE*)frame->data; BYTE* seqHead; BYTE scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE(MaxSeq, MaxFSELog)]; /* literals compressing block removed so that can be done separately */ /* Sequences Header */ if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */) return ERROR(dstSize_tooSmall); if (nbSeq < 0x7F) *op++ = (BYTE)nbSeq; else if (nbSeq < LONGNBSEQ) op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; if (nbSeq==0) { frame->data = op; return 0; } /* seqHead : flags for FSE encoding type */ seqHead = op++; /* convert length/distances into codes */ ZSTD_seqToCodes(seqStorePtr); /* CTable for Literal Lengths */ { unsigned max = MaxLL; size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */ assert(!HIST_isError(mostFrequent)); if (frame->stats.fseInit && !(RAND(seed) & 3) && isSymbolSubset(llCodeTable, nbSeq, frame->stats.litlengthSymbolSet, 35)) { /* maybe do repeat mode if we're allowed to */ LLtype = set_repeat; } else if (mostFrequent == nbSeq) { /* do RLE if we have the chance */ *op++ = llCodeTable[0]; FSE_buildCTable_rle(CTable_LitLength, (BYTE)max); LLtype = set_rle; } else if (!(RAND(seed) & 3)) { /* maybe use the default distribution */ CHECKERR(FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer))); LLtype = set_basic; } else { /* fall back on a full table */ size_t nbSeq_1 = nbSeq; const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max); if (count[llCodeTable[nbSeq-1]]>1) { count[llCodeTable[nbSeq-1]]--; nbSeq_1--; } FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048); { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ if (FSE_isError(NCountSize)) return ERROR(GENERIC); op += NCountSize; } CHECKERR(FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer))); LLtype = set_compressed; } } /* CTable for Offsets */ /* see Literal Lengths for descriptions of mode choices */ { unsigned max = MaxOff; size_t const mostFrequent = HIST_countFast_wksp(count, &max, ofCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */ assert(!HIST_isError(mostFrequent)); if (frame->stats.fseInit && !(RAND(seed) & 3) && isSymbolSubset(ofCodeTable, nbSeq, frame->stats.offsetSymbolSet, 28)) { Offtype = set_repeat; } else if (mostFrequent == nbSeq) { *op++ = ofCodeTable[0]; FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max); Offtype = set_rle; } else if (!(RAND(seed) & 3)) { FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, DefaultMaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer)); Offtype = set_basic; } else { size_t nbSeq_1 = nbSeq; const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max); if (count[ofCodeTable[nbSeq-1]]>1) { count[ofCodeTable[nbSeq-1]]--; nbSeq_1--; } FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048); { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ if (FSE_isError(NCountSize)) return ERROR(GENERIC); op += NCountSize; } FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer)); Offtype = set_compressed; } } /* CTable for MatchLengths */ /* see Literal Lengths for descriptions of mode choices */ { unsigned max = MaxML; size_t const mostFrequent = HIST_countFast_wksp(count, &max, mlCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */ assert(!HIST_isError(mostFrequent)); if (frame->stats.fseInit && !(RAND(seed) & 3) && isSymbolSubset(mlCodeTable, nbSeq, frame->stats.matchlengthSymbolSet, 52)) { MLtype = set_repeat; } else if (mostFrequent == nbSeq) { *op++ = *mlCodeTable; FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max); MLtype = set_rle; } else if (!(RAND(seed) & 3)) { /* sometimes do default distribution */ FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer)); MLtype = set_basic; } else { /* fall back on table */ size_t nbSeq_1 = nbSeq; const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max); if (count[mlCodeTable[nbSeq-1]]>1) { count[mlCodeTable[nbSeq-1]]--; nbSeq_1--; } FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048); { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ if (FSE_isError(NCountSize)) return ERROR(GENERIC); op += NCountSize; } FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer)); MLtype = set_compressed; } } frame->stats.fseInit = 1; initSymbolSet(llCodeTable, nbSeq, frame->stats.litlengthSymbolSet, 35); initSymbolSet(ofCodeTable, nbSeq, frame->stats.offsetSymbolSet, 28); initSymbolSet(mlCodeTable, nbSeq, frame->stats.matchlengthSymbolSet, 52); DISPLAYLEVEL(5, " LL type: %d OF type: %d ML type: %d\n", (unsigned)LLtype, (unsigned)Offtype, (unsigned)MLtype); *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); /* Encoding Sequences */ { BIT_CStream_t blockStream; FSE_CState_t stateMatchLength; FSE_CState_t stateOffsetBits; FSE_CState_t stateLitLength; RETURN_ERROR_IF( ERR_isError(BIT_initCStream(&blockStream, op, oend-op)), dstSize_tooSmall, "not enough space remaining"); /* first symbols */ FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]); FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]); FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); if (MEM_32bits()) BIT_flushBits(&blockStream); BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]); if (MEM_32bits()) BIT_flushBits(&blockStream); BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]); BIT_flushBits(&blockStream); { size_t n; for (n=nbSeq-2 ; n= 64-7-(LLFSELog+MLFSELog+OffFSELog))) BIT_flushBits(&blockStream); /* (7)*/ BIT_addBits(&blockStream, sequences[n].litLength, llBits); if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); if (MEM_32bits()) BIT_flushBits(&blockStream); /* (7)*/ BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */ BIT_flushBits(&blockStream); /* (7)*/ } } FSE_flushCState(&blockStream, &stateMatchLength); FSE_flushCState(&blockStream, &stateOffsetBits); FSE_flushCState(&blockStream, &stateLitLength); { size_t const streamSize = BIT_closeCStream(&blockStream); if (streamSize==0) return ERROR(dstSize_tooSmall); /* not enough space */ op += streamSize; } } frame->data = op; return 0; } static size_t writeSequencesBlock(U32* seed, frame_t* frame, size_t contentSize, size_t literalsSize, dictInfo info) { seqStore_t seqStore; size_t numSequences; initSeqStore(&seqStore); /* randomly generate sequences */ numSequences = generateSequences(seed, frame, &seqStore, contentSize, literalsSize, info); /* write them out to the frame data */ CHECKERR(writeSequences(seed, frame, &seqStore, numSequences)); return numSequences; } static size_t writeCompressedBlock(U32* seed, frame_t* frame, size_t contentSize, dictInfo info) { BYTE* const blockStart = (BYTE*)frame->data; size_t literalsSize; size_t nbSeq; DISPLAYLEVEL(4, " compressed block:\n"); literalsSize = writeLiteralsBlock(seed, frame, contentSize); DISPLAYLEVEL(4, " literals size: %u\n", (unsigned)literalsSize); nbSeq = writeSequencesBlock(seed, frame, contentSize, literalsSize, info); DISPLAYLEVEL(4, " number of sequences: %u\n", (unsigned)nbSeq); return (BYTE*)frame->data - blockStart; } static void writeBlock(U32* seed, frame_t* frame, size_t contentSize, int lastBlock, dictInfo info) { int const blockTypeDesc = RAND(seed) % 8; size_t blockSize; int blockType; BYTE *const header = (BYTE*)frame->data; BYTE *op = header + 3; DISPLAYLEVEL(4, " block:\n"); DISPLAYLEVEL(4, " block content size: %u\n", (unsigned)contentSize); DISPLAYLEVEL(4, " last block: %s\n", lastBlock ? "yes" : "no"); if (blockTypeDesc == 0) { /* Raw data frame */ RAND_buffer(seed, frame->src, contentSize); memcpy(op, frame->src, contentSize); op += contentSize; blockType = 0; blockSize = contentSize; } else if (blockTypeDesc == 1 && frame->header.contentSize > 0) { /* RLE (Don't create RLE block if frame content is 0 since block size of 1 may exceed max block size)*/ BYTE const symbol = RAND(seed) & 0xff; op[0] = symbol; memset(frame->src, symbol, contentSize); op++; blockType = 1; blockSize = contentSize; } else { /* compressed, most common */ size_t compressedSize; blockType = 2; frame->oldStats = frame->stats; frame->data = op; compressedSize = writeCompressedBlock(seed, frame, contentSize, info); if (compressedSize >= contentSize) { /* compressed block must be strictly smaller than uncompressed one */ blockType = 0; memcpy(op, frame->src, contentSize); op += contentSize; blockSize = contentSize; /* fall back on raw block if data doesn't compress */ frame->stats = frame->oldStats; /* don't update the stats */ } else { op += compressedSize; blockSize = compressedSize; } } frame->src = (BYTE*)frame->src + contentSize; DISPLAYLEVEL(4, " block type: %s\n", BLOCK_TYPES[blockType]); DISPLAYLEVEL(4, " block size field: %u\n", (unsigned)blockSize); header[0] = (BYTE) ((lastBlock | (blockType << 1) | (blockSize << 3)) & 0xff); MEM_writeLE16(header + 1, (U16) (blockSize >> 5)); frame->data = op; } static void writeBlocks(U32* seed, frame_t* frame, dictInfo info) { size_t contentLeft = frame->header.contentSize; size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize); while (1) { /* 1 in 4 chance of ending frame */ int const lastBlock = contentLeft > maxBlockSize ? 0 : !(RAND(seed) & 3); size_t blockContentSize; if (lastBlock) { blockContentSize = contentLeft; } else { if (contentLeft > 0 && (RAND(seed) & 7)) { /* some variable size block */ blockContentSize = RAND(seed) % (MIN(maxBlockSize, contentLeft)+1); } else if (contentLeft > maxBlockSize && (RAND(seed) & 1)) { /* some full size block */ blockContentSize = maxBlockSize; } else { /* some empty block */ blockContentSize = 0; } } writeBlock(seed, frame, blockContentSize, lastBlock, info); contentLeft -= blockContentSize; if (lastBlock) break; } } static void writeChecksum(frame_t* frame) { /* write checksum so implementations can verify their output */ U64 digest = XXH64(frame->srcStart, (BYTE*)frame->src-(BYTE*)frame->srcStart, 0); DISPLAYLEVEL(3, " checksum: %08x\n", (unsigned)digest); MEM_writeLE32(frame->data, (U32)digest); frame->data = (BYTE*)frame->data + 4; } static void outputBuffer(const void* buf, size_t size, const char* const path) { /* write data out to file */ const BYTE* ip = (const BYTE*)buf; FILE* out; if (path) { out = fopen(path, "wb"); } else { out = stdout; } if (!out) { fprintf(stderr, "Failed to open file at %s: ", path); perror(NULL); exit(1); } { size_t fsize = size; size_t written = 0; while (written < fsize) { written += fwrite(ip + written, 1, fsize - written, out); if (ferror(out)) { fprintf(stderr, "Failed to write to file at %s: ", path); perror(NULL); exit(1); } } } if (path) { fclose(out); } } static void initFrame(frame_t* fr) { memset(fr, 0, sizeof(*fr)); fr->data = fr->dataStart = FRAME_BUFFER; fr->dataEnd = FRAME_BUFFER + sizeof(FRAME_BUFFER); fr->src = fr->srcStart = CONTENT_BUFFER; fr->srcEnd = CONTENT_BUFFER + sizeof(CONTENT_BUFFER); /* init repeat codes */ fr->stats.rep[0] = 1; fr->stats.rep[1] = 4; fr->stats.rep[2] = 8; } /** * Generated a single zstd compressed block with no block/frame header. * Returns the final seed. */ static U32 generateCompressedBlock(U32 seed, frame_t* frame, dictInfo info) { size_t blockContentSize; int blockWritten = 0; BYTE* op; DISPLAYLEVEL(4, "block seed: %u\n", (unsigned)seed); initFrame(frame); op = (BYTE*)frame->data; while (!blockWritten) { size_t cSize; /* generate window size */ { int const exponent = RAND(&seed) % (MAX_WINDOW_LOG - 10); int const mantissa = RAND(&seed) % 8; frame->header.windowSize = (1U << (exponent + 10)); frame->header.windowSize += (frame->header.windowSize / 8) * mantissa; } /* generate content size */ { size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize); if (RAND(&seed) & 15) { /* some full size blocks */ blockContentSize = maxBlockSize; } else if (RAND(&seed) & 7 && g_maxBlockSize >= (1U << 7)) { /* some small blocks <= 128 bytes*/ blockContentSize = RAND(&seed) % (1U << 7); } else { /* some variable size blocks */ blockContentSize = RAND(&seed) % maxBlockSize; } } /* try generating a compressed block */ frame->oldStats = frame->stats; frame->data = op; cSize = writeCompressedBlock(&seed, frame, blockContentSize, info); if (cSize >= blockContentSize) { /* compressed size must be strictly smaller than decompressed size : https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks */ /* data doesn't compress -- try again */ frame->stats = frame->oldStats; /* don't update the stats */ DISPLAYLEVEL(5, " can't compress block : try again \n"); } else { blockWritten = 1; DISPLAYLEVEL(4, " block size: %u \n", (unsigned)cSize); frame->src = (BYTE*)frame->src + blockContentSize; } } return seed; } /* Return the final seed */ static U32 generateFrame(U32 seed, frame_t* fr, dictInfo info) { /* generate a complete frame */ DISPLAYLEVEL(3, "frame seed: %u\n", (unsigned)seed); initFrame(fr); writeFrameHeader(&seed, fr, info); writeBlocks(&seed, fr, info); writeChecksum(fr); return seed; } /*_******************************************************* * Dictionary Helper Functions *********************************************************/ /* returns 0 if successful, otherwise returns 1 upon error */ static int genRandomDict(U32 dictID, U32 seed, size_t dictSize, BYTE* fullDict) { /* allocate space for samples */ int ret = 0; unsigned const numSamples = 4; size_t sampleSizes[4]; BYTE* const samples = malloc(5000*sizeof(BYTE)); if (samples == NULL) { DISPLAY("Error: could not allocate space for samples\n"); return 1; } /* generate samples */ { unsigned literalValue = 1; unsigned samplesPos = 0; size_t currSize = 1; while (literalValue <= 4) { sampleSizes[literalValue - 1] = currSize; { size_t k; for (k = 0; k < currSize; k++) { *(samples + (samplesPos++)) = (BYTE)literalValue; } } literalValue++; currSize *= 16; } } { size_t dictWriteSize = 0; ZDICT_params_t zdictParams; size_t const headerSize = MAX(dictSize/4, 256); size_t const dictContentSize = dictSize - headerSize; BYTE* const dictContent = fullDict + headerSize; if (dictContentSize < ZDICT_CONTENTSIZE_MIN || dictSize < ZDICT_DICTSIZE_MIN) { DISPLAY("Error: dictionary size is too small\n"); ret = 1; goto exitGenRandomDict; } /* init dictionary params */ memset(&zdictParams, 0, sizeof(zdictParams)); zdictParams.dictID = dictID; zdictParams.notificationLevel = 1; /* fill in dictionary content */ RAND_buffer(&seed, (void*)dictContent, dictContentSize); /* finalize dictionary with random samples */ dictWriteSize = ZDICT_finalizeDictionary(fullDict, dictSize, dictContent, dictContentSize, samples, sampleSizes, numSamples, zdictParams); if (ZDICT_isError(dictWriteSize)) { DISPLAY("Could not finalize dictionary: %s\n", ZDICT_getErrorName(dictWriteSize)); ret = 1; } } exitGenRandomDict: free(samples); return ret; } static dictInfo initDictInfo(int useDict, size_t dictContentSize, BYTE* dictContent, U32 dictID){ /* allocate space statically */ dictInfo dictOp; memset(&dictOp, 0, sizeof(dictOp)); dictOp.useDict = useDict; dictOp.dictContentSize = dictContentSize; dictOp.dictContent = dictContent; dictOp.dictID = dictID; return dictOp; } /*-******************************************************* * Test Mode *********************************************************/ BYTE DECOMPRESSED_BUFFER[MAX_DECOMPRESSED_SIZE]; static size_t testDecodeSimple(frame_t* fr) { /* test decoding the generated data with the simple API */ size_t const ret = ZSTD_decompress(DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart); if (ZSTD_isError(ret)) return ret; if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart, (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) { return ERROR(corruption_detected); } return ret; } static size_t testDecodeStreaming(frame_t* fr) { /* test decoding the generated data with the streaming API */ ZSTD_DStream* zd = ZSTD_createDStream(); ZSTD_inBuffer in; ZSTD_outBuffer out; size_t ret; if (!zd) return ERROR(memory_allocation); in.src = fr->dataStart; in.pos = 0; in.size = (BYTE*)fr->data - (BYTE*)fr->dataStart; out.dst = DECOMPRESSED_BUFFER; out.pos = 0; out.size = ZSTD_DStreamOutSize(); ZSTD_initDStream(zd); while (1) { ret = ZSTD_decompressStream(zd, &out, &in); if (ZSTD_isError(ret)) goto cleanup; /* error */ if (ret == 0) break; /* frame is done */ /* force decoding to be done in chunks */ out.size += MIN(ZSTD_DStreamOutSize(), MAX_DECOMPRESSED_SIZE - out.size); } ret = out.pos; if (memcmp(out.dst, fr->srcStart, out.pos) != 0) { return ERROR(corruption_detected); } cleanup: ZSTD_freeDStream(zd); return ret; } static size_t testDecodeWithDict(U32 seed, genType_e genType) { /* create variables */ size_t const dictSize = RAND(&seed) % (10 << 20) + ZDICT_DICTSIZE_MIN + ZDICT_CONTENTSIZE_MIN; U32 const dictID = RAND(&seed); size_t errorDetected = 0; BYTE* const fullDict = malloc(dictSize); if (fullDict == NULL) { return ERROR(GENERIC); } /* generate random dictionary */ if (genRandomDict(dictID, seed, dictSize, fullDict)) { /* return 0 on success */ errorDetected = ERROR(GENERIC); goto dictTestCleanup; } { frame_t fr; dictInfo info; ZSTD_DCtx* const dctx = ZSTD_createDCtx(); size_t ret; /* get dict info */ { size_t const headerSize = MAX(dictSize/4, 256); size_t const dictContentSize = dictSize-headerSize; BYTE* const dictContent = fullDict+headerSize; info = initDictInfo(1, dictContentSize, dictContent, dictID); } /* manually decompress and check difference */ if (genType == gt_frame) { /* Test frame */ generateFrame(seed, &fr, info); ret = ZSTD_decompress_usingDict(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, fullDict, dictSize); } else { /* Test block */ generateCompressedBlock(seed, &fr, info); ret = ZSTD_decompressBegin_usingDict(dctx, fullDict, dictSize); if (ZSTD_isError(ret)) { errorDetected = ret; ZSTD_freeDCtx(dctx); goto dictTestCleanup; } ret = ZSTD_decompressBlock(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart); } ZSTD_freeDCtx(dctx); if (ZSTD_isError(ret)) { errorDetected = ret; goto dictTestCleanup; } if (memcmp(DECOMPRESSED_BUFFER, fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart) != 0) { errorDetected = ERROR(corruption_detected); goto dictTestCleanup; } } dictTestCleanup: free(fullDict); return errorDetected; } static size_t testDecodeRawBlock(frame_t* fr) { ZSTD_DCtx* dctx = ZSTD_createDCtx(); size_t ret = ZSTD_decompressBegin(dctx); if (ZSTD_isError(ret)) return ret; ret = ZSTD_decompressBlock( dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart); ZSTD_freeDCtx(dctx); if (ZSTD_isError(ret)) return ret; if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart, (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) { return ERROR(corruption_detected); } return ret; } static int runBlockTest(U32* seed) { frame_t fr; U32 const seedCopy = *seed; { dictInfo const info = initDictInfo(0, 0, NULL, 0); *seed = generateCompressedBlock(*seed, &fr, info); } { size_t const r = testDecodeRawBlock(&fr); if (ZSTD_isError(r)) { DISPLAY("Error in block mode on test seed %u: %s\n", (unsigned)seedCopy, ZSTD_getErrorName(r)); return 1; } } { size_t const r = testDecodeWithDict(*seed, gt_block); if (ZSTD_isError(r)) { DISPLAY("Error in block mode with dictionary on test seed %u: %s\n", (unsigned)seedCopy, ZSTD_getErrorName(r)); return 1; } } return 0; } static int runFrameTest(U32* seed) { frame_t fr; U32 const seedCopy = *seed; { dictInfo const info = initDictInfo(0, 0, NULL, 0); *seed = generateFrame(*seed, &fr, info); } { size_t const r = testDecodeSimple(&fr); if (ZSTD_isError(r)) { DISPLAY("Error in simple mode on test seed %u: %s\n", (unsigned)seedCopy, ZSTD_getErrorName(r)); return 1; } } { size_t const r = testDecodeStreaming(&fr); if (ZSTD_isError(r)) { DISPLAY("Error in streaming mode on test seed %u: %s\n", (unsigned)seedCopy, ZSTD_getErrorName(r)); return 1; } } { size_t const r = testDecodeWithDict(*seed, gt_frame); /* avoid big dictionaries */ if (ZSTD_isError(r)) { DISPLAY("Error in dictionary mode on test seed %u: %s\n", (unsigned)seedCopy, ZSTD_getErrorName(r)); return 1; } } return 0; } static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS, genType_e genType) { unsigned fnum; UTIL_time_t const startClock = UTIL_getTime(); U64 const maxClockSpan = testDurationS * SEC_TO_MICRO; if (numFiles == 0 && !testDurationS) numFiles = 1; DISPLAY("seed: %u\n", (unsigned)seed); for (fnum = 0; fnum < numFiles || UTIL_clockSpanMicro(startClock) < maxClockSpan; fnum++) { if (fnum < numFiles) DISPLAYUPDATE("\r%u/%u ", fnum, numFiles); else DISPLAYUPDATE("\r%u ", fnum); { int const ret = (genType == gt_frame) ? runFrameTest(&seed) : runBlockTest(&seed); if (ret) return ret; } } DISPLAY("\r%u tests completed: ", fnum); DISPLAY("OK\n"); return 0; } /*-******************************************************* * File I/O *********************************************************/ static int generateFile(U32 seed, const char* const path, const char* const origPath, genType_e genType) { frame_t fr; DISPLAY("seed: %u\n", (unsigned)seed); { dictInfo const info = initDictInfo(0, 0, NULL, 0); if (genType == gt_frame) { generateFrame(seed, &fr, info); } else { generateCompressedBlock(seed, &fr, info); } } outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path); if (origPath) { outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath); } return 0; } static int generateCorpus(U32 seed, unsigned numFiles, const char* const path, const char* const origPath, genType_e genType) { char outPath[MAX_PATH]; unsigned fnum; DISPLAY("seed: %u\n", (unsigned)seed); for (fnum = 0; fnum < numFiles; fnum++) { frame_t fr; DISPLAYUPDATE("\r%u/%u ", fnum, numFiles); { dictInfo const info = initDictInfo(0, 0, NULL, 0); if (genType == gt_frame) { seed = generateFrame(seed, &fr, info); } else { seed = generateCompressedBlock(seed, &fr, info); } } if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) { DISPLAY("Error: path too long\n"); return 1; } outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath); if (origPath) { if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) { DISPLAY("Error: path too long\n"); return 1; } outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath); } } DISPLAY("\r%u/%u \n", fnum, numFiles); return 0; } static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const path, const char* const origPath, const size_t dictSize, genType_e genType) { char outPath[MAX_PATH]; BYTE* fullDict; U32 const dictID = RAND(&seed); int errorDetected = 0; if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) { DISPLAY("Error: path too long\n"); return 1; } /* allocate space for the dictionary */ fullDict = malloc(dictSize); if (fullDict == NULL) { DISPLAY("Error: could not allocate space for full dictionary.\n"); return 1; } /* randomly generate the dictionary */ { int const ret = genRandomDict(dictID, seed, dictSize, fullDict); if (ret != 0) { errorDetected = ret; goto dictCleanup; } } /* write out dictionary */ if (numFiles != 0) { if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) { DISPLAY("Error: dictionary path too long\n"); errorDetected = 1; goto dictCleanup; } outputBuffer(fullDict, dictSize, outPath); } else { outputBuffer(fullDict, dictSize, "dictionary"); } /* generate random compressed/decompressed files */ { unsigned fnum; for (fnum = 0; fnum < MAX(numFiles, 1); fnum++) { frame_t fr; DISPLAYUPDATE("\r%u/%u ", fnum, numFiles); { size_t const headerSize = MAX(dictSize/4, 256); size_t const dictContentSize = dictSize-headerSize; BYTE* const dictContent = fullDict+headerSize; dictInfo const info = initDictInfo(1, dictContentSize, dictContent, dictID); if (genType == gt_frame) { seed = generateFrame(seed, &fr, info); } else { seed = generateCompressedBlock(seed, &fr, info); } } if (numFiles != 0) { if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) { DISPLAY("Error: path too long\n"); errorDetected = 1; goto dictCleanup; } outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath); if (origPath) { if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) { DISPLAY("Error: path too long\n"); errorDetected = 1; goto dictCleanup; } outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath); } } else { outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path); if (origPath) { outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath); } } } } dictCleanup: free(fullDict); return errorDetected; } /*_******************************************************* * Command line *********************************************************/ static U32 makeSeed(void) { U32 t = (U32) time(NULL); return XXH32(&t, sizeof(t), 0) % 65536; } static unsigned readInt(const char** argument) { unsigned val = 0; while ((**argument>='0') && (**argument<='9')) { val *= 10; val += **argument - '0'; (*argument)++; } return val; } static void usage(const char* programName) { DISPLAY( "Usage :\n"); DISPLAY( " %s [args]\n", programName); DISPLAY( "\n"); DISPLAY( "Arguments :\n"); DISPLAY( " -p : select output path (default:stdout)\n"); DISPLAY( " in multiple files mode this should be a directory\n"); DISPLAY( " -o : select path to output original file (default:no output)\n"); DISPLAY( " in multiple files mode this should be a directory\n"); DISPLAY( " -s# : select seed (default:random based on time)\n"); DISPLAY( " -n# : number of files to generate (default:1)\n"); DISPLAY( " -t : activate test mode (test files against libzstd instead of outputting them)\n"); DISPLAY( " -T# : length of time to run tests for\n"); DISPLAY( " -v : increase verbosity level (default:0, max:7)\n"); DISPLAY( " -h/H : display help/long help and exit\n"); } static void advancedUsage(const char* programName) { usage(programName); DISPLAY( "\n"); DISPLAY( "Advanced arguments :\n"); DISPLAY( " --content-size : always include the content size in the frame header\n"); DISPLAY( " --use-dict=# : include a dictionary used to decompress the corpus\n"); DISPLAY( " --gen-blocks : generate raw compressed blocks without block/frame headers\n"); DISPLAY( " --max-block-size-log=# : max block size log, must be in range [2, 17]\n"); DISPLAY( " --max-content-size-log=# : max content size log, must be <= 20\n"); DISPLAY( " (this is ignored with gen-blocks)\n"); } /*! readU32FromChar() : @return : unsigned integer value read from input in `char` format allows and interprets K, KB, KiB, M, MB and MiB suffix. Will also modify `*stringPtr`, advancing it to position where it stopped reading. Note : function result can overflow if digit string > MAX_UINT */ static unsigned readU32FromChar(const char** stringPtr) { unsigned result = 0; while ((**stringPtr >='0') && (**stringPtr <='9')) result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; if ((**stringPtr=='K') || (**stringPtr=='M')) { result <<= 10; if (**stringPtr=='M') result <<= 10; (*stringPtr)++ ; if (**stringPtr=='i') (*stringPtr)++; if (**stringPtr=='B') (*stringPtr)++; } return result; } /** longCommandWArg() : * check if *stringPtr is the same as longCommand. * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. * @return 0 and doesn't modify *stringPtr otherwise. */ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) { size_t const comSize = strlen(longCommand); int const result = !strncmp(*stringPtr, longCommand, comSize); if (result) *stringPtr += comSize; return result; } int main(int argc, char** argv) { U32 seed = 0; int seedset = 0; unsigned numFiles = 0; unsigned testDuration = 0; int testMode = 0; const char* path = NULL; const char* origPath = NULL; int useDict = 0; unsigned dictSize = (10 << 10); /* 10 kB default */ genType_e genType = gt_frame; int argNb; /* Check command line */ for (argNb=1; argNb