Rizin
unix-like reverse engineering framework and cli tools
gznorm.c
Go to the documentation of this file.
1 /* gznorm.c -- normalize a gzip stream
2  * Copyright (C) 2018 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  * Version 1.0 7 Oct 2018 Mark Adler */
5 
6 // gznorm takes a gzip stream, potentially containing multiple members, and
7 // converts it to a gzip stream with a single member. In addition the gzip
8 // header is normalized, removing the file name and time stamp, and setting the
9 // other header contents (XFL, OS) to fixed values. gznorm does not recompress
10 // the data, so it is fast, but no advantage is gained from the history that
11 // could be available across member boundaries.
12 
13 #include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf,
14  // vsnprintf, stdout, stderr, NULL, FILE
15 #include <stdlib.h> // malloc, free
16 #include <string.h> // strerror
17 #include <errno.h> // errno
18 #include <stdarg.h> // va_list, va_start, va_end
19 #include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd,
20  // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK,
21  // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR,
22  // Z_MEM_ERROR
23 
24 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
25 # include <fcntl.h>
26 # include <io.h>
27 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
28 #else
29 # define SET_BINARY_MODE(file)
30 #endif
31 
32 #define local static
33 
34 // printf to an allocated string. Return the string, or NULL if the printf or
35 // allocation fails.
36 local char *aprintf(char *fmt, ...) {
37  // Get the length of the result of the printf.
38  va_list args;
39  va_start(args, fmt);
40  int len = vsnprintf(NULL, 0, fmt, args);
41  va_end(args);
42  if (len < 0)
43  return NULL;
44 
45  // Allocate the required space and printf to it.
46  char *str = malloc(len + 1);
47  if (str == NULL)
48  return NULL;
49  va_start(args, fmt);
50  vsnprintf(str, len + 1, fmt, args);
51  va_end(args);
52  return str;
53 }
54 
55 // Return with an error, putting an allocated error message in *err. Doing an
56 // inflateEnd() on an already ended state, or one with state set to Z_NULL, is
57 // permitted.
58 #define BYE(...) \
59  do { \
60  inflateEnd(&strm); \
61  *err = aprintf(__VA_ARGS__); \
62  return 1; \
63  } while (0)
64 
65 // Chunk size for buffered reads and for decompression. Twice this many bytes
66 // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned.
67 #define CHUNK 16384
68 
69 // Read a gzip stream from in and write an equivalent normalized gzip stream to
70 // out. If given no input, an empty gzip stream will be written. If successful,
71 // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the
72 // details of the error are returned in *err, a pointer to an allocated string.
73 //
74 // The input may be a stream with multiple gzip members, which is converted to
75 // a single gzip member on the output. Each gzip member is decompressed at the
76 // level of deflate blocks. This enables clearing the last-block bit, shifting
77 // the compressed data to concatenate to the previous member's compressed data,
78 // which can end at an arbitrary bit boundary, and identifying stored blocks in
79 // order to resynchronize those to byte boundaries. The deflate compressed data
80 // is terminated with a 10-bit empty fixed block. If any members on the input
81 // end with a 10-bit empty fixed block, then that block is excised from the
82 // stream. This avoids appending empty fixed blocks for every normalization,
83 // and assures that gzip_normalize applied a second time will not change the
84 // input. The pad bits after stored block headers and after the final deflate
85 // block are all forced to zeros.
86 local int gzip_normalize(FILE *in, FILE *out, char **err) {
87  // initialize the inflate engine to process a gzip member
88  z_stream strm;
89  strm.zalloc = Z_NULL;
90  strm.zfree = Z_NULL;
91  strm.opaque = Z_NULL;
92  strm.avail_in = 0;
94  if (inflateInit2(&strm, 15 + 16) != Z_OK)
95  BYE("out of memory");
96 
97  // State while processing the input gzip stream.
98  enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ...
99  BETWEEN, // between gzip members (must end in this state)
100  HEAD, // reading a gzip header
101  BLOCK, // reading deflate blocks
102  TAIL // reading a gzip trailer
103  } state = BETWEEN; // current component being processed
104  unsigned long crc = 0; // accumulated CRC of uncompressed data
105  unsigned long len = 0; // accumulated length of uncompressed data
106  unsigned long buf = 0; // deflate stream bit buffer of num bits
107  int num = 0; // number of bits in buf (at bottom)
108 
109  // Write a canonical gzip header (no mod time, file name, comment, extra
110  // block, or extra flags, and OS is marked as unknown).
111  fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
112 
113  // Process the gzip stream from in until reaching the end of the input,
114  // encountering invalid input, or experiencing an i/o error.
115  int more; // true if not at the end of the input
116  do {
117  // State inside this loop.
118  unsigned char *put; // next input buffer location to process
119  int prev; // number of bits from previous block in
120  // the bit buffer, or -1 if not at the
121  // start of a block
122  unsigned long long memb; // uncompressed length of member
123  size_t tail; // number of trailer bytes read (0..8)
124  unsigned long part; // accumulated trailer component
125 
126  // Get the next chunk of input from in.
127  unsigned char dat[CHUNK];
128  strm.avail_in = fread(dat, 1, CHUNK, in);
129  if (strm.avail_in == 0)
130  break;
131  more = strm.avail_in == CHUNK;
132  strm.next_in = put = dat;
133 
134  // Run that chunk of input through the inflate engine to exhaustion.
135  do {
136  // At this point it is assured that strm.avail_in > 0.
137 
138  // Inflate until the end of a gzip component (header, deflate
139  // block, trailer) is reached, or until all of the chunk is
140  // consumed. The resulting decompressed data is discarded, though
141  // the total size of the decompressed data in each member is
142  // tracked, for the calculation of the total CRC.
143  do {
144  // inflate and handle any errors
145  unsigned char scrap[CHUNK];
146  strm.avail_out = CHUNK;
147  strm.next_out = scrap;
148  int ret = inflate(&strm, Z_BLOCK);
149  if (ret == Z_MEM_ERROR)
150  BYE("out of memory");
151  if (ret == Z_DATA_ERROR)
152  BYE("input invalid: %s", strm.msg);
153  if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END)
154  BYE("internal error");
155 
156  // Update the number of uncompressed bytes generated in this
157  // member. The actual count (not modulo 2^32) is required to
158  // correctly compute the total CRC.
159  unsigned got = CHUNK - strm.avail_out;
160  memb += got;
161  if (memb < got)
162  BYE("overflow error");
163 
164  // Continue to process this chunk until it is consumed, or
165  // until the end of a component (header, deflate block, or
166  // trailer) is reached.
167  } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0);
168 
169  // Since strm.avail_in was > 0 for the inflate call, some input was
170  // just consumed. It is therefore assured that put < strm.next_in.
171 
172  // Disposition the consumed component or part of a component.
173  switch (state) {
174  case BETWEEN:
175  state = HEAD;
176  // Fall through to HEAD when some or all of the header is
177  // processed.
178 
179  case HEAD:
180  // Discard the header.
181  if (strm.data_type & 0x80) {
182  // End of header reached -- deflate blocks follow.
183  put = strm.next_in;
184  prev = num;
185  memb = 0;
186  state = BLOCK;
187  }
188  break;
189 
190  case BLOCK:
191  // Copy the deflate stream to the output, but with the
192  // last-block-bit cleared. Re-synchronize stored block
193  // headers to the output byte boundaries. The bytes at
194  // put..strm.next_in-1 is the compressed data that has been
195  // processed and is ready to be copied to the output.
196 
197  // At this point, it is assured that new compressed data is
198  // available, i.e., put < strm.next_in. If prev is -1, then
199  // that compressed data starts in the middle of a deflate
200  // block. If prev is not -1, then the bits in the bit
201  // buffer, possibly combined with the bits in *put, contain
202  // the three-bit header of the new deflate block. In that
203  // case, prev is the number of bits from the previous block
204  // that remain in the bit buffer. Since num is the number
205  // of bits in the bit buffer, we have that num - prev is
206  // the number of bits from the new block currently in the
207  // bit buffer.
208 
209  // If strm.data_type & 0xc0 is 0x80, then the last byte of
210  // the available compressed data includes the last bits of
211  // the end of a deflate block. In that case, that last byte
212  // also has strm.data_type & 0x1f bits of the next deflate
213  // block, in the range 0..7. If strm.data_type & 0xc0 is
214  // 0xc0, then the last byte of the compressed data is the
215  // end of the deflate stream, followed by strm.data_type &
216  // 0x1f pad bits, also in the range 0..7.
217 
218  // Set bits to the number of bits not yet consumed from the
219  // last byte. If we are at the end of the block, bits is
220  // either the number of bits in the last byte belonging to
221  // the next block, or the number of pad bits after the
222  // final block. In either of those cases, bits is in the
223  // range 0..7.
224  ; // (required due to C syntax oddity)
225  int bits = strm.data_type & 0x1f;
226 
227  if (prev != -1) {
228  // We are at the start of a new block. Clear the last
229  // block bit, and check for special cases. If it is a
230  // stored block, then emit the header and pad to the
231  // next byte boundary. If it is a final, empty fixed
232  // block, then excise it.
233 
234  // Some or all of the three header bits for this block
235  // may already be in the bit buffer. Load any remaining
236  // header bits into the bit buffer.
237  if (num - prev < 3) {
238  buf += (unsigned long)*put++ << num;
239  num += 8;
240  }
241 
242  // Set last to have a 1 in the position of the last
243  // block bit in the bit buffer.
244  unsigned long last = (unsigned long)1 << prev;
245 
246  if (((buf >> prev) & 7) == 3) {
247  // This is a final fixed block. Load at least ten
248  // bits from this block, including the header, into
249  // the bit buffer. We already have at least three,
250  // so at most one more byte needs to be loaded.
251  if (num - prev < 10) {
252  if (put == strm.next_in)
253  // Need to go get and process more input.
254  // We'll end up back here to finish this.
255  break;
256  buf += (unsigned long)*put++ << num;
257  num += 8;
258  }
259  if (((buf >> prev) & 0x3ff) == 3) {
260  // That final fixed block is empty. Delete it
261  // to avoid adding an empty block every time a
262  // gzip stream is normalized.
263  num = prev;
264  buf &= last - 1; // zero the pad bits
265  }
266  }
267  else if (((buf >> prev) & 6) == 0) {
268  // This is a stored block. Flush to the next
269  // byte boundary after the three-bit header.
270  num = (prev + 10) & ~7;
271  buf &= last - 1; // zero the pad bits
272  }
273 
274  // Clear the last block bit.
275  buf &= ~last;
276 
277  // Write out complete bytes in the bit buffer.
278  while (num >= 8) {
279  putc(buf, out);
280  buf >>= 8;
281  num -= 8;
282  }
283 
284  // If no more bytes left to process, then we have
285  // consumed the byte that had bits from the next block.
286  if (put == strm.next_in)
287  bits = 0;
288  }
289 
290  // We are done handling the deflate block header. Now copy
291  // all or almost all of the remaining compressed data that
292  // has been processed so far. Don't copy one byte at the
293  // end if it contains bits from the next deflate block or
294  // pad bits at the end of a deflate block.
295 
296  // mix is 1 if we are at the end of a deflate block, and if
297  // some of the bits in the last byte follow this block. mix
298  // is 0 if we are in the middle of a deflate block, if the
299  // deflate block ended on a byte boundary, or if all of the
300  // compressed data processed so far has been consumed.
301  int mix = (strm.data_type & 0x80) && bits;
302 
303  // Copy all of the processed compressed data to the output,
304  // except for the last byte if it contains bits from the
305  // next deflate block or pad bits at the end of the deflate
306  // stream. Copy the data after shifting in num bits from
307  // buf in front of it, leaving num bits from the end of the
308  // compressed data in buf when done.
309  unsigned char *end = strm.next_in - mix;
310  if (put < end) {
311  if (num)
312  // Insert num bits from buf before the data being
313  // copied.
314  do {
315  buf += (unsigned)(*put++) << num;
316  putc(buf, out);
317  buf >>= 8;
318  } while (put < end);
319  else {
320  // No shifting needed -- write directly.
321  fwrite(put, 1, end - put, out);
322  put = end;
323  }
324  }
325 
326  // Process the last processed byte if it wasn't written.
327  if (mix) {
328  // Load the last byte into the bit buffer.
329  buf += (unsigned)(*put++) << num;
330  num += 8;
331 
332  if (strm.data_type & 0x40) {
333  // We are at the end of the deflate stream and
334  // there are bits pad bits. Discard the pad bits
335  // and write a byte to the output, if available.
336  // Leave the num bits left over in buf to prepend
337  // to the next deflate stream.
338  num -= bits;
339  if (num >= 8) {
340  putc(buf, out);
341  num -= 8;
342  buf >>= 8;
343  }
344 
345  // Force the pad bits in the bit buffer to zeros.
346  buf &= ((unsigned long)1 << num) - 1;
347 
348  // Don't need to set prev here since going to TAIL.
349  }
350  else
351  // At the end of an internal deflate block. Leave
352  // the last byte in the bit buffer to examine on
353  // the next entry to BLOCK, when more bits from the
354  // next block will be available.
355  prev = num - bits; // number of bits in buffer
356  // from current block
357  }
358 
359  // Don't have a byte left over, so we are in the middle of
360  // a deflate block, or the deflate block ended on a byte
361  // boundary. Set prev appropriately for the next entry into
362  // BLOCK.
363  else if (strm.data_type & 0x80)
364  // The block ended on a byte boundary, so no header
365  // bits are in the bit buffer.
366  prev = num;
367  else
368  // In the middle of a deflate block, so no header here.
369  prev = -1;
370 
371  // Check for the end of the deflate stream.
372  if ((strm.data_type & 0xc0) == 0xc0) {
373  // That ends the deflate stream on the input side, the
374  // pad bits were discarded, and any remaining bits from
375  // the last block in the stream are saved in the bit
376  // buffer to prepend to the next stream. Process the
377  // gzip trailer next.
378  tail = 0;
379  part = 0;
380  state = TAIL;
381  }
382  break;
383 
384  case TAIL:
385  // Accumulate available trailer bytes to update the total
386  // CRC and the total uncompressed length.
387  do {
388  part = (part >> 8) + ((unsigned long)(*put++) << 24);
389  tail++;
390  if (tail == 4) {
391  // Update the total CRC.
392  z_off_t len2 = memb;
393  if (len2 < 0 || (unsigned long long)len2 != memb)
394  BYE("overflow error");
395  crc = crc ? crc32_combine(crc, part, len2) : part;
396  part = 0;
397  }
398  else if (tail == 8) {
399  // Update the total uncompressed length. (It's ok
400  // if this sum is done modulo 2^32.)
401  len += part;
402 
403  // At the end of a member. Set up to inflate an
404  // immediately following gzip member. (If we made
405  // it this far, then the trailer was valid.)
406  if (inflateReset(&strm) != Z_OK)
407  BYE("internal error");
408  state = BETWEEN;
409  break;
410  }
411  } while (put < strm.next_in);
412  break;
413  }
414 
415  // Process the input buffer until completely consumed.
416  } while (strm.avail_in > 0);
417 
418  // Process input until end of file, invalid input, or i/o error.
419  } while (more);
420 
421  // Done with the inflate engine.
422  inflateEnd(&strm);
423 
424  // Verify the validity of the input.
425  if (state != BETWEEN)
426  BYE("input invalid: incomplete gzip stream");
427 
428  // Write the remaining deflate stream bits, followed by a terminating
429  // deflate fixed block.
430  buf += (unsigned long)3 << num;
431  putc(buf, out);
432  putc(buf >> 8, out);
433  if (num > 6)
434  putc(0, out);
435 
436  // Write the gzip trailer, which is the CRC and the uncompressed length
437  // modulo 2^32, both in little-endian order.
438  putc(crc, out);
439  putc(crc >> 8, out);
440  putc(crc >> 16, out);
441  putc(crc >> 24, out);
442  putc(len, out);
443  putc(len >> 8, out);
444  putc(len >> 16, out);
445  putc(len >> 24, out);
446  fflush(out);
447 
448  // Check for any i/o errors.
449  if (ferror(in) || ferror(out))
450  BYE("i/o error: %s", strerror(errno));
451 
452  // All good!
453  *err = NULL;
454  return 0;
455 }
456 
457 // Normalize the gzip stream on stdin, writing the result to stdout.
458 int main(void) {
459  // Avoid end-of-line conversions on evil operating systems.
460  SET_BINARY_MODE(stdin);
461  SET_BINARY_MODE(stdout);
462 
463  // Normalize from stdin to stdout, returning 1 on error, 0 if ok.
464  char *err;
465  int ret = gzip_normalize(stdin, stdout, &err);
466  if (ret)
467  fprintf(stderr, "gznorm error: %s\n", err);
468  free(err);
469  return ret;
470 }
size_t len
Definition: 6502dis.c:15
static bool err
Definition: armass.c:435
int bits(struct state *s, int need)
Definition: blast.c:72
const lzma_allocator const uint8_t * in
Definition: block.h:527
const lzma_allocator const uint8_t size_t uint8_t * out
Definition: block.h:528
#define NULL
Definition: cris-opc.c:27
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags long
Definition: sflib.h:79
static lzma_stream strm
Definition: full_flush.c:20
#define local
Definition: gznorm.c:32
#define CHUNK
Definition: gznorm.c:67
char * aprintf(char *fmt,...)
Definition: gznorm.c:36
#define BYE(...)
Definition: gznorm.c:58
#define SET_BINARY_MODE(file)
Definition: gznorm.c:29
int gzip_normalize(FILE *in, FILE *out, char **err)
Definition: gznorm.c:86
int main(void)
Definition: gznorm.c:458
RZ_API void Ht_() free(HtName_(Ht) *ht)
Definition: ht_inc.c:130
int ZEXPORT inflate(z_streamp strm, int flush)
Definition: inflate.c:623
int ZEXPORT inflateReset(z_streamp strm)
Definition: inflate.c:145
int ZEXPORT inflateEnd(z_streamp strm)
Definition: inflate.c:1301
@ HEAD
Definition: inflate.h:21
voidpf void * buf
Definition: ioapi.h:138
vsnprintf
Definition: kernel.h:366
void * malloc(size_t size)
Definition: malloc.c:123
static static fork const void static count static fd const char const char static newpath char char char static envp time_t static t const char static mode static whence const char static dir time_t static t unsigned static seconds const char struct utimbuf static buf static inc static sig const char static mode static oldfd struct tms static buf static getgid static geteuid const char static filename static arg static mask struct ustat static ubuf static getppid static setsid static egid sigset_t static set struct timeval struct timezone static tz fd_set fd_set fd_set struct timeval static timeout const char char static bufsiz const char static swapflags void static offset const char static length static mode static who const char struct statfs static buf unsigned unsigned num
Definition: sflib.h:126
static void struct sockaddr socklen_t static fromlen static backlog static fork char char char static envp int struct rusage static rusage struct utsname static buf struct sembuf unsigned
Definition: sflib.h:97
int args
Definition: mipsasm.c:18
string FILE
Definition: benchmark.py:21
uint8_t * next_out
Definition: base.h:490
size_t avail_out
Definition: base.h:491
const uint8_t * next_in
Definition: base.h:486
size_t avail_in
Definition: base.h:487
Definition: dis.h:43
#define z_off_t
Definition: zconf.h:504
uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2)
Definition: crc32.c:1084
#define Z_BUF_ERROR
Definition: zlib.h:184
#define inflateInit2(strm, windowBits)
Definition: zlib.h:1817
#define Z_BLOCK
Definition: zlib.h:173
#define Z_STREAM_END
Definition: zlib.h:178
#define Z_OK
Definition: zlib.h:177
#define Z_DATA_ERROR
Definition: zlib.h:182
#define Z_NULL
Definition: zlib.h:212
#define Z_MEM_ERROR
Definition: zlib.h:183