Rizin
unix-like reverse engineering framework and cli tools
zran.c
Go to the documentation of this file.
1 /* zran.c -- example of zlib/gzip stream indexing and random access
2  * Copyright (C) 2005, 2012, 2018 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  * Version 1.2 14 Oct 2018 Mark Adler */
5 
6 /* Version History:
7  1.0 29 May 2005 First version
8  1.1 29 Sep 2012 Fix memory reallocation error
9  1.2 14 Oct 2018 Handle gzip streams with multiple members
10  Add a header file to facilitate usage in applications
11  */
12 
13 /* Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
14  for random access of a compressed file. A file containing a zlib or gzip
15  stream is provided on the command line. The compressed stream is decoded in
16  its entirety, and an index built with access points about every SPAN bytes
17  in the uncompressed output. The compressed file is left open, and can then
18  be read randomly, having to decompress on the average SPAN/2 uncompressed
19  bytes before getting to the desired block of data.
20 
21  An access point can be created at the start of any deflate block, by saving
22  the starting file offset and bit of that block, and the 32K bytes of
23  uncompressed data that precede that block. Also the uncompressed offset of
24  that block is saved to provide a referece for locating a desired starting
25  point in the uncompressed stream. deflate_index_build() works by
26  decompressing the input zlib or gzip stream a block at a time, and at the
27  end of each block deciding if enough uncompressed data has gone by to
28  justify the creation of a new access point. If so, that point is saved in a
29  data structure that grows as needed to accommodate the points.
30 
31  To use the index, an offset in the uncompressed data is provided, for which
32  the latest access point at or preceding that offset is located in the index.
33  The input file is positioned to the specified location in the index, and if
34  necessary the first few bits of the compressed data is read from the file.
35  inflate is initialized with those bits and the 32K of uncompressed data, and
36  the decompression then proceeds until the desired offset in the file is
37  reached. Then the decompression continues to read the desired uncompressed
38  data from the file.
39 
40  Another approach would be to generate the index on demand. In that case,
41  requests for random access reads from the compressed data would try to use
42  the index, but if a read far enough past the end of the index is required,
43  then further index entries would be generated and added.
44 
45  There is some fair bit of overhead to starting inflation for the random
46  access, mainly copying the 32K byte dictionary. So if small pieces of the
47  file are being accessed, it would make sense to implement a cache to hold
48  some lookahead and avoid many calls to deflate_index_extract() for small
49  lengths.
50 
51  Another way to build an index would be to use inflateCopy(). That would
52  not be constrained to have access points at block boundaries, but requires
53  more memory per access point, and also cannot be saved to file due to the
54  use of pointers in the state. The approach here allows for storage of the
55  index in a file.
56  */
57 
58 #include <stdio.h>
59 #include <stdlib.h>
60 #include <string.h>
61 #include "zlib.h"
62 #include "zran.h"
63 
64 #define WINSIZE 32768U /* sliding window size */
65 #define CHUNK 16384 /* file input buffer size */
66 
67 /* Access point entry. */
68 struct point {
69  off_t out; /* corresponding offset in uncompressed data */
70  off_t in; /* offset in input file of first full byte */
71  int bits; /* number of bits (1-7) from byte at in-1, or 0 */
72  unsigned char window[WINSIZE]; /* preceding 32K of uncompressed data */
73 };
74 
75 /* See comments in zran.h. */
76 void deflate_index_free(struct deflate_index *index)
77 {
78  if (index != NULL) {
79  free(index->list);
80  free(index);
81  }
82 }
83 
84 /* Add an entry to the access point list. If out of memory, deallocate the
85  existing list and return NULL. index->gzip is the allocated size of the
86  index in point entries, until it is time for deflate_index_build() to
87  return, at which point gzip is set to indicate a gzip file or not.
88  */
89 static struct deflate_index *addpoint(struct deflate_index *index, int bits,
90  off_t in, off_t out, unsigned left,
91  unsigned char *window)
92 {
93  struct point *next;
94 
95  /* if list is empty, create it (start with eight points) */
96  if (index == NULL) {
97  index = malloc(sizeof(struct deflate_index));
98  if (index == NULL) return NULL;
99  index->list = malloc(sizeof(struct point) << 3);
100  if (index->list == NULL) {
101  free(index);
102  return NULL;
103  }
104  index->gzip = 8;
105  index->have = 0;
106  }
107 
108  /* if list is full, make it bigger */
109  else if (index->have == index->gzip) {
110  index->gzip <<= 1;
111  next = realloc(index->list, sizeof(struct point) * index->gzip);
112  if (next == NULL) {
113  deflate_index_free(index);
114  return NULL;
115  }
116  index->list = next;
117  }
118 
119  /* fill in entry and increment how many we have */
120  next = (struct point *)(index->list) + index->have;
121  next->bits = bits;
122  next->in = in;
123  next->out = out;
124  if (left)
125  memcpy(next->window, window + WINSIZE - left, left);
126  if (left < WINSIZE)
127  memcpy(next->window + left, window, WINSIZE - left);
128  index->have++;
129 
130  /* return list, possibly reallocated */
131  return index;
132 }
133 
134 /* See comments in zran.h. */
135 int deflate_index_build(FILE *in, off_t span, struct deflate_index **built)
136 {
137  int ret;
138  int gzip = 0; /* true if reading a gzip file */
139  off_t totin, totout; /* our own total counters to avoid 4GB limit */
140  off_t last; /* totout value of last access point */
141  struct deflate_index *index; /* access points being generated */
142  z_stream strm;
143  unsigned char input[CHUNK];
144  unsigned char window[WINSIZE];
145 
146  /* initialize inflate */
147  strm.zalloc = Z_NULL;
148  strm.zfree = Z_NULL;
149  strm.opaque = Z_NULL;
150  strm.avail_in = 0;
151  strm.next_in = Z_NULL;
152  ret = inflateInit2(&strm, 47); /* automatic zlib or gzip decoding */
153  if (ret != Z_OK)
154  return ret;
155 
156  /* inflate the input, maintain a sliding window, and build an index -- this
157  also validates the integrity of the compressed data using the check
158  information in the gzip or zlib stream */
159  totin = totout = last = 0;
160  index = NULL; /* will be allocated by first addpoint() */
161  strm.avail_out = 0;
162  do {
163  /* get some compressed data from input file */
164  strm.avail_in = fread(input, 1, CHUNK, in);
165  if (ferror(in)) {
166  ret = Z_ERRNO;
167  goto deflate_index_build_error;
168  }
169  if (strm.avail_in == 0) {
170  ret = Z_DATA_ERROR;
171  goto deflate_index_build_error;
172  }
173  strm.next_in = input;
174 
175  /* check for a gzip stream */
176  if (totin == 0 && strm.avail_in >= 3 &&
177  input[0] == 31 && input[1] == 139 && input[2] == 8)
178  gzip = 1;
179 
180  /* process all of that, or until end of stream */
181  do {
182  /* reset sliding window if necessary */
183  if (strm.avail_out == 0) {
185  strm.next_out = window;
186  }
187 
188  /* inflate until out of input, output, or at end of block --
189  update the total input and output counters */
190  totin += strm.avail_in;
191  totout += strm.avail_out;
192  ret = inflate(&strm, Z_BLOCK); /* return at end of block */
193  totin -= strm.avail_in;
194  totout -= strm.avail_out;
195  if (ret == Z_NEED_DICT)
196  ret = Z_DATA_ERROR;
197  if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
198  goto deflate_index_build_error;
199  if (ret == Z_STREAM_END) {
200  if (gzip &&
201  (strm.avail_in || ungetc(getc(in), in) != EOF)) {
202  ret = inflateReset(&strm);
203  if (ret != Z_OK)
204  goto deflate_index_build_error;
205  continue;
206  }
207  break;
208  }
209 
210  /* if at end of block, consider adding an index entry (note that if
211  data_type indicates an end-of-block, then all of the
212  uncompressed data from that block has been delivered, and none
213  of the compressed data after that block has been consumed,
214  except for up to seven bits) -- the totout == 0 provides an
215  entry point after the zlib or gzip header, and assures that the
216  index always has at least one access point; we avoid creating an
217  access point after the last block by checking bit 6 of data_type
218  */
219  if ((strm.data_type & 128) && !(strm.data_type & 64) &&
220  (totout == 0 || totout - last > span)) {
221  index = addpoint(index, strm.data_type & 7, totin,
222  totout, strm.avail_out, window);
223  if (index == NULL) {
224  ret = Z_MEM_ERROR;
225  goto deflate_index_build_error;
226  }
227  last = totout;
228  }
229  } while (strm.avail_in != 0);
230  } while (ret != Z_STREAM_END);
231 
232  /* clean up and return index (release unused entries in list) */
233  (void)inflateEnd(&strm);
234  index->list = realloc(index->list, sizeof(struct point) * index->have);
235  index->gzip = gzip;
236  index->length = totout;
237  *built = index;
238  return index->have;
239 
240  /* return error */
241  deflate_index_build_error:
242  (void)inflateEnd(&strm);
243  deflate_index_free(index);
244  return ret;
245 }
246 
247 /* See comments in zran.h. */
249  unsigned char *buf, int len)
250 {
251  int ret, skip;
252  z_stream strm;
253  struct point *here;
254  unsigned char input[CHUNK];
255  unsigned char discard[WINSIZE];
256 
257  /* proceed only if something reasonable to do */
258  if (len < 0)
259  return 0;
260 
261  /* find where in stream to start */
262  here = index->list;
263  ret = index->have;
264  while (--ret && here[1].out <= offset)
265  here++;
266 
267  /* initialize file and inflate state to start there */
268  strm.zalloc = Z_NULL;
269  strm.zfree = Z_NULL;
270  strm.opaque = Z_NULL;
271  strm.avail_in = 0;
272  strm.next_in = Z_NULL;
273  ret = inflateInit2(&strm, -15); /* raw inflate */
274  if (ret != Z_OK)
275  return ret;
276  ret = fseeko(in, here->in - (here->bits ? 1 : 0), SEEK_SET);
277  if (ret == -1)
278  goto deflate_index_extract_ret;
279  if (here->bits) {
280  ret = getc(in);
281  if (ret == -1) {
282  ret = ferror(in) ? Z_ERRNO : Z_DATA_ERROR;
283  goto deflate_index_extract_ret;
284  }
285  (void)inflatePrime(&strm, here->bits, ret >> (8 - here->bits));
286  }
287  (void)inflateSetDictionary(&strm, here->window, WINSIZE);
288 
289  /* skip uncompressed bytes until offset reached, then satisfy request */
290  offset -= here->out;
291  strm.avail_in = 0;
292  skip = 1; /* while skipping to offset */
293  do {
294  /* define where to put uncompressed data, and how much */
295  if (offset > WINSIZE) { /* skip WINSIZE bytes */
297  strm.next_out = discard;
298  offset -= WINSIZE;
299  }
300  else if (offset > 0) { /* last skip */
302  strm.next_out = discard;
303  offset = 0;
304  }
305  else if (skip) { /* at offset now */
306  strm.avail_out = len;
307  strm.next_out = buf;
308  skip = 0; /* only do this once */
309  }
310 
311  /* uncompress until avail_out filled, or end of stream */
312  do {
313  if (strm.avail_in == 0) {
314  strm.avail_in = fread(input, 1, CHUNK, in);
315  if (ferror(in)) {
316  ret = Z_ERRNO;
317  goto deflate_index_extract_ret;
318  }
319  if (strm.avail_in == 0) {
320  ret = Z_DATA_ERROR;
321  goto deflate_index_extract_ret;
322  }
323  strm.next_in = input;
324  }
325  ret = inflate(&strm, Z_NO_FLUSH); /* normal inflate */
326  if (ret == Z_NEED_DICT)
327  ret = Z_DATA_ERROR;
328  if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
329  goto deflate_index_extract_ret;
330  if (ret == Z_STREAM_END) {
331  /* the raw deflate stream has ended */
332  if (index->gzip == 0)
333  /* this is a zlib stream that has ended -- done */
334  break;
335 
336  /* near the end of a gzip member, which might be followed by
337  another gzip member -- skip the gzip trailer and see if
338  there is more input after it */
339  if (strm.avail_in < 8) {
340  fseeko(in, 8 - strm.avail_in, SEEK_CUR);
341  strm.avail_in = 0;
342  }
343  else {
344  strm.avail_in -= 8;
345  strm.next_in += 8;
346  }
347  if (strm.avail_in == 0 && ungetc(getc(in), in) == EOF)
348  /* the input ended after the gzip trailer -- done */
349  break;
350 
351  /* there is more input, so another gzip member should follow --
352  validate and skip the gzip header */
353  ret = inflateReset2(&strm, 31);
354  if (ret != Z_OK)
355  goto deflate_index_extract_ret;
356  do {
357  if (strm.avail_in == 0) {
358  strm.avail_in = fread(input, 1, CHUNK, in);
359  if (ferror(in)) {
360  ret = Z_ERRNO;
361  goto deflate_index_extract_ret;
362  }
363  if (strm.avail_in == 0) {
364  ret = Z_DATA_ERROR;
365  goto deflate_index_extract_ret;
366  }
367  strm.next_in = input;
368  }
369  ret = inflate(&strm, Z_BLOCK);
370  if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
371  goto deflate_index_extract_ret;
372  } while ((strm.data_type & 128) == 0);
373 
374  /* set up to continue decompression of the raw deflate stream
375  that follows the gzip header */
376  ret = inflateReset2(&strm, -15);
377  if (ret != Z_OK)
378  goto deflate_index_extract_ret;
379  }
380 
381  /* continue to process the available input before reading more */
382  } while (strm.avail_out != 0);
383 
384  if (ret == Z_STREAM_END)
385  /* reached the end of the compressed data -- return the data that
386  was available, possibly less than requested */
387  break;
388 
389  /* do until offset reached and requested data read */
390  } while (skip);
391 
392  /* compute the number of uncompressed bytes read after the offset */
393  ret = skip ? 0 : len - strm.avail_out;
394 
395  /* clean up and return the bytes read, or the negative error */
396  deflate_index_extract_ret:
397  (void)inflateEnd(&strm);
398  return ret;
399 }
400 
401 #ifdef TEST
402 
403 #define SPAN 1048576L /* desired distance between access points */
404 #define LEN 16384 /* number of bytes to extract */
405 
406 /* Demonstrate the use of deflate_index_build() and deflate_index_extract() by
407  processing the file provided on the command line, and extracting LEN bytes
408  from 2/3rds of the way through the uncompressed output, writing that to
409  stdout. An offset can be provided as the second argument, in which case the
410  data is extracted from there instead. */
411 int main(int argc, char **argv)
412 {
413  int len;
414  off_t offset = -1;
415  FILE *in;
416  struct deflate_index *index = NULL;
417  unsigned char buf[LEN];
418 
419  /* open input file */
420  if (argc < 2 || argc > 3) {
421  fprintf(stderr, "usage: zran file.gz [offset]\n");
422  return 1;
423  }
424  in = fopen(argv[1], "rb");
425  if (in == NULL) {
426  fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);
427  return 1;
428  }
429 
430  /* get optional offset */
431  if (argc == 3) {
432  char *end;
433  offset = strtoll(argv[2], &end, 10);
434  if (*end || offset < 0) {
435  fprintf(stderr, "zran: %s is not a valid offset\n", argv[2]);
436  return 1;
437  }
438  }
439 
440  /* build index */
441  len = deflate_index_build(in, SPAN, &index);
442  if (len < 0) {
443  fclose(in);
444  switch (len) {
445  case Z_MEM_ERROR:
446  fprintf(stderr, "zran: out of memory\n");
447  break;
448  case Z_DATA_ERROR:
449  fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);
450  break;
451  case Z_ERRNO:
452  fprintf(stderr, "zran: read error on %s\n", argv[1]);
453  break;
454  default:
455  fprintf(stderr, "zran: error %d while building index\n", len);
456  }
457  return 1;
458  }
459  fprintf(stderr, "zran: built index with %d access points\n", len);
460 
461  /* use index by reading some bytes from an arbitrary offset */
462  if (offset == -1)
463  offset = (index->length << 1) / 3;
464  len = deflate_index_extract(in, index, offset, buf, LEN);
465  if (len < 0)
466  fprintf(stderr, "zran: extraction failed: %s error\n",
467  len == Z_MEM_ERROR ? "out of memory" : "input corrupted");
468  else {
469  fwrite(buf, 1, len, stdout);
470  fprintf(stderr, "zran: extracted %d bytes at %llu\n", len, offset);
471  }
472 
473  /* clean up and exit */
474  deflate_index_free(index);
475  fclose(in);
476  return 0;
477 }
478 
479 #endif
size_t len
Definition: 6502dis.c:15
int bits(struct state *s, int need)
Definition: blast.c:72
const lzma_allocator const uint8_t * in
Definition: block.h:527
const lzma_allocator const uint8_t size_t uint8_t * out
Definition: block.h:528
#define fseeko(s, o, w)
Definition: compat.h:121
#define NULL
Definition: cris-opc.c:27
static lzma_stream strm
Definition: full_flush.c:20
void skip(file *in, unsigned n)
Definition: gzappend.c:202
RZ_API void Ht_() free(HtName_(Ht) *ht)
Definition: ht_inc.c:130
@ LEN
Definition: inflate9.h:16
int ZEXPORT inflateSetDictionary(z_streamp strm, const Bytef *dictionary, uInt dictLength)
Definition: inflate.c:1338
int ZEXPORT inflatePrime(z_streamp strm, int bits, int value)
Definition: inflate.c:248
int ZEXPORT inflate(z_streamp strm, int flush)
Definition: inflate.c:623
int ZEXPORT inflateReset(z_streamp strm)
Definition: inflate.c:145
int ZEXPORT inflateEnd(z_streamp strm)
Definition: inflate.c:1301
int ZEXPORT inflateReset2(z_streamp strm, int windowBits)
Definition: inflate.c:158
voidpf uLong offset
Definition: ioapi.h:144
voidpf void * buf
Definition: ioapi.h:138
memcpy(mem, inblock.get(), min(CONTAINING_RECORD(inblock.get(), MEMBLOCK, data) ->size, size))
void * realloc(void *ptr, size_t size)
Definition: malloc.c:144
void * malloc(size_t size)
Definition: malloc.c:123
static static fork const void static count static fd const char const char static newpath char char argv
Definition: sflib.h:40
static void struct sockaddr socklen_t static fromlen static backlog static fork char char char static envp int struct rusage static rusage struct utsname static buf struct sembuf unsigned
Definition: sflib.h:97
string FILE
Definition: benchmark.py:21
int main(int argc, char **argv)
Definition: rz-bb.c:29
int off_t
Definition: sftypes.h:41
int gzip
Definition: zran.h:12
int have
Definition: zran.h:11
void * list
Definition: zran.h:15
off_t length
Definition: zran.h:14
uint8_t * next_out
Definition: base.h:490
size_t avail_out
Definition: base.h:491
const uint8_t * next_in
Definition: base.h:486
size_t avail_in
Definition: base.h:487
Definition: zran.c:68
off_t in
Definition: zran.c:70
unsigned char window[WINSIZE]
Definition: zran.c:72
int bits
Definition: zran.c:71
off_t out
Definition: zran.c:69
struct _window window
#define SEEK_SET
Definition: zip.c:88
#define SEEK_CUR
Definition: zip.c:80
static bool input(void *ud, zip_uint8_t *data, zip_uint64_t length)
#define Z_NEED_DICT
Definition: zlib.h:179
#define Z_ERRNO
Definition: zlib.h:180
#define inflateInit2(strm, windowBits)
Definition: zlib.h:1817
#define Z_BLOCK
Definition: zlib.h:173
#define Z_STREAM_END
Definition: zlib.h:178
#define Z_OK
Definition: zlib.h:177
#define Z_DATA_ERROR
Definition: zlib.h:182
#define Z_NO_FLUSH
Definition: zlib.h:168
#define Z_NULL
Definition: zlib.h:212
#define Z_MEM_ERROR
Definition: zlib.h:183
#define CHUNK
Definition: zran.c:65
void deflate_index_free(struct deflate_index *index)
Definition: zran.c:76
int deflate_index_extract(FILE *in, struct deflate_index *index, off_t offset, unsigned char *buf, int len)
Definition: zran.c:248
static struct deflate_index * addpoint(struct deflate_index *index, int bits, off_t in, off_t out, unsigned left, unsigned char *window)
Definition: zran.c:89
int deflate_index_build(FILE *in, off_t span, struct deflate_index **built)
Definition: zran.c:135
#define WINSIZE
Definition: zran.c:64