Rizin
unix-like reverse engineering framework and cli tools
lz_encoder_mf.c
Go to the documentation of this file.
1 //
6 // Authors: Igor Pavlov
7 // Lasse Collin
8 //
9 // This file has been put into the public domain.
10 // You can do whatever you want with this file.
11 //
13 
14 #include "lz_encoder.h"
15 #include "lz_encoder_hash.h"
16 #include "memcmplen.h"
17 
18 
22 extern uint32_t
23 lzma_mf_find(lzma_mf *mf, uint32_t *count_ptr, lzma_match *matches)
24 {
25  // Call the match finder. It returns the number of length-distance
26  // pairs found.
27  // FIXME: Minimum count is zero, what _exactly_ is the maximum?
28  const uint32_t count = mf->find(mf, matches);
29 
30  // Length of the longest match; assume that no matches were found
31  // and thus the maximum length is zero.
32  uint32_t len_best = 0;
33 
34  if (count > 0) {
35 #ifndef NDEBUG
36  // Validate the matches.
37  for (uint32_t i = 0; i < count; ++i) {
38  assert(matches[i].len <= mf->nice_len);
39  assert(matches[i].dist < mf->read_pos);
40  assert(memcmp(mf_ptr(mf) - 1,
41  mf_ptr(mf) - matches[i].dist - 2,
42  matches[i].len) == 0);
43  }
44 #endif
45 
46  // The last used element in the array contains
47  // the longest match.
48  len_best = matches[count - 1].len;
49 
50  // If a match of maximum search length was found, try to
51  // extend the match to maximum possible length.
52  if (len_best == mf->nice_len) {
53  // The limit for the match length is either the
54  // maximum match length supported by the LZ-based
55  // encoder or the number of bytes left in the
56  // dictionary, whichever is smaller.
57  uint32_t limit = mf_avail(mf) + 1;
58  if (limit > mf->match_len_max)
59  limit = mf->match_len_max;
60 
61  // Pointer to the byte we just ran through
62  // the match finder.
63  const uint8_t *p1 = mf_ptr(mf) - 1;
64 
65  // Pointer to the beginning of the match. We need -1
66  // here because the match distances are zero based.
67  const uint8_t *p2 = p1 - matches[count - 1].dist - 1;
68 
69  len_best = lzma_memcmplen(p1, p2, len_best, limit);
70  }
71  }
72 
73  *count_ptr = count;
74 
75  // Finally update the read position to indicate that match finder was
76  // run for this dictionary offset.
77  ++mf->read_ahead;
78 
79  return len_best;
80 }
81 
82 
86 #define EMPTY_HASH_VALUE 0
87 
88 
91 #define MUST_NORMALIZE_POS UINT32_MAX
92 
93 
107 static void
109 {
111 
112  // In future we may not want to touch the lowest bits, because there
113  // may be match finders that use larger resolution than one byte.
114  const uint32_t subvalue
116  // & ~((UINT32_C(1) << 10) - 1);
117 
118  for (uint32_t i = 0; i < mf->hash_count; ++i) {
119  // If the distance is greater than the dictionary size,
120  // we can simply mark the hash element as empty.
121  if (mf->hash[i] <= subvalue)
122  mf->hash[i] = EMPTY_HASH_VALUE;
123  else
124  mf->hash[i] -= subvalue;
125  }
126 
127  for (uint32_t i = 0; i < mf->sons_count; ++i) {
128  // Do the same for mf->son.
129  //
130  // NOTE: There may be uninitialized elements in mf->son.
131  // Valgrind may complain that the "if" below depends on
132  // an uninitialized value. In this case it is safe to ignore
133  // the warning. See also the comments in lz_encoder_init()
134  // in lz_encoder.c.
135  if (mf->son[i] <= subvalue)
136  mf->son[i] = EMPTY_HASH_VALUE;
137  else
138  mf->son[i] -= subvalue;
139  }
140 
141  // Update offset to match the new locations.
142  mf->offset -= subvalue;
143 
144  return;
145 }
146 
147 
149 static void
151 {
152  if (++mf->cyclic_pos == mf->cyclic_size)
153  mf->cyclic_pos = 0;
154 
155  ++mf->read_pos;
156  assert(mf->read_pos <= mf->write_pos);
157 
158  if (unlikely(mf->read_pos + mf->offset == UINT32_MAX))
159  normalize(mf);
160 }
161 
162 
177 static void
179 {
180  ++mf->read_pos;
181  assert(mf->read_pos <= mf->write_pos);
182  ++mf->pending;
183 }
184 
185 
191 #define header(is_bt, len_min, ret_op) \
192  uint32_t len_limit = mf_avail(mf); \
193  if (mf->nice_len <= len_limit) { \
194  len_limit = mf->nice_len; \
195  } else if (len_limit < (len_min) \
196  || (is_bt && mf->action == LZMA_SYNC_FLUSH)) { \
197  assert(mf->action != LZMA_RUN); \
198  move_pending(mf); \
199  ret_op; \
200  } \
201  const uint8_t *cur = mf_ptr(mf); \
202  const uint32_t pos = mf->read_pos + mf->offset
203 
204 
207 #define header_find(is_bt, len_min) \
208  header(is_bt, len_min, return 0); \
209  uint32_t matches_count = 0
210 
211 
214 #define header_skip(is_bt, len_min) \
215  header(is_bt, len_min, continue)
216 
217 
221 #define call_find(func, len_best) \
222 do { \
223  matches_count = func(len_limit, pos, cur, cur_match, mf->depth, \
224  mf->son, mf->cyclic_pos, mf->cyclic_size, \
225  matches + matches_count, len_best) \
226  - matches; \
227  move_pos(mf); \
228  return matches_count; \
229 } while (0)
230 
231 
233 // Hash Chain //
235 
236 #if defined(HAVE_MF_HC3) || defined(HAVE_MF_HC4)
249 static lzma_match *
250 hc_find_func(
251  const uint32_t len_limit,
252  const uint32_t pos,
253  const uint8_t *const cur,
254  uint32_t cur_match,
255  uint32_t depth,
256  uint32_t *const son,
257  const uint32_t cyclic_pos,
258  const uint32_t cyclic_size,
259  lzma_match *matches,
260  uint32_t len_best)
261 {
262  son[cyclic_pos] = cur_match;
263 
264  while (true) {
265  const uint32_t delta = pos - cur_match;
266  if (depth-- == 0 || delta >= cyclic_size)
267  return matches;
268 
269  const uint8_t *const pb = cur - delta;
270  cur_match = son[cyclic_pos - delta
271  + (delta > cyclic_pos ? cyclic_size : 0)];
272 
273  if (pb[len_best] == cur[len_best] && pb[0] == cur[0]) {
274  uint32_t len = lzma_memcmplen(pb, cur, 1, len_limit);
275 
276  if (len_best < len) {
277  len_best = len;
278  matches->len = len;
279  matches->dist = delta - 1;
280  ++matches;
281 
282  if (len == len_limit)
283  return matches;
284  }
285  }
286  }
287 }
288 
289 
290 #define hc_find(len_best) \
291  call_find(hc_find_func, len_best)
292 
293 
294 #define hc_skip() \
295 do { \
296  mf->son[mf->cyclic_pos] = cur_match; \
297  move_pos(mf); \
298 } while (0)
299 
300 #endif
301 
302 
303 #ifdef HAVE_MF_HC3
304 extern uint32_t
305 lzma_mf_hc3_find(lzma_mf *mf, lzma_match *matches)
306 {
307  header_find(false, 3);
308 
309  hash_3_calc();
310 
311  const uint32_t delta2 = pos - mf->hash[hash_2_value];
312  const uint32_t cur_match = mf->hash[FIX_3_HASH_SIZE + hash_value];
313 
314  mf->hash[hash_2_value] = pos;
315  mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
316 
317  uint32_t len_best = 2;
318 
319  if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
320  len_best = lzma_memcmplen(cur - delta2, cur,
321  len_best, len_limit);
322 
323  matches[0].len = len_best;
324  matches[0].dist = delta2 - 1;
325  matches_count = 1;
326 
327  if (len_best == len_limit) {
328  hc_skip();
329  return 1; // matches_count
330  }
331  }
332 
333  hc_find(len_best);
334 }
335 
336 
337 extern void
338 lzma_mf_hc3_skip(lzma_mf *mf, uint32_t amount)
339 {
340  do {
341  if (mf_avail(mf) < 3) {
342  move_pending(mf);
343  continue;
344  }
345 
346  const uint8_t *cur = mf_ptr(mf);
347  const uint32_t pos = mf->read_pos + mf->offset;
348 
349  hash_3_calc();
350 
351  const uint32_t cur_match
352  = mf->hash[FIX_3_HASH_SIZE + hash_value];
353 
354  mf->hash[hash_2_value] = pos;
355  mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
356 
357  hc_skip();
358 
359  } while (--amount != 0);
360 }
361 #endif
362 
363 
364 #ifdef HAVE_MF_HC4
365 extern uint32_t
366 lzma_mf_hc4_find(lzma_mf *mf, lzma_match *matches)
367 {
368  header_find(false, 4);
369 
370  hash_4_calc();
371 
372  uint32_t delta2 = pos - mf->hash[hash_2_value];
373  const uint32_t delta3
374  = pos - mf->hash[FIX_3_HASH_SIZE + hash_3_value];
375  const uint32_t cur_match = mf->hash[FIX_4_HASH_SIZE + hash_value];
376 
377  mf->hash[hash_2_value ] = pos;
378  mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
379  mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
380 
381  uint32_t len_best = 1;
382 
383  if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
384  len_best = 2;
385  matches[0].len = 2;
386  matches[0].dist = delta2 - 1;
387  matches_count = 1;
388  }
389 
390  if (delta2 != delta3 && delta3 < mf->cyclic_size
391  && *(cur - delta3) == *cur) {
392  len_best = 3;
393  matches[matches_count++].dist = delta3 - 1;
394  delta2 = delta3;
395  }
396 
397  if (matches_count != 0) {
398  len_best = lzma_memcmplen(cur - delta2, cur,
399  len_best, len_limit);
400 
401  matches[matches_count - 1].len = len_best;
402 
403  if (len_best == len_limit) {
404  hc_skip();
405  return matches_count;
406  }
407  }
408 
409  if (len_best < 3)
410  len_best = 3;
411 
412  hc_find(len_best);
413 }
414 
415 
416 extern void
417 lzma_mf_hc4_skip(lzma_mf *mf, uint32_t amount)
418 {
419  do {
420  if (mf_avail(mf) < 4) {
421  move_pending(mf);
422  continue;
423  }
424 
425  const uint8_t *cur = mf_ptr(mf);
426  const uint32_t pos = mf->read_pos + mf->offset;
427 
428  hash_4_calc();
429 
430  const uint32_t cur_match
431  = mf->hash[FIX_4_HASH_SIZE + hash_value];
432 
433  mf->hash[hash_2_value] = pos;
434  mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
435  mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
436 
437  hc_skip();
438 
439  } while (--amount != 0);
440 }
441 #endif
442 
443 
445 // Binary Tree //
447 
448 #if defined(HAVE_MF_BT2) || defined(HAVE_MF_BT3) || defined(HAVE_MF_BT4)
449 static lzma_match *
450 bt_find_func(
451  const uint32_t len_limit,
452  const uint32_t pos,
453  const uint8_t *const cur,
454  uint32_t cur_match,
455  uint32_t depth,
456  uint32_t *const son,
457  const uint32_t cyclic_pos,
458  const uint32_t cyclic_size,
459  lzma_match *matches,
460  uint32_t len_best)
461 {
462  uint32_t *ptr0 = son + (cyclic_pos << 1) + 1;
463  uint32_t *ptr1 = son + (cyclic_pos << 1);
464 
465  uint32_t len0 = 0;
466  uint32_t len1 = 0;
467 
468  while (true) {
469  const uint32_t delta = pos - cur_match;
470  if (depth-- == 0 || delta >= cyclic_size) {
471  *ptr0 = EMPTY_HASH_VALUE;
472  *ptr1 = EMPTY_HASH_VALUE;
473  return matches;
474  }
475 
476  uint32_t *const pair = son + ((cyclic_pos - delta
477  + (delta > cyclic_pos ? cyclic_size : 0))
478  << 1);
479 
480  const uint8_t *const pb = cur - delta;
481  uint32_t len = my_min(len0, len1);
482 
483  if (pb[len] == cur[len]) {
484  len = lzma_memcmplen(pb, cur, len + 1, len_limit);
485 
486  if (len_best < len) {
487  len_best = len;
488  matches->len = len;
489  matches->dist = delta - 1;
490  ++matches;
491 
492  if (len == len_limit) {
493  *ptr1 = pair[0];
494  *ptr0 = pair[1];
495  return matches;
496  }
497  }
498  }
499 
500  if (pb[len] < cur[len]) {
501  *ptr1 = cur_match;
502  ptr1 = pair + 1;
503  cur_match = *ptr1;
504  len1 = len;
505  } else {
506  *ptr0 = cur_match;
507  ptr0 = pair;
508  cur_match = *ptr0;
509  len0 = len;
510  }
511  }
512 }
513 
514 
515 static void
516 bt_skip_func(
517  const uint32_t len_limit,
518  const uint32_t pos,
519  const uint8_t *const cur,
520  uint32_t cur_match,
521  uint32_t depth,
522  uint32_t *const son,
523  const uint32_t cyclic_pos,
524  const uint32_t cyclic_size)
525 {
526  uint32_t *ptr0 = son + (cyclic_pos << 1) + 1;
527  uint32_t *ptr1 = son + (cyclic_pos << 1);
528 
529  uint32_t len0 = 0;
530  uint32_t len1 = 0;
531 
532  while (true) {
533  const uint32_t delta = pos - cur_match;
534  if (depth-- == 0 || delta >= cyclic_size) {
535  *ptr0 = EMPTY_HASH_VALUE;
536  *ptr1 = EMPTY_HASH_VALUE;
537  return;
538  }
539 
540  uint32_t *pair = son + ((cyclic_pos - delta
541  + (delta > cyclic_pos ? cyclic_size : 0))
542  << 1);
543  const uint8_t *pb = cur - delta;
544  uint32_t len = my_min(len0, len1);
545 
546  if (pb[len] == cur[len]) {
547  len = lzma_memcmplen(pb, cur, len + 1, len_limit);
548 
549  if (len == len_limit) {
550  *ptr1 = pair[0];
551  *ptr0 = pair[1];
552  return;
553  }
554  }
555 
556  if (pb[len] < cur[len]) {
557  *ptr1 = cur_match;
558  ptr1 = pair + 1;
559  cur_match = *ptr1;
560  len1 = len;
561  } else {
562  *ptr0 = cur_match;
563  ptr0 = pair;
564  cur_match = *ptr0;
565  len0 = len;
566  }
567  }
568 }
569 
570 
571 #define bt_find(len_best) \
572  call_find(bt_find_func, len_best)
573 
574 #define bt_skip() \
575 do { \
576  bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, \
577  mf->son, mf->cyclic_pos, \
578  mf->cyclic_size); \
579  move_pos(mf); \
580 } while (0)
581 
582 #endif
583 
584 
585 #ifdef HAVE_MF_BT2
586 extern uint32_t
587 lzma_mf_bt2_find(lzma_mf *mf, lzma_match *matches)
588 {
589  header_find(true, 2);
590 
591  hash_2_calc();
592 
593  const uint32_t cur_match = mf->hash[hash_value];
594  mf->hash[hash_value] = pos;
595 
596  bt_find(1);
597 }
598 
599 
600 extern void
601 lzma_mf_bt2_skip(lzma_mf *mf, uint32_t amount)
602 {
603  do {
604  header_skip(true, 2);
605 
606  hash_2_calc();
607 
608  const uint32_t cur_match = mf->hash[hash_value];
609  mf->hash[hash_value] = pos;
610 
611  bt_skip();
612 
613  } while (--amount != 0);
614 }
615 #endif
616 
617 
618 #ifdef HAVE_MF_BT3
619 extern uint32_t
620 lzma_mf_bt3_find(lzma_mf *mf, lzma_match *matches)
621 {
622  header_find(true, 3);
623 
624  hash_3_calc();
625 
626  const uint32_t delta2 = pos - mf->hash[hash_2_value];
627  const uint32_t cur_match = mf->hash[FIX_3_HASH_SIZE + hash_value];
628 
629  mf->hash[hash_2_value] = pos;
630  mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
631 
632  uint32_t len_best = 2;
633 
634  if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
635  len_best = lzma_memcmplen(
636  cur, cur - delta2, len_best, len_limit);
637 
638  matches[0].len = len_best;
639  matches[0].dist = delta2 - 1;
640  matches_count = 1;
641 
642  if (len_best == len_limit) {
643  bt_skip();
644  return 1; // matches_count
645  }
646  }
647 
648  bt_find(len_best);
649 }
650 
651 
652 extern void
653 lzma_mf_bt3_skip(lzma_mf *mf, uint32_t amount)
654 {
655  do {
656  header_skip(true, 3);
657 
658  hash_3_calc();
659 
660  const uint32_t cur_match
661  = mf->hash[FIX_3_HASH_SIZE + hash_value];
662 
663  mf->hash[hash_2_value] = pos;
664  mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
665 
666  bt_skip();
667 
668  } while (--amount != 0);
669 }
670 #endif
671 
672 
673 #ifdef HAVE_MF_BT4
674 extern uint32_t
675 lzma_mf_bt4_find(lzma_mf *mf, lzma_match *matches)
676 {
677  header_find(true, 4);
678 
679  hash_4_calc();
680 
681  uint32_t delta2 = pos - mf->hash[hash_2_value];
682  const uint32_t delta3
683  = pos - mf->hash[FIX_3_HASH_SIZE + hash_3_value];
684  const uint32_t cur_match = mf->hash[FIX_4_HASH_SIZE + hash_value];
685 
686  mf->hash[hash_2_value] = pos;
687  mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
688  mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
689 
690  uint32_t len_best = 1;
691 
692  if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
693  len_best = 2;
694  matches[0].len = 2;
695  matches[0].dist = delta2 - 1;
696  matches_count = 1;
697  }
698 
699  if (delta2 != delta3 && delta3 < mf->cyclic_size
700  && *(cur - delta3) == *cur) {
701  len_best = 3;
702  matches[matches_count++].dist = delta3 - 1;
703  delta2 = delta3;
704  }
705 
706  if (matches_count != 0) {
707  len_best = lzma_memcmplen(
708  cur, cur - delta2, len_best, len_limit);
709 
710  matches[matches_count - 1].len = len_best;
711 
712  if (len_best == len_limit) {
713  bt_skip();
714  return matches_count;
715  }
716  }
717 
718  if (len_best < 3)
719  len_best = 3;
720 
721  bt_find(len_best);
722 }
723 
724 
725 extern void
726 lzma_mf_bt4_skip(lzma_mf *mf, uint32_t amount)
727 {
728  do {
729  header_skip(true, 4);
730 
731  hash_4_calc();
732 
733  const uint32_t cur_match
734  = mf->hash[FIX_4_HASH_SIZE + hash_value];
735 
736  mf->hash[hash_2_value] = pos;
737  mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
738  mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
739 
740  bt_skip();
741 
742  } while (--amount != 0);
743 }
744 #endif
size_t len
Definition: 6502dis.c:15
lzma_index ** i
Definition: index.h:629
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void count
Definition: sflib.h:98
#define unlikely(expr)
Definition: lz4.c:177
LZ in window and match finder API.
void lzma_mf_hc4_skip(lzma_mf *dict, uint32_t amount)
uint32_t lzma_mf_hc3_find(lzma_mf *dict, lzma_match *matches)
uint32_t lzma_mf_bt2_find(lzma_mf *dict, lzma_match *matches)
static uint32_t mf_avail(const lzma_mf *mf)
Get the number of bytes that haven't been ran through the match finder yet.
Definition: lz_encoder.h:229
uint32_t lzma_mf_bt3_find(lzma_mf *dict, lzma_match *matches)
static const uint8_t * mf_ptr(const lzma_mf *mf)
Get pointer to the first byte not ran through the match finder.
Definition: lz_encoder.h:221
uint32_t lzma_mf_bt4_find(lzma_mf *dict, lzma_match *matches)
void lzma_mf_bt2_skip(lzma_mf *dict, uint32_t amount)
void lzma_mf_bt3_skip(lzma_mf *dict, uint32_t amount)
void lzma_mf_hc3_skip(lzma_mf *dict, uint32_t amount)
void lzma_mf_bt4_skip(lzma_mf *dict, uint32_t amount)
uint32_t lzma_mf_hc4_find(lzma_mf *dict, lzma_match *matches)
Hash macros for match finders.
#define hash_4_calc()
#define hash_2_calc()
#define FIX_3_HASH_SIZE
#define hash_3_calc()
#define FIX_4_HASH_SIZE
static void move_pending(lzma_mf *mf)
uint32_t lzma_mf_find(lzma_mf *mf, uint32_t *count_ptr, lzma_match *matches)
Find matches starting from the current byte.
Definition: lz_encoder_mf.c:23
static void normalize(lzma_mf *mf)
Normalizes hash values.
static void move_pos(lzma_mf *mf)
Mark the current byte as processed from point of view of the match finder.
#define MUST_NORMALIZE_POS
Definition: lz_encoder_mf.c:91
#define header_find(is_bt, len_min)
#define EMPTY_HASH_VALUE
Definition: lz_encoder_mf.c:86
#define header_skip(is_bt, len_min)
Optimized comparison of two buffers.
assert(limit<=UINT32_MAX/2)
static uint32_t const uint8_t uint32_t uint32_t limit
Definition: memcmplen.h:45
unsigned int uint32_t
Definition: sftypes.h:29
unsigned char uint8_t
Definition: sftypes.h:31
#define UINT32_MAX
uint32_t len
Definition: lz_encoder.h:23
uint32_t dist
Definition: lz_encoder.h:24
uint32_t hash_count
Number of elements in hash[].
Definition: lz_encoder.h:122
uint32_t read_pos
Definition: lz_encoder.h:63
uint32_t offset
Definition: lz_encoder.h:58
uint32_t cyclic_size
Definition: lz_encoder.h:102
uint32_t * son
Definition: lz_encoder.h:100
uint32_t match_len_max
Definition: lz_encoder.h:114
uint32_t cyclic_pos
Definition: lz_encoder.h:101
uint32_t(* find)(lzma_mf *mf, lzma_match *matches)
Definition: lz_encoder.h:92
uint32_t nice_len
Maximum length of a match that the match finder will try to find.
Definition: lz_encoder.h:109
uint32_t * hash
Definition: lz_encoder.h:99
uint32_t read_ahead
Definition: lz_encoder.h:67
uint32_t sons_count
Number of elements in son[].
Definition: lz_encoder.h:125
uint32_t pending
Definition: lz_encoder.h:84
uint32_t write_pos
Definition: lz_encoder.h:80
int pos
Definition: main.c:11
#define my_min(x, y)
Definition: sysdefs.h:185
static st64 delta
Definition: vmenus.c:2425