Rizin
unix-like reverse engineering framework and cli tools
lz_encoder_mf.c File Reference

Match finders. More...

#include "lz_encoder.h"
#include "lz_encoder_hash.h"
#include "memcmplen.h"

Go to the source code of this file.

Macros

#define EMPTY_HASH_VALUE   0
 
#define MUST_NORMALIZE_POS   UINT32_MAX
 
#define header(is_bt, len_min, ret_op)
 
#define header_find(is_bt, len_min)
 
#define header_skip(is_bt, len_min)    header(is_bt, len_min, continue)
 
#define call_find(func, len_best)
 

Functions

uint32_t lzma_mf_find (lzma_mf *mf, uint32_t *count_ptr, lzma_match *matches)
 Find matches starting from the current byte. More...
 
static void normalize (lzma_mf *mf)
 Normalizes hash values. More...
 
static void move_pos (lzma_mf *mf)
 Mark the current byte as processed from point of view of the match finder. More...
 
static void move_pending (lzma_mf *mf)
 

Detailed Description

Match finders.

Definition in file lz_encoder_mf.c.

Macro Definition Documentation

◆ call_find

#define call_find (   func,
  len_best 
)
Value:
do { \
matches_count = func(len_limit, pos, cur, cur_match, mf->depth, \
mf->son, mf->cyclic_pos, mf->cyclic_size, \
matches + matches_count, len_best) \
- matches; \
move_pos(mf); \
return matches_count; \
} while (0)
int pos
Definition: main.c:11

Calls hc_find_func() or bt_find_func() and calculates the total number of matches found. Updates the dictionary position and returns the number of matches found.

Definition at line 221 of file lz_encoder_mf.c.

◆ EMPTY_HASH_VALUE

#define EMPTY_HASH_VALUE   0

Hash value to indicate unused element in the hash. Since we start the positions from dict_size + 1, zero is always too far to qualify as usable match position.

Definition at line 86 of file lz_encoder_mf.c.

◆ header

#define header (   is_bt,
  len_min,
  ret_op 
)
Value:
uint32_t len_limit = mf_avail(mf); \
if (mf->nice_len <= len_limit) { \
len_limit = mf->nice_len; \
} else if (len_limit < (len_min) \
|| (is_bt && mf->action == LZMA_SYNC_FLUSH)) { \
assert(mf->action != LZMA_RUN); \
move_pending(mf); \
ret_op; \
} \
const uint8_t *cur = mf_ptr(mf); \
const uint32_t pos = mf->read_pos + mf->offset
static uint32_t mf_avail(const lzma_mf *mf)
Get the number of bytes that haven't been ran through the match finder yet.
Definition: lz_encoder.h:229
static const uint8_t * mf_ptr(const lzma_mf *mf)
Get pointer to the first byte not ran through the match finder.
Definition: lz_encoder.h:221
unsigned int uint32_t
Definition: sftypes.h:29
unsigned char uint8_t
Definition: sftypes.h:31
@ LZMA_SYNC_FLUSH
Make all the input available at output.
Definition: base.h:265
@ LZMA_RUN
Continue coding.
Definition: base.h:251

Calculate len_limit and determine if there is enough input to run the actual match finder code. Sets up "cur" and "pos". This macro is used by all find functions and binary tree skip functions. Hash chain skip function doesn't need len_limit so a simpler code is used in them.

Definition at line 191 of file lz_encoder_mf.c.

◆ header_find

#define header_find (   is_bt,
  len_min 
)
Value:
header(is_bt, len_min, return 0); \
uint32_t matches_count = 0
#define header(is_bt, len_min, ret_op)

Header for find functions. "return 0" indicates that zero matches were found.

Definition at line 207 of file lz_encoder_mf.c.

◆ header_skip

#define header_skip (   is_bt,
  len_min 
)     header(is_bt, len_min, continue)

Header for a loop in a skip function. "continue" tells to skip the rest of the code in the loop.

Definition at line 214 of file lz_encoder_mf.c.

◆ MUST_NORMALIZE_POS

#define MUST_NORMALIZE_POS   UINT32_MAX

Normalization must be done when lzma_mf.offset + lzma_mf.read_pos reaches MUST_NORMALIZE_POS.

Definition at line 91 of file lz_encoder_mf.c.

Function Documentation

◆ lzma_mf_find()

uint32_t lzma_mf_find ( lzma_mf mf,
uint32_t count_ptr,
lzma_match matches 
)

Find matches starting from the current byte.

Returns
The length of the longest match found

Definition at line 23 of file lz_encoder_mf.c.

24 {
25  // Call the match finder. It returns the number of length-distance
26  // pairs found.
27  // FIXME: Minimum count is zero, what _exactly_ is the maximum?
28  const uint32_t count = mf->find(mf, matches);
29 
30  // Length of the longest match; assume that no matches were found
31  // and thus the maximum length is zero.
32  uint32_t len_best = 0;
33 
34  if (count > 0) {
35 #ifndef NDEBUG
36  // Validate the matches.
37  for (uint32_t i = 0; i < count; ++i) {
38  assert(matches[i].len <= mf->nice_len);
39  assert(matches[i].dist < mf->read_pos);
40  assert(memcmp(mf_ptr(mf) - 1,
41  mf_ptr(mf) - matches[i].dist - 2,
42  matches[i].len) == 0);
43  }
44 #endif
45 
46  // The last used element in the array contains
47  // the longest match.
48  len_best = matches[count - 1].len;
49 
50  // If a match of maximum search length was found, try to
51  // extend the match to maximum possible length.
52  if (len_best == mf->nice_len) {
53  // The limit for the match length is either the
54  // maximum match length supported by the LZ-based
55  // encoder or the number of bytes left in the
56  // dictionary, whichever is smaller.
57  uint32_t limit = mf_avail(mf) + 1;
58  if (limit > mf->match_len_max)
59  limit = mf->match_len_max;
60 
61  // Pointer to the byte we just ran through
62  // the match finder.
63  const uint8_t *p1 = mf_ptr(mf) - 1;
64 
65  // Pointer to the beginning of the match. We need -1
66  // here because the match distances are zero based.
67  const uint8_t *p2 = p1 - matches[count - 1].dist - 1;
68 
69  len_best = lzma_memcmplen(p1, p2, len_best, limit);
70  }
71  }
72 
73  *count_ptr = count;
74 
75  // Finally update the read position to indicate that match finder was
76  // run for this dictionary offset.
77  ++mf->read_ahead;
78 
79  return len_best;
80 }
size_t len
Definition: 6502dis.c:15
lzma_index ** i
Definition: index.h:629
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void count
Definition: sflib.h:98
assert(limit<=UINT32_MAX/2)
static uint32_t const uint8_t uint32_t uint32_t limit
Definition: memcmplen.h:45
uint32_t len
Definition: lz_encoder.h:23
uint32_t dist
Definition: lz_encoder.h:24
uint32_t match_len_max
Definition: lz_encoder.h:114
uint32_t(* find)(lzma_mf *mf, lzma_match *matches)
Definition: lz_encoder.h:92
uint32_t nice_len
Maximum length of a match that the match finder will try to find.
Definition: lz_encoder.h:109
uint32_t read_ahead
Definition: lz_encoder.h:67

References assert(), count, lzma_match::dist, lzma_mf_s::find, i, len, lzma_match::len, limit, lzma_mf_s::match_len_max, mf_avail(), mf_ptr(), lzma_mf_s::nice_len, and lzma_mf_s::read_ahead.

◆ move_pending()

static void move_pending ( lzma_mf mf)
static

When flushing, we cannot run the match finder unless there is nice_len bytes available in the dictionary. Instead, we skip running the match finder (indicating that no match was found), and count how many bytes we have ignored this way.

When new data is given after the flushing was completed, the match finder is restarted by rewinding mf->read_pos backwards by mf->pending. Then the missed bytes are added to the hash using the match finder's skip function (with small amount of input, it may start using mf->pending again if flushing).

Due to this rewinding, we don't touch cyclic_pos or test for normalization. It will be done when the match finder's skip function catches up after a flush.

Definition at line 178 of file lz_encoder_mf.c.

179 {
180  ++mf->read_pos;
181  assert(mf->read_pos <= mf->write_pos);
182  ++mf->pending;
183 }
uint32_t read_pos
Definition: lz_encoder.h:63
uint32_t pending
Definition: lz_encoder.h:84
uint32_t write_pos
Definition: lz_encoder.h:80

References assert(), lzma_mf_s::pending, lzma_mf_s::read_pos, and lzma_mf_s::write_pos.

◆ move_pos()

static void move_pos ( lzma_mf mf)
static

Mark the current byte as processed from point of view of the match finder.

Definition at line 150 of file lz_encoder_mf.c.

151 {
152  if (++mf->cyclic_pos == mf->cyclic_size)
153  mf->cyclic_pos = 0;
154 
155  ++mf->read_pos;
156  assert(mf->read_pos <= mf->write_pos);
157 
158  if (unlikely(mf->read_pos + mf->offset == UINT32_MAX))
159  normalize(mf);
160 }
#define unlikely(expr)
Definition: lz4.c:177
static void normalize(lzma_mf *mf)
Normalizes hash values.
#define UINT32_MAX
uint32_t offset
Definition: lz_encoder.h:58
uint32_t cyclic_size
Definition: lz_encoder.h:102
uint32_t cyclic_pos
Definition: lz_encoder.h:101

References assert(), lzma_mf_s::cyclic_pos, lzma_mf_s::cyclic_size, normalize(), lzma_mf_s::offset, lzma_mf_s::read_pos, UINT32_MAX, unlikely, and lzma_mf_s::write_pos.

◆ normalize()

static void normalize ( lzma_mf mf)
static

Normalizes hash values.

The hash arrays store positions of match candidates. The positions are relative to an arbitrary offset that is not the same as the absolute offset in the input stream. The relative position of the current byte is lzma_mf.offset + lzma_mf.read_pos. The distances of the matches are the differences of the current read position and the position found from the hash.

To prevent integer overflows of the offsets stored in the hash arrays, we need to "normalize" the stored values now and then. During the normalization, we drop values that indicate distance greater than the dictionary size, thus making space for new values.

Definition at line 108 of file lz_encoder_mf.c.

109 {
111 
112  // In future we may not want to touch the lowest bits, because there
113  // may be match finders that use larger resolution than one byte.
114  const uint32_t subvalue
116  // & ~((UINT32_C(1) << 10) - 1);
117 
118  for (uint32_t i = 0; i < mf->hash_count; ++i) {
119  // If the distance is greater than the dictionary size,
120  // we can simply mark the hash element as empty.
121  if (mf->hash[i] <= subvalue)
122  mf->hash[i] = EMPTY_HASH_VALUE;
123  else
124  mf->hash[i] -= subvalue;
125  }
126 
127  for (uint32_t i = 0; i < mf->sons_count; ++i) {
128  // Do the same for mf->son.
129  //
130  // NOTE: There may be uninitialized elements in mf->son.
131  // Valgrind may complain that the "if" below depends on
132  // an uninitialized value. In this case it is safe to ignore
133  // the warning. See also the comments in lz_encoder_init()
134  // in lz_encoder.c.
135  if (mf->son[i] <= subvalue)
136  mf->son[i] = EMPTY_HASH_VALUE;
137  else
138  mf->son[i] -= subvalue;
139  }
140 
141  // Update offset to match the new locations.
142  mf->offset -= subvalue;
143 
144  return;
145 }
#define MUST_NORMALIZE_POS
Definition: lz_encoder_mf.c:91
#define EMPTY_HASH_VALUE
Definition: lz_encoder_mf.c:86
uint32_t hash_count
Number of elements in hash[].
Definition: lz_encoder.h:122
uint32_t * son
Definition: lz_encoder.h:100
uint32_t * hash
Definition: lz_encoder.h:99
uint32_t sons_count
Number of elements in son[].
Definition: lz_encoder.h:125

References assert(), lzma_mf_s::cyclic_size, EMPTY_HASH_VALUE, lzma_mf_s::hash, lzma_mf_s::hash_count, i, MUST_NORMALIZE_POS, lzma_mf_s::offset, lzma_mf_s::read_pos, lzma_mf_s::son, and lzma_mf_s::sons_count.

Referenced by move_pos(), and test_mc::run_mc().