Match finders. More...

#include "lz_encoder.h"
#include "lz_encoder_hash.h"
#include "memcmplen.h"

Macros
#define	EMPTY_HASH_VALUE 0

#define	MUST_NORMALIZE_POS UINT32_MAX

#define	header(is_bt, len_min, ret_op)

#define	header_find(is_bt, len_min)

#define	header_skip(is_bt, len_min) header(is_bt, len_min, continue)

#define	call_find(func, len_best)

Functions
uint32_t	lzma_mf_find (lzma_mf mf, uint32_t count_ptr, lzma_match *matches)
	Find matches starting from the current byte. More...

static void	normalize (lzma_mf *mf)
	Normalizes hash values. More...

static void	move_pos (lzma_mf *mf)
	Mark the current byte as processed from point of view of the match finder. More...

static void	move_pending (lzma_mf *mf)

Detailed Description

Match finders.

Definition in file lz_encoder_mf.c.

Macro Definition Documentation

◆ call_find

#define call_find	(	func,
		len_best
	)

Value:

do { \
    matches_count = func(len_limit, pos, cur, cur_match, mf->depth, \
                mf->son, mf->cyclic_pos, mf->cyclic_size, \
                matches + matches_count, len_best) \
            - matches; \
    move_pos(mf); \
    return matches_count; \
} while (0)

Calls hc_find_func() or bt_find_func() and calculates the total number of matches found. Updates the dictionary position and returns the number of matches found.

Definition at line 221 of file lz_encoder_mf.c.

◆ EMPTY_HASH_VALUE

#define EMPTY_HASH_VALUE 0

Hash value to indicate unused element in the hash. Since we start the positions from dict_size + 1, zero is always too far to qualify as usable match position.

Definition at line 86 of file lz_encoder_mf.c.

◆ header

#define header	(	is_bt,
		len_min,
		ret_op
	)

Value:

    uint32_t len_limit = mf_avail(mf); \
    if (mf->nice_len <= len_limit) { \
        len_limit = mf->nice_len; \
    } else if (len_limit < (len_min) \
            || (is_bt && mf->action == LZMA_SYNC_FLUSH)) { \
        assert(mf->action != LZMA_RUN); \
        move_pending(mf); \
        ret_op; \
    } \
    const uint8_t *cur = mf_ptr(mf); \
    const uint32_t pos = mf->read_pos + mf->offset

Calculate len_limit and determine if there is enough input to run the actual match finder code. Sets up "cur" and "pos". This macro is used by all find functions and binary tree skip functions. Hash chain skip function doesn't need len_limit so a simpler code is used in them.

Definition at line 191 of file lz_encoder_mf.c.

◆ header_find

#define header_find	(	is_bt,
		len_min
	)

Value:

header(is_bt, len_min, return 0); \

uint32_t matches_count = 0

header

#define header(is_bt, len_min, ret_op)

Definition: lz_encoder_mf.c:191

Header for find functions. "return 0" indicates that zero matches were found.

Definition at line 207 of file lz_encoder_mf.c.

◆ header_skip

#define header_skip	(	is_bt,
		len_min
	)	header(is_bt, len_min, continue)

Header for a loop in a skip function. "continue" tells to skip the rest of the code in the loop.

Definition at line 214 of file lz_encoder_mf.c.

◆ MUST_NORMALIZE_POS

#define MUST_NORMALIZE_POS UINT32_MAX

Normalization must be done when lzma_mf.offset + lzma_mf.read_pos reaches MUST_NORMALIZE_POS.

Definition at line 91 of file lz_encoder_mf.c.

Function Documentation

◆ lzma_mf_find()

uint32_t lzma_mf_find	(	lzma_mf *	mf,
		uint32_t *	count_ptr,
		lzma_match *	matches
	)

Find matches starting from the current byte.

Returns: The length of the longest match found

Definition at line 23 of file lz_encoder_mf.c.

 {
     // Call the match finder. It returns the number of length-distance
     // pairs found.
     // FIXME: Minimum count is zero, what _exactly_ is the maximum?
     const uint32_t count = mf->find(mf, matches);
  
     // Length of the longest match; assume that no matches were found
     // and thus the maximum length is zero.
     uint32_t len_best = 0;
  
     if (count > 0) {
 #ifndef NDEBUG
         // Validate the matches.
         for (uint32_t i = 0; i < count; ++i) {
             assert(matches[i].len <= mf->nice_len);
             assert(matches[i].dist < mf->read_pos);
             assert(memcmp(mf_ptr(mf) - 1,
                 mf_ptr(mf) - matches[i].dist - 2,
                 matches[i].len) == 0);
         }
 #endif
  
         // The last used element in the array contains
         // the longest match.
         len_best = matches[count - 1].len;
  
         // If a match of maximum search length was found, try to
         // extend the match to maximum possible length.
         if (len_best == mf->nice_len) {
             // The limit for the match length is either the
             // maximum match length supported by the LZ-based
             // encoder or the number of bytes left in the
             // dictionary, whichever is smaller.
             uint32_t limit = mf_avail(mf) + 1;
             if (limit > mf->match_len_max)
                 limit = mf->match_len_max;
  
             // Pointer to the byte we just ran through
             // the match finder.
             const uint8_t *p1 = mf_ptr(mf) - 1;
  
             // Pointer to the beginning of the match. We need -1
             // here because the match distances are zero based.
             const uint8_t *p2 = p1 - matches[count - 1].dist - 1;
  
             len_best = lzma_memcmplen(p1, p2, len_best, limit);
         }
     }
  
     *count_ptr = count;
  
     // Finally update the read position to indicate that match finder was
     // run for this dictionary offset.
     ++mf->read_ahead;
  
     return len_best;
 }

References assert(), count, lzma_match::dist, lzma_mf_s::find, i, len, lzma_match::len, limit, lzma_mf_s::match_len_max, mf_avail(), mf_ptr(), lzma_mf_s::nice_len, and lzma_mf_s::read_ahead.

◆ move_pending()

static void move_pending ( lzma_mf * mf )

static

When flushing, we cannot run the match finder unless there is nice_len bytes available in the dictionary. Instead, we skip running the match finder (indicating that no match was found), and count how many bytes we have ignored this way.

When new data is given after the flushing was completed, the match finder is restarted by rewinding mf->read_pos backwards by mf->pending. Then the missed bytes are added to the hash using the match finder's skip function (with small amount of input, it may start using mf->pending again if flushing).

Due to this rewinding, we don't touch cyclic_pos or test for normalization. It will be done when the match finder's skip function catches up after a flush.

Definition at line 178 of file lz_encoder_mf.c.

 {
     ++mf->read_pos;
     assert(mf->read_pos <= mf->write_pos);
     ++mf->pending;
 }

References assert(), lzma_mf_s::pending, lzma_mf_s::read_pos, and lzma_mf_s::write_pos.

◆ move_pos()

static void move_pos ( lzma_mf * mf )

static

Mark the current byte as processed from point of view of the match finder.

Definition at line 150 of file lz_encoder_mf.c.

 {
     if (++mf->cyclic_pos == mf->cyclic_size)
         mf->cyclic_pos = 0;
  
     ++mf->read_pos;
     assert(mf->read_pos <= mf->write_pos);
  
     if (unlikely(mf->read_pos + mf->offset == UINT32_MAX))
         normalize(mf);
 }

References assert(), lzma_mf_s::cyclic_pos, lzma_mf_s::cyclic_size, normalize(), lzma_mf_s::offset, lzma_mf_s::read_pos, UINT32_MAX, unlikely, and lzma_mf_s::write_pos.

◆ normalize()

static void normalize ( lzma_mf * mf )

static

Normalizes hash values.

The hash arrays store positions of match candidates. The positions are relative to an arbitrary offset that is not the same as the absolute offset in the input stream. The relative position of the current byte is lzma_mf.offset + lzma_mf.read_pos. The distances of the matches are the differences of the current read position and the position found from the hash.

To prevent integer overflows of the offsets stored in the hash arrays, we need to "normalize" the stored values now and then. During the normalization, we drop values that indicate distance greater than the dictionary size, thus making space for new values.

Definition at line 108 of file lz_encoder_mf.c.

 {
     assert(mf->read_pos + mf->offset == MUST_NORMALIZE_POS);
  
     // In future we may not want to touch the lowest bits, because there
     // may be match finders that use larger resolution than one byte.
     const uint32_t subvalue
             = (MUST_NORMALIZE_POS - mf->cyclic_size);
                 // & ~((UINT32_C(1) << 10) - 1);
  
     for (uint32_t i = 0; i < mf->hash_count; ++i) {
         // If the distance is greater than the dictionary size,
         // we can simply mark the hash element as empty.
         if (mf->hash[i] <= subvalue)
             mf->hash[i] = EMPTY_HASH_VALUE;
         else
             mf->hash[i] -= subvalue;
     }
  
     for (uint32_t i = 0; i < mf->sons_count; ++i) {
         // Do the same for mf->son.
         //
         // NOTE: There may be uninitialized elements in mf->son.
         // Valgrind may complain that the "if" below depends on
         // an uninitialized value. In this case it is safe to ignore
         // the warning. See also the comments in lz_encoder_init()
         // in lz_encoder.c.
         if (mf->son[i] <= subvalue)
             mf->son[i] = EMPTY_HASH_VALUE;
         else
             mf->son[i] -= subvalue;
     }
  
     // Update offset to match the new locations.
     mf->offset -= subvalue;
  
     return;
 }

References assert(), lzma_mf_s::cyclic_size, EMPTY_HASH_VALUE, lzma_mf_s::hash, lzma_mf_s::hash_count, i, MUST_NORMALIZE_POS, lzma_mf_s::offset, lzma_mf_s::read_pos, lzma_mf_s::son, and lzma_mf_s::sons_count.

Referenced by move_pos(), and test_mc::run_mc().

Macros

Functions

Detailed Description

Macro Definition Documentation

◆ call_find

◆ EMPTY_HASH_VALUE

◆ header

◆ header_find

◆ header_skip

◆ MUST_NORMALIZE_POS

Function Documentation

◆ lzma_mf_find()

◆ move_pending()

◆ move_pos()

◆ normalize()