rz-bindgen/doxygen/lexer_8c_source.html

 #include <stdio.h>

 #include "./lexer.h"

 #include "./subtree.h"

 #include "./length.h"

 #include "./unicode.h"


 #define LOG(message, character)              \

   if (self->logger.log) {                    \

     snprintf(                                \

       self->debug_buffer,                    \

       TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \

       32 <= character && character < 127 ?   \

         message " character:'%c'" :          \

         message " character:%d",             \

       character                              \

     );                                       \

     self->logger.log(                        \

       self->logger.payload,                  \

       TSLogTypeLex,                          \

       self->debug_buffer                     \

     );                                       \

   }


 static const int32_t BYTE_ORDER_MARK = 0xFEFF;


 static const TSRange DEFAULT_RANGE = {

   .start_point = {

     .row = 0,

     .column = 0,

   },

   .end_point = {

     .row = UINT32_MAX,

     .column = UINT32_MAX,

   },

   .start_byte = 0,

   .end_byte = UINT32_MAX

 };


 // Check if the lexer has reached EOF. This state is stored

 // by setting the lexer's `current_included_range_index` such that

 // it has consumed all of its available ranges.

 static bool ts_lexer__eof(const TSLexer *_self) {

   Lexer *self = (Lexer *)_self;

   return self->current_included_range_index == self->included_range_count;

 }


 // Clear the currently stored chunk of source code, because the lexer's

 // position has changed.

 static void ts_lexer__clear_chunk(Lexer *self) {

   self->chunk = NULL;

   self->chunk_size = 0;

   self->chunk_start = 0;

 }


 // Call the lexer's input callback to obtain a new chunk of source code

 // for the current position.

 static void ts_lexer__get_chunk(Lexer *self) {

   self->chunk_start = self->current_position.bytes;

   self->chunk = self->input.read(

     self->input.payload,

     self->current_position.bytes,

     self->current_position.extent,

     &self->chunk_size

   );

   if (!self->chunk_size) {

     self->current_included_range_index = self->included_range_count;

     self->chunk = NULL;

   }

 }


 // Decode the next unicode character in the current chunk of source code.

 // This assumes that the lexer has already retrieved a chunk of source

 // code that spans the current position.

 static void ts_lexer__get_lookahead(Lexer *self) {

   uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;

   uint32_t size = self->chunk_size - position_in_chunk;


   if (size == 0) {

     self->lookahead_size = 1;

     self->data.lookahead = '\0';

     return;

   }


   const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;

   UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8

     ? ts_decode_utf8

     : ts_decode_utf16;


   self->lookahead_size = decode(chunk, size, &self->data.lookahead);


   // If this chunk ended in the middle of a multi-byte character,

   // try again with a fresh chunk.

   if (self->data.lookahead == TS_DECODE_ERROR && size < 4) {

     ts_lexer__get_chunk(self);

     chunk = (const uint8_t *)self->chunk;

     size = self->chunk_size;

     self->lookahead_size = decode(chunk, size, &self->data.lookahead);

   }


   if (self->data.lookahead == TS_DECODE_ERROR) {

     self->lookahead_size = 1;

   }

 }


 static void ts_lexer_goto(Lexer *self, Length position) {

   self->current_position = position;

   bool found_included_range = false;


   // Move to the first valid position at or after the given position.

   for (unsigned i = 0; i < self->included_range_count; i++) {

     TSRange *included_range = &self->included_ranges[i];

     if (included_range->end_byte > position.bytes) {

       if (included_range->start_byte >= position.bytes) {

         self->current_position = (Length) {

           .bytes = included_range->start_byte,

           .extent = included_range->start_point,

         };

       }


       self->current_included_range_index = i;

       found_included_range = true;

       break;

     }

   }


   if (found_included_range) {

     // If the current position is outside of the current chunk of text,

     // then clear out the current chunk of text.

     if (self->chunk && (

       position.bytes < self->chunk_start ||

       position.bytes >= self->chunk_start + self->chunk_size

     )) {

       ts_lexer__clear_chunk(self);

     }


     self->lookahead_size = 0;

     self->data.lookahead = '\0';

   }


   // If the given position is beyond any of included ranges, move to the EOF

   // state - past the end of the included ranges.

   else {

     self->current_included_range_index = self->included_range_count;

     TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];

     self->current_position = (Length) {

       .bytes = last_included_range->end_byte,

       .extent = last_included_range->end_point,

     };

     ts_lexer__clear_chunk(self);

     self->lookahead_size = 1;

     self->data.lookahead = '\0';

   }

 }


 // Intended to be called only from functions that control logging.

 static void ts_lexer__do_advance(Lexer *self, bool skip) {

   if (self->lookahead_size) {

     self->current_position.bytes += self->lookahead_size;

     if (self->data.lookahead == '\n') {

       self->current_position.extent.row++;

       self->current_position.extent.column = 0;

     } else {

       self->current_position.extent.column += self->lookahead_size;

     }

   }


   const TSRange *current_range = NULL;

   if (self->current_included_range_index < self->included_range_count) {

     current_range = &self->included_ranges[self->current_included_range_index];

     if (self->current_position.bytes == current_range->end_byte) {

       self->current_included_range_index++;

       if (self->current_included_range_index < self->included_range_count) {

         current_range++;

         self->current_position = (Length) {

           current_range->start_byte,

           current_range->start_point,

         };

       } else {

         current_range = NULL;

       }

     }

   }


   if (skip) self->token_start_position = self->current_position;


   if (current_range) {

     if (self->current_position.bytes >= self->chunk_start + self->chunk_size) {

       ts_lexer__get_chunk(self);

     }

     ts_lexer__get_lookahead(self);

   } else {

     ts_lexer__clear_chunk(self);

     self->data.lookahead = '\0';

     self->lookahead_size = 1;

   }

 }


 // Advance to the next character in the source code, retrieving a new

 // chunk of source code if needed.

 static void ts_lexer__advance(TSLexer *_self, bool skip) {

   Lexer *self = (Lexer *)_self;

   if (!self->chunk) return;


   if (skip) {

     LOG("skip", self->data.lookahead);

   } else {

     LOG("consume", self->data.lookahead);

   }


   ts_lexer__do_advance(self, skip);

 }


 // Mark that a token match has completed. This can be called multiple

 // times if a longer match is found later.

 static void ts_lexer__mark_end(TSLexer *_self) {

   Lexer *self = (Lexer *)_self;

   if (!ts_lexer__eof(&self->data)) {

     // If the lexer is right at the beginning of included range,

     // then the token should be considered to end at the *end* of the

     // previous included range, rather than here.

     TSRange *current_included_range = &self->included_ranges[

       self->current_included_range_index

     ];

     if (

       self->current_included_range_index > 0 &&

       self->current_position.bytes == current_included_range->start_byte

     ) {

       TSRange *previous_included_range = current_included_range - 1;

       self->token_end_position = (Length) {

         previous_included_range->end_byte,

         previous_included_range->end_point,

       };

       return;

     }

   }

   self->token_end_position = self->current_position;

 }


 static uint32_t ts_lexer__get_column(TSLexer *_self) {

   Lexer *self = (Lexer *)_self;


   uint32_t goal_byte = self->current_position.bytes;


   self->did_get_column = true;

   self->current_position.bytes -= self->current_position.extent.column;

   self->current_position.extent.column = 0;


   if (self->current_position.bytes < self->chunk_start) {

     ts_lexer__get_chunk(self);

   }


   uint32_t result = 0;

   ts_lexer__get_lookahead(self);

   while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self) && self->chunk) {

     ts_lexer__do_advance(self, false);

     result++;

   }


   return result;

 }


 // Is the lexer at a boundary between two disjoint included ranges of

 // source code? This is exposed as an API because some languages' external

 // scanners need to perform custom actions at these boundaries.

 static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) {

   const Lexer *self = (const Lexer *)_self;

   if (self->current_included_range_index < self->included_range_count) {

     TSRange *current_range = &self->included_ranges[self->current_included_range_index];

     return self->current_position.bytes == current_range->start_byte;

   } else {

     return false;

   }

 }


 void ts_lexer_init(Lexer *self) {

   *self = (Lexer) {

     .data = {

       // The lexer's methods are stored as struct fields so that generated

       // parsers can call them without needing to be linked against this

       // library.

       .advance = ts_lexer__advance,

       .mark_end = ts_lexer__mark_end,

       .get_column = ts_lexer__get_column,

       .is_at_included_range_start = ts_lexer__is_at_included_range_start,

       .eof = ts_lexer__eof,

       .lookahead = 0,

       .result_symbol = 0,

     },

     .chunk = NULL,

     .chunk_size = 0,

     .chunk_start = 0,

     .current_position = {0, {0, 0}},

     .logger = {

       .payload = NULL,

       .log = NULL

     },

     .included_ranges = NULL,

     .included_range_count = 0,

     .current_included_range_index = 0,

   };

   ts_lexer_set_included_ranges(self, NULL, 0);

 }


 void ts_lexer_delete(Lexer *self) {

   ts_free(self->included_ranges);

 }


 void ts_lexer_set_input(Lexer *self, TSInput input) {

   self->input = input;

   ts_lexer__clear_chunk(self);

   ts_lexer_goto(self, self->current_position);

 }


 // Move the lexer to the given position. This doesn't do any work

 // if the parser is already at the given position.

 void ts_lexer_reset(Lexer *self, Length position) {

   if (position.bytes != self->current_position.bytes) {

     ts_lexer_goto(self, position);

   }

 }


 void ts_lexer_start(Lexer *self) {

   self->token_start_position = self->current_position;

   self->token_end_position = LENGTH_UNDEFINED;

   self->data.result_symbol = 0;

   self->did_get_column = false;

   if (!ts_lexer__eof(&self->data)) {

     if (!self->chunk_size) ts_lexer__get_chunk(self);

     if (!self->lookahead_size) ts_lexer__get_lookahead(self);

     if (

       self->current_position.bytes == 0 &&

       self->data.lookahead == BYTE_ORDER_MARK

     ) ts_lexer__advance(&self->data, true);

   }

 }


 void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {

   if (length_is_undefined(self->token_end_position)) {

     ts_lexer__mark_end(&self->data);

   }


   uint32_t current_lookahead_end_byte = self->current_position.bytes + 1;


   // In order to determine that a byte sequence is invalid UTF8 or UTF16,

   // the character decoding algorithm may have looked at the following byte.

   // Therefore, the next byte *after* the current (invalid) character

   // affects the interpretation of the current character.

   if (self->data.lookahead == TS_DECODE_ERROR) {

     current_lookahead_end_byte++;

   }


   if (current_lookahead_end_byte > *lookahead_end_byte) {

     *lookahead_end_byte = current_lookahead_end_byte;

   }

 }


 void ts_lexer_advance_to_end(Lexer *self) {

   while (self->chunk) {

     ts_lexer__advance(&self->data, false);

   }

 }


 void ts_lexer_mark_end(Lexer *self) {

   ts_lexer__mark_end(&self->data);

 }


 bool ts_lexer_set_included_ranges(

   Lexer *self,

   const TSRange *ranges,

   uint32_t count

 ) {

   if (count == 0 || !ranges) {

     ranges = &DEFAULT_RANGE;

     count = 1;

   } else {

     uint32_t previous_byte = 0;

     for (unsigned i = 0; i < count; i++) {

       const TSRange *range = &ranges[i];

       if (

         range->start_byte < previous_byte ||

         range->end_byte < range->start_byte

       ) return false;

       previous_byte = range->end_byte;

     }

   }


   size_t size = count * sizeof(TSRange);

   self->included_ranges = ts_realloc(self->included_ranges, size);

   memcpy(self->included_ranges, ranges, size);

   self->included_range_count = count;

   ts_lexer_goto(self, self->current_position);

   return true;

 }


 TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count) {

   *count = self->included_range_count;

   return self->included_ranges;

 }


 #undef LOG

ts_realloc
#define ts_realloc
Definition: alloc.h:27

ts_free
#define ts_free
Definition: alloc.h:30

i
lzma_index ** i
Definition: index.h:629

TSInputEncodingUTF8
@ TSInputEncodingUTF8
Definition: api.h:45

NULL
#define NULL
Definition: cris-opc.c:27

count
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void count
Definition: sflib.h:98

decode
int(* decode)(const ut8 *, ebc_command_t *cmd)
Definition: ebc_disas.c:88

skip
void skip(file *in, unsigned n)
Definition: gzappend.c:202

size
voidpf void uLong size
Definition: ioapi.h:138

length.h

LENGTH_UNDEFINED
static const Length LENGTH_UNDEFINED
Definition: length.h:14

length_is_undefined
static bool length_is_undefined(Length length)
Definition: length.h:17

ts_lexer__get_column
static uint32_t ts_lexer__get_column(TSLexer *_self)
Definition: lexer.c:239

ts_lexer_reset
void ts_lexer_reset(Lexer *self, Length position)
Definition: lexer.c:316

ts_lexer_finish
void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte)
Definition: lexer.c:337

ts_lexer__eof
static bool ts_lexer__eof(const TSLexer *_self)
Definition: lexer.c:42

ts_lexer__is_at_included_range_start
static bool ts_lexer__is_at_included_range_start(const TSLexer *_self)
Definition: lexer.c:265

ts_lexer_mark_end
void ts_lexer_mark_end(Lexer *self)
Definition: lexer.c:363

ts_lexer__get_lookahead
static void ts_lexer__get_lookahead(Lexer *self)
Definition: lexer.c:74

ts_lexer_set_input
void ts_lexer_set_input(Lexer *self, TSInput input)
Definition: lexer.c:308

ts_lexer__mark_end
static void ts_lexer__mark_end(TSLexer *_self)
Definition: lexer.c:215

ts_lexer_delete
void ts_lexer_delete(Lexer *self)
Definition: lexer.c:304

BYTE_ORDER_MARK
static const int32_t BYTE_ORDER_MARK
Definition: lexer.c:24

ts_lexer_advance_to_end
void ts_lexer_advance_to_end(Lexer *self)
Definition: lexer.c:357

ts_lexer_start
void ts_lexer_start(Lexer *self)
Definition: lexer.c:322

LOG
#define LOG(message, character)
Definition: lexer.c:7

ts_lexer_init
void ts_lexer_init(Lexer *self)
Definition: lexer.c:275

ts_lexer__clear_chunk
static void ts_lexer__clear_chunk(Lexer *self)
Definition: lexer.c:49

ts_lexer__get_chunk
static void ts_lexer__get_chunk(Lexer *self)
Definition: lexer.c:57

ts_lexer_goto
static void ts_lexer_goto(Lexer *self, Length position)
Definition: lexer.c:105

ts_lexer__advance
static void ts_lexer__advance(TSLexer *_self, bool skip)
Definition: lexer.c:200

ts_lexer_included_ranges
TSRange * ts_lexer_included_ranges(const Lexer *self, uint32_t *count)
Definition: lexer.c:395

ts_lexer__do_advance
static void ts_lexer__do_advance(Lexer *self, bool skip)
Definition: lexer.c:156

DEFAULT_RANGE
static const TSRange DEFAULT_RANGE
Definition: lexer.c:26

ts_lexer_set_included_ranges
bool ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count)
Definition: lexer.c:367

lexer.h

memcpy
memcpy(mem, inblock.get(), min(CONTAINING_RECORD(inblock.get(), MEMBLOCK, data) ->size, size))

capstone.range
range
Definition: __init__.py:6

int32_t
int int32_t
Definition: sftypes.h:33

uint32_t
unsigned int uint32_t
Definition: sftypes.h:29

uint8_t
unsigned char uint8_t
Definition: sftypes.h:31

UINT32_MAX
#define UINT32_MAX
Definition: stdint-msvc2008.h:142

Length
Definition: length.h:9

Length::bytes
uint32_t bytes
Definition: length.h:10

Lexer
Definition: lexer.h:13

TSInput
Definition: api.h:67

TSLexer
Definition: parser.h:43

TSPoint::row
uint32_t row
Definition: api.h:56

TSRange
Definition: api.h:60

TSRange::end_point
TSPoint end_point
Definition: api.h:62

TSRange::start_byte
uint32_t start_byte
Definition: api.h:63

TSRange::start_point
TSPoint start_point
Definition: api.h:61

TSRange::end_byte
uint32_t end_byte
Definition: api.h:64

chunk
Definition: malloc.c:21

subtree.h

unicode.h

TS_DECODE_ERROR
static const int32_t TS_DECODE_ERROR
Definition: unicode.h:16

ts_decode_utf16
static uint32_t ts_decode_utf16(const uint8_t *string, uint32_t length, int32_t *code_point)
Definition: unicode.h:36

ts_decode_utf8
static uint32_t ts_decode_utf8(const uint8_t *string, uint32_t length, int32_t *code_point)
Definition: unicode.h:26

UnicodeDecodeFunction
uint32_t(* UnicodeDecodeFunction)(const uint8_t *string, uint32_t length, int32_t *code_point)
Definition: unicode.h:20

input
static bool input(void *ud, zip_uint8_t *data, zip_uint64_t length)
Definition: zip_algorithm_bzip2.c:190