Rizin
unix-like reverse engineering framework and cli tools
lexer.c File Reference
#include <stdio.h>
#include "./lexer.h"
#include "./subtree.h"
#include "./length.h"
#include "./unicode.h"

Go to the source code of this file.

Macros

#define LOG(message, character)
 

Functions

static bool ts_lexer__eof (const TSLexer *_self)
 
static void ts_lexer__clear_chunk (Lexer *self)
 
static void ts_lexer__get_chunk (Lexer *self)
 
static void ts_lexer__get_lookahead (Lexer *self)
 
static void ts_lexer_goto (Lexer *self, Length position)
 
static void ts_lexer__do_advance (Lexer *self, bool skip)
 
static void ts_lexer__advance (TSLexer *_self, bool skip)
 
static void ts_lexer__mark_end (TSLexer *_self)
 
static uint32_t ts_lexer__get_column (TSLexer *_self)
 
static bool ts_lexer__is_at_included_range_start (const TSLexer *_self)
 
void ts_lexer_init (Lexer *self)
 
void ts_lexer_delete (Lexer *self)
 
void ts_lexer_set_input (Lexer *self, TSInput input)
 
void ts_lexer_reset (Lexer *self, Length position)
 
void ts_lexer_start (Lexer *self)
 
void ts_lexer_finish (Lexer *self, uint32_t *lookahead_end_byte)
 
void ts_lexer_advance_to_end (Lexer *self)
 
void ts_lexer_mark_end (Lexer *self)
 
bool ts_lexer_set_included_ranges (Lexer *self, const TSRange *ranges, uint32_t count)
 
TSRangets_lexer_included_ranges (const Lexer *self, uint32_t *count)
 

Variables

static const int32_t BYTE_ORDER_MARK = 0xFEFF
 
static const TSRange DEFAULT_RANGE
 

Macro Definition Documentation

◆ LOG

#define LOG (   message,
  character 
)
Value:
if (self->logger.log) { \
snprintf( \
self->debug_buffer, \
32 <= character && character < 127 ? \
message " character:'%c'" : \
message " character:%d", \
character \
); \
self->logger.log( \
self->logger.payload, \
self->debug_buffer \
); \
}
@ TSLogTypeLex
Definition: api.h:75
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE
Definition: parser.h:14
char * message
Definition: main.c:12

Definition at line 7 of file lexer.c.

Function Documentation

◆ ts_lexer__advance()

static void ts_lexer__advance ( TSLexer _self,
bool  skip 
)
static

Definition at line 200 of file lexer.c.

200  {
201  Lexer *self = (Lexer *)_self;
202  if (!self->chunk) return;
203 
204  if (skip) {
205  LOG("skip", self->data.lookahead);
206  } else {
207  LOG("consume", self->data.lookahead);
208  }
209 
210  ts_lexer__do_advance(self, skip);
211 }
void skip(file *in, unsigned n)
Definition: gzappend.c:202
#define LOG(message, character)
Definition: lexer.c:7
static void ts_lexer__do_advance(Lexer *self, bool skip)
Definition: lexer.c:156
Definition: lexer.h:13

References LOG, skip(), and ts_lexer__do_advance().

Referenced by ts_lexer_advance_to_end(), ts_lexer_init(), and ts_lexer_start().

◆ ts_lexer__clear_chunk()

static void ts_lexer__clear_chunk ( Lexer self)
static

Definition at line 49 of file lexer.c.

49  {
50  self->chunk = NULL;
51  self->chunk_size = 0;
52  self->chunk_start = 0;
53 }
#define NULL
Definition: cris-opc.c:27

References NULL.

Referenced by ts_lexer__do_advance(), ts_lexer_goto(), and ts_lexer_set_input().

◆ ts_lexer__do_advance()

static void ts_lexer__do_advance ( Lexer self,
bool  skip 
)
static

Definition at line 156 of file lexer.c.

156  {
157  if (self->lookahead_size) {
158  self->current_position.bytes += self->lookahead_size;
159  if (self->data.lookahead == '\n') {
160  self->current_position.extent.row++;
161  self->current_position.extent.column = 0;
162  } else {
163  self->current_position.extent.column += self->lookahead_size;
164  }
165  }
166 
167  const TSRange *current_range = NULL;
169  current_range = &self->included_ranges[self->current_included_range_index];
170  if (self->current_position.bytes == current_range->end_byte) {
171  self->current_included_range_index++;
173  current_range++;
174  self->current_position = (Length) {
175  current_range->start_byte,
176  current_range->start_point,
177  };
178  } else {
179  current_range = NULL;
180  }
181  }
182  }
183 
184  if (skip) self->token_start_position = self->current_position;
185 
186  if (current_range) {
187  if (self->current_position.bytes >= self->chunk_start + self->chunk_size) {
188  ts_lexer__get_chunk(self);
189  }
191  } else {
192  ts_lexer__clear_chunk(self);
193  self->data.lookahead = '\0';
194  self->lookahead_size = 1;
195  }
196 }
static void ts_lexer__get_lookahead(Lexer *self)
Definition: lexer.c:74
static void ts_lexer__clear_chunk(Lexer *self)
Definition: lexer.c:49
static void ts_lexer__get_chunk(Lexer *self)
Definition: lexer.c:57
Definition: length.h:9
uint32_t bytes
Definition: length.h:10
TSLexer data
Definition: lexer.h:14
uint32_t lookahead_size
Definition: lexer.h:28
uint32_t current_included_range_index
Definition: lexer.h:25
uint32_t chunk_size
Definition: lexer.h:27
uint32_t included_range_count
Definition: lexer.h:24
Length current_position
Definition: lexer.h:15
uint32_t chunk_start
Definition: lexer.h:26
int32_t lookahead
Definition: parser.h:44
Definition: api.h:60
uint32_t start_byte
Definition: api.h:63
TSPoint start_point
Definition: api.h:61
uint32_t end_byte
Definition: api.h:64

References TSRange::end_byte, NULL, skip(), TSRange::start_byte, TSRange::start_point, ts_lexer__clear_chunk(), ts_lexer__get_chunk(), and ts_lexer__get_lookahead().

Referenced by ts_lexer__advance(), and ts_lexer__get_column().

◆ ts_lexer__eof()

static bool ts_lexer__eof ( const TSLexer _self)
static

Definition at line 42 of file lexer.c.

42  {
43  Lexer *self = (Lexer *)_self;
44  return self->current_included_range_index == self->included_range_count;
45 }

Referenced by ts_lexer__get_column(), ts_lexer__mark_end(), ts_lexer_init(), and ts_lexer_start().

◆ ts_lexer__get_chunk()

static void ts_lexer__get_chunk ( Lexer self)
static

Definition at line 57 of file lexer.c.

57  {
58  self->chunk_start = self->current_position.bytes;
59  self->chunk = self->input.read(
60  self->input.payload,
61  self->current_position.bytes,
63  &self->chunk_size
64  );
65  if (!self->chunk_size) {
66  self->current_included_range_index = self->included_range_count;
67  self->chunk = NULL;
68  }
69 }
TSPoint extent
Definition: length.h:11
TSInput input
Definition: lexer.h:21
void * payload
Definition: api.h:68

References NULL.

Referenced by ts_lexer__do_advance(), ts_lexer__get_column(), ts_lexer__get_lookahead(), and ts_lexer_start().

◆ ts_lexer__get_column()

static uint32_t ts_lexer__get_column ( TSLexer _self)
static

Definition at line 239 of file lexer.c.

239  {
240  Lexer *self = (Lexer *)_self;
241 
242  uint32_t goal_byte = self->current_position.bytes;
243 
244  self->did_get_column = true;
245  self->current_position.bytes -= self->current_position.extent.column;
246  self->current_position.extent.column = 0;
247 
248  if (self->current_position.bytes < self->chunk_start) {
249  ts_lexer__get_chunk(self);
250  }
251 
252  uint32_t result = 0;
254  while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self) && self->chunk) {
255  ts_lexer__do_advance(self, false);
256  result++;
257  }
258 
259  return result;
260 }
static bool ts_lexer__eof(const TSLexer *_self)
Definition: lexer.c:42
unsigned int uint32_t
Definition: sftypes.h:29

References ts_lexer__do_advance(), ts_lexer__eof(), ts_lexer__get_chunk(), and ts_lexer__get_lookahead().

Referenced by ts_lexer_init().

◆ ts_lexer__get_lookahead()

static void ts_lexer__get_lookahead ( Lexer self)
static

Definition at line 74 of file lexer.c.

74  {
75  uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
76  uint32_t size = self->chunk_size - position_in_chunk;
77 
78  if (size == 0) {
79  self->lookahead_size = 1;
80  self->data.lookahead = '\0';
81  return;
82  }
83 
84  const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
88 
89  self->lookahead_size = decode(chunk, size, &self->data.lookahead);
90 
91  // If this chunk ended in the middle of a multi-byte character,
92  // try again with a fresh chunk.
93  if (self->data.lookahead == TS_DECODE_ERROR && size < 4) {
94  ts_lexer__get_chunk(self);
95  chunk = (const uint8_t *)self->chunk;
96  size = self->chunk_size;
97  self->lookahead_size = decode(chunk, size, &self->data.lookahead);
98  }
99 
100  if (self->data.lookahead == TS_DECODE_ERROR) {
101  self->lookahead_size = 1;
102  }
103 }
@ TSInputEncodingUTF8
Definition: api.h:45
int(* decode)(const ut8 *, ebc_command_t *cmd)
Definition: ebc_disas.c:88
voidpf void uLong size
Definition: ioapi.h:138
unsigned char uint8_t
Definition: sftypes.h:31
const char * chunk
Definition: lexer.h:20
TSInputEncoding encoding
Definition: api.h:70
Definition: malloc.c:21
static const int32_t TS_DECODE_ERROR
Definition: unicode.h:16
static uint32_t ts_decode_utf16(const uint8_t *string, uint32_t length, int32_t *code_point)
Definition: unicode.h:36
static uint32_t ts_decode_utf8(const uint8_t *string, uint32_t length, int32_t *code_point)
Definition: unicode.h:26
uint32_t(* UnicodeDecodeFunction)(const uint8_t *string, uint32_t length, int32_t *code_point)
Definition: unicode.h:20

References TS_DECODE_ERROR, ts_decode_utf16(), ts_decode_utf8(), ts_lexer__get_chunk(), and TSInputEncodingUTF8.

Referenced by ts_lexer__do_advance(), ts_lexer__get_column(), and ts_lexer_start().

◆ ts_lexer__is_at_included_range_start()

static bool ts_lexer__is_at_included_range_start ( const TSLexer _self)
static

Definition at line 265 of file lexer.c.

265  {
266  const Lexer *self = (const Lexer *)_self;
267  if (self->current_included_range_index < self->included_range_count) {
268  TSRange *current_range = &self->included_ranges[self->current_included_range_index];
269  return self->current_position.bytes == current_range->start_byte;
270  } else {
271  return false;
272  }
273 }

References TSRange::start_byte.

Referenced by ts_lexer_init().

◆ ts_lexer__mark_end()

static void ts_lexer__mark_end ( TSLexer _self)
static

Definition at line 215 of file lexer.c.

215  {
216  Lexer *self = (Lexer *)_self;
217  if (!ts_lexer__eof(&self->data)) {
218  // If the lexer is right at the beginning of included range,
219  // then the token should be considered to end at the *end* of the
220  // previous included range, rather than here.
221  TSRange *current_included_range = &self->included_ranges[
222  self->current_included_range_index
223  ];
224  if (
225  self->current_included_range_index > 0 &&
226  self->current_position.bytes == current_included_range->start_byte
227  ) {
228  TSRange *previous_included_range = current_included_range - 1;
229  self->token_end_position = (Length) {
230  previous_included_range->end_byte,
231  previous_included_range->end_point,
232  };
233  return;
234  }
235  }
236  self->token_end_position = self->current_position;
237 }
TSPoint end_point
Definition: api.h:62

References TSRange::end_byte, TSRange::end_point, TSRange::start_byte, and ts_lexer__eof().

Referenced by ts_lexer_finish(), ts_lexer_init(), and ts_lexer_mark_end().

◆ ts_lexer_advance_to_end()

void ts_lexer_advance_to_end ( Lexer self)

Definition at line 357 of file lexer.c.

357  {
358  while (self->chunk) {
359  ts_lexer__advance(&self->data, false);
360  }
361 }
static void ts_lexer__advance(TSLexer *_self, bool skip)
Definition: lexer.c:200

References ts_lexer__advance().

Referenced by parser__halt_parse().

◆ ts_lexer_delete()

void ts_lexer_delete ( Lexer self)

Definition at line 304 of file lexer.c.

304  {
305  ts_free(self->included_ranges);
306 }
#define ts_free
Definition: alloc.h:30
TSRange * included_ranges
Definition: lexer.h:19

References ts_free.

Referenced by ts_parser_delete().

◆ ts_lexer_finish()

void ts_lexer_finish ( Lexer self,
uint32_t lookahead_end_byte 
)

Definition at line 337 of file lexer.c.

337  {
339  ts_lexer__mark_end(&self->data);
340  }
341 
342  uint32_t current_lookahead_end_byte = self->current_position.bytes + 1;
343 
344  // In order to determine that a byte sequence is invalid UTF8 or UTF16,
345  // the character decoding algorithm may have looked at the following byte.
346  // Therefore, the next byte *after* the current (invalid) character
347  // affects the interpretation of the current character.
348  if (self->data.lookahead == TS_DECODE_ERROR) {
349  current_lookahead_end_byte++;
350  }
351 
352  if (current_lookahead_end_byte > *lookahead_end_byte) {
353  *lookahead_end_byte = current_lookahead_end_byte;
354  }
355 }
static bool length_is_undefined(Length length)
Definition: length.h:17
static void ts_lexer__mark_end(TSLexer *_self)
Definition: lexer.c:215
Length token_end_position
Definition: lexer.h:17

References length_is_undefined(), TS_DECODE_ERROR, and ts_lexer__mark_end().

Referenced by ts_parser__lex().

◆ ts_lexer_goto()

static void ts_lexer_goto ( Lexer self,
Length  position 
)
static

Definition at line 105 of file lexer.c.

105  {
106  self->current_position = position;
107  bool found_included_range = false;
108 
109  // Move to the first valid position at or after the given position.
110  for (unsigned i = 0; i < self->included_range_count; i++) {
111  TSRange *included_range = &self->included_ranges[i];
112  if (included_range->end_byte > position.bytes) {
113  if (included_range->start_byte >= position.bytes) {
114  self->current_position = (Length) {
115  .bytes = included_range->start_byte,
116  .extent = included_range->start_point,
117  };
118  }
119 
120  self->current_included_range_index = i;
121  found_included_range = true;
122  break;
123  }
124  }
125 
126  if (found_included_range) {
127  // If the current position is outside of the current chunk of text,
128  // then clear out the current chunk of text.
129  if (self->chunk && (
130  position.bytes < self->chunk_start ||
131  position.bytes >= self->chunk_start + self->chunk_size
132  )) {
133  ts_lexer__clear_chunk(self);
134  }
135 
136  self->lookahead_size = 0;
137  self->data.lookahead = '\0';
138  }
139 
140  // If the given position is beyond any of included ranges, move to the EOF
141  // state - past the end of the included ranges.
142  else {
143  self->current_included_range_index = self->included_range_count;
144  TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
145  self->current_position = (Length) {
146  .bytes = last_included_range->end_byte,
147  .extent = last_included_range->end_point,
148  };
149  ts_lexer__clear_chunk(self);
150  self->lookahead_size = 1;
151  self->data.lookahead = '\0';
152  }
153 }
lzma_index ** i
Definition: index.h:629

References Length::bytes, TSRange::end_byte, TSRange::end_point, i, TSRange::start_byte, TSRange::start_point, and ts_lexer__clear_chunk().

Referenced by ts_lexer_reset(), ts_lexer_set_included_ranges(), and ts_lexer_set_input().

◆ ts_lexer_included_ranges()

TSRange* ts_lexer_included_ranges ( const Lexer self,
uint32_t count 
)

Definition at line 395 of file lexer.c.

395  {
396  *count = self->included_range_count;
397  return self->included_ranges;
398 }
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void count
Definition: sflib.h:98

References count.

Referenced by ts_parser_included_ranges().

◆ ts_lexer_init()

void ts_lexer_init ( Lexer self)

Definition at line 275 of file lexer.c.

275  {
276  *self = (Lexer) {
277  .data = {
278  // The lexer's methods are stored as struct fields so that generated
279  // parsers can call them without needing to be linked against this
280  // library.
281  .advance = ts_lexer__advance,
282  .mark_end = ts_lexer__mark_end,
283  .get_column = ts_lexer__get_column,
284  .is_at_included_range_start = ts_lexer__is_at_included_range_start,
285  .eof = ts_lexer__eof,
286  .lookahead = 0,
287  .result_symbol = 0,
288  },
289  .chunk = NULL,
290  .chunk_size = 0,
291  .chunk_start = 0,
292  .current_position = {0, {0, 0}},
293  .logger = {
294  .payload = NULL,
295  .log = NULL
296  },
297  .included_ranges = NULL,
298  .included_range_count = 0,
299  .current_included_range_index = 0,
300  };
302 }
static uint32_t ts_lexer__get_column(TSLexer *_self)
Definition: lexer.c:239
static bool ts_lexer__is_at_included_range_start(const TSLexer *_self)
Definition: lexer.c:265
bool ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count)
Definition: lexer.c:367

References NULL, ts_lexer__advance(), ts_lexer__eof(), ts_lexer__get_column(), ts_lexer__is_at_included_range_start(), ts_lexer__mark_end(), and ts_lexer_set_included_ranges().

Referenced by parser_init(), and ts_parser_new().

◆ ts_lexer_mark_end()

void ts_lexer_mark_end ( Lexer self)

Definition at line 363 of file lexer.c.

363  {
364  ts_lexer__mark_end(&self->data);
365 }

References ts_lexer__mark_end().

Referenced by ts_parser__handle_error().

◆ ts_lexer_reset()

void ts_lexer_reset ( Lexer self,
Length  position 
)

Definition at line 316 of file lexer.c.

316  {
317  if (position.bytes != self->current_position.bytes) {
318  ts_lexer_goto(self, position);
319  }
320 }
static void ts_lexer_goto(Lexer *self, Length position)
Definition: lexer.c:105

References Length::bytes, and ts_lexer_goto().

Referenced by parser__lex(), ts_parser__handle_error(), ts_parser__lex(), and ts_parser_reset().

◆ ts_lexer_set_included_ranges()

bool ts_lexer_set_included_ranges ( Lexer self,
const TSRange ranges,
uint32_t  count 
)

Definition at line 367 of file lexer.c.

371  {
372  if (count == 0 || !ranges) {
373  ranges = &DEFAULT_RANGE;
374  count = 1;
375  } else {
376  uint32_t previous_byte = 0;
377  for (unsigned i = 0; i < count; i++) {
378  const TSRange *range = &ranges[i];
379  if (
380  range->start_byte < previous_byte ||
381  range->end_byte < range->start_byte
382  ) return false;
383  previous_byte = range->end_byte;
384  }
385  }
386 
387  size_t size = count * sizeof(TSRange);
388  self->included_ranges = ts_realloc(self->included_ranges, size);
389  memcpy(self->included_ranges, ranges, size);
390  self->included_range_count = count;
391  ts_lexer_goto(self, self->current_position);
392  return true;
393 }
#define ts_realloc
Definition: alloc.h:27
static const TSRange DEFAULT_RANGE
Definition: lexer.c:26
memcpy(mem, inblock.get(), min(CONTAINING_RECORD(inblock.get(), MEMBLOCK, data) ->size, size))

References count, DEFAULT_RANGE, i, memcpy(), capstone::range, ts_lexer_goto(), and ts_realloc.

Referenced by ts_lexer_init(), and ts_parser_set_included_ranges().

◆ ts_lexer_set_input()

void ts_lexer_set_input ( Lexer self,
TSInput  input 
)

Definition at line 308 of file lexer.c.

308  {
309  self->input = input;
310  ts_lexer__clear_chunk(self);
311  ts_lexer_goto(self, self->current_position);
312 }
static bool input(void *ud, zip_uint8_t *data, zip_uint64_t length)

References input(), ts_lexer__clear_chunk(), and ts_lexer_goto().

Referenced by parser__start(), and ts_parser_parse().

◆ ts_lexer_start()

void ts_lexer_start ( Lexer self)

Definition at line 322 of file lexer.c.

322  {
323  self->token_start_position = self->current_position;
324  self->token_end_position = LENGTH_UNDEFINED;
325  self->data.result_symbol = 0;
326  self->did_get_column = false;
327  if (!ts_lexer__eof(&self->data)) {
328  if (!self->chunk_size) ts_lexer__get_chunk(self);
329  if (!self->lookahead_size) ts_lexer__get_lookahead(self);
330  if (
331  self->current_position.bytes == 0 &&
333  ) ts_lexer__advance(&self->data, true);
334  }
335 }
static const Length LENGTH_UNDEFINED
Definition: length.h:14
static const int32_t BYTE_ORDER_MARK
Definition: lexer.c:24

References BYTE_ORDER_MARK, LENGTH_UNDEFINED, ts_lexer__advance(), ts_lexer__eof(), ts_lexer__get_chunk(), and ts_lexer__get_lookahead().

Referenced by parser__lex(), and ts_parser__lex().

Variable Documentation

◆ BYTE_ORDER_MARK

const int32_t BYTE_ORDER_MARK = 0xFEFF
static

Definition at line 24 of file lexer.c.

Referenced by ts_lexer_start().

◆ DEFAULT_RANGE

const TSRange DEFAULT_RANGE
static
Initial value:
= {
.start_point = {
.row = 0,
.column = 0,
},
.end_point = {
.row = UINT32_MAX,
.column = UINT32_MAX,
},
.start_byte = 0,
.end_byte = UINT32_MAX
}
#define UINT32_MAX

Definition at line 26 of file lexer.c.

Referenced by ts_lexer_set_included_ranges().