Rizin
unix-like reverse engineering framework and cli tools
lexer.c
Go to the documentation of this file.
1 #include <stdio.h>
2 #include "./lexer.h"
3 #include "./subtree.h"
4 #include "./length.h"
5 #include "./unicode.h"
6 
7 #define LOG(message, character) \
8  if (self->logger.log) { \
9  snprintf( \
10  self->debug_buffer, \
11  TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \
12  32 <= character && character < 127 ? \
13  message " character:'%c'" : \
14  message " character:%d", \
15  character \
16  ); \
17  self->logger.log( \
18  self->logger.payload, \
19  TSLogTypeLex, \
20  self->debug_buffer \
21  ); \
22  }
23 
24 static const int32_t BYTE_ORDER_MARK = 0xFEFF;
25 
26 static const TSRange DEFAULT_RANGE = {
27  .start_point = {
28  .row = 0,
29  .column = 0,
30  },
31  .end_point = {
32  .row = UINT32_MAX,
33  .column = UINT32_MAX,
34  },
35  .start_byte = 0,
36  .end_byte = UINT32_MAX
37 };
38 
39 // Check if the lexer has reached EOF. This state is stored
40 // by setting the lexer's `current_included_range_index` such that
41 // it has consumed all of its available ranges.
42 static bool ts_lexer__eof(const TSLexer *_self) {
43  Lexer *self = (Lexer *)_self;
44  return self->current_included_range_index == self->included_range_count;
45 }
46 
47 // Clear the currently stored chunk of source code, because the lexer's
48 // position has changed.
49 static void ts_lexer__clear_chunk(Lexer *self) {
50  self->chunk = NULL;
51  self->chunk_size = 0;
52  self->chunk_start = 0;
53 }
54 
55 // Call the lexer's input callback to obtain a new chunk of source code
56 // for the current position.
57 static void ts_lexer__get_chunk(Lexer *self) {
58  self->chunk_start = self->current_position.bytes;
59  self->chunk = self->input.read(
60  self->input.payload,
61  self->current_position.bytes,
62  self->current_position.extent,
63  &self->chunk_size
64  );
65  if (!self->chunk_size) {
66  self->current_included_range_index = self->included_range_count;
67  self->chunk = NULL;
68  }
69 }
70 
71 // Decode the next unicode character in the current chunk of source code.
72 // This assumes that the lexer has already retrieved a chunk of source
73 // code that spans the current position.
74 static void ts_lexer__get_lookahead(Lexer *self) {
75  uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
76  uint32_t size = self->chunk_size - position_in_chunk;
77 
78  if (size == 0) {
79  self->lookahead_size = 1;
80  self->data.lookahead = '\0';
81  return;
82  }
83 
84  const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
85  UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8
88 
89  self->lookahead_size = decode(chunk, size, &self->data.lookahead);
90 
91  // If this chunk ended in the middle of a multi-byte character,
92  // try again with a fresh chunk.
93  if (self->data.lookahead == TS_DECODE_ERROR && size < 4) {
94  ts_lexer__get_chunk(self);
95  chunk = (const uint8_t *)self->chunk;
96  size = self->chunk_size;
97  self->lookahead_size = decode(chunk, size, &self->data.lookahead);
98  }
99 
100  if (self->data.lookahead == TS_DECODE_ERROR) {
101  self->lookahead_size = 1;
102  }
103 }
104 
105 static void ts_lexer_goto(Lexer *self, Length position) {
106  self->current_position = position;
107  bool found_included_range = false;
108 
109  // Move to the first valid position at or after the given position.
110  for (unsigned i = 0; i < self->included_range_count; i++) {
111  TSRange *included_range = &self->included_ranges[i];
112  if (included_range->end_byte > position.bytes) {
113  if (included_range->start_byte >= position.bytes) {
114  self->current_position = (Length) {
115  .bytes = included_range->start_byte,
116  .extent = included_range->start_point,
117  };
118  }
119 
120  self->current_included_range_index = i;
121  found_included_range = true;
122  break;
123  }
124  }
125 
126  if (found_included_range) {
127  // If the current position is outside of the current chunk of text,
128  // then clear out the current chunk of text.
129  if (self->chunk && (
130  position.bytes < self->chunk_start ||
131  position.bytes >= self->chunk_start + self->chunk_size
132  )) {
133  ts_lexer__clear_chunk(self);
134  }
135 
136  self->lookahead_size = 0;
137  self->data.lookahead = '\0';
138  }
139 
140  // If the given position is beyond any of included ranges, move to the EOF
141  // state - past the end of the included ranges.
142  else {
143  self->current_included_range_index = self->included_range_count;
144  TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
145  self->current_position = (Length) {
146  .bytes = last_included_range->end_byte,
147  .extent = last_included_range->end_point,
148  };
149  ts_lexer__clear_chunk(self);
150  self->lookahead_size = 1;
151  self->data.lookahead = '\0';
152  }
153 }
154 
155 // Intended to be called only from functions that control logging.
156 static void ts_lexer__do_advance(Lexer *self, bool skip) {
157  if (self->lookahead_size) {
158  self->current_position.bytes += self->lookahead_size;
159  if (self->data.lookahead == '\n') {
160  self->current_position.extent.row++;
161  self->current_position.extent.column = 0;
162  } else {
163  self->current_position.extent.column += self->lookahead_size;
164  }
165  }
166 
167  const TSRange *current_range = NULL;
168  if (self->current_included_range_index < self->included_range_count) {
169  current_range = &self->included_ranges[self->current_included_range_index];
170  if (self->current_position.bytes == current_range->end_byte) {
171  self->current_included_range_index++;
172  if (self->current_included_range_index < self->included_range_count) {
173  current_range++;
174  self->current_position = (Length) {
175  current_range->start_byte,
176  current_range->start_point,
177  };
178  } else {
179  current_range = NULL;
180  }
181  }
182  }
183 
184  if (skip) self->token_start_position = self->current_position;
185 
186  if (current_range) {
187  if (self->current_position.bytes >= self->chunk_start + self->chunk_size) {
188  ts_lexer__get_chunk(self);
189  }
191  } else {
192  ts_lexer__clear_chunk(self);
193  self->data.lookahead = '\0';
194  self->lookahead_size = 1;
195  }
196 }
197 
198 // Advance to the next character in the source code, retrieving a new
199 // chunk of source code if needed.
200 static void ts_lexer__advance(TSLexer *_self, bool skip) {
201  Lexer *self = (Lexer *)_self;
202  if (!self->chunk) return;
203 
204  if (skip) {
205  LOG("skip", self->data.lookahead);
206  } else {
207  LOG("consume", self->data.lookahead);
208  }
209 
210  ts_lexer__do_advance(self, skip);
211 }
212 
213 // Mark that a token match has completed. This can be called multiple
214 // times if a longer match is found later.
215 static void ts_lexer__mark_end(TSLexer *_self) {
216  Lexer *self = (Lexer *)_self;
217  if (!ts_lexer__eof(&self->data)) {
218  // If the lexer is right at the beginning of included range,
219  // then the token should be considered to end at the *end* of the
220  // previous included range, rather than here.
221  TSRange *current_included_range = &self->included_ranges[
222  self->current_included_range_index
223  ];
224  if (
225  self->current_included_range_index > 0 &&
226  self->current_position.bytes == current_included_range->start_byte
227  ) {
228  TSRange *previous_included_range = current_included_range - 1;
229  self->token_end_position = (Length) {
230  previous_included_range->end_byte,
231  previous_included_range->end_point,
232  };
233  return;
234  }
235  }
236  self->token_end_position = self->current_position;
237 }
238 
240  Lexer *self = (Lexer *)_self;
241 
242  uint32_t goal_byte = self->current_position.bytes;
243 
244  self->did_get_column = true;
245  self->current_position.bytes -= self->current_position.extent.column;
246  self->current_position.extent.column = 0;
247 
248  if (self->current_position.bytes < self->chunk_start) {
249  ts_lexer__get_chunk(self);
250  }
251 
252  uint32_t result = 0;
254  while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self) && self->chunk) {
255  ts_lexer__do_advance(self, false);
256  result++;
257  }
258 
259  return result;
260 }
261 
262 // Is the lexer at a boundary between two disjoint included ranges of
263 // source code? This is exposed as an API because some languages' external
264 // scanners need to perform custom actions at these boundaries.
266  const Lexer *self = (const Lexer *)_self;
267  if (self->current_included_range_index < self->included_range_count) {
268  TSRange *current_range = &self->included_ranges[self->current_included_range_index];
269  return self->current_position.bytes == current_range->start_byte;
270  } else {
271  return false;
272  }
273 }
274 
275 void ts_lexer_init(Lexer *self) {
276  *self = (Lexer) {
277  .data = {
278  // The lexer's methods are stored as struct fields so that generated
279  // parsers can call them without needing to be linked against this
280  // library.
281  .advance = ts_lexer__advance,
282  .mark_end = ts_lexer__mark_end,
283  .get_column = ts_lexer__get_column,
284  .is_at_included_range_start = ts_lexer__is_at_included_range_start,
285  .eof = ts_lexer__eof,
286  .lookahead = 0,
287  .result_symbol = 0,
288  },
289  .chunk = NULL,
290  .chunk_size = 0,
291  .chunk_start = 0,
292  .current_position = {0, {0, 0}},
293  .logger = {
294  .payload = NULL,
295  .log = NULL
296  },
297  .included_ranges = NULL,
298  .included_range_count = 0,
299  .current_included_range_index = 0,
300  };
302 }
303 
304 void ts_lexer_delete(Lexer *self) {
305  ts_free(self->included_ranges);
306 }
307 
309  self->input = input;
310  ts_lexer__clear_chunk(self);
311  ts_lexer_goto(self, self->current_position);
312 }
313 
314 // Move the lexer to the given position. This doesn't do any work
315 // if the parser is already at the given position.
316 void ts_lexer_reset(Lexer *self, Length position) {
317  if (position.bytes != self->current_position.bytes) {
318  ts_lexer_goto(self, position);
319  }
320 }
321 
322 void ts_lexer_start(Lexer *self) {
323  self->token_start_position = self->current_position;
324  self->token_end_position = LENGTH_UNDEFINED;
325  self->data.result_symbol = 0;
326  self->did_get_column = false;
327  if (!ts_lexer__eof(&self->data)) {
328  if (!self->chunk_size) ts_lexer__get_chunk(self);
329  if (!self->lookahead_size) ts_lexer__get_lookahead(self);
330  if (
331  self->current_position.bytes == 0 &&
332  self->data.lookahead == BYTE_ORDER_MARK
333  ) ts_lexer__advance(&self->data, true);
334  }
335 }
336 
337 void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
338  if (length_is_undefined(self->token_end_position)) {
339  ts_lexer__mark_end(&self->data);
340  }
341 
342  uint32_t current_lookahead_end_byte = self->current_position.bytes + 1;
343 
344  // In order to determine that a byte sequence is invalid UTF8 or UTF16,
345  // the character decoding algorithm may have looked at the following byte.
346  // Therefore, the next byte *after* the current (invalid) character
347  // affects the interpretation of the current character.
348  if (self->data.lookahead == TS_DECODE_ERROR) {
349  current_lookahead_end_byte++;
350  }
351 
352  if (current_lookahead_end_byte > *lookahead_end_byte) {
353  *lookahead_end_byte = current_lookahead_end_byte;
354  }
355 }
356 
358  while (self->chunk) {
359  ts_lexer__advance(&self->data, false);
360  }
361 }
362 
364  ts_lexer__mark_end(&self->data);
365 }
366 
368  Lexer *self,
369  const TSRange *ranges,
371 ) {
372  if (count == 0 || !ranges) {
373  ranges = &DEFAULT_RANGE;
374  count = 1;
375  } else {
376  uint32_t previous_byte = 0;
377  for (unsigned i = 0; i < count; i++) {
378  const TSRange *range = &ranges[i];
379  if (
380  range->start_byte < previous_byte ||
381  range->end_byte < range->start_byte
382  ) return false;
383  previous_byte = range->end_byte;
384  }
385  }
386 
387  size_t size = count * sizeof(TSRange);
388  self->included_ranges = ts_realloc(self->included_ranges, size);
389  memcpy(self->included_ranges, ranges, size);
390  self->included_range_count = count;
391  ts_lexer_goto(self, self->current_position);
392  return true;
393 }
394 
396  *count = self->included_range_count;
397  return self->included_ranges;
398 }
399 
400 #undef LOG
#define ts_realloc
Definition: alloc.h:27
#define ts_free
Definition: alloc.h:30
lzma_index ** i
Definition: index.h:629
@ TSInputEncodingUTF8
Definition: api.h:45
#define NULL
Definition: cris-opc.c:27
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void count
Definition: sflib.h:98
int(* decode)(const ut8 *, ebc_command_t *cmd)
Definition: ebc_disas.c:88
void skip(file *in, unsigned n)
Definition: gzappend.c:202
voidpf void uLong size
Definition: ioapi.h:138
static const Length LENGTH_UNDEFINED
Definition: length.h:14
static bool length_is_undefined(Length length)
Definition: length.h:17
static uint32_t ts_lexer__get_column(TSLexer *_self)
Definition: lexer.c:239
void ts_lexer_reset(Lexer *self, Length position)
Definition: lexer.c:316
void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte)
Definition: lexer.c:337
static bool ts_lexer__eof(const TSLexer *_self)
Definition: lexer.c:42
static bool ts_lexer__is_at_included_range_start(const TSLexer *_self)
Definition: lexer.c:265
void ts_lexer_mark_end(Lexer *self)
Definition: lexer.c:363
static void ts_lexer__get_lookahead(Lexer *self)
Definition: lexer.c:74
void ts_lexer_set_input(Lexer *self, TSInput input)
Definition: lexer.c:308
static void ts_lexer__mark_end(TSLexer *_self)
Definition: lexer.c:215
void ts_lexer_delete(Lexer *self)
Definition: lexer.c:304
static const int32_t BYTE_ORDER_MARK
Definition: lexer.c:24
void ts_lexer_advance_to_end(Lexer *self)
Definition: lexer.c:357
void ts_lexer_start(Lexer *self)
Definition: lexer.c:322
#define LOG(message, character)
Definition: lexer.c:7
void ts_lexer_init(Lexer *self)
Definition: lexer.c:275
static void ts_lexer__clear_chunk(Lexer *self)
Definition: lexer.c:49
static void ts_lexer__get_chunk(Lexer *self)
Definition: lexer.c:57
static void ts_lexer_goto(Lexer *self, Length position)
Definition: lexer.c:105
static void ts_lexer__advance(TSLexer *_self, bool skip)
Definition: lexer.c:200
TSRange * ts_lexer_included_ranges(const Lexer *self, uint32_t *count)
Definition: lexer.c:395
static void ts_lexer__do_advance(Lexer *self, bool skip)
Definition: lexer.c:156
static const TSRange DEFAULT_RANGE
Definition: lexer.c:26
bool ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count)
Definition: lexer.c:367
memcpy(mem, inblock.get(), min(CONTAINING_RECORD(inblock.get(), MEMBLOCK, data) ->size, size))
int int32_t
Definition: sftypes.h:33
unsigned int uint32_t
Definition: sftypes.h:29
unsigned char uint8_t
Definition: sftypes.h:31
#define UINT32_MAX
Definition: length.h:9
uint32_t bytes
Definition: length.h:10
Definition: lexer.h:13
Definition: api.h:67
Definition: parser.h:43
uint32_t row
Definition: api.h:56
Definition: api.h:60
TSPoint end_point
Definition: api.h:62
uint32_t start_byte
Definition: api.h:63
TSPoint start_point
Definition: api.h:61
uint32_t end_byte
Definition: api.h:64
Definition: malloc.c:21
static const int32_t TS_DECODE_ERROR
Definition: unicode.h:16
static uint32_t ts_decode_utf16(const uint8_t *string, uint32_t length, int32_t *code_point)
Definition: unicode.h:36
static uint32_t ts_decode_utf8(const uint8_t *string, uint32_t length, int32_t *code_point)
Definition: unicode.h:26
uint32_t(* UnicodeDecodeFunction)(const uint8_t *string, uint32_t length, int32_t *code_point)
Definition: unicode.h:20
static bool input(void *ud, zip_uint8_t *data, zip_uint64_t length)