Rizin
unix-like reverse engineering framework and cli tools
c_cpp_parser.c
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: 2021 Anton Kochkov <anton.kochkov@gmail.com>
2 // SPDX-License-Identifier: LGPL-3.0-only
3 
4 #include <stdio.h>
5 #include <rz_types.h>
6 #include <rz_list.h>
7 #include <rz_util/rz_file.h>
8 #include <rz_type.h>
9 #include <tree_sitter/api.h>
10 
11 #include <types_parser.h>
12 
13 #define TS_START_END(node, start, end) \
14  do { \
15  start = ts_node_start_byte(node); \
16  end = ts_node_end_byte(node); \
17  } while (0)
18 
19 static char *ts_node_sub_string(TSNode node, const char *cstr) {
20  ut32 start, end;
21  TS_START_END(node, start, end);
22  return rz_str_newf("%.*s", end - start, cstr + start);
23 }
24 
25 // Declare the `tree_sitter_c` function, which is
26 // implemented by the `tree-sitter-c` library.
28 
29 // Declare the `tree_sitter_cpp` function, which is
30 // implemented by the `tree-sitter-cpp` library.
31 // TSLanguage *tree_sitter_cpp();
32 
33 CParserState *c_parser_state_new(HtPP *base_types, HtPP *callable_types) {
35  if (!base_types) {
36  state->types = ht_pp_new0();
37  } else {
38  state->types = base_types;
39  }
40  if (!callable_types) {
41  state->callables = ht_pp_new0();
42  } else {
43  state->callables = callable_types;
44  }
45  // Forward definitions require to have a special hashtable
46  state->forward = ht_pp_new0();
47  // Initializing error/warning/debug messages buffers
48  state->errors = rz_strbuf_new("");
49  state->warnings = rz_strbuf_new("");
50  state->debug = rz_strbuf_new("");
51  state->verbose = false;
52  return state;
53 }
54 
56  ht_pp_free(state->forward);
57  ht_pp_free(state->types);
58  ht_pp_free(state->callables);
59  rz_strbuf_free(state->debug);
60  rz_strbuf_free(state->warnings);
61  rz_strbuf_free(state->errors);
62  free(state);
63  return;
64 }
65 
67  ht_pp_free(state->forward);
68  rz_strbuf_free(state->debug);
69  rz_strbuf_free(state->warnings);
70  rz_strbuf_free(state->errors);
71  free(state);
72  return;
73 }
74 
76  rz_strbuf_free(state->debug);
77  rz_strbuf_free(state->warnings);
78  rz_strbuf_free(state->errors);
79  // Initializing error/warning/debug messages buffers
80  state->errors = rz_strbuf_new("");
81  state->warnings = rz_strbuf_new("");
82  state->debug = rz_strbuf_new("");
83  return;
84 }
85 
88 };
89 
98  if (!parser) {
99  return NULL;
100  }
101  parser->state = c_parser_state_new(NULL, NULL);
102  return parser;
103 }
104 
117  if (!parser) {
118  return NULL;
119  }
120  parser->state = c_parser_state_new(types, callables);
121  return parser;
122 }
123 
128  // We do not destroy HT by default since it might be used after
130  free(parser);
131 }
132 
137  c_parser_state_free(parser->state);
138  free(parser);
139 }
140 
141 static int type_parse_string(CParserState *state, const char *code, char **error_msg) {
142  // Create a parser.
144  // Set the parser's language (C in this case)
146 
147  TSTree *tree = ts_parser_parse_string(parser, NULL, code, strlen(code));
148 
149  // Get the root node of the syntax tree.
150  TSNode root_node = ts_tree_root_node(tree);
151  int root_node_child_count = ts_node_named_child_count(root_node);
152  if (!root_node_child_count) {
153  parser_warning(state, "Root node is empty!\n");
154  ts_tree_delete(tree);
156  return 0;
157  }
158 
159  // Some debugging
160  if (state->verbose) {
161  parser_debug(state, "root_node (%d children): %s\n", root_node_child_count, ts_node_type(root_node));
162  // Print the syntax tree as an S-expression.
163  char *string = ts_node_string(root_node);
164  parser_debug(state, "Syntax tree: %s\n", string);
165  free(string);
166  }
167 
168  // At first step we should handle defines
169  // #define
170  // #if / #ifdef
171  // #else
172  // #endif
173  // After that, we should process include files and #error/#warning/#pragma
174  // Temporarily we could just run preprocessing step using tccpp code
175  //
176  // And only after that - run the normal C/C++ syntax parsing
177 
178  // Filter types function prototypes and start parsing
179  int i = 0, result = 0;
180  for (i = 0; i < root_node_child_count; i++) {
181  TSNode child = ts_node_named_child(root_node, i);
182  // We skip ";" or "," - empty expressions
183  char *node_code = ts_node_sub_string(child, code);
184  if (!strcmp(node_code, ";") || !strcmp(node_code, ",")) {
185  free(node_code);
186  continue;
187  }
188  free(node_code);
189  parser_debug(state, "Processing %d child...\n", i);
190  result += parse_type_nodes_save(state, child, code);
191  }
192 
193  // If there were errors during the parser then the result is different from 0
194  if (result) {
195  char *error_msgs = rz_strbuf_drain_nofree(state->errors);
196  RZ_LOG_DEBUG("Errors:\n");
197  RZ_LOG_DEBUG("%s", error_msgs);
198  char *warning_msgs = rz_strbuf_drain_nofree(state->warnings);
199  RZ_LOG_DEBUG("Warnings:\n");
200  RZ_LOG_DEBUG("%s", warning_msgs);
201  if (error_msg) {
202  *error_msg = strdup(error_msgs);
203  }
204  free(error_msgs);
205  free(warning_msgs);
206  }
207  if (state->verbose) {
208  char *debug_msgs = rz_strbuf_drain_nofree(state->debug);
209  RZ_LOG_DEBUG("%s", debug_msgs);
210  free(debug_msgs);
211  }
212 
213  // After everything parsed, we should preserve the base type database
214  // And the state of the parser - anonymous structs, forward declarations, etc
215  ts_tree_delete(tree);
217  return result;
218 }
219 
227 RZ_API int rz_type_parse_string_stateless(RzTypeParser *parser, const char *code, char **error_msg) {
228  return type_parse_string(parser->state, code, error_msg);
229 }
230 
239 RZ_API int rz_type_parse_file_stateless(RzTypeParser *parser, const char *path, const char *dir, char **error_msg) {
240  size_t read_bytes = 0;
241  char *source_code = rz_file_slurp(path, &read_bytes);
242  if (!source_code || !read_bytes) {
243  free(source_code);
244  return -1;
245  }
246  RZ_LOG_DEBUG("File size is %" PFMT64d " bytes, read %zu bytes\n", rz_file_size(path), read_bytes);
247  int result = rz_type_parse_string_stateless(parser, source_code, error_msg);
248  free(source_code);
249  return result;
250 }
251 
260 RZ_API int rz_type_parse_file(RzTypeDB *typedb, const char *path, const char *dir, char **error_msg) {
261  size_t read_bytes = 0;
262  char *source_code = rz_file_slurp(path, &read_bytes);
263  if (!source_code || !read_bytes) {
264  free(source_code);
265  return -1;
266  }
267  RZ_LOG_DEBUG("File size is %" PFMT64d " bytes, read %zu bytes\n", rz_file_size(path), read_bytes);
268  int result = rz_type_parse_string(typedb, source_code, error_msg);
269  free(source_code);
270  return result;
271 }
272 
280 RZ_API int rz_type_parse_string(RzTypeDB *typedb, const char *code, char **error_msg) {
281  bool verbose = true;
282  // Create new C parser state
283  CParserState *state = c_parser_state_new(typedb->types, typedb->callables);
284  if (!state) {
285  eprintf("CParserState initialization error!\n");
286  return -1;
287  }
288  state->verbose = verbose;
289  return type_parse_string(state, code, error_msg);
290 }
291 
298  rz_type_parser_free(typedb->parser);
299  typedb->parser = rz_type_parser_new();
300 }
301 
311  if (error_msg) {
312  *error_msg = NULL;
313  }
314  // Create a parser.
315  TSParser *tsparser = ts_parser_new();
316  // Set the parser's language (C in this case)
318 
319  // Note, that the original C grammar doesn't have support for alternate roots,
320  // see:
321  // - https://github.com/tree-sitter/tree-sitter-c/issues/65
322  // - https://github.com/tree-sitter/tree-sitter/issues/1105
323  // Thus, we use our own patched C grammar that has an additional rule
324  // for type descriptor, but we use the `__TYPE_EXPRESSION` prefix for every
325  // such type descriptor expression.
326  char *patched_code = rz_str_newf("__TYPE_EXPRESSION %s", code);
327 
328  TSTree *tree = ts_parser_parse_string(tsparser, NULL, patched_code, strlen(patched_code));
329 
330  // Get the root node of the syntax tree.
331  TSNode root_node = ts_tree_root_node(tree);
332  int root_node_child_count = ts_node_named_child_count(root_node);
333  if (!root_node_child_count) {
334  parser_warning(parser->state, "Root node is empty!\n");
335  ts_tree_delete(tree);
336  ts_parser_delete(tsparser);
337  free(patched_code);
338  return NULL;
339  }
340 
341  // Some debugging
342  if (parser->state->verbose) {
343  parser_debug(parser->state, "code: \"%s\"\n", code);
344  parser_debug(parser->state, "patched code: \"%s\"\n", patched_code);
345  parser_debug(parser->state, "root_node (%d children): %s\n", root_node_child_count, ts_node_type(root_node));
346  // Print the syntax tree as an S-expression.
347  char *string = ts_node_string(root_node);
348  parser_debug(parser->state, "Syntax tree: %s\n", string);
349  free(string);
350  }
351 
352  // At first step we should handle defines
353  // #define
354  // #if / #ifdef
355  // #else
356  // #endif
357  // After that, we should process include files and #error/#warning/#pragma
358  // Temporarily we could just run preprocessing step using tccpp code
359  //
360  // And only after that - run the normal C/C++ syntax parsing
361 
362  // Filter types function prototypes and start parsing
363  int i = 0, result = 0;
364  ParserTypePair *tpair = NULL;
365  for (i = 0; i < root_node_child_count; i++) {
366  parser_debug(parser->state, "Processing %d child...\n", i);
367  TSNode child = ts_node_named_child(root_node, i);
368  if (!parse_type_descriptor_single(parser->state, child, patched_code, &tpair)) {
369  break;
370  }
371  }
372 
373  // If there were errors during the parser then the result is different from 0
374  if (result || !tpair) {
375  char *error_msgs = rz_strbuf_drain_nofree(parser->state->errors);
376  RZ_LOG_DEBUG("Errors:\n");
377  RZ_LOG_DEBUG("%s", error_msgs);
378  char *warning_msgs = rz_strbuf_drain_nofree(parser->state->warnings);
379  RZ_LOG_DEBUG("Warnings:\n");
380  RZ_LOG_DEBUG("%s", warning_msgs);
381  if (error_msg) {
382  *error_msg = strdup(error_msgs);
383  }
384  free(error_msgs);
385  free(warning_msgs);
386  }
387  if (parser->state->verbose) {
388  char *debug_msgs = rz_strbuf_drain_nofree(parser->state->debug);
389  RZ_LOG_DEBUG("%s", debug_msgs);
390  free(debug_msgs);
391  }
392 
393  // After everything parsed, we should preserve the base type database
394  // Also we don't free the parser state, just reset the buffers for new use
396  ts_tree_delete(tree);
397  ts_parser_delete(tsparser);
398  free(patched_code);
399  RzType *ret = tpair ? tpair->type : NULL;
400  free(tpair);
401  return ret;
402 }
403 
412  if (error_msg) {
413  *error_msg = NULL;
414  }
415  // Create a parser.
416  TSParser *tsparser = ts_parser_new();
417  // Set the parser's language (C in this case)
419 
420  TSTree *tree = ts_parser_parse_string(tsparser, NULL, code, strlen(code));
421 
422  // Get the root node of the syntax tree.
423  TSNode root_node = ts_tree_root_node(tree);
424  int root_node_child_count = ts_node_named_child_count(root_node);
425  if (!root_node_child_count) {
426  parser_warning(parser->state, "Root node is empty!\n");
427  ts_tree_delete(tree);
428  ts_parser_delete(tsparser);
429  return NULL;
430  }
431 
432  // Some debugging
433  if (parser->state->verbose) {
434  parser_debug(parser->state, "code: \"%s\"\n", code);
435  parser_debug(parser->state, "root_node (%d children): %s\n", root_node_child_count, ts_node_type(root_node));
436  // Print the syntax tree as an S-expression.
437  char *string = ts_node_string(root_node);
438  parser_debug(parser->state, "Syntax tree: %s\n", string);
439  free(string);
440  }
441 
442  // At first step we should handle defines
443  // #define
444  // #if / #ifdef
445  // #else
446  // #endif
447  // After that, we should process include files and #error/#warning/#pragma
448  // Temporarily we could just run preprocessing step using tccpp code
449  //
450  // And only after that - run the normal C/C++ syntax parsing
451 
452  // Filter types function prototypes and start parsing
453  int i = 0, result = 0;
454  ParserTypePair *tpair = NULL;
455  for (i = 0; i < root_node_child_count; i++) {
456  parser_debug(parser->state, "Processing %d child...\n", i);
457  TSNode child = ts_node_named_child(root_node, i);
458  if (!parse_declaration_node(parser->state, child, code, &tpair)) {
459  break;
460  }
461  }
462 
463  // If there were errors during the parser then the result is different from 0
464  if (result || !tpair) {
465  char *error_msgs = rz_strbuf_drain_nofree(parser->state->errors);
466  RZ_LOG_DEBUG("Errors:\n");
467  RZ_LOG_DEBUG("%s", error_msgs);
468  char *warning_msgs = rz_strbuf_drain_nofree(parser->state->warnings);
469  RZ_LOG_DEBUG("Warnings:\n");
470  RZ_LOG_DEBUG("%s", warning_msgs);
471  if (error_msg) {
472  *error_msg = strdup(error_msgs);
473  }
474  free(error_msgs);
475  free(warning_msgs);
476  }
477  if (parser->state->verbose) {
478  char *debug_msgs = rz_strbuf_drain_nofree(parser->state->debug);
479  RZ_LOG_DEBUG("%s", debug_msgs);
480  free(debug_msgs);
481  }
482 
483  // After everything parsed, we should preserve the base type database
484  // Also we don't free the parser state, just reset the buffers for new use
486  ts_tree_delete(tree);
487  ts_parser_delete(tsparser);
488  return tpair ? tpair->type : NULL;
489 }
lzma_index ** i
Definition: index.h:629
const char * ts_node_type(TSNode)
Definition: node.c:420
void ts_parser_delete(TSParser *parser)
Definition: parser.c:1725
TSNode ts_node_named_child(TSNode, uint32_t)
Definition: node.c:496
char * ts_node_string(TSNode)
Definition: node.c:426
uint32_t ts_node_named_child_count(TSNode)
Definition: node.c:611
void ts_tree_delete(TSTree *self)
Definition: tree.c:26
TSNode ts_tree_root_node(const TSTree *self)
Definition: tree.c:36
bool ts_parser_set_language(TSParser *self, const TSLanguage *language)
Definition: parser.c:1754
TSParser * ts_parser_new(void)
Definition: parser.c:1704
TSTree * ts_parser_parse_string(TSParser *self, const TSTree *old_tree, const char *string, uint32_t length)
Definition: parser.c:1945
RZ_API RZ_OWN RzTypeParser * rz_type_parser_init(HtPP *types, HtPP *callables)
Creates a new instance of the C type parser.
Definition: c_cpp_parser.c:115
RZ_API int rz_type_parse_file(RzTypeDB *typedb, const char *path, const char *dir, char **error_msg)
Parses the C types file creating the new parser state.
Definition: c_cpp_parser.c:260
TSLanguage * tree_sitter_c()
Definition: parser.c:79645
RZ_API RZ_OWN RzType * rz_type_parse_string_declaration_single(RzTypeParser *parser, const char *code, char **error_msg)
Parses the single C type declaration.
Definition: c_cpp_parser.c:411
#define TS_START_END(node, start, end)
Definition: c_cpp_parser.c:13
RZ_API RZ_OWN RzTypeParser * rz_type_parser_new()
Creates a new instance of the C type parser.
Definition: c_cpp_parser.c:96
RZ_API void rz_type_parser_free(RZ_NONNULL RzTypeParser *parser)
Frees the instance of the C type parser without destroying hashtables.
Definition: c_cpp_parser.c:127
void c_parser_state_free_keep_ht(CParserState *state)
Definition: c_cpp_parser.c:66
RZ_API RZ_OWN RzType * rz_type_parse_string_single(RzTypeParser *parser, const char *code, char **error_msg)
Parses the single C type definition.
Definition: c_cpp_parser.c:309
RZ_API int rz_type_parse_string_stateless(RzTypeParser *parser, const char *code, char **error_msg)
Parses the C type string reusing the existing parser state.
Definition: c_cpp_parser.c:227
static int type_parse_string(CParserState *state, const char *code, char **error_msg)
Definition: c_cpp_parser.c:141
void c_parser_state_reset_keep_ht(CParserState *state)
Definition: c_cpp_parser.c:75
CParserState * c_parser_state_new(HtPP *base_types, HtPP *callable_types)
Definition: c_cpp_parser.c:33
RZ_API int rz_type_parse_file_stateless(RzTypeParser *parser, const char *path, const char *dir, char **error_msg)
Parses the C types file reusing the existing parser state.
Definition: c_cpp_parser.c:239
static char * ts_node_sub_string(TSNode node, const char *cstr)
Definition: c_cpp_parser.c:19
RZ_API void rz_type_parse_reset(RzTypeDB *typedb)
Reset the C parser state.
Definition: c_cpp_parser.c:297
void c_parser_state_free(CParserState *state)
Definition: c_cpp_parser.c:55
RZ_API int rz_type_parse_string(RzTypeDB *typedb, const char *code, char **error_msg)
Parses the C type string creating the new parser state.
Definition: c_cpp_parser.c:280
RZ_API void rz_type_parser_free_purge(RZ_NONNULL RzTypeParser *parser)
Frees the instance of the C type parser and destroy the hashtables.
Definition: c_cpp_parser.c:136
#define RZ_API
#define NULL
Definition: cris-opc.c:27
static static fork const void static count static fd const char const char static newpath const char static path const char path
Definition: sflib.h:35
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void static offset struct stat static buf void long static basep static whence static length const void static len static semflg const void static shmflg const struct timespec struct timespec static rem const char static group const void start
Definition: sflib.h:133
uint32_t ut32
RZ_API void Ht_() free(HtName_(Ht) *ht)
Definition: ht_inc.c:130
return strdup("=SP r13\n" "=LR r14\n" "=PC r15\n" "=A0 r0\n" "=A1 r1\n" "=A2 r2\n" "=A3 r3\n" "=ZF zf\n" "=SF nf\n" "=OF vf\n" "=CF cf\n" "=SN or0\n" "gpr lr .32 56 0\n" "gpr pc .32 60 0\n" "gpr cpsr .32 64 0 ____tfiae_________________qvczn\n" "gpr or0 .32 68 0\n" "gpr tf .1 64.5 0 thumb\n" "gpr ef .1 64.9 0 endian\n" "gpr jf .1 64.24 0 java\n" "gpr qf .1 64.27 0 sticky_overflow\n" "gpr vf .1 64.28 0 overflow\n" "gpr cf .1 64.29 0 carry\n" "gpr zf .1 64.30 0 zero\n" "gpr nf .1 64.31 0 negative\n" "gpr itc .4 64.10 0 if_then_count\n" "gpr gef .4 64.16 0 great_or_equal\n" "gpr r0 .32 0 0\n" "gpr r1 .32 4 0\n" "gpr r2 .32 8 0\n" "gpr r3 .32 12 0\n" "gpr r4 .32 16 0\n" "gpr r5 .32 20 0\n" "gpr r6 .32 24 0\n" "gpr r7 .32 28 0\n" "gpr r8 .32 32 0\n" "gpr r9 .32 36 0\n" "gpr r10 .32 40 0\n" "gpr r11 .32 44 0\n" "gpr r12 .32 48 0\n" "gpr r13 .32 52 0\n" "gpr r14 .32 56 0\n" "gpr r15 .32 60 0\n" "gpr r16 .32 64 0\n" "gpr r17 .32 68 0\n")
insn_type_descr_t types[]
Definition: or1k_disas.c:7
#define eprintf(x, y...)
Definition: rlcc.c:7
#define rz_return_val_if_fail(expr, val)
Definition: rz_assert.h:108
RZ_API RZ_OWN char * rz_file_slurp(const char *str, RZ_NULLABLE size_t *usz)
Definition: file.c:454
RZ_API ut64 rz_file_size(const char *str)
Definition: file.c:205
#define RZ_LOG_DEBUG(fmtstr,...)
Definition: rz_log.h:49
RZ_API char * rz_str_newf(const char *fmt,...) RZ_PRINTF_CHECK(1
RZ_API RZ_OWN char * rz_strbuf_drain_nofree(RzStrBuf *sb)
Definition: strbuf.c:349
RZ_API RzStrBuf * rz_strbuf_new(const char *s)
Definition: strbuf.c:8
RZ_API void rz_strbuf_free(RzStrBuf *sb)
Definition: strbuf.c:358
#define PFMT64d
Definition: rz_types.h:394
#define RZ_OWN
Definition: rz_types.h:62
#define RZ_NEW0(x)
Definition: rz_types.h:284
#define RZ_NONNULL
Definition: rz_types.h:64
RzType * type
Definition: types_parser.h:24
Definition: api.h:92
Definition: tree.h:15
Definition: inftree9.h:24
RzTypeParser * parser
Definition: rz_type.h:37
HtPP * callables
Definition: rz_type.h:35
HtPP * types
Definition: rz_type.h:33
CParserState * state
Definition: c_cpp_parser.c:87
Definition: dis.h:43
int parse_declaration_node(CParserState *state, TSNode node, const char *text, ParserTypePair **tpair)
int parse_type_nodes_save(CParserState *state, TSNode node, const char *text)
int parse_type_descriptor_single(CParserState *state, TSNode node, const char *text, ParserTypePair **tpair)
void parser_debug(CParserState *state, const char *fmt,...)
Definition: types_parser.c:37
void parser_warning(CParserState *state, const char *fmt,...)
Definition: types_parser.c:55
static int verbose
Definition: z80asm.c:73