Rizin
unix-like reverse engineering framework and cli tools
utf8.c File Reference
#include <rz_types.h>
#include <rz_util.h>
#include <rz_windows.h>

Go to the source code of this file.

Macros

#define UTF_LAST_BLOCK   (281)
 
#define UTF_BLOCKS_COUNT   RZ_ARRAY_SIZE(utf_blocks)
 
#define UTF_NONPRINTABLE_RANGES_COUNT   RZ_ARRAY_SIZE(nonprintable_ranges)
 

Functions

RZ_API const char * rz_utf_block_name (int idx)
 
RZ_API int rz_utf8_decode (const ut8 *ptr, int ptrlen, RzRune *ch)
 
RZ_API int rz_mutf8_decode (const ut8 *ptr, int ptrlen, RzRune *ch)
 
RZ_API int rz_utf8_encode (ut8 *ptr, const RzRune ch)
 
RZ_API int rz_utf8_encode_str (const RzRune *str, ut8 *dst, const int dst_length)
 
RZ_API int rz_utf8_size (const ut8 *ptr)
 
RZ_API int rz_utf8_strlen (const ut8 *str)
 
RZ_API bool rz_rune_is_printable (const RzRune c)
 Returns true when the RzRune is a printable symbol. More...
 
RZ_API int rz_utf_block_idx (RzRune ch)
 
RZ_API intrz_utf_block_list (const ut8 *str, int len, int **freq_list)
 
RZ_API RzStrEnc rz_utf_bom_encoding (const ut8 *ptr, int ptrlen)
 

Variables

struct {
   ut32   from
 
   ut32   to
 
nonprintable_ranges []
 
const RUtfBlock utf_blocks []
 

Macro Definition Documentation

◆ UTF_BLOCKS_COUNT

#define UTF_BLOCKS_COUNT   RZ_ARRAY_SIZE(utf_blocks)

Definition at line 11 of file utf8.c.

◆ UTF_LAST_BLOCK

#define UTF_LAST_BLOCK   (281)

Definition at line 10 of file utf8.c.

◆ UTF_NONPRINTABLE_RANGES_COUNT

#define UTF_NONPRINTABLE_RANGES_COUNT   RZ_ARRAY_SIZE(nonprintable_ranges)

Definition at line 12 of file utf8.c.

Function Documentation

◆ rz_mutf8_decode()

RZ_API int rz_mutf8_decode ( const ut8 ptr,
int  ptrlen,
RzRune ch 
)

Definition at line 524 of file utf8.c.

524  {
525  if (ptrlen > 1 && ptr[0] == 0xc0 && ptr[1] == 0x80) {
526  if (ch) {
527  *ch = 0;
528  }
529  return 2;
530  }
531  return rz_utf8_decode(ptr, ptrlen, ch);
532 }
RZ_API int rz_utf8_decode(const ut8 *ptr, int ptrlen, RzRune *ch)
Definition: utf8.c:492

References rz_utf8_decode().

Referenced by escape_utf8_for_json().

◆ rz_rune_is_printable()

RZ_API bool rz_rune_is_printable ( const RzRune  c)

Returns true when the RzRune is a printable symbol.

Parameters
cRzRune value to test
Returns
true if the rune is printable, otherwise false

Definition at line 606 of file utf8.c.

606  {
607  // RzRunes are most commonly single byte... We can early out with this common case.
608  if (c < 0x34F) {
609  /*
610  manually copied from top, please update if this ever changes
611  { 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x034F, 0x034F },
612  could do a linear search, but that's a lot slower than a few compare
613  */
614  return !(c <= 0x1F || (c >= 0x7F && c <= 0x9F));
615  }
616 
617  const int last = UTF_NONPRINTABLE_RANGES_COUNT;
618 
619  int low = 0;
620  int hi = last - 1;
621 
622  do {
623  int mid = (low + hi) >> 1;
624  if (c >= nonprintable_ranges[mid].from && c <= nonprintable_ranges[mid].to) {
625  return false;
626  }
627  if (mid < last && c > nonprintable_ranges[mid].to) {
628  low = mid + 1;
629  }
630  if (mid < last && c < nonprintable_ranges[mid].from) {
631  hi = mid - 1;
632  }
633  } while (low <= hi);
634 
635  return true;
636 }
hi(addr) 0x03
#define c(i)
Definition: sha256.c:43
const struct @335 nonprintable_ranges[]
ut32 to
Definition: utf8.c:14
#define UTF_NONPRINTABLE_RANGES_COUNT
Definition: utf8.c:12
ut32 from
Definition: utf8.c:14

References c, from, hi(), nonprintable_ranges, to, and UTF_NONPRINTABLE_RANGES_COUNT.

Referenced by escape_utf8_for_json(), process_one_string(), rz_scan_strings_raw(), and rz_str_stringify_raw_buffer().

◆ rz_utf8_decode()

RZ_API int rz_utf8_decode ( const ut8 ptr,
int  ptrlen,
RzRune ch 
)

Definition at line 492 of file utf8.c.

492  {
493  if (ptrlen < 1) {
494  return 0;
495  }
496  if (ptr[0] < 0x80) {
497  if (ch) {
498  *ch = (ut32)ptr[0];
499  }
500  return 1;
501  } else if (ptrlen > 1 && (ptr[0] & 0xe0) == 0xc0 && (ptr[1] & 0xc0) == 0x80) {
502  RzRune rune = (ptr[0] & 0x1f) << 6 | (ptr[1] & 0x3f);
503  if (ch) {
504  *ch = rune;
505  }
506  return rune < 0x80 ? 0 : 2;
507  } else if (ptrlen > 2 && (ptr[0] & 0xf0) == 0xe0 && (ptr[1] & 0xc0) == 0x80 && (ptr[2] & 0xc0) == 0x80) {
508  RzRune rune = (ptr[0] & 0xf) << 12 | (ptr[1] & 0x3f) << 6 | (ptr[2] & 0x3f);
509  if (ch) {
510  *ch = rune;
511  }
512  return rune < 0x800 ? 0 : 3;
513  } else if (ptrlen > 3 && (ptr[0] & 0xf8) == 0xf0 && (ptr[1] & 0xc0) == 0x80 && (ptr[2] & 0xc0) == 0x80 && (ptr[3] & 0xc0) == 0x80) {
514  RzRune rune = (ptr[0] & 7) << 18 | (ptr[1] & 0x3f) << 12 | (ptr[2] & 0x3f) << 6 | (ptr[3] & 0x3f);
515  if (ch) {
516  *ch = rune;
517  }
518  return rune < 0x10000 ? 0 : 4;
519  }
520  return 0;
521 }
uint32_t ut32
ut32 RzRune
Definition: rz_utf8.h:13

Referenced by calculate_utf8_string_info(), can_be_utf16_le(), can_be_utf32_le(), ds_esc_str(), escape_utf8_for_json(), process_one_string(), rz_mutf8_decode(), rz_scan_strings_raw(), rz_search_keyword_new_wide(), rz_str_escape_utf(), rz_str_is_printable(), rz_str_is_printable_incl_newlines(), rz_str_is_printable_limited(), rz_str_is_utf8(), rz_str_stringify_raw_buffer(), rz_utf_block_list(), and sanitize_cab_filename().

◆ rz_utf8_encode()

RZ_API int rz_utf8_encode ( ut8 ptr,
const RzRune  ch 
)

Definition at line 535 of file utf8.c.

535  {
536  if (ch < 0x80) {
537  ptr[0] = (ut8)ch;
538  return 1;
539  } else if (ch < 0x800) {
540  ptr[0] = 0xc0 | (ch >> 6);
541  ptr[1] = 0x80 | (ch & 0x3f);
542  return 2;
543  } else if (ch < 0x10000) {
544  ptr[0] = 0xe0 | (ch >> 12);
545  ptr[1] = 0x80 | ((ch >> 6) & 0x3f);
546  ptr[2] = 0x80 | (ch & 0x3f);
547  return 3;
548  } else if (ch < 0x200000) {
549  ptr[0] = 0xf0 | (ch >> 18);
550  ptr[1] = 0x80 | ((ch >> 12) & 0x3f);
551  ptr[2] = 0x80 | ((ch >> 6) & 0x3f);
552  ptr[3] = 0x80 | (ch & 0x3f);
553  return 4;
554  }
555  return 0;
556 }
#define ut8
Definition: dcpu16.h:8

References ut8.

Referenced by process_one_string(), rz_str_escape_utf(), rz_str_stringify_raw_buffer(), rz_utf8_encode_str(), sanitize_cab_filename(), and unescape_string().

◆ rz_utf8_encode_str()

RZ_API int rz_utf8_encode_str ( const RzRune str,
ut8 dst,
const int  dst_length 
)

Definition at line 559 of file utf8.c.

559  {
560  if (!str || !dst) {
561  return -1;
562  }
563 
564  int pos = 0;
565  for (size_t i = 0; i < sizeof(str) - 1 && str[i] && pos < dst_length - 1; i++) {
566  pos += rz_utf8_encode(&dst[pos], str[i]);
567  }
568 
569  dst[pos++] = '\0';
570  return pos;
571 }
lzma_index ** i
Definition: index.h:629
char * dst
Definition: lz4.h:724
int pos
Definition: main.c:11
RZ_API int rz_utf8_encode(ut8 *ptr, const RzRune ch)
Definition: utf8.c:535

References dst, i, pos, rz_utf8_encode(), and cmd_descs_generate::str.

Referenced by rz_core_analysis_hasrefs_to_depth().

◆ rz_utf8_size()

RZ_API int rz_utf8_size ( const ut8 ptr)

Definition at line 574 of file utf8.c.

574  {
575  const int utf8_size[] = {
576  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
577  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0-0xCF
581  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0-0xDF
582  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0-0xEF
583  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 0xF0-0xFF
584  };
585  return (ptr[0] & 0x80) ? utf8_size[ptr[0] ^ 0x80] : 1;
586 }

◆ rz_utf8_strlen()

RZ_API int rz_utf8_strlen ( const ut8 str)

Definition at line 588 of file utf8.c.

588  {
589  int len = 0;
590 
591  for (int i = 0; str[i]; i++) {
592  if ((str[i] & 0xc0) != 0x80) {
593  len++;
594  }
595  }
596 
597  return len;
598 }
size_t len
Definition: 6502dis.c:15

References i, len, and cmd_descs_generate::str.

Referenced by ds_print_relocs().

◆ rz_utf_block_idx()

RZ_API int rz_utf_block_idx ( RzRune  ch)

Definition at line 733 of file utf8.c.

733  {
734  const int last = UTF_BLOCKS_COUNT;
735  int low = 0, hi = last - 1, mid = 0;
736 
737  do {
738  mid = (low + hi) >> 1;
739  if (ch >= utf_blocks[mid].from && ch <= utf_blocks[mid].to) {
740  return mid;
741  }
742  if (mid < last && ch > utf_blocks[mid].to) {
743  low = mid + 1;
744  }
745  if (mid < last && ch < utf_blocks[mid].from) {
746  hi = mid - 1;
747  }
748  } while (low <= hi);
749 
750  return UTF_BLOCKS_COUNT - 1; /* index for "No_Block" */
751 }
#define UTF_BLOCKS_COUNT
Definition: utf8.c:11
const RUtfBlock utf_blocks[]
Definition: utf8.c:200

References from, hi(), to, utf_blocks, and UTF_BLOCKS_COUNT.

Referenced by rz_utf_block_list().

◆ rz_utf_block_list()

RZ_API int* rz_utf_block_list ( const ut8 str,
int  len,
int **  freq_list 
)

Definition at line 754 of file utf8.c.

754  {
755  if (!str) {
756  return NULL;
757  }
758  if (len < 0) {
759  len = strlen((const char *)str);
760  }
761  int block_freq[UTF_BLOCKS_COUNT] = { 0 };
762  int *list = RZ_NEWS0(int, len + 1);
763  if (!list) {
764  return NULL;
765  }
766  int *freq_list_ptr = NULL;
767  if (freq_list) {
768  *freq_list = RZ_NEWS0(int, len + 1);
769  if (!*freq_list) {
770  free(list);
771  return NULL;
772  }
773  freq_list_ptr = *freq_list;
774  }
775  int *list_ptr = list;
776  const ut8 *str_ptr = str;
777  const ut8 *str_end = str + len;
778  RzRune ch = 0;
779  while (str_ptr < str_end) {
780  int block_idx;
781  int ch_bytes = rz_utf8_decode(str_ptr, str_end - str_ptr, &ch);
782  if (!ch_bytes) {
783  block_idx = UTF_BLOCKS_COUNT - 1;
784  ch_bytes = 1;
785  } else {
786  block_idx = rz_utf_block_idx(ch);
787  }
788  if (!block_freq[block_idx]) {
789  *list_ptr = block_idx;
790  list_ptr++;
791  }
792  block_freq[block_idx]++;
793  str_ptr += ch_bytes;
794  }
795  *list_ptr = -1;
796  if (freq_list_ptr) {
797  for (list_ptr = list; *list_ptr != -1; list_ptr++) {
798  *freq_list_ptr = block_freq[*list_ptr];
799  freq_list_ptr++;
800  }
801  *freq_list_ptr = -1;
802  }
803  for (list_ptr = list; *list_ptr != -1; list_ptr++) {
804  block_freq[*list_ptr] = 0;
805  }
806  return list;
807 }
#define NULL
Definition: cris-opc.c:27
RZ_API void Ht_() free(HtName_(Ht) *ht)
Definition: ht_inc.c:130
uint8_t ut8
Definition: lh5801.h:11
static void list(RzEgg *egg)
Definition: rz-gg.c:52
#define RZ_NEWS0(x, y)
Definition: rz_types.h:282
RZ_API int rz_utf_block_idx(RzRune ch)
Definition: utf8.c:733

References free(), len, list(), NULL, RZ_NEWS0, rz_utf8_decode(), rz_utf_block_idx(), cmd_descs_generate::str, and UTF_BLOCKS_COUNT.

Referenced by reduce_false_positives(), and strings_print().

◆ rz_utf_block_name()

RZ_API const char* rz_utf_block_name ( int  idx)

Definition at line 484 of file utf8.c.

484  {
485  if (idx < 0 || idx >= UTF_LAST_BLOCK) {
486  return NULL;
487  }
488  return utf_blocks[idx].name;
489 }
int idx
Definition: setup.py:197
const char * name
Definition: rz_utf8.h:9
#define UTF_LAST_BLOCK
Definition: utf8.c:10

References setup::idx, RUtfBlock::name, NULL, utf_blocks, and UTF_LAST_BLOCK.

Referenced by strings_print().

◆ rz_utf_bom_encoding()

RZ_API RzStrEnc rz_utf_bom_encoding ( const ut8 ptr,
int  ptrlen 
)

Definition at line 809 of file utf8.c.

809  {
810  if (ptrlen > 3) {
811  if (ptr[0] == 0xff && ptr[1] == 0xfe && !ptr[2] && !ptr[3]) {
812  return RZ_STRING_ENC_UTF32LE;
813  }
814  if (!ptr[0] && !ptr[1] && ptr[2] == 0xfe && ptr[3] == 0xff) {
815  return RZ_STRING_ENC_UTF32BE;
816  }
817  }
818  if (ptrlen > 2) {
819  if (ptr[0] == 0xef && ptr[1] == 0xbb && ptr[2] == 0xbf) {
820  return RZ_STRING_ENC_UTF8;
821  }
822  }
823  if (ptrlen > 1) {
824  if (ptr[0] == 0xff && ptr[1] == 0xfe) {
825  return RZ_STRING_ENC_UTF16LE;
826  }
827  if (ptr[0] == 0xfe && ptr[1] == 0xff) {
828  return RZ_STRING_ENC_UTF16BE;
829  }
830  }
831  return RZ_STRING_ENC_GUESS;
832 }
@ RZ_STRING_ENC_UTF32LE
Definition: rz_str.h:24
@ RZ_STRING_ENC_UTF32BE
Definition: rz_str.h:26
@ RZ_STRING_ENC_UTF8
Definition: rz_str.h:21
@ RZ_STRING_ENC_GUESS
Definition: rz_str.h:33
@ RZ_STRING_ENC_UTF16LE
Definition: rz_str.h:23
@ RZ_STRING_ENC_UTF16BE
Definition: rz_str.h:25

References RZ_STRING_ENC_GUESS, RZ_STRING_ENC_UTF16BE, RZ_STRING_ENC_UTF16LE, RZ_STRING_ENC_UTF32BE, RZ_STRING_ENC_UTF32LE, and RZ_STRING_ENC_UTF8.

Referenced by ds_esc_str(), ds_print_ptr(), and rz_str_guess_encoding_from_buffer().

Variable Documentation

◆ from

ut32 from

Definition at line 14 of file utf8.c.

Referenced by rz_rune_is_printable(), and rz_utf_block_idx().

◆ 

const { ... } nonprintable_ranges[]

Referenced by rz_rune_is_printable().

◆ to

ut32 to

Definition at line 14 of file utf8.c.

Referenced by rz_rune_is_printable(), and rz_utf_block_idx().

◆ utf_blocks

const RUtfBlock utf_blocks[]

Definition at line 200 of file utf8.c.

Referenced by rz_utf_block_idx(), and rz_utf_block_name().