Rizin
unix-like reverse engineering framework and cli tools
utf8.c
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: 2014-2018 LemonBoy <thatlemon@gmail.com>
2 // SPDX-FileCopyrightText: 2014-2018 kazarmy <kazarmy@gmail.com>
3 // SPDX-FileCopyrightText: 2014-2018 pancake <pancake@nopcode.org>
4 // SPDX-License-Identifier: LGPL-3.0-only
5 
6 #include <rz_types.h>
7 #include <rz_util.h>
8 #include <rz_windows.h>
9 
10 #define UTF_LAST_BLOCK (281)
11 #define UTF_BLOCKS_COUNT RZ_ARRAY_SIZE(utf_blocks)
12 #define UTF_NONPRINTABLE_RANGES_COUNT RZ_ARRAY_SIZE(nonprintable_ranges)
13 
14 const struct { ut32 from, to; } nonprintable_ranges[] = {
15  { 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x034F, 0x034F },
16  { 0x0378, 0x0379 }, { 0x037F, 0x0383 }, { 0x038B, 0x038B },
17  { 0x038D, 0x038D }, { 0x03A2, 0x03A2 }, { 0x0528, 0x0530 },
18  { 0x0557, 0x0558 }, { 0x0560, 0x0560 }, { 0x0588, 0x0588 },
19  { 0x058B, 0x058E }, { 0x0590, 0x0590 }, { 0x05C8, 0x05CF },
20  { 0x05EB, 0x05EF }, { 0x05F5, 0x0605 }, { 0x061C, 0x061D },
21  { 0x06DD, 0x06DD }, { 0x070E, 0x070F }, { 0x074B, 0x074C },
22  { 0x07B2, 0x07BF }, { 0x07FB, 0x07FF }, { 0x082E, 0x082F },
23  { 0x083F, 0x083F }, { 0x085C, 0x085D }, { 0x085F, 0x089F },
24  { 0x08A1, 0x08A1 }, { 0x08AD, 0x08E3 }, { 0x08FF, 0x08FF },
25  { 0x0978, 0x0978 }, { 0x0980, 0x0980 }, { 0x0984, 0x0984 },
26  { 0x098D, 0x098E }, { 0x0991, 0x0992 }, { 0x09A9, 0x09A9 },
27  { 0x09B1, 0x09B1 }, { 0x09B3, 0x09B5 }, { 0x09BA, 0x09BB },
28  { 0x09C5, 0x09C6 }, { 0x09C9, 0x09CA }, { 0x09CF, 0x09D6 },
29  { 0x09D8, 0x09DB }, { 0x09DE, 0x09DE }, { 0x09E4, 0x09E5 },
30  { 0x09FC, 0x0A00 }, { 0x0A04, 0x0A04 }, { 0x0A0B, 0x0A0E },
31  { 0x0A11, 0x0A12 }, { 0x0A29, 0x0A29 }, { 0x0A31, 0x0A31 },
32  { 0x0A34, 0x0A34 }, { 0x0A37, 0x0A37 }, { 0x0A3A, 0x0A3B },
33  { 0x0A3D, 0x0A3D }, { 0x0A43, 0x0A46 }, { 0x0A49, 0x0A4A },
34  { 0x0A4E, 0x0A50 }, { 0x0A52, 0x0A58 }, { 0x0A5D, 0x0A5D },
35  { 0x0A5F, 0x0A65 }, { 0x0A76, 0x0A80 }, { 0x0A84, 0x0A84 },
36  { 0x0A8E, 0x0A8E }, { 0x0A92, 0x0A92 }, { 0x0AA9, 0x0AA9 },
37  { 0x0AB1, 0x0AB1 }, { 0x0AB4, 0x0AB4 }, { 0x0ABA, 0x0ABB },
38  { 0x0AC6, 0x0AC6 }, { 0x0ACA, 0x0ACA }, { 0x0ACE, 0x0ACF },
39  { 0x0AD1, 0x0ADF }, { 0x0AE4, 0x0AE5 }, { 0x0AF2, 0x0B00 },
40  { 0x0B04, 0x0B04 }, { 0x0B0D, 0x0B0E }, { 0x0B11, 0x0B12 },
41  { 0x0B29, 0x0B29 }, { 0x0B31, 0x0B31 }, { 0x0B34, 0x0B34 },
42  { 0x0B3A, 0x0B3B }, { 0x0B45, 0x0B46 }, { 0x0B49, 0x0B4A },
43  { 0x0B4E, 0x0B55 }, { 0x0B58, 0x0B5B }, { 0x0B5E, 0x0B5E },
44  { 0x0B64, 0x0B65 }, { 0x0B78, 0x0B81 }, { 0x0B84, 0x0B84 },
45  { 0x0B8B, 0x0B8D }, { 0x0B91, 0x0B91 }, { 0x0B96, 0x0B98 },
46  { 0x0B9B, 0x0B9B }, { 0x0B9D, 0x0B9D }, { 0x0BA0, 0x0BA2 },
47  { 0x0BA5, 0x0BA7 }, { 0x0BAB, 0x0BAD }, { 0x0BBA, 0x0BBD },
48  { 0x0BC3, 0x0BC5 }, { 0x0BC9, 0x0BC9 }, { 0x0BCE, 0x0BCF },
49  { 0x0BD1, 0x0BD6 }, { 0x0BD8, 0x0BE5 }, { 0x0BFB, 0x0C00 },
50  { 0x0C04, 0x0C04 }, { 0x0C0D, 0x0C0D }, { 0x0C11, 0x0C11 },
51  { 0x0C29, 0x0C29 }, { 0x0C34, 0x0C34 }, { 0x0C3A, 0x0C3C },
52  { 0x0C45, 0x0C45 }, { 0x0C49, 0x0C49 }, { 0x0C4E, 0x0C54 },
53  { 0x0C57, 0x0C57 }, { 0x0C5A, 0x0C5F }, { 0x0C64, 0x0C65 },
54  { 0x0C70, 0x0C77 }, { 0x0C80, 0x0C81 }, { 0x0C84, 0x0C84 },
55  { 0x0C8D, 0x0C8D }, { 0x0C91, 0x0C91 }, { 0x0CA9, 0x0CA9 },
56  { 0x0CB4, 0x0CB4 }, { 0x0CBA, 0x0CBB }, { 0x0CC5, 0x0CC5 },
57  { 0x0CC9, 0x0CC9 }, { 0x0CCE, 0x0CD4 }, { 0x0CD7, 0x0CDD },
58  { 0x0CDF, 0x0CDF }, { 0x0CE4, 0x0CE5 }, { 0x0CF0, 0x0CF0 },
59  { 0x0CF3, 0x0D01 }, { 0x0D04, 0x0D04 }, { 0x0D0D, 0x0D0D },
60  { 0x0D11, 0x0D11 }, { 0x0D3B, 0x0D3C }, { 0x0D45, 0x0D45 },
61  { 0x0D49, 0x0D49 }, { 0x0D4F, 0x0D56 }, { 0x0D58, 0x0D5F },
62  { 0x0D64, 0x0D65 }, { 0x0D76, 0x0D78 }, { 0x0D80, 0x0D81 },
63  { 0x0D84, 0x0D84 }, { 0x0D97, 0x0D99 }, { 0x0DB2, 0x0DB2 },
64  { 0x0DBC, 0x0DBC }, { 0x0DBE, 0x0DBF }, { 0x0DC7, 0x0DC9 },
65  { 0x0DCB, 0x0DCE }, { 0x0DD5, 0x0DD5 }, { 0x0DD7, 0x0DD7 },
66  { 0x0DE0, 0x0DF1 }, { 0x0DF5, 0x0E00 }, { 0x0E3B, 0x0E3E },
67  { 0x0E5C, 0x0E80 }, { 0x0E83, 0x0E83 }, { 0x0E85, 0x0E86 },
68  { 0x0E89, 0x0E89 }, { 0x0E8B, 0x0E8C }, { 0x0E8E, 0x0E93 },
69  { 0x0E98, 0x0E98 }, { 0x0EA0, 0x0EA0 }, { 0x0EA4, 0x0EA4 },
70  { 0x0EA6, 0x0EA6 }, { 0x0EA8, 0x0EA9 }, { 0x0EAC, 0x0EAC },
71  { 0x0EBA, 0x0EBA }, { 0x0EBE, 0x0EBF }, { 0x0EC5, 0x0EC5 },
72  { 0x0EC7, 0x0EC7 }, { 0x0ECE, 0x0ECF }, { 0x0EDA, 0x0EDB },
73  { 0x0EE0, 0x0EFF }, { 0x0F48, 0x0F48 }, { 0x0F6D, 0x0F70 },
74  { 0x0F98, 0x0F98 }, { 0x0FBD, 0x0FBD }, { 0x0FCD, 0x0FCD },
75  { 0x0FDB, 0x0FFF }, { 0x10C6, 0x10C6 }, { 0x10C8, 0x10CC },
76  { 0x10CE, 0x10CF }, { 0x115F, 0x1160 }, { 0x1249, 0x1249 },
77  { 0x124E, 0x124F }, { 0x1257, 0x1257 }, { 0x1259, 0x1259 },
78  { 0x125E, 0x125F }, { 0x1289, 0x1289 }, { 0x128E, 0x128F },
79  { 0x12B1, 0x12B1 }, { 0x12B6, 0x12B7 }, { 0x12BF, 0x12BF },
80  { 0x12C1, 0x12C1 }, { 0x12C6, 0x12C7 }, { 0x12D7, 0x12D7 },
81  { 0x1311, 0x1311 }, { 0x1316, 0x1317 }, { 0x135B, 0x135C },
82  { 0x137D, 0x137F }, { 0x139A, 0x139F }, { 0x13F5, 0x13FF },
83  { 0x169D, 0x169F }, { 0x16F1, 0x16FF }, { 0x170D, 0x170D },
84  { 0x1715, 0x171F }, { 0x1737, 0x173F }, { 0x1754, 0x175F },
85  { 0x176D, 0x176D }, { 0x1771, 0x1771 }, { 0x1774, 0x177F },
86  { 0x17B4, 0x17B5 }, { 0x17DE, 0x17DF }, { 0x17EA, 0x17EF },
87  { 0x17FA, 0x17FF }, { 0x180B, 0x180D }, { 0x180F, 0x180F },
88  { 0x181A, 0x181F }, { 0x1878, 0x187F }, { 0x18AB, 0x18AF },
89  { 0x18F6, 0x18FF }, { 0x191D, 0x191F }, { 0x192C, 0x192F },
90  { 0x193C, 0x193F }, { 0x1941, 0x1943 }, { 0x196E, 0x196F },
91  { 0x1975, 0x197F }, { 0x19AC, 0x19AF }, { 0x19CA, 0x19CF },
92  { 0x19DB, 0x19DD }, { 0x1A1C, 0x1A1D }, { 0x1A5F, 0x1A5F },
93  { 0x1A7D, 0x1A7E }, { 0x1A8A, 0x1A8F }, { 0x1A9A, 0x1A9F },
94  { 0x1AAE, 0x1AFF }, { 0x1B4C, 0x1B4F }, { 0x1B7D, 0x1B7F },
95  { 0x1BF4, 0x1BFB }, { 0x1C38, 0x1C3A }, { 0x1C4A, 0x1C4C },
96  { 0x1C80, 0x1CBF }, { 0x1CC8, 0x1CCF }, { 0x1CF7, 0x1CFF },
97  { 0x1DE7, 0x1DFB }, { 0x1F16, 0x1F17 }, { 0x1F1E, 0x1F1F },
98  { 0x1F46, 0x1F47 }, { 0x1F4E, 0x1F4F }, { 0x1F58, 0x1F58 },
99  { 0x1F5A, 0x1F5A }, { 0x1F5C, 0x1F5C }, { 0x1F5E, 0x1F5E },
100  { 0x1F7E, 0x1F7F }, { 0x1FB5, 0x1FB5 }, { 0x1FC5, 0x1FC5 },
101  { 0x1FD4, 0x1FD5 }, { 0x1FDC, 0x1FDC }, { 0x1FF0, 0x1FF1 },
102  { 0x1FF5, 0x1FF5 }, { 0x1FFF, 0x1FFF }, { 0x200B, 0x200F },
103  { 0x202A, 0x202E }, { 0x2060, 0x206F }, { 0x2072, 0x2073 },
104  { 0x208F, 0x208F }, { 0x209D, 0x209F }, { 0x20BB, 0x20CF },
105  { 0x20F1, 0x20FF }, { 0x218A, 0x218F }, { 0x23F4, 0x23FF },
106  { 0x2427, 0x243F }, { 0x244B, 0x245F }, { 0x2700, 0x2700 },
107  { 0x2B4D, 0x2B4F }, { 0x2B5A, 0x2BFF }, { 0x2C2F, 0x2C2F },
108  { 0x2C5F, 0x2C5F }, { 0x2CF4, 0x2CF8 }, { 0x2D26, 0x2D26 },
109  { 0x2D28, 0x2D2C }, { 0x2D2E, 0x2D2F }, { 0x2D68, 0x2D6E },
110  { 0x2D71, 0x2D7E }, { 0x2D97, 0x2D9F }, { 0x2DA7, 0x2DA7 },
111  { 0x2DAF, 0x2DAF }, { 0x2DB7, 0x2DB7 }, { 0x2DBF, 0x2DBF },
112  { 0x2DC7, 0x2DC7 }, { 0x2DCF, 0x2DCF }, { 0x2DD7, 0x2DD7 },
113  { 0x2DDF, 0x2DDF }, { 0x2E3C, 0x2E7F }, { 0x2E9A, 0x2E9A },
114  { 0x2EF4, 0x2EFF }, { 0x2FD6, 0x2FEF }, { 0x2FFC, 0x2FFF },
115  { 0x3040, 0x3040 }, { 0x3097, 0x3098 }, { 0x3100, 0x3104 },
116  { 0x312E, 0x3130 }, { 0x3164, 0x3164 }, { 0x318F, 0x318F },
117  { 0x31BB, 0x31BF }, { 0x31E4, 0x31EF }, { 0x321F, 0x321F },
118  { 0x32FF, 0x32FF }, { 0x4DB6, 0x4DBF }, { 0x9FCD, 0x9FFF },
119  { 0xA48D, 0xA48F }, { 0xA4C7, 0xA4CF }, { 0xA62C, 0xA63F },
120  { 0xA698, 0xA69E }, { 0xA6F8, 0xA6FF }, { 0xA78F, 0xA78F },
121  { 0xA794, 0xA79F }, { 0xA7AB, 0xA7F7 }, { 0xA82C, 0xA82F },
122  { 0xA83A, 0xA83F }, { 0xA878, 0xA87F }, { 0xA8C5, 0xA8CD },
123  { 0xA8DA, 0xA8DF }, { 0xA8FC, 0xA8FF }, { 0xA954, 0xA95E },
124  { 0xA97D, 0xA97F }, { 0xA9CE, 0xA9CE }, { 0xA9DA, 0xA9DD },
125  { 0xA9E0, 0xA9FF }, { 0xAA37, 0xAA3F }, { 0xAA4E, 0xAA4F },
126  { 0xAA5A, 0xAA5B }, { 0xAA7C, 0xAA7F }, { 0xAAC3, 0xAADA },
127  { 0xAAF7, 0xAB00 }, { 0xAB07, 0xAB08 }, { 0xAB0F, 0xAB10 },
128  { 0xAB17, 0xAB1F }, { 0xAB27, 0xAB27 }, { 0xAB2F, 0xABBF },
129  { 0xABEE, 0xABEF }, { 0xABFA, 0xABFF }, { 0xD7A4, 0xD7AF },
130  { 0xD7C7, 0xD7CA }, { 0xD7FC, 0xDFFF }, { 0xFA6E, 0xFA6F },
131  { 0xFADA, 0xFAFF }, { 0xFB07, 0xFB12 }, { 0xFB18, 0xFB1C },
132  { 0xFB37, 0xFB37 }, { 0xFB3D, 0xFB3D }, { 0xFB3F, 0xFB3F },
133  { 0xFB42, 0xFB42 }, { 0xFB45, 0xFB45 }, { 0xFBC2, 0xFBD2 },
134  { 0xFD40, 0xFD4F }, { 0xFD90, 0xFD91 }, { 0xFDC8, 0xFDEF },
135  { 0xFDFE, 0xFE0F }, { 0xFE1A, 0xFE1F }, { 0xFE27, 0xFE2F },
136  { 0xFE53, 0xFE53 }, { 0xFE67, 0xFE67 }, { 0xFE6C, 0xFE6F },
137  { 0xFE75, 0xFE75 }, { 0xFEFD, 0xFEFF }, { 0xFF00, 0xFF00 },
138  { 0xFFA0, 0xFFA0 }, { 0xFFBF, 0xFFC1 }, { 0xFFC8, 0xFFC9 },
139  { 0xFFD0, 0xFFD1 }, { 0xFFD8, 0xFFD9 }, { 0xFFDD, 0xFFDF },
140  { 0xFFE7, 0xFFE7 }, { 0xFFEF, 0xFFFB }, { 0xFFFE, 0xFFFF },
141  { 0x1000C, 0x1000C }, { 0x10027, 0x10027 }, { 0x1003B, 0x1003B },
142  { 0x1003E, 0x1003E }, { 0x1004E, 0x1004F }, { 0x1005E, 0x1007F },
143  { 0x100FB, 0x100FF }, { 0x10103, 0x10106 }, { 0x10134, 0x10136 },
144  { 0x1018B, 0x1018F }, { 0x1019C, 0x101CF }, { 0x101FE, 0x1027F },
145  { 0x1029D, 0x1029F }, { 0x102D1, 0x102FF }, { 0x1031F, 0x1031F },
146  { 0x10324, 0x1032F }, { 0x1034B, 0x1037F }, { 0x1039E, 0x1039E },
147  { 0x103C4, 0x103C7 }, { 0x103D6, 0x103FF }, { 0x1049E, 0x1049F },
148  { 0x104AA, 0x107FF }, { 0x10806, 0x10807 }, { 0x10809, 0x10809 },
149  { 0x10836, 0x10836 }, { 0x10839, 0x1083B }, { 0x1083D, 0x1083E },
150  { 0x10856, 0x10856 }, { 0x10860, 0x108FF }, { 0x1091C, 0x1091E },
151  { 0x1093A, 0x1093E }, { 0x10940, 0x1097F }, { 0x109B8, 0x109BD },
152  { 0x109C0, 0x109FF }, { 0x10A04, 0x10A04 }, { 0x10A07, 0x10A0B },
153  { 0x10A14, 0x10A14 }, { 0x10A18, 0x10A18 }, { 0x10A34, 0x10A37 },
154  { 0x10A3B, 0x10A3E }, { 0x10A48, 0x10A4F }, { 0x10A59, 0x10A5F },
155  { 0x10A80, 0x10AFF }, { 0x10B36, 0x10B38 }, { 0x10B56, 0x10B57 },
156  { 0x10B73, 0x10B77 }, { 0x10B80, 0x10BFF }, { 0x10C49, 0x10E5F },
157  { 0x10E7F, 0x10FFF }, { 0x1104E, 0x11051 }, { 0x11070, 0x1107F },
158  { 0x110BD, 0x110BD }, { 0x110C2, 0x110CF }, { 0x110E9, 0x110EF },
159  { 0x110FA, 0x110FF }, { 0x11135, 0x11135 }, { 0x11144, 0x1117F },
160  { 0x111C9, 0x111CF }, { 0x111DA, 0x1167F }, { 0x116B8, 0x116BF },
161  { 0x116CA, 0x11FFF }, { 0x1236F, 0x123FF }, { 0x12463, 0x1246F },
162  { 0x12474, 0x12FFF }, { 0x1342F, 0x167FF }, { 0x16A39, 0x16EFF },
163  { 0x16F45, 0x16F4F }, { 0x16F7F, 0x16F8E }, { 0x16FA0, 0x1AFFF },
164  { 0x1B002, 0x1CFFF }, { 0x1D0F6, 0x1D0FF }, { 0x1D127, 0x1D128 },
165  { 0x1D173, 0x1D17A }, { 0x1D1DE, 0x1D1FF }, { 0x1D246, 0x1D2FF },
166  { 0x1D357, 0x1D35F }, { 0x1D372, 0x1D3FF }, { 0x1D455, 0x1D455 },
167  { 0x1D49D, 0x1D49D }, { 0x1D4A0, 0x1D4A1 }, { 0x1D4A3, 0x1D4A4 },
168  { 0x1D4A7, 0x1D4A8 }, { 0x1D4AD, 0x1D4AD }, { 0x1D4BA, 0x1D4BA },
169  { 0x1D4BC, 0x1D4BC }, { 0x1D4C4, 0x1D4C4 }, { 0x1D506, 0x1D506 },
170  { 0x1D50B, 0x1D50C }, { 0x1D515, 0x1D515 }, { 0x1D51D, 0x1D51D },
171  { 0x1D53A, 0x1D53A }, { 0x1D53F, 0x1D53F }, { 0x1D545, 0x1D545 },
172  { 0x1D547, 0x1D549 }, { 0x1D551, 0x1D551 }, { 0x1D6A6, 0x1D6A7 },
173  { 0x1D7CC, 0x1D7CD }, { 0x1D800, 0x1EDFF }, { 0x1EE04, 0x1EE04 },
174  { 0x1EE20, 0x1EE20 }, { 0x1EE23, 0x1EE23 }, { 0x1EE25, 0x1EE26 },
175  { 0x1EE28, 0x1EE28 }, { 0x1EE33, 0x1EE33 }, { 0x1EE38, 0x1EE38 },
176  { 0x1EE3A, 0x1EE3A }, { 0x1EE3C, 0x1EE41 }, { 0x1EE43, 0x1EE46 },
177  { 0x1EE48, 0x1EE48 }, { 0x1EE4A, 0x1EE4A }, { 0x1EE4C, 0x1EE4C },
178  { 0x1EE50, 0x1EE50 }, { 0x1EE53, 0x1EE53 }, { 0x1EE55, 0x1EE56 },
179  { 0x1EE58, 0x1EE58 }, { 0x1EE5A, 0x1EE5A }, { 0x1EE5C, 0x1EE5C },
180  { 0x1EE5E, 0x1EE5E }, { 0x1EE60, 0x1EE60 }, { 0x1EE63, 0x1EE63 },
181  { 0x1EE65, 0x1EE66 }, { 0x1EE6B, 0x1EE6B }, { 0x1EE73, 0x1EE73 },
182  { 0x1EE78, 0x1EE78 }, { 0x1EE7D, 0x1EE7D }, { 0x1EE7F, 0x1EE7F },
183  { 0x1EE8A, 0x1EE8A }, { 0x1EE9C, 0x1EEA0 }, { 0x1EEA4, 0x1EEA4 },
184  { 0x1EEAA, 0x1EEAA }, { 0x1EEBC, 0x1EEEF }, { 0x1EEF2, 0x1EFFF },
185  { 0x1F02C, 0x1F02F }, { 0x1F094, 0x1F09F }, { 0x1F0AF, 0x1F0B0 },
186  { 0x1F0BF, 0x1F0C0 }, { 0x1F0D0, 0x1F0D0 }, { 0x1F0E0, 0x1F0FF },
187  { 0x1F10B, 0x1F10F }, { 0x1F12F, 0x1F12F }, { 0x1F16C, 0x1F16F },
188  { 0x1F19B, 0x1F1E5 }, { 0x1F203, 0x1F20F }, { 0x1F23B, 0x1F23F },
189  { 0x1F249, 0x1F24F }, { 0x1F252, 0x1F2FF }, { 0x1F321, 0x1F32F },
190  { 0x1F336, 0x1F336 }, { 0x1F37D, 0x1F37F }, { 0x1F394, 0x1F39F },
191  { 0x1F3C5, 0x1F3C5 }, { 0x1F3CB, 0x1F3DF }, { 0x1F3F1, 0x1F3FF },
192  { 0x1F43F, 0x1F43F }, { 0x1F441, 0x1F441 }, { 0x1F4F8, 0x1F4F8 },
193  { 0x1F4FD, 0x1F4FF }, { 0x1F53E, 0x1F53F }, { 0x1F544, 0x1F54F },
194  { 0x1F568, 0x1F5FA }, { 0x1F641, 0x1F644 }, { 0x1F650, 0x1F67F },
195  { 0x1F6C6, 0x1F6FF }, { 0x1F774, 0x1FFFF }, { 0x2A6D7, 0x2A6FF },
196  { 0x2B735, 0x2B73F }, { 0x2B81E, 0x2F7FF }, { 0x2FA1E, 0xF0000 },
197  { 0xFFFFE, 0xFFFFF }, { 0x10FFFE, 0x10FFFF }, { 0x110000, 0xFFFFFFFF }
198 };
199 
201  { 0x0000, 0x007F, "Basic Latin" },
202  { 0x0080, 0x00FF, "Latin-1 Supplement" },
203  { 0x0100, 0x017F, "Latin Extended-A" },
204  { 0x0180, 0x024F, "Latin Extended-B" },
205  { 0x0250, 0x02AF, "IPA Extensions" },
206  { 0x02B0, 0x02FF, "Spacing Modifier Letters" },
207  { 0x0300, 0x036F, "Combining Diacritical Marks" },
208  { 0x0370, 0x03FF, "Greek and Coptic" },
209  { 0x0400, 0x04FF, "Cyrillic" },
210  { 0x0500, 0x052F, "Cyrillic Supplement" },
211  { 0x0530, 0x058F, "Armenian" },
212  { 0x0590, 0x05FF, "Hebrew" },
213  { 0x0600, 0x06FF, "Arabic" },
214  { 0x0700, 0x074F, "Syriac" },
215  { 0x0750, 0x077F, "Arabic Supplement" },
216  { 0x0780, 0x07BF, "Thaana" },
217  { 0x07C0, 0x07FF, "NKo" },
218  { 0x0800, 0x083F, "Samaritan" },
219  { 0x0840, 0x085F, "Mandaic" },
220  { 0x0860, 0x086F, "Syriac Supplement" },
221  { 0x08A0, 0x08FF, "Arabic Extended-A" },
222  { 0x0900, 0x097F, "Devanagari" },
223  { 0x0980, 0x09FF, "Bengali" },
224  { 0x0A00, 0x0A7F, "Gurmukhi" },
225  { 0x0A80, 0x0AFF, "Gujarati" },
226  { 0x0B00, 0x0B7F, "Oriya" },
227  { 0x0B80, 0x0BFF, "Tamil" },
228  { 0x0C00, 0x0C7F, "Telugu" },
229  { 0x0C80, 0x0CFF, "Kannada" },
230  { 0x0D00, 0x0D7F, "Malayalam" },
231  { 0x0D80, 0x0DFF, "Sinhala" },
232  { 0x0E00, 0x0E7F, "Thai" },
233  { 0x0E80, 0x0EFF, "Lao" },
234  { 0x0F00, 0x0FFF, "Tibetan" },
235  { 0x1000, 0x109F, "Myanmar" },
236  { 0x10A0, 0x10FF, "Georgian" },
237  { 0x1100, 0x11FF, "Hangul Jamo" },
238  { 0x1200, 0x137F, "Ethiopic" },
239  { 0x1380, 0x139F, "Ethiopic Supplement" },
240  { 0x13A0, 0x13FF, "Cherokee" },
241  { 0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics" },
242  { 0x1680, 0x169F, "Ogham" },
243  { 0x16A0, 0x16FF, "Runic" },
244  { 0x1700, 0x171F, "Tagalog" },
245  { 0x1720, 0x173F, "Hanunoo" },
246  { 0x1740, 0x175F, "Buhid" },
247  { 0x1760, 0x177F, "Tagbanwa" },
248  { 0x1780, 0x17FF, "Khmer" },
249  { 0x1800, 0x18AF, "Mongolian" },
250  { 0x18B0, 0x18FF, "Unified Canadian Aboriginal Syllabics Extended" },
251  { 0x1900, 0x194F, "Limbu" },
252  { 0x1950, 0x197F, "Tai Le" },
253  { 0x1980, 0x19DF, "New Tai Lue" },
254  { 0x19E0, 0x19FF, "Khmer Symbols" },
255  { 0x1A00, 0x1A1F, "Buginese" },
256  { 0x1A20, 0x1AAF, "Tai Tham" },
257  { 0x1AB0, 0x1AFF, "Combining Diacritical Marks Extended" },
258  { 0x1B00, 0x1B7F, "Balinese" },
259  { 0x1B80, 0x1BBF, "Sundanese" },
260  { 0x1BC0, 0x1BFF, "Batak" },
261  { 0x1C00, 0x1C4F, "Lepcha" },
262  { 0x1C50, 0x1C7F, "Ol Chiki" },
263  { 0x1C80, 0x1C8F, "Cyrillic Extended-C" },
264  { 0x1CC0, 0x1CCF, "Sundanese Supplement" },
265  { 0x1CD0, 0x1CFF, "Vedic Extensions" },
266  { 0x1D00, 0x1D7F, "Phonetic Extensions" },
267  { 0x1D80, 0x1DBF, "Phonetic Extensions Supplement" },
268  { 0x1DC0, 0x1DFF, "Combining Diacritical Marks Supplement" },
269  { 0x1E00, 0x1EFF, "Latin Extended Additional" },
270  { 0x1F00, 0x1FFF, "Greek Extended" },
271  { 0x2000, 0x206F, "General Punctuation" },
272  { 0x2070, 0x209F, "Superscripts and Subscripts" },
273  { 0x20A0, 0x20CF, "Currency Symbols" },
274  { 0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols" },
275  { 0x2100, 0x214F, "Letterlike Symbols" },
276  { 0x2150, 0x218F, "Number Forms" },
277  { 0x2190, 0x21FF, "Arrows" },
278  { 0x2200, 0x22FF, "Mathematical Operators" },
279  { 0x2300, 0x23FF, "Miscellaneous Technical" },
280  { 0x2400, 0x243F, "Control Pictures" },
281  { 0x2440, 0x245F, "Optical Character Recognition" },
282  { 0x2460, 0x24FF, "Enclosed Alphanumerics" },
283  { 0x2500, 0x257F, "Box Drawing" },
284  { 0x2580, 0x259F, "Block Elements" },
285  { 0x25A0, 0x25FF, "Geometric Shapes" },
286  { 0x2600, 0x26FF, "Miscellaneous Symbols" },
287  { 0x2700, 0x27BF, "Dingbats" },
288  { 0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A" },
289  { 0x27F0, 0x27FF, "Supplemental Arrows-A" },
290  { 0x2800, 0x28FF, "Braille Patterns" },
291  { 0x2900, 0x297F, "Supplemental Arrows-B" },
292  { 0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B" },
293  { 0x2A00, 0x2AFF, "Supplemental Mathematical Operators" },
294  { 0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows" },
295  { 0x2C00, 0x2C5F, "Glagolitic" },
296  { 0x2C60, 0x2C7F, "Latin Extended-C" },
297  { 0x2C80, 0x2CFF, "Coptic" },
298  { 0x2D00, 0x2D2F, "Georgian Supplement" },
299  { 0x2D30, 0x2D7F, "Tifinagh" },
300  { 0x2D80, 0x2DDF, "Ethiopic Extended" },
301  { 0x2DE0, 0x2DFF, "Cyrillic Extended-A" },
302  { 0x2E00, 0x2E7F, "Supplemental Punctuation" },
303  { 0x2E80, 0x2EFF, "CJK Radicals Supplement" },
304  { 0x2F00, 0x2FDF, "Kangxi Radicals" },
305  { 0x2FF0, 0x2FFF, "Ideographic Description Characters" },
306  { 0x3000, 0x303F, "CJK Symbols and Punctuation" },
307  { 0x3040, 0x309F, "Hiragana" },
308  { 0x30A0, 0x30FF, "Katakana" },
309  { 0x3100, 0x312F, "Bopomofo" },
310  { 0x3130, 0x318F, "Hangul Compatibility Jamo" },
311  { 0x3190, 0x319F, "Kanbun" },
312  { 0x31A0, 0x31BF, "Bopomofo Extended" },
313  { 0x31C0, 0x31EF, "CJK Strokes" },
314  { 0x31F0, 0x31FF, "Katakana Phonetic Extensions" },
315  { 0x3200, 0x32FF, "Enclosed CJK Letters and Months" },
316  { 0x3300, 0x33FF, "CJK Compatibility" },
317  { 0x3400, 0x4DBF, "CJK Unified Ideographs Extension A" },
318  { 0x4DC0, 0x4DFF, "Yijing Hexagram Symbols" },
319  { 0x4E00, 0x9FFF, "CJK Unified Ideographs" },
320  { 0xA000, 0xA48F, "Yi Syllables" },
321  { 0xA490, 0xA4CF, "Yi Radicals" },
322  { 0xA4D0, 0xA4FF, "Lisu" },
323  { 0xA500, 0xA63F, "Vai" },
324  { 0xA640, 0xA69F, "Cyrillic Extended-B" },
325  { 0xA6A0, 0xA6FF, "Bamum" },
326  { 0xA700, 0xA71F, "Modifier Tone Letters" },
327  { 0xA720, 0xA7FF, "Latin Extended-D" },
328  { 0xA800, 0xA82F, "Syloti Nagri" },
329  { 0xA830, 0xA83F, "Common Indic Number Forms" },
330  { 0xA840, 0xA87F, "Phags-pa" },
331  { 0xA880, 0xA8DF, "Saurashtra" },
332  { 0xA8E0, 0xA8FF, "Devanagari Extended" },
333  { 0xA900, 0xA92F, "Kayah Li" },
334  { 0xA930, 0xA95F, "Rejang" },
335  { 0xA960, 0xA97F, "Hangul Jamo Extended-A" },
336  { 0xA980, 0xA9DF, "Javanese" },
337  { 0xA9E0, 0xA9FF, "Myanmar Extended-B" },
338  { 0xAA00, 0xAA5F, "Cham" },
339  { 0xAA60, 0xAA7F, "Myanmar Extended-A" },
340  { 0xAA80, 0xAADF, "Tai Viet" },
341  { 0xAAE0, 0xAAFF, "Meetei Mayek Extensions" },
342  { 0xAB00, 0xAB2F, "Ethiopic Extended-A" },
343  { 0xAB30, 0xAB6F, "Latin Extended-E" },
344  { 0xAB70, 0xABBF, "Cherokee Supplement" },
345  { 0xABC0, 0xABFF, "Meetei Mayek" },
346  { 0xAC00, 0xD7AF, "Hangul Syllables" },
347  { 0xD7B0, 0xD7FF, "Hangul Jamo Extended-B" },
348  { 0xD800, 0xDB7F, "High Surrogates" },
349  { 0xDB80, 0xDBFF, "High Private Use Surrogates" },
350  { 0xDC00, 0xDFFF, "Low Surrogates" },
351  { 0xE000, 0xF8FF, "Private Use Area" },
352  { 0xF900, 0xFAFF, "CJK Compatibility Ideographs" },
353  { 0xFB00, 0xFB4F, "Alphabetic Presentation Forms" },
354  { 0xFB50, 0xFDFF, "Arabic Presentation Forms-A" },
355  { 0xFE00, 0xFE0F, "Variation Selectors" },
356  { 0xFE10, 0xFE1F, "Vertical Forms" },
357  { 0xFE20, 0xFE2F, "Combining Half Marks" },
358  { 0xFE30, 0xFE4F, "CJK Compatibility Forms" },
359  { 0xFE50, 0xFE6F, "Small Form Variants" },
360  { 0xFE70, 0xFEFF, "Arabic Presentation Forms-B" },
361  { 0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms" },
362  { 0xFFF0, 0xFFFF, "Specials" },
363  { 0x10000, 0x1007F, "Linear B Syllabary" },
364  { 0x10080, 0x100FF, "Linear B Ideograms" },
365  { 0x10100, 0x1013F, "Aegean Numbers" },
366  { 0x10140, 0x1018F, "Ancient Greek Numbers" },
367  { 0x10190, 0x101CF, "Ancient Symbols" },
368  { 0x101D0, 0x101FF, "Phaistos Disc" },
369  { 0x10280, 0x1029F, "Lycian" },
370  { 0x102A0, 0x102DF, "Carian" },
371  { 0x102E0, 0x102FF, "Coptic Epact Numbers" },
372  { 0x10300, 0x1032F, "Old Italic" },
373  { 0x10330, 0x1034F, "Gothic" },
374  { 0x10350, 0x1037F, "Old Permic" },
375  { 0x10380, 0x1039F, "Ugaritic" },
376  { 0x103A0, 0x103DF, "Old Persian" },
377  { 0x10400, 0x1044F, "Deseret" },
378  { 0x10450, 0x1047F, "Shavian" },
379  { 0x10480, 0x104AF, "Osmanya" },
380  { 0x104B0, 0x104FF, "Osage" },
381  { 0x10500, 0x1052F, "Elbasan" },
382  { 0x10530, 0x1056F, "Caucasian Albanian" },
383  { 0x10600, 0x1077F, "Linear A" },
384  { 0x10800, 0x1083F, "Cypriot Syllabary" },
385  { 0x10840, 0x1085F, "Imperial Aramaic" },
386  { 0x10860, 0x1087F, "Palmyrene" },
387  { 0x10880, 0x108AF, "Nabataean" },
388  { 0x108E0, 0x108FF, "Hatran" },
389  { 0x10900, 0x1091F, "Phoenician" },
390  { 0x10920, 0x1093F, "Lydian" },
391  { 0x10980, 0x1099F, "Meroitic Hieroglyphs" },
392  { 0x109A0, 0x109FF, "Meroitic Cursive" },
393  { 0x10A00, 0x10A5F, "Kharoshthi" },
394  { 0x10A60, 0x10A7F, "Old South Arabian" },
395  { 0x10A80, 0x10A9F, "Old North Arabian" },
396  { 0x10AC0, 0x10AFF, "Manichaean" },
397  { 0x10B00, 0x10B3F, "Avestan" },
398  { 0x10B40, 0x10B5F, "Inscriptional Parthian" },
399  { 0x10B60, 0x10B7F, "Inscriptional Pahlavi" },
400  { 0x10B80, 0x10BAF, "Psalter Pahlavi" },
401  { 0x10C00, 0x10C4F, "Old Turkic" },
402  { 0x10C80, 0x10CFF, "Old Hungarian" },
403  { 0x10E60, 0x10E7F, "Rumi Numeral Symbols" },
404  { 0x11000, 0x1107F, "Brahmi" },
405  { 0x11080, 0x110CF, "Kaithi" },
406  { 0x110D0, 0x110FF, "Sora Sompeng" },
407  { 0x11100, 0x1114F, "Chakma" },
408  { 0x11150, 0x1117F, "Mahajani" },
409  { 0x11180, 0x111DF, "Sharada" },
410  { 0x111E0, 0x111FF, "Sinhala Archaic Numbers" },
411  { 0x11200, 0x1124F, "Khojki" },
412  { 0x11280, 0x112AF, "Multani" },
413  { 0x112B0, 0x112FF, "Khudawadi" },
414  { 0x11300, 0x1137F, "Grantha" },
415  { 0x11400, 0x1147F, "Newa" },
416  { 0x11480, 0x114DF, "Tirhuta" },
417  { 0x11580, 0x115FF, "Siddham" },
418  { 0x11600, 0x1165F, "Modi" },
419  { 0x11660, 0x1167F, "Mongolian Supplement" },
420  { 0x11680, 0x116CF, "Takri" },
421  { 0x11700, 0x1173F, "Ahom" },
422  { 0x118A0, 0x118FF, "Warang Citi" },
423  { 0x11A00, 0x11A4F, "Zanabazar Square" },
424  { 0x11A50, 0x11AAF, "Soyombo" },
425  { 0x11AC0, 0x11AFF, "Pau Cin Hau" },
426  { 0x11C00, 0x11C6F, "Bhaiksuki" },
427  { 0x11C70, 0x11CBF, "Marchen" },
428  { 0x11D00, 0x11D5F, "Masaram Gondi" },
429  { 0x12000, 0x123FF, "Cuneiform" },
430  { 0x12400, 0x1247F, "Cuneiform Numbers and Punctuation" },
431  { 0x12480, 0x1254F, "Early Dynastic Cuneiform" },
432  { 0x13000, 0x1342F, "Egyptian Hieroglyphs" },
433  { 0x14400, 0x1467F, "Anatolian Hieroglyphs" },
434  { 0x16800, 0x16A3F, "Bamum Supplement" },
435  { 0x16A40, 0x16A6F, "Mro" },
436  { 0x16AD0, 0x16AFF, "Bassa Vah" },
437  { 0x16B00, 0x16B8F, "Pahawh Hmong" },
438  { 0x16F00, 0x16F9F, "Miao" },
439  { 0x16FE0, 0x16FFF, "Ideographic Symbols and Punctuation" },
440  { 0x17000, 0x187FF, "Tangut" },
441  { 0x18800, 0x18AFF, "Tangut Components" },
442  { 0x1B000, 0x1B0FF, "Kana Supplement" },
443  { 0x1B100, 0x1B12F, "Kana Extended-A" },
444  { 0x1B170, 0x1B2FF, "Nushu" },
445  { 0x1BC00, 0x1BC9F, "Duployan" },
446  { 0x1BCA0, 0x1BCAF, "Shorthand Format Controls" },
447  { 0x1D000, 0x1D0FF, "Byzantine Musical Symbols" },
448  { 0x1D100, 0x1D1FF, "Musical Symbols" },
449  { 0x1D200, 0x1D24F, "Ancient Greek Musical Notation" },
450  { 0x1D300, 0x1D35F, "Tai Xuan Jing Symbols" },
451  { 0x1D360, 0x1D37F, "Counting Rod Numerals" },
452  { 0x1D400, 0x1D7FF, "Mathematical Alphanumeric Symbols" },
453  { 0x1D800, 0x1DAAF, "Sutton SignWriting" },
454  { 0x1E000, 0x1E02F, "Glagolitic Supplement" },
455  { 0x1E800, 0x1E8DF, "Mende Kikakui" },
456  { 0x1E900, 0x1E95F, "Adlam" },
457  { 0x1EE00, 0x1EEFF, "Arabic Mathematical Alphabetic Symbols" },
458  { 0x1F000, 0x1F02F, "Mahjong Tiles" },
459  { 0x1F030, 0x1F09F, "Domino Tiles" },
460  { 0x1F0A0, 0x1F0FF, "Playing Cards" },
461  { 0x1F100, 0x1F1FF, "Enclosed Alphanumeric Supplement" },
462  { 0x1F200, 0x1F2FF, "Enclosed Ideographic Supplement" },
463  { 0x1F300, 0x1F5FF, "Miscellaneous Symbols and Pictographs" },
464  { 0x1F600, 0x1F64F, "Emoticons" },
465  { 0x1F650, 0x1F67F, "Ornamental Dingbats" },
466  { 0x1F680, 0x1F6FF, "Transport and Map Symbols" },
467  { 0x1F700, 0x1F77F, "Alchemical Symbols" },
468  { 0x1F780, 0x1F7FF, "Geometric Shapes Extended" },
469  { 0x1F800, 0x1F8FF, "Supplemental Arrows-C" },
470  { 0x1F900, 0x1F9FF, "Supplemental Symbols and Pictographs" },
471  { 0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B" },
472  { 0x2A700, 0x2B73F, "CJK Unified Ideographs Extension C" },
473  { 0x2B740, 0x2B81F, "CJK Unified Ideographs Extension D" },
474  { 0x2B820, 0x2CEAF, "CJK Unified Ideographs Extension E" },
475  { 0x2CEB0, 0x2EBEF, "CJK Unified Ideographs Extension F" },
476  { 0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement" },
477  { 0xE0000, 0xE007F, "Tags" },
478  { 0xE0100, 0xE01EF, "Variation Selectors Supplement" },
479  { 0xF0000, 0xFFFFF, "Supplementary Private Use Area-A" },
480  { 0x100000, 0x10FFFF, "Supplementary Private Use Area-B" },
481  { 0x110000, 0xFFFFFFFF, "No_Block" }
482 };
483 
484 RZ_API const char *rz_utf_block_name(int idx) {
485  if (idx < 0 || idx >= UTF_LAST_BLOCK) {
486  return NULL;
487  }
488  return utf_blocks[idx].name;
489 }
490 
491 /* Convert an UTF-8 buf into a unicode RzRune */
492 RZ_API int rz_utf8_decode(const ut8 *ptr, int ptrlen, RzRune *ch) {
493  if (ptrlen < 1) {
494  return 0;
495  }
496  if (ptr[0] < 0x80) {
497  if (ch) {
498  *ch = (ut32)ptr[0];
499  }
500  return 1;
501  } else if (ptrlen > 1 && (ptr[0] & 0xe0) == 0xc0 && (ptr[1] & 0xc0) == 0x80) {
502  RzRune rune = (ptr[0] & 0x1f) << 6 | (ptr[1] & 0x3f);
503  if (ch) {
504  *ch = rune;
505  }
506  return rune < 0x80 ? 0 : 2;
507  } else if (ptrlen > 2 && (ptr[0] & 0xf0) == 0xe0 && (ptr[1] & 0xc0) == 0x80 && (ptr[2] & 0xc0) == 0x80) {
508  RzRune rune = (ptr[0] & 0xf) << 12 | (ptr[1] & 0x3f) << 6 | (ptr[2] & 0x3f);
509  if (ch) {
510  *ch = rune;
511  }
512  return rune < 0x800 ? 0 : 3;
513  } else if (ptrlen > 3 && (ptr[0] & 0xf8) == 0xf0 && (ptr[1] & 0xc0) == 0x80 && (ptr[2] & 0xc0) == 0x80 && (ptr[3] & 0xc0) == 0x80) {
514  RzRune rune = (ptr[0] & 7) << 18 | (ptr[1] & 0x3f) << 12 | (ptr[2] & 0x3f) << 6 | (ptr[3] & 0x3f);
515  if (ch) {
516  *ch = rune;
517  }
518  return rune < 0x10000 ? 0 : 4;
519  }
520  return 0;
521 }
522 
523 /* Convert an MUTF-8 buf into a unicode RzRune */
524 RZ_API int rz_mutf8_decode(const ut8 *ptr, int ptrlen, RzRune *ch) {
525  if (ptrlen > 1 && ptr[0] == 0xc0 && ptr[1] == 0x80) {
526  if (ch) {
527  *ch = 0;
528  }
529  return 2;
530  }
531  return rz_utf8_decode(ptr, ptrlen, ch);
532 }
533 
534 /* Convert a unicode RzRune into an UTF-8 buf */
535 RZ_API int rz_utf8_encode(ut8 *ptr, const RzRune ch) {
536  if (ch < 0x80) {
537  ptr[0] = (ut8)ch;
538  return 1;
539  } else if (ch < 0x800) {
540  ptr[0] = 0xc0 | (ch >> 6);
541  ptr[1] = 0x80 | (ch & 0x3f);
542  return 2;
543  } else if (ch < 0x10000) {
544  ptr[0] = 0xe0 | (ch >> 12);
545  ptr[1] = 0x80 | ((ch >> 6) & 0x3f);
546  ptr[2] = 0x80 | (ch & 0x3f);
547  return 3;
548  } else if (ch < 0x200000) {
549  ptr[0] = 0xf0 | (ch >> 18);
550  ptr[1] = 0x80 | ((ch >> 12) & 0x3f);
551  ptr[2] = 0x80 | ((ch >> 6) & 0x3f);
552  ptr[3] = 0x80 | (ch & 0x3f);
553  return 4;
554  }
555  return 0;
556 }
557 
558 /* Convert a unicode RzRune string into an utf-8 one */
559 RZ_API int rz_utf8_encode_str(const RzRune *str, ut8 *dst, const int dst_length) {
560  if (!str || !dst) {
561  return -1;
562  }
563 
564  int pos = 0;
565  for (size_t i = 0; i < sizeof(str) - 1 && str[i] && pos < dst_length - 1; i++) {
566  pos += rz_utf8_encode(&dst[pos], str[i]);
567  }
568 
569  dst[pos++] = '\0';
570  return pos;
571 }
572 
573 /* Returns the size in bytes of the utf-8 encoded char */
574 RZ_API int rz_utf8_size(const ut8 *ptr) {
575  const int utf8_size[] = {
576  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
577  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0-0xCF
581  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0-0xDF
582  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0-0xEF
583  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 0xF0-0xFF
584  };
585  return (ptr[0] & 0x80) ? utf8_size[ptr[0] ^ 0x80] : 1;
586 }
587 
589  int len = 0;
590 
591  for (int i = 0; str[i]; i++) {
592  if ((str[i] & 0xc0) != 0x80) {
593  len++;
594  }
595  }
596 
597  return len;
598 }
599 
607  // RzRunes are most commonly single byte... We can early out with this common case.
608  if (c < 0x34F) {
609  /*
610  manually copied from top, please update if this ever changes
611  { 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x034F, 0x034F },
612  could do a linear search, but that's a lot slower than a few compare
613  */
614  return !(c <= 0x1F || (c >= 0x7F && c <= 0x9F));
615  }
616 
617  const int last = UTF_NONPRINTABLE_RANGES_COUNT;
618 
619  int low = 0;
620  int hi = last - 1;
621 
622  do {
623  int mid = (low + hi) >> 1;
624  if (c >= nonprintable_ranges[mid].from && c <= nonprintable_ranges[mid].to) {
625  return false;
626  }
627  if (mid < last && c > nonprintable_ranges[mid].to) {
628  low = mid + 1;
629  }
630  if (mid < last && c < nonprintable_ranges[mid].from) {
631  hi = mid - 1;
632  }
633  } while (low <= hi);
634 
635  return true;
636 }
637 
638 #if __WINDOWS__
639 RZ_API char *rz_utf16_to_utf8_l(const wchar_t *wc, int len) {
640  // -1 is allowed on purpose.
641  if (!wc || !len || len < -1) {
642  return NULL;
643  }
644  char *rutf8 = NULL;
645  int csize;
646 
647  if ((csize = WideCharToMultiByte(CP_UTF8, 0, wc, len, NULL, 0, NULL, NULL))) {
648  ++csize;
649  if ((rutf8 = malloc(csize))) {
650  WideCharToMultiByte(CP_UTF8, 0, wc, len, rutf8, csize, NULL, NULL);
651  if (len != -1) {
652  rutf8[csize - 1] = '\0';
653  }
654  }
655  }
656  return rutf8;
657 }
658 
659 RZ_API wchar_t *rz_utf8_to_utf16_l(const char *cstring, int len) {
660  // -1 is allowed on purpose.
661  if (!cstring || !len || len < -1) {
662  return NULL;
663  }
664  wchar_t *rutf16 = NULL;
665  int wcsize;
666 
667  if ((wcsize = MultiByteToWideChar(CP_UTF8, 0, cstring, len, NULL, 0))) {
668  ++wcsize;
669  if ((rutf16 = (wchar_t *)calloc(wcsize, sizeof(wchar_t)))) {
670  MultiByteToWideChar(CP_UTF8, 0, cstring, len, rutf16, wcsize);
671  if (len != -1) {
672  rutf16[wcsize - 1] = L'\0';
673  }
674  }
675  }
676  return rutf16;
677 }
678 
679 RZ_API char *rz_utf8_to_acp_l(const char *str, int len) {
680  // -1 is allowed on purpose.
681  if (!str || !len || len < -1) {
682  return NULL;
683  }
684  char *acp = NULL;
685  int wcsize = 0, csize = 0;
686  if ((wcsize = MultiByteToWideChar(CP_UTF8, 0, str, len, NULL, 0))) {
687  wchar_t *rutf16 = NULL;
688  ++wcsize;
689  if ((rutf16 = (wchar_t *)calloc(wcsize, sizeof(wchar_t)))) {
690  MultiByteToWideChar(CP_UTF8, 0, str, len, rutf16, wcsize);
691  if (len != -1) {
692  rutf16[wcsize - 1] = L'\0';
693  }
694  if ((csize = WideCharToMultiByte(CP_ACP, 0, rutf16, wcsize, NULL, 0, NULL, NULL))) {
695  ++csize;
696  if ((acp = malloc(csize))) {
697  WideCharToMultiByte(CP_ACP, 0, rutf16, wcsize, acp, csize, NULL, NULL);
698  if (len != -1) {
699  acp[csize - 1] = '\0';
700  }
701  }
702  }
703  free(rutf16);
704  }
705  }
706  return acp;
707 }
708 
709 RZ_API char *rz_acp_to_utf8_l(const char *str, int len) {
710  // -1 is allowed on purpose.
711  if (!str || !len || len < -1) {
712  return NULL;
713  }
714  int wcsize = 0;
715  if ((wcsize = MultiByteToWideChar(CP_ACP, 0, str, len, NULL, 0))) {
716  wchar_t *rutf16 = NULL;
717  ++wcsize;
718  if ((rutf16 = (wchar_t *)calloc(wcsize, sizeof(wchar_t)))) {
719  MultiByteToWideChar(CP_ACP, 0, str, len, rutf16, wcsize);
720  if (len != -1) {
721  rutf16[wcsize - 1] = L'\0';
722  }
723  char *ret = rz_utf16_to_utf8_l(rutf16, wcsize);
724  free(rutf16);
725  return ret;
726  }
727  }
728  return NULL;
729 }
730 
731 #endif // __WINDOWS__
732 
734  const int last = UTF_BLOCKS_COUNT;
735  int low = 0, hi = last - 1, mid = 0;
736 
737  do {
738  mid = (low + hi) >> 1;
739  if (ch >= utf_blocks[mid].from && ch <= utf_blocks[mid].to) {
740  return mid;
741  }
742  if (mid < last && ch > utf_blocks[mid].to) {
743  low = mid + 1;
744  }
745  if (mid < last && ch < utf_blocks[mid].from) {
746  hi = mid - 1;
747  }
748  } while (low <= hi);
749 
750  return UTF_BLOCKS_COUNT - 1; /* index for "No_Block" */
751 }
752 
753 /* str must be UTF8-encoded */
754 RZ_API int *rz_utf_block_list(const ut8 *str, int len, int **freq_list) {
755  if (!str) {
756  return NULL;
757  }
758  if (len < 0) {
759  len = strlen((const char *)str);
760  }
761  int block_freq[UTF_BLOCKS_COUNT] = { 0 };
762  int *list = RZ_NEWS0(int, len + 1);
763  if (!list) {
764  return NULL;
765  }
766  int *freq_list_ptr = NULL;
767  if (freq_list) {
768  *freq_list = RZ_NEWS0(int, len + 1);
769  if (!*freq_list) {
770  free(list);
771  return NULL;
772  }
773  freq_list_ptr = *freq_list;
774  }
775  int *list_ptr = list;
776  const ut8 *str_ptr = str;
777  const ut8 *str_end = str + len;
778  RzRune ch = 0;
779  while (str_ptr < str_end) {
780  int block_idx;
781  int ch_bytes = rz_utf8_decode(str_ptr, str_end - str_ptr, &ch);
782  if (!ch_bytes) {
783  block_idx = UTF_BLOCKS_COUNT - 1;
784  ch_bytes = 1;
785  } else {
786  block_idx = rz_utf_block_idx(ch);
787  }
788  if (!block_freq[block_idx]) {
789  *list_ptr = block_idx;
790  list_ptr++;
791  }
792  block_freq[block_idx]++;
793  str_ptr += ch_bytes;
794  }
795  *list_ptr = -1;
796  if (freq_list_ptr) {
797  for (list_ptr = list; *list_ptr != -1; list_ptr++) {
798  *freq_list_ptr = block_freq[*list_ptr];
799  freq_list_ptr++;
800  }
801  *freq_list_ptr = -1;
802  }
803  for (list_ptr = list; *list_ptr != -1; list_ptr++) {
804  block_freq[*list_ptr] = 0;
805  }
806  return list;
807 }
808 
809 RZ_API RzStrEnc rz_utf_bom_encoding(const ut8 *ptr, int ptrlen) {
810  if (ptrlen > 3) {
811  if (ptr[0] == 0xff && ptr[1] == 0xfe && !ptr[2] && !ptr[3]) {
812  return RZ_STRING_ENC_UTF32LE;
813  }
814  if (!ptr[0] && !ptr[1] && ptr[2] == 0xfe && ptr[3] == 0xff) {
815  return RZ_STRING_ENC_UTF32BE;
816  }
817  }
818  if (ptrlen > 2) {
819  if (ptr[0] == 0xef && ptr[1] == 0xbb && ptr[2] == 0xbf) {
820  return RZ_STRING_ENC_UTF8;
821  }
822  }
823  if (ptrlen > 1) {
824  if (ptr[0] == 0xff && ptr[1] == 0xfe) {
825  return RZ_STRING_ENC_UTF16LE;
826  }
827  if (ptr[0] == 0xfe && ptr[1] == 0xff) {
828  return RZ_STRING_ENC_UTF16BE;
829  }
830  }
831  return RZ_STRING_ENC_GUESS;
832 }
size_t len
Definition: 6502dis.c:15
lzma_index ** i
Definition: index.h:629
#define RZ_API
#define NULL
Definition: cris-opc.c:27
#define ut8
Definition: dcpu16.h:8
uint32_t ut32
hi(addr) 0x03
RZ_API void Ht_() free(HtName_(Ht) *ht)
Definition: ht_inc.c:130
uint8_t ut8
Definition: lh5801.h:11
static void list(RzEgg *egg)
Definition: rz-gg.c:52
void * malloc(size_t size)
Definition: malloc.c:123
void * calloc(size_t number, size_t size)
Definition: malloc.c:102
char * dst
Definition: lz4.h:724
int idx
Definition: setup.py:197
RzStrEnc
Definition: rz_str.h:19
@ RZ_STRING_ENC_UTF32LE
Definition: rz_str.h:24
@ RZ_STRING_ENC_UTF32BE
Definition: rz_str.h:26
@ RZ_STRING_ENC_UTF8
Definition: rz_str.h:21
@ RZ_STRING_ENC_GUESS
Definition: rz_str.h:33
@ RZ_STRING_ENC_UTF16LE
Definition: rz_str.h:23
@ RZ_STRING_ENC_UTF16BE
Definition: rz_str.h:25
#define RZ_NEWS0(x, y)
Definition: rz_types.h:282
ut32 RzRune
Definition: rz_utf8.h:13
#define c(i)
Definition: sha256.c:43
const char * name
Definition: rz_utf8.h:9
int pos
Definition: main.c:11
RZ_API int rz_utf8_encode_str(const RzRune *str, ut8 *dst, const int dst_length)
Definition: utf8.c:559
RZ_API int * rz_utf_block_list(const ut8 *str, int len, int **freq_list)
Definition: utf8.c:754
const struct @335 nonprintable_ranges[]
ut32 to
Definition: utf8.c:14
RZ_API int rz_utf_block_idx(RzRune ch)
Definition: utf8.c:733
RZ_API int rz_utf8_size(const ut8 *ptr)
Definition: utf8.c:574
RZ_API const char * rz_utf_block_name(int idx)
Definition: utf8.c:484
#define UTF_NONPRINTABLE_RANGES_COUNT
Definition: utf8.c:12
RZ_API RzStrEnc rz_utf_bom_encoding(const ut8 *ptr, int ptrlen)
Definition: utf8.c:809
#define UTF_LAST_BLOCK
Definition: utf8.c:10
#define UTF_BLOCKS_COUNT
Definition: utf8.c:11
const RUtfBlock utf_blocks[]
Definition: utf8.c:200
ut32 from
Definition: utf8.c:14
RZ_API int rz_utf8_decode(const ut8 *ptr, int ptrlen, RzRune *ch)
Definition: utf8.c:492
RZ_API bool rz_rune_is_printable(const RzRune c)
Returns true when the RzRune is a printable symbol.
Definition: utf8.c:606
RZ_API int rz_mutf8_decode(const ut8 *ptr, int ptrlen, RzRune *ch)
Definition: utf8.c:524
RZ_API int rz_utf8_encode(ut8 *ptr, const RzRune ch)
Definition: utf8.c:535
RZ_API int rz_utf8_strlen(const ut8 *str)
Definition: utf8.c:588
#define L
Definition: zip_err_str.c:7