Rizin
unix-like reverse engineering framework and cli tools
zip_utf-8.c
Go to the documentation of this file.
1 /*
2  zip_utf-8.c -- UTF-8 support functions for libzip
3  Copyright (C) 2011-2021 Dieter Baron and Thomas Klausner
4 
5  This file is part of libzip, a library to manipulate ZIP archives.
6  The authors can be contacted at <info@libzip.org>
7 
8  Redistribution and use in source and binary forms, with or without
9  modification, are permitted provided that the following conditions
10  are met:
11  1. Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13  2. Redistributions in binary form must reproduce the above copyright
14  notice, this list of conditions and the following disclaimer in
15  the documentation and/or other materials provided with the
16  distribution.
17  3. The names of the authors may not be used to endorse or promote
18  products derived from this software without specific prior
19  written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
22  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
25  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29  IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 
35 #include "zipint.h"
36 
37 #include <stdlib.h>
38 
39 
40 static const zip_uint16_t _cp437_to_unicode[256] = {
41  /* 0x00 - 0x0F */
42  0x0000, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
43 
44  /* 0x10 - 0x1F */
45  0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8, 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
46 
47  /* 0x20 - 0x2F */
48  0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
49 
50  /* 0x30 - 0x3F */
51  0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
52 
53  /* 0x40 - 0x4F */
54  0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
55 
56  /* 0x50 - 0x5F */
57  0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
58 
59  /* 0x60 - 0x6F */
60  0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
61 
62  /* 0x70 - 0x7F */
63  0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
64 
65  /* 0x80 - 0x8F */
66  0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
67 
68  /* 0x90 - 0x9F */
69  0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
70 
71  /* 0xA0 - 0xAF */
72  0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
73 
74  /* 0xB0 - 0xBF */
75  0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
76 
77  /* 0xC0 - 0xCF */
78  0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
79 
80  /* 0xD0 - 0xDF */
81  0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
82 
83  /* 0xE0 - 0xEF */
84  0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4, 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
85 
86  /* 0xF0 - 0xFF */
87  0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0};
88 
89 #define UTF_8_LEN_2_MASK 0xe0
90 #define UTF_8_LEN_2_MATCH 0xc0
91 #define UTF_8_LEN_3_MASK 0xf0
92 #define UTF_8_LEN_3_MATCH 0xe0
93 #define UTF_8_LEN_4_MASK 0xf8
94 #define UTF_8_LEN_4_MATCH 0xf0
95 #define UTF_8_CONTINUE_MASK 0xc0
96 #define UTF_8_CONTINUE_MATCH 0x80
97 
98 
102  const zip_uint8_t *name;
103  zip_uint32_t i, j, ulen;
104 
105  if (str == NULL)
106  return ZIP_ENCODING_ASCII;
107 
108  name = str->raw;
109 
110  if (str->encoding != ZIP_ENCODING_UNKNOWN)
111  enc = str->encoding;
112  else {
113  enc = ZIP_ENCODING_ASCII;
114  for (i = 0; i < str->length; i++) {
115  if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t')
116  continue;
117 
120  ulen = 1;
121  else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
122  ulen = 2;
123  else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
124  ulen = 3;
125  else {
126  enc = ZIP_ENCODING_CP437;
127  break;
128  }
129 
130  if (i + ulen >= str->length) {
131  enc = ZIP_ENCODING_CP437;
132  break;
133  }
134 
135  for (j = 1; j <= ulen; j++) {
137  enc = ZIP_ENCODING_CP437;
138  goto done;
139  }
140  }
141  i += ulen;
142  }
143  }
144 
145 done:
146  str->encoding = enc;
147 
148  if (expected_encoding != ZIP_ENCODING_UNKNOWN) {
149  if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED)
150  str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN;
151 
152  if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII)
153  return ZIP_ENCODING_ERROR;
154  }
155 
156  return enc;
157 }
158 
159 
160 static zip_uint32_t
162  if (codepoint < 0x0080)
163  return 1;
164  if (codepoint < 0x0800)
165  return 2;
166  if (codepoint < 0x10000)
167  return 3;
168  return 4;
169 }
170 
171 
172 static zip_uint32_t
174  if (codepoint < 0x0080) {
175  buf[0] = codepoint & 0xff;
176  return 1;
177  }
178  if (codepoint < 0x0800) {
179  buf[0] = (zip_uint8_t)(UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f));
180  buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
181  return 2;
182  }
183  if (codepoint < 0x10000) {
184  buf[0] = (zip_uint8_t)(UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f));
185  buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
186  buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
187  return 3;
188  }
189  buf[0] = (zip_uint8_t)(UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07));
190  buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f));
191  buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
192  buf[3] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
193  return 4;
194 }
195 
196 
197 zip_uint8_t *
199  zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
200  zip_uint8_t *utf8buf;
202 
203  if (len == 0) {
204  if (utf8_lenp)
205  *utf8_lenp = 0;
206  return NULL;
207  }
208 
209  buflen = 1;
210  for (i = 0; i < len; i++)
212 
213  if ((utf8buf = (zip_uint8_t *)malloc(buflen)) == NULL) {
215  return NULL;
216  }
217 
218  offset = 0;
219  for (i = 0; i < len; i++)
220  offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]], utf8buf + offset);
221 
222  utf8buf[buflen - 1] = 0;
223  if (utf8_lenp)
224  *utf8_lenp = buflen - 1;
225  return utf8buf;
226 }
size_t len
Definition: 6502dis.c:15
lzma_index ** i
Definition: index.h:629
#define NULL
Definition: cris-opc.c:27
struct tab * done
Definition: enough.c:233
voidpf uLong offset
Definition: ioapi.h:144
voidpf void * buf
Definition: ioapi.h:138
ZIP_EXTERN void zip_error_set(zip_error_t *_Nullable, int, int)
Definition: zip_error.c:126
#define ZIP_ER_MEMORY
Definition: zip.h:119
void * malloc(size_t size)
Definition: malloc.c:123
const char * name
Definition: op.c:541
Definition: z80asm.h:102
Definition: zip.h:284
ut64 buflen
Definition: core.c:76
void error(const char *msg)
Definition: untgz.c:593
#define UTF_8_LEN_4_MASK
Definition: zip_utf-8.c:93
#define UTF_8_CONTINUE_MATCH
Definition: zip_utf-8.c:96
#define UTF_8_LEN_4_MATCH
Definition: zip_utf-8.c:94
#define UTF_8_LEN_3_MATCH
Definition: zip_utf-8.c:92
#define UTF_8_LEN_3_MASK
Definition: zip_utf-8.c:91
#define UTF_8_LEN_2_MASK
Definition: zip_utf-8.c:89
static const zip_uint16_t _cp437_to_unicode[256]
Definition: zip_utf-8.c:40
static zip_uint32_t _zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf)
Definition: zip_utf-8.c:173
zip_uint8_t * _zip_cp437_to_utf8(const zip_uint8_t *const _cp437buf, zip_uint32_t len, zip_uint32_t *utf8_lenp, zip_error_t *error)
Definition: zip_utf-8.c:198
#define UTF_8_CONTINUE_MASK
Definition: zip_utf-8.c:95
#define UTF_8_LEN_2_MATCH
Definition: zip_utf-8.c:90
zip_encoding_type_t _zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding)
Definition: zip_utf-8.c:100
static zip_uint32_t _zip_unicode_to_utf8_len(zip_uint32_t codepoint)
Definition: zip_utf-8.c:161
uint32_t zip_uint32_t
Definition: zipconf.h:37
uint8_t zip_uint8_t
Definition: zipconf.h:33
uint16_t zip_uint16_t
Definition: zipconf.h:35
enum zip_encoding_type zip_encoding_type_t
Definition: zipint.h:262
@ ZIP_ENCODING_ERROR
Definition: zipint.h:259
@ ZIP_ENCODING_UTF8_GUESSED
Definition: zipint.h:257
@ ZIP_ENCODING_UTF8_KNOWN
Definition: zipint.h:256
@ ZIP_ENCODING_UNKNOWN
Definition: zipint.h:254
@ ZIP_ENCODING_CP437
Definition: zipint.h:258
@ ZIP_ENCODING_ASCII
Definition: zipint.h:255