Rizin
unix-like reverse engineering framework and cli tools
ascmagic.c
Go to the documentation of this file.
1 /* $OpenBSD: ascmagic.c,v 1.11 2009/10/27 23:59:37 deraadt Exp $ */
2 /*
3  * Copyright (c) Ian F. Darwin 1986-1995.
4  * Software written by Ian F. Darwin and others;
5  * maintained 1995-present by Christos Zoulas and others.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  * notice immediately at the beginning of the file, without modification,
12  * this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
21  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 /*
30  * ASCII magic -- file types that we know based on keywords
31  * that can appear anywhere in the file.
32  *
33  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
34  * to handle character codes other than ASCII on a unified basis.
35  *
36  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
37  * international characters, now subsumed into this file.
38  */
39 #include <rz_userconf.h>
40 
41 #if !USE_LIB_MAGIC
42 
43 #include "file.h"
44 #include <stdio.h>
45 #include <string.h>
46 #include <memory.h>
47 #include <ctype.h>
48 #include <stdlib.h>
49 #include "names.h"
50 
51 #define MAXLINELEN 300 /* longest sane line length */
52 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' || (x) == 0x85 || (x) == '\f')
53 
54 static int looks_ascii(const ut8 *, size_t, unichar *, size_t *);
55 static int looks_utf8_with_BOM(const ut8 *, size_t, unichar *,
56  size_t *);
57 int file_looks_utf8(const ut8 *, size_t, unichar *, size_t *);
58 static int looks_ucs16(const ut8 *, size_t, unichar *, size_t *);
59 static int looks_latin1(const ut8 *, size_t, unichar *, size_t *);
60 static int looks_extended(const ut8 *, size_t, unichar *, size_t *);
61 static void from_ebcdic(const ut8 *, size_t, ut8 *);
62 static int ascmatch(const ut8 *, const unichar *, size_t);
63 static ut8 *encode_utf8(ut8 *, size_t, unichar *, size_t);
64 
65 int file_ascmagic(RzMagic *ms, const ut8 *buf, size_t nbytes) {
66  return 0;
67  size_t i;
68  ut8 *nbuf = NULL, *utf8_buf = NULL, *utf8_end;
69  unichar *ubuf = NULL;
70  size_t ulen, mlen;
71  const struct names *p;
72  int rv = -1;
73  int mime = ms->flags & RZ_MAGIC_MIME;
74 
75  const char *code = NULL;
76  const char *code_mime = NULL;
77  const char *type = NULL;
78  const char *subtype = NULL;
79  const char *subtype_mime = NULL;
80 
81  int has_escapes = 0;
82  int has_backspace = 0;
83  int seen_cr = 0;
84 
85  int n_crlf = 0;
86  int n_lf = 0;
87  int n_cr = 0;
88  int n_nel = 0;
89 
90  size_t last_line_end = (size_t)-1;
91  int has_long_lines = 0;
92 
93  /*
94  * Undo the NUL-termination kindly provided by process()
95  * but leave at least one byte to look at
96  */
97  while (nbytes > 1 && buf[nbytes - 1] == '\0') {
98  nbytes--;
99  }
100 
101  if (!(nbuf = calloc(1, (nbytes + 1) * sizeof(nbuf[0])))) {
102  goto done;
103  }
104  if (!(ubuf = calloc(1, (nbytes + 1) * sizeof(ubuf[0])))) {
105  goto done;
106  }
107 
108  /*
109  * Then try to determine whether it's any character code we can
110  * identify. Each of these tests, if it succeeds, will leave
111  * the text converted into one-unichar-per-character Unicode in
112  * ubuf, and the number of characters converted in ulen.
113  */
114  if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
115  code = "ASCII";
116  code_mime = "us-ascii";
117  type = "text";
118  } else if (looks_utf8_with_BOM(buf, nbytes, ubuf, &ulen) > 0) {
119  code = "UTF-8 Unicode (with BOM)";
120  code_mime = "utf-8";
121  type = "text";
122  } else if (file_looks_utf8(buf, nbytes, ubuf, &ulen) > 1) {
123  code = "UTF-8 Unicode";
124  code_mime = "utf-8";
125  type = "text";
126  } else if ((i = looks_ucs16(buf, nbytes, ubuf, &ulen)) != 0) {
127  if (i == 1) {
128  code = "Little-endian UTF-16 Unicode";
129  } else {
130  code = "Big-endian UTF-16 Unicode";
131  }
132 
133  type = "character data";
134  code_mime = "utf-16"; /* is this defined? */
135  } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
136  if (!memcmp(buf, "\xff\xff\xff\xff", 4)) {
137  // uninitialized memory is not iso-8859!!
138  goto done;
139  }
140  code = "ISO-8859";
141  type = "text";
142  code_mime = "iso-8859-1";
143  } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
144  code = "Non-ISO extended-ASCII";
145  type = "text";
146  code_mime = "unknown";
147  } else {
148  from_ebcdic(buf, nbytes, nbuf);
149 
150  if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
151  code = "EBCDIC";
152  type = "character data";
153  code_mime = "ebcdic";
154  } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
155  code = "International EBCDIC";
156  type = "character data";
157  code_mime = "ebcdic";
158  } else {
159  rv = 0;
160  goto done; /* doesn't look like text at all */
161  }
162  }
163 
164  if (nbytes <= 1) {
165  rv = 0;
166  goto done;
167  }
168 
169  /* Convert ubuf to UTF-8 and try text soft magic */
170  /* If original was ASCII or UTF-8, could use nbuf instead of
171  re-converting. */
172  /* malloc size is a conservative overestimate; could be
173  re-converting improved, or at least realloced after
174  re-converting conversion. */
175  mlen = ulen * 6;
176  if (!(utf8_buf = malloc(mlen))) {
177  file_oomem(ms, mlen);
178  goto done;
179  }
180  if (!(utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen))) {
181  goto done;
182  }
183  if (file_softmagic(ms, utf8_buf, utf8_end - utf8_buf, TEXTTEST) != 0) {
184  rv = 1;
185  goto done;
186  }
187 
188  /* look for tokens from names.h - this is expensive! */
189  if ((ms->flags & RZ_MAGIC_NO_CHECK_TOKENS) != 0) {
190  goto subtype_identified;
191  }
192 
193  i = 0;
194  while (i < ulen) {
195  size_t end;
196 
197  /* skip past any leading space */
198  while (i < ulen && ISSPC(ubuf[i])) {
199  i++;
200  }
201  if (i >= ulen) {
202  break;
203  }
204 
205  /* find the next whitespace */
206  for (end = i + 1; end < nbytes; end++) {
207  if (ISSPC(ubuf[end])) {
208  break;
209  }
210  }
211 
212  /* compare the word thus isolated against the token list */
213  for (p = names; p < names + NNAMES; p++) {
214  if (ascmatch((const ut8 *)p->name, ubuf + i,
215  end - i)) {
216  subtype = types[p->type].human;
217  subtype_mime = types[p->type].mime;
218  goto subtype_identified;
219  }
220  }
221 
222  i = end;
223  }
224 
225 subtype_identified:
226 
227  /* Now try to discover other details about the file. */
228  for (i = 0; i < ulen; i++) {
229  if (ubuf[i] == '\n') {
230  if (seen_cr) {
231  n_crlf++;
232  } else {
233  n_lf++;
234  }
235  last_line_end = i;
236  } else if (seen_cr) {
237  n_cr++;
238  }
239 
240  seen_cr = (ubuf[i] == '\r');
241  if (seen_cr) {
242  last_line_end = i;
243  }
244 
245  if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
246  n_nel++;
247  last_line_end = i;
248  }
249  /* If this line is _longer_ than MAXLINELEN, remember it. */
250  if (i > last_line_end + MAXLINELEN) {
251  has_long_lines = 1;
252  }
253 
254  if (ubuf[i] == '\033') {
255  has_escapes = 1;
256  }
257  if (ubuf[i] == '\b') {
258  has_backspace = 1;
259  }
260  }
261 
262  /* Beware, if the data has been truncated, the final CR could have
263  been followed by a LF. If we have HOWMANY bytes, it indicates
264  that the data might have been truncated, probably even before
265  this function was called. */
266  if (seen_cr && nbytes < HOWMANY) {
267  n_cr++;
268  }
269 
270  if (mime) {
271  if (mime & RZ_MAGIC_MIME_TYPE) {
272  if (subtype_mime) {
273  if (file_printf(ms, subtype_mime) == -1) {
274  goto done;
275  }
276  } else {
277  if (file_printf(ms, "text/plain") == -1) {
278  goto done;
279  }
280  }
281  }
282 
283  if ((mime == 0 || mime == RZ_MAGIC_MIME) && code_mime) {
284  if ((mime & RZ_MAGIC_MIME_TYPE) &&
285  file_printf(ms, " charset=") == -1) {
286  goto done;
287  }
288  if (file_printf(ms, code_mime) == -1) {
289  goto done;
290  }
291  }
292 
293  if (mime == RZ_MAGIC_MIME_ENCODING) {
294  if (file_printf(ms, "binary") == -1) {
295  rv = 1;
296  goto done;
297  }
298  }
299  } else {
300  if (file_printf(ms, code) == -1) {
301  goto done;
302  }
303 
304  if (subtype) {
305  if (file_printf(ms, " ") == -1) {
306  goto done;
307  }
308  if (file_printf(ms, subtype) == -1) {
309  goto done;
310  }
311  }
312 
313  if (file_printf(ms, " ") == -1) {
314  goto done;
315  }
316  if (file_printf(ms, type) == -1) {
317  goto done;
318  }
319 
320  if (has_long_lines) {
321  if (file_printf(ms, ", with very long lines") == -1) {
322  goto done;
323  }
324  }
325 
326  /*
327  * Only report line terminators if we find one other than LF,
328  * or if we find none at all.
329  */
330  if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
331  (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
332  if (file_printf(ms, ", with") == -1) {
333  goto done;
334  }
335 
336  if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
337  if (file_printf(ms, " no") == -1) {
338  goto done;
339  }
340  } else {
341  if (n_crlf) {
342  if (file_printf(ms, " CRLF") == -1) {
343  goto done;
344  }
345  if (n_cr || n_lf || n_nel) {
346  if (file_printf(ms, ",") == -1) {
347  goto done;
348  }
349  }
350  }
351  if (n_cr) {
352  if (file_printf(ms, " CR") == -1) {
353  goto done;
354  }
355  if (n_lf || n_nel) {
356  if (file_printf(ms, ",") == -1) {
357  goto done;
358  }
359  }
360  }
361  if (n_lf) {
362  if (file_printf(ms, " LF") == -1) {
363  goto done;
364  }
365  if (n_nel) {
366  if (file_printf(ms, ",") == -1) {
367  goto done;
368  }
369  }
370  }
371  if (n_nel) {
372  if (file_printf(ms, " NEL") == -1) {
373  goto done;
374  }
375  }
376  }
377 
378  if (file_printf(ms, " line terminators") == -1) {
379  goto done;
380  }
381  }
382 
383  if (has_escapes) {
384  if (file_printf(ms, ", with escape sequences") == -1) {
385  goto done;
386  }
387  }
388  if (has_backspace) {
389  if (file_printf(ms, ", with overstriking") == -1) {
390  goto done;
391  }
392  }
393  }
394  rv = 1;
395 done:
396  free(nbuf);
397  free(ubuf);
398  free(utf8_buf);
399  return rv;
400 }
401 
402 static int ascmatch(const ut8 *s, const unichar *us, size_t ulen) {
403  size_t i;
404  for (i = 0; i < ulen; i++) {
405  if (s[i] != us[i]) {
406  return 0;
407  }
408  }
409  return s[i] ? 0 : 1;
410 }
411 
412 /*
413  * This table reflects a particular philosophy about what constitutes
414  * "text," and there is room for disagreement about it.
415  *
416  * Version 3.31 of the file command considered a file to be ASCII if
417  * each of its characters was approved by either the isascii() or
418  * isalpha() function. On most systems, this would mean that any
419  * file consisting only of characters in the range 0x00 ... 0x7F
420  * would be called ASCII text, but many systems might reasonably
421  * consider some characters outside this range to be alphabetic,
422  * so the file command would call such characters ASCII. It might
423  * have been more accurate to call this "considered textual on the
424  * local system" than "ASCII."
425  *
426  * It considered a file to be "International language text" if each
427  * of its characters was either an ASCII printing character (according
428  * to the real ASCII standard, not the above test), a character in
429  * the range 0x80 ... 0xFF, or one of the following control characters:
430  * backspace, tab, line feed, vertical tab, form feed, carriage return,
431  * escape. No attempt was made to determine the language in which files
432  * of this type were written.
433  *
434  *
435  * The table below considers a file to be ASCII if all of its characters
436  * are either ASCII printing characters (again, according to the X3.4
437  * standard, not isascii()) or any of the following controls: bell,
438  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
439  *
440  * I include bell because some programs (particularly shell scripts)
441  * use it literally, even though it is rare in normal text. I exclude
442  * vertical tab because it never seems to be used in real text. I also
443  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
444  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
445  * character to. It might be more appropriate to include it in the 8859
446  * set instead of the ASCII set, but it's got to be included in *something*
447  * we recognize or EBCDIC files aren't going to be considered textual.
448  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
449  * and Latin characters, so these should possibly be allowed. But they
450  * make a real mess on VT100-style displays if they're not paired properly,
451  * so we are probably better off not calling them text.
452  *
453  * A file is considered to be ISO-8859 text if its characters are all
454  * either ASCII, according to the above definition, or printing characters
455  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
456  *
457  * Finally, a file is considered to be international text from some other
458  * character code if its characters are all either ISO-8859 (according to
459  * the above definition) or characters in the range 0x80 ... 0x9F, which
460  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
461  * consider to be printing characters.
462  */
463 
464 #define F 0 /* character never appears in text */
465 #define T 1 /* character appears in plain ASCII text */
466 #define I 2 /* character appears in ISO-8859 text */
467 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
468 
469 static char text_chars[256] = {
470  /* BEL BS HT LF FF CR */
471  F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
472  /* ESC */
473  F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
474  T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
475  T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
476  T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
477  T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
478  T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
479  T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
480  /* NEL */
481  X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
482  X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
483  I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
484  I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
485  I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
486  I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
487  I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
488  I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
489 };
490 
491 static int looks_ascii(const ut8 *buf, size_t nbytes, unichar *ubuf, size_t *ulen) {
492  size_t i;
493  *ulen = 0;
494  for (i = 0; i < nbytes; i++) {
495  int t = text_chars[buf[i]];
496  if (t != T) {
497  return 0;
498  }
499  ubuf[(*ulen)++] = buf[i];
500  }
501  return 1;
502 }
503 
504 static int looks_latin1(const ut8 *buf, size_t nbytes, unichar *ubuf, size_t *ulen) {
505  size_t i;
506  *ulen = 0;
507 
508  for (i = 0; i < nbytes; i++) {
509  int t = text_chars[buf[i]];
510  if (t != T && t != I) {
511  return 0;
512  }
513  ubuf[(*ulen)++] = buf[i];
514  }
515  return 1;
516 }
517 
518 static int looks_extended(const ut8 *buf, size_t nbytes, unichar *ubuf, size_t *ulen) {
519  size_t i;
520  *ulen = 0;
521  for (i = 0; i < nbytes; i++) {
522  int t = text_chars[buf[i]];
523  if (t != T && t != I && t != X) {
524  return 0;
525  }
526  ubuf[(*ulen)++] = buf[i];
527  }
528  return 1;
529 }
530 
531 /*
532  * Encode Unicode string as UTF-8, returning pointer to character
533  * after end of string, or NULL if an invalid character is found.
534  */
535 static ut8 *
536 encode_utf8(ut8 *buf, size_t len, unichar *ubuf, size_t ulen) {
537  size_t i;
538  ut8 *end = buf + len;
539 
540  for (i = 0; i < ulen; i++) {
541  if (ubuf[i] <= 0x7f) {
542  if (end - buf < 1) {
543  return NULL;
544  }
545  *buf++ = (ut8)ubuf[i];
546  } else if (ubuf[i] <= 0x7ff) {
547  if (end - buf < 2) {
548  return NULL;
549  }
550  *buf++ = (ut8)((ubuf[i] >> 6) + 0xc0);
551  *buf++ = (ut8)((ubuf[i] & 0x3f) + 0x80);
552  } else if (ubuf[i] <= 0xffff) {
553  if (end - buf < 3) {
554  return NULL;
555  }
556  *buf++ = (ut8)((ubuf[i] >> 12) + 0xe0);
557  *buf++ = (ut8)(((ubuf[i] >> 6) & 0x3f) + 0x80);
558  *buf++ = (ut8)((ubuf[i] & 0x3f) + 0x80);
559  } else if (ubuf[i] <= 0x1fffff) {
560  if (end - buf < 4) {
561  return NULL;
562  }
563  *buf++ = (ut8)((ubuf[i] >> 18) + 0xf0);
564  *buf++ = (ut8)(((ubuf[i] >> 12) & 0x3f) + 0x80);
565  *buf++ = (ut8)(((ubuf[i] >> 6) & 0x3f) + 0x80);
566  *buf++ = (ut8)((ubuf[i] & 0x3f) + 0x80);
567  } else if (ubuf[i] <= 0x3ffffff) {
568  if (end - buf < 5) {
569  return NULL;
570  }
571  *buf++ = (ut8)((ubuf[i] >> 24) + 0xf8);
572  *buf++ = (ut8)(((ubuf[i] >> 18) & 0x3f) + 0x80);
573  *buf++ = (ut8)(((ubuf[i] >> 12) & 0x3f) + 0x80);
574  *buf++ = (ut8)(((ubuf[i] >> 6) & 0x3f) + 0x80);
575  *buf++ = (ut8)((ubuf[i] & 0x3f) + 0x80);
576  } else if (ubuf[i] <= 0x7fffffff) {
577  if (end - buf < 6) {
578  return NULL;
579  }
580  *buf++ = (ut8)((ubuf[i] >> 30) + 0xfc);
581  *buf++ = (ut8)(((ubuf[i] >> 24) & 0x3f) + 0x80);
582  *buf++ = (ut8)(((ubuf[i] >> 18) & 0x3f) + 0x80);
583  *buf++ = (ut8)(((ubuf[i] >> 12) & 0x3f) + 0x80);
584  *buf++ = (ut8)(((ubuf[i] >> 6) & 0x3f) + 0x80);
585  *buf++ = (ut8)((ubuf[i] & 0x3f) + 0x80);
586  } else { /* Invalid character */
587  return NULL;
588  }
589  }
590 
591  return buf;
592 }
593 
594 /*
595  * Decide whether some text looks like UTF-8. Returns:
596  *
597  * -1: invalid UTF-8
598  * 0: uses odd control characters, so doesn't look like text
599  * 1: 7-bit text
600  * 2: definitely UTF-8 text (valid high-bit set bytes)
601  *
602  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
603  * ubuf must be big enough!
604  */
605 int file_looks_utf8(const ut8 *buf, size_t nbytes, unichar *ubuf, size_t *ulen) {
606  size_t i;
607  int n;
608  unichar c;
609  int gotone = 0, ctrl = 0;
610 
611  if (ubuf) {
612  *ulen = 0;
613  }
614 
615  for (i = 0; i < nbytes; i++) {
616  if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
617  /*
618  * Even if the whole file is valid UTF-8 sequences,
619  * still reject it if it uses weird control characters.
620  */
621 
622  if (text_chars[buf[i]] != T) {
623  ctrl = 1;
624  }
625 
626  if (ubuf) {
627  ubuf[(*ulen)++] = buf[i];
628  }
629  } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
630  return -1;
631  } else { /* 11xxxxxx begins UTF-8 */
632  int following;
633 
634  if ((buf[i] & 0x20) == 0) { /* 110xxxxx */
635  c = buf[i] & 0x1f;
636  following = 1;
637  } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */
638  c = buf[i] & 0x0f;
639  following = 2;
640  } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */
641  c = buf[i] & 0x07;
642  following = 3;
643  } else if ((buf[i] & 0x04) == 0) { /* 111110xx */
644  c = buf[i] & 0x03;
645  following = 4;
646  } else if ((buf[i] & 0x02) == 0) { /* 1111110x */
647  c = buf[i] & 0x01;
648  following = 5;
649  } else {
650  return -1;
651  }
652 
653  for (n = 0; n < following; n++) {
654  i++;
655  if (i >= nbytes) {
656  goto done;
657  }
658 
659  if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) {
660  return -1;
661  }
662 
663  c = (c << 6) + (buf[i] & 0x3f);
664  }
665 
666  if (ubuf) {
667  ubuf[(*ulen)++] = c;
668  }
669  gotone = 1;
670  }
671  }
672 done:
673  return ctrl ? 0 : (gotone ? 2 : 1);
674 }
675 
676 /*
677  * Decide whether some text looks like UTF-8 with BOM. If there is no
678  * BOM, return -1; otherwise return the result of looks_utf8 on the
679  * rest of the text.
680  */
681 static int looks_utf8_with_BOM(const ut8 *buf, size_t nbytes, unichar *ubuf, size_t *ulen) {
682  if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) {
683  return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
684  }
685  return -1;
686 }
687 
688 static int looks_ucs16(const ut8 *buf, size_t nbytes, unichar *ubuf, size_t *ulen) {
689  int bigend;
690  size_t i;
691 
692  if (nbytes < 2) {
693  return 0;
694  }
695 
696  if (buf[0] == 0xff && buf[1] == 0xfe) {
697  bigend = 0;
698  } else if (buf[0] == 0xfe && buf[1] == 0xff) {
699  bigend = 1;
700  } else {
701  return 0;
702  }
703 
704  *ulen = 0;
705 
706  for (i = 2; i + 1 < nbytes; i += 2) {
707  /* XXX fix to properly handle chars > 65536 */
708 
709  if (bigend) {
710  ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
711  } else {
712  ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
713  }
714 
715  if (ubuf[*ulen - 1] == 0xfffe) {
716  return 0;
717  }
718  if (ubuf[*ulen - 1] < 128 && text_chars[(size_t)ubuf[*ulen - 1]] != T) {
719  return 0;
720  }
721  }
722  return 1 + bigend;
723 }
724 
725 #undef F
726 #undef T
727 #undef I
728 #undef X
729 
730 /*
731  * This table maps each EBCDIC character to an (8-bit extended) ASCII
732  * character, as specified in the rationale for the dd(1) command in
733  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
734  *
735  * Unfortunately it does not seem to correspond exactly to any of the
736  * five variants of EBCDIC documented in IBM's _Enterprise Systems
737  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
738  * Edition, July, 1999, pp. I-1 - I-4.
739  *
740  * Fortunately, though, all versions of EBCDIC, including this one, agree
741  * on most of the printing characters that also appear in (7-bit) ASCII.
742  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
743  *
744  * Fortunately too, there is general agreement that codes 0x00 through
745  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
746  * remainder printing characters.
747  *
748  * This is sufficient to allow us to identify EBCDIC text and to distinguish
749  * between old-style and internationalized examples of text.
750  */
751 
752 static ut8 ebcdic_to_ascii[] = {
753  0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
754  16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
755  128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
756  144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
757  ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
758  '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
759  '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
760  186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'', '=', '"',
761  195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
762  202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
763  209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
764  216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
765  '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
766  '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
767  '\\', 159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
768  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
769 };
770 
771 #ifdef notdef
772 /*
773  * The following EBCDIC-to-ASCII table may relate more closely to reality,
774  * or at least to modern reality. It comes from
775  *
776  * http://ftp.s390.ibm.com/products/oe/bpxqp9.html
777  *
778  * and maps the characters of EBCDIC code page 1047 (the code used for
779  * Unix-derived software on IBM's 390 systems) to the corresponding
780  * characters from ISO 8859-1.
781  *
782  * If this table is used instead of the above one, some of the special
783  * cases for the NEL character can be taken out of the code.
784  */
785 
786 static ut8 ebcdic_1047_to_8859[] = {
787  0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
788  0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
789  0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
790  0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, 0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A,
791  0x20, 0xA0, 0xE2, 0xE4, 0xE0, 0xE1, 0xE3, 0xE5, 0xE7, 0xF1, 0xA2, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
792  0x26, 0xE9, 0xEA, 0xEB, 0xE8, 0xED, 0xEE, 0xEF, 0xEC, 0xDF, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
793  0x2D, 0x2F, 0xC2, 0xC4, 0xC0, 0xC1, 0xC3, 0xC5, 0xC7, 0xD1, 0xA6, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
794  0xF8, 0xC9, 0xCA, 0xCB, 0xC8, 0xCD, 0xCE, 0xCF, 0xCC, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
795  0xD8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xAB, 0xBB, 0xF0, 0xFD, 0xFE, 0xB1,
796  0xB0, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0xAA, 0xBA, 0xE6, 0xB8, 0xC6, 0xA4,
797  0xB5, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0xA1, 0xBF, 0xD0, 0x5B, 0xDE, 0xAE,
798  0xAC, 0xA3, 0xA5, 0xB7, 0xA9, 0xA7, 0xB6, 0xBC, 0xBD, 0xBE, 0xDD, 0xA8, 0xAF, 0x5D, 0xB4, 0xD7,
799  0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0xAD, 0xF4, 0xF6, 0xF2, 0xF3, 0xF5,
800  0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0xB9, 0xFB, 0xFC, 0xF9, 0xFA, 0xFF,
801  0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5,
802  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F
803 };
804 #endif
805 
806 /*
807  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
808  */
809 static void from_ebcdic(const ut8 *buf, size_t nbytes, ut8 *out) {
810  size_t i;
811  for (i = 0; i < nbytes; i++) {
812  out[i] = ebcdic_to_ascii[buf[i]];
813  }
814 }
815 #endif
size_t len
Definition: 6502dis.c:15
lzma_index ** i
Definition: index.h:629
#define ISSPC(x)
Definition: ascmagic.c:52
#define T
Definition: ascmagic.c:465
static void from_ebcdic(const ut8 *, size_t, ut8 *)
Definition: ascmagic.c:809
static ut8 * encode_utf8(ut8 *, size_t, unichar *, size_t)
Definition: ascmagic.c:536
#define X
Definition: ascmagic.c:467
static int looks_latin1(const ut8 *, size_t, unichar *, size_t *)
Definition: ascmagic.c:504
static int looks_ucs16(const ut8 *, size_t, unichar *, size_t *)
Definition: ascmagic.c:688
int file_ascmagic(RzMagic *ms, const ut8 *buf, size_t nbytes)
Definition: ascmagic.c:65
#define F
Definition: ascmagic.c:464
static ut8 ebcdic_to_ascii[]
Definition: ascmagic.c:752
int file_looks_utf8(const ut8 *, size_t, unichar *, size_t *)
Definition: ascmagic.c:605
#define I
Definition: ascmagic.c:466
#define MAXLINELEN
Definition: ascmagic.c:51
static int looks_extended(const ut8 *, size_t, unichar *, size_t *)
Definition: ascmagic.c:518
static int looks_utf8_with_BOM(const ut8 *, size_t, unichar *, size_t *)
Definition: ascmagic.c:681
static int looks_ascii(const ut8 *, size_t, unichar *, size_t *)
Definition: ascmagic.c:491
static int ascmatch(const ut8 *, const unichar *, size_t)
Definition: ascmagic.c:402
static char text_chars[256]
Definition: ascmagic.c:469
const lzma_allocator const uint8_t size_t uint8_t * out
Definition: block.h:528
#define NULL
Definition: cris-opc.c:27
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void static offset struct stat static buf void nbytes
Definition: sflib.h:113
#define ut8
Definition: dcpu16.h:8
struct tab * done
Definition: enough.c:233
void file_oomem(struct rz_magic_set *, size_t)
int file_printf(struct rz_magic_set *, const char *,...)
int file_softmagic(struct rz_magic_set *, const unsigned char *, size_t, int)
unsigned long unichar
Definition: file.h:51
checking print the parsed form of the magic use in n conjunction with m to debug a new magic file n before installing it n mime
Definition: file_opts.h:30
RZ_API void Ht_() free(HtName_(Ht) *ht)
Definition: ht_inc.c:130
voidpf void * buf
Definition: ioapi.h:138
uint8_t ut8
Definition: lh5801.h:11
void * p
Definition: libc.cpp:67
void * malloc(size_t size)
Definition: malloc.c:123
void * calloc(size_t number, size_t size)
Definition: malloc.c:102
int n
Definition: mipsasm.c:19
int type
Definition: mipsasm.c:17
#define NNAMES
Definition: names.h:180
insn_type_descr_t types[]
Definition: or1k_disas.c:7
static RzSocket * s
Definition: rtr.c:28
int size_t
Definition: sftypes.h:40
#define c(i)
Definition: sha256.c:43
Definition: inftree9.h:24
Definition: names.h:123