Rizin
unix-like reverse engineering framework and cli tools
utf8.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1999-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: utf8.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 1999sep13
16 * created by: Markus W. Scherer
17 */
18 
34 #ifndef __UTF8_H__
35 #define __UTF8_H__
36 
37 #include "unicode/umachine.h"
38 #ifndef __UTF_H__
39 # include "unicode/utf.h"
40 #endif
41 
42 /* internal definitions ----------------------------------------------------- */
43 
55 #define U8_COUNT_TRAIL_BYTES(leadByte) \
56  (U8_IS_LEAD(leadByte) ? \
57  ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
58 
70 #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
71  (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
72 
80 #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
81 
90 #define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
91 
97 #define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
98 
107 #define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
108 
114 #define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
115 
127 
139 
151 
163 
164 /* single-code point definitions -------------------------------------------- */
165 
172 #define U8_IS_SINGLE(c) (((c)&0x80)==0)
173 
180 #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
181 // 0x32=0xf4-0xc2
182 
189 #define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
190 
198 #define U8_LENGTH(c) \
199  ((uint32_t)(c)<=0x7f ? 1 : \
200  ((uint32_t)(c)<=0x7ff ? 2 : \
201  ((uint32_t)(c)<=0xd7ff ? 3 : \
202  ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
203  ((uint32_t)(c)<=0xffff ? 3 : 4)\
204  ) \
205  ) \
206  ) \
207  )
208 
214 #define U8_MAX_LENGTH 4
215 
232 #define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
233  int32_t _u8_get_unsafe_index=(int32_t)(i); \
234  U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
235  U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
236 } UPRV_BLOCK_MACRO_END
237 
259 #define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
260  int32_t _u8_get_index=(i); \
261  U8_SET_CP_START(s, start, _u8_get_index); \
262  U8_NEXT(s, _u8_get_index, length, c); \
263 } UPRV_BLOCK_MACRO_END
264 
290 #define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
291  int32_t _u8_get_index=(i); \
292  U8_SET_CP_START(s, start, _u8_get_index); \
293  U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
294 } UPRV_BLOCK_MACRO_END
295 
296 /* definitions with forward iteration --------------------------------------- */
297 
315 #define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
316  (c)=(uint8_t)(s)[(i)++]; \
317  if(!U8_IS_SINGLE(c)) { \
318  if((c)<0xe0) { \
319  (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
320  } else if((c)<0xf0) { \
321  /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
322  (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
323  (i)+=2; \
324  } else { \
325  (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
326  (i)+=3; \
327  } \
328  } \
329 } UPRV_BLOCK_MACRO_END
330 
351 #define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
352 
377 #define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
378 
380 #define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \
381  (c)=(uint8_t)(s)[(i)++]; \
382  if(!U8_IS_SINGLE(c)) { \
383  uint8_t __t = 0; \
384  if((i)!=(length) && \
385  /* fetch/validate/assemble all but last trail byte */ \
386  ((c)>=0xe0 ? \
387  ((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \
388  U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
389  (__t&=0x3f, 1) \
390  : /* U+10000..U+10FFFF */ \
391  ((c)-=0xf0)<=4 && \
392  U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
393  ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
394  (__t=(s)[i]-0x80)<=0x3f) && \
395  /* valid second-to-last trail byte */ \
396  ((c)=((c)<<6)|__t, ++(i)!=(length)) \
397  : /* U+0080..U+07FF */ \
398  (c)>=0xc2 && ((c)&=0x1f, 1)) && \
399  /* last trail byte */ \
400  (__t=(s)[i]-0x80)<=0x3f && \
401  ((c)=((c)<<6)|__t, ++(i), 1)) { \
402  } else { \
403  (c)=(sub); /* ill-formed*/ \
404  } \
405  } \
406 } UPRV_BLOCK_MACRO_END
407 
421 #define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
422  uint32_t __uc=(c); \
423  if(__uc<=0x7f) { \
424  (s)[(i)++]=(uint8_t)__uc; \
425  } else { \
426  if(__uc<=0x7ff) { \
427  (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
428  } else { \
429  if(__uc<=0xffff) { \
430  (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
431  } else { \
432  (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
433  (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
434  } \
435  (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
436  } \
437  (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
438  } \
439 } UPRV_BLOCK_MACRO_END
440 
458 #define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
459  uint32_t __uc=(c); \
460  if(__uc<=0x7f) { \
461  (s)[(i)++]=(uint8_t)__uc; \
462  } else if(__uc<=0x7ff && (i)+1<(capacity)) { \
463  (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
464  (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
465  } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
466  (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
467  (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
468  (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
469  } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
470  (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
471  (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
472  (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
473  (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
474  } else { \
475  (isError)=TRUE; \
476  } \
477 } UPRV_BLOCK_MACRO_END
478 
489 #define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
490  (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
491 } UPRV_BLOCK_MACRO_END
492 
506 #define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
507  uint8_t __b=(s)[(i)++]; \
508  if(U8_IS_LEAD(__b) && (i)!=(length)) { \
509  uint8_t __t1=(s)[i]; \
510  if((0xe0<=__b && __b<0xf0)) { \
511  if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
512  ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
513  ++(i); \
514  } \
515  } else if(__b<0xe0) { \
516  if(U8_IS_TRAIL(__t1)) { \
517  ++(i); \
518  } \
519  } else /* c>=0xf0 */ { \
520  if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
521  ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
522  ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
523  ++(i); \
524  } \
525  } \
526  } \
527 } UPRV_BLOCK_MACRO_END
528 
541 #define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
542  int32_t __N=(n); \
543  while(__N>0) { \
544  U8_FWD_1_UNSAFE(s, i); \
545  --__N; \
546  } \
547 } UPRV_BLOCK_MACRO_END
548 
564 #define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
565  int32_t __N=(n); \
566  while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
567  U8_FWD_1(s, i, length); \
568  --__N; \
569  } \
570 } UPRV_BLOCK_MACRO_END
571 
585 #define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
586  while(U8_IS_TRAIL((s)[i])) { --(i); } \
587 } UPRV_BLOCK_MACRO_END
588 
606 #define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
607  if(U8_IS_TRAIL((s)[(i)])) { \
608  (i)=utf8_back1SafeBody(s, start, (i)); \
609  } \
610 } UPRV_BLOCK_MACRO_END
611 
638 #define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \
639  if((length)>(start)) { \
640  uint8_t __b1=s[(length)-1]; \
641  if(U8_IS_SINGLE(__b1)) { \
642  /* common ASCII character */ \
643  } else if(U8_IS_LEAD(__b1)) { \
644  --(length); \
645  } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
646  uint8_t __b2=s[(length)-2]; \
647  if(0xe0<=__b2 && __b2<=0xf4) { \
648  if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
649  U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
650  (length)-=2; \
651  } \
652  } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
653  uint8_t __b3=s[(length)-3]; \
654  if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
655  (length)-=3; \
656  } \
657  } \
658  } \
659  } \
660 } UPRV_BLOCK_MACRO_END
661 
662 /* definitions with backward iteration -------------------------------------- */
663 
683 #define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
684  (c)=(uint8_t)(s)[--(i)]; \
685  if(U8_IS_TRAIL(c)) { \
686  uint8_t __b, __count=1, __shift=6; \
687 \
688  /* c is a trail byte */ \
689  (c)&=0x3f; \
690  for(;;) { \
691  __b=(s)[--(i)]; \
692  if(__b>=0xc0) { \
693  U8_MASK_LEAD_BYTE(__b, __count); \
694  (c)|=(UChar32)__b<<__shift; \
695  break; \
696  } else { \
697  (c)|=(UChar32)(__b&0x3f)<<__shift; \
698  ++__count; \
699  __shift+=6; \
700  } \
701  } \
702  } \
703 } UPRV_BLOCK_MACRO_END
704 
725 #define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
726  (c)=(uint8_t)(s)[--(i)]; \
727  if(!U8_IS_SINGLE(c)) { \
728  (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
729  } \
730 } UPRV_BLOCK_MACRO_END
731 
756 #define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
757  (c)=(uint8_t)(s)[--(i)]; \
758  if(!U8_IS_SINGLE(c)) { \
759  (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
760  } \
761 } UPRV_BLOCK_MACRO_END
762 
774 #define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
775  while(U8_IS_TRAIL((s)[--(i)])) {} \
776 } UPRV_BLOCK_MACRO_END
777 
790 #define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
791  if(U8_IS_TRAIL((s)[--(i)])) { \
792  (i)=utf8_back1SafeBody(s, start, (i)); \
793  } \
794 } UPRV_BLOCK_MACRO_END
795 
809 #define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
810  int32_t __N=(n); \
811  while(__N>0) { \
812  U8_BACK_1_UNSAFE(s, i); \
813  --__N; \
814  } \
815 } UPRV_BLOCK_MACRO_END
816 
831 #define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
832  int32_t __N=(n); \
833  while(__N>0 && (i)>(start)) { \
834  U8_BACK_1(s, start, i); \
835  --__N; \
836  } \
837 } UPRV_BLOCK_MACRO_END
838 
852 #define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
853  U8_BACK_1_UNSAFE(s, i); \
854  U8_FWD_1_UNSAFE(s, i); \
855 } UPRV_BLOCK_MACRO_END
856 
874 #define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
875  if((start)<(i) && ((i)<(length) || (length)<0)) { \
876  U8_BACK_1(s, start, i); \
877  U8_FWD_1(s, i, length); \
878  } \
879 } UPRV_BLOCK_MACRO_END
880 
881 #endif
lzma_index ** i
Definition: index.h:629
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void static offset struct stat static buf void long static basep static whence static length const void static len static semflg const void static shmflg const struct timespec struct timespec static rem const char static group const void start
Definition: sflib.h:133
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void static offset struct stat static buf void long static basep static whence static length const void static len static semflg const void static shmflg const struct timespec struct timespec static rem const char static group const void length
Definition: sflib.h:133
static RzSocket * s
Definition: rtr.c:28
int int32_t
Definition: sftypes.h:33
unsigned char uint8_t
Definition: sftypes.h:31
#define c(i)
Definition: sha256.c:43
Basic types and constants for UTF.
int32_t UChar32
Definition: umachine.h:424
int8_t UBool
Definition: umachine.h:260
#define U_STABLE
Definition: umachine.h:111
#define U_EXPORT2
Definition: unicode.h:12
U_STABLE int32_t U_EXPORT2 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError)
U_STABLE UChar32 U_EXPORT2 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict)
U_STABLE UChar32 U_EXPORT2 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict)
U_STABLE int32_t U_EXPORT2 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i)