123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- ******************************************************************************
- *
- * Copyright (C) 1999-2016, International Business Machines
- * Corporation and others. All Rights Reserved.
- *
- ******************************************************************************
- * file name: ubidiimp.h
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 1999aug06
- * created by: Markus W. Scherer, updated by Matitiahu Allouche
- */
- #ifndef UBIDIIMP_H
- #define UBIDIIMP_H
- #include "unicode/utypes.h"
- #include "unicode/ubidi.h"
- #include "unicode/uchar.h"
- #include "ubidi_props.h"
- /* miscellaneous definitions ---------------------------------------------- */
- // ICU-20853=ICU-20935 Solaris #defines CS and ES in sys/regset.h
- #ifdef CS
- # undef CS
- #endif
- #ifdef ES
- # undef ES
- #endif
- typedef uint8_t DirProp;
- typedef uint32_t Flags;
- /* Comparing the description of the BiDi algorithm with this implementation
- is easier with the same names for the BiDi types in the code as there.
- See UCharDirection in uchar.h .
- */
- enum {
- L= U_LEFT_TO_RIGHT, /* 0 */
- R= U_RIGHT_TO_LEFT, /* 1 */
- EN= U_EUROPEAN_NUMBER, /* 2 */
- ES= U_EUROPEAN_NUMBER_SEPARATOR, /* 3 */
- ET= U_EUROPEAN_NUMBER_TERMINATOR, /* 4 */
- AN= U_ARABIC_NUMBER, /* 5 */
- CS= U_COMMON_NUMBER_SEPARATOR, /* 6 */
- B= U_BLOCK_SEPARATOR, /* 7 */
- S= U_SEGMENT_SEPARATOR, /* 8 */
- WS= U_WHITE_SPACE_NEUTRAL, /* 9 */
- ON= U_OTHER_NEUTRAL, /* 10 */
- LRE=U_LEFT_TO_RIGHT_EMBEDDING, /* 11 */
- LRO=U_LEFT_TO_RIGHT_OVERRIDE, /* 12 */
- AL= U_RIGHT_TO_LEFT_ARABIC, /* 13 */
- RLE=U_RIGHT_TO_LEFT_EMBEDDING, /* 14 */
- RLO=U_RIGHT_TO_LEFT_OVERRIDE, /* 15 */
- PDF=U_POP_DIRECTIONAL_FORMAT, /* 16 */
- NSM=U_DIR_NON_SPACING_MARK, /* 17 */
- BN= U_BOUNDARY_NEUTRAL, /* 18 */
- FSI=U_FIRST_STRONG_ISOLATE, /* 19 */
- LRI=U_LEFT_TO_RIGHT_ISOLATE, /* 20 */
- RLI=U_RIGHT_TO_LEFT_ISOLATE, /* 21 */
- PDI=U_POP_DIRECTIONAL_ISOLATE, /* 22 */
- ENL, /* EN after W7 */ /* 23 */
- ENR, /* EN not subject to W7 */ /* 24 */
- dirPropCount
- };
- /* Sometimes, bit values are more appropriate
- to deal with directionality properties.
- Abbreviations in these macro names refer to names
- used in the BiDi algorithm.
- */
- #define DIRPROP_FLAG(dir) (1UL<<(dir))
- #define PURE_DIRPROP(prop) ((prop)&~0xE0) ?????????????????????????
- /* special flag for multiple runs from explicit embedding codes */
- #define DIRPROP_FLAG_MULTI_RUNS (1UL<<31)
- /* are there any characters that are LTR or RTL? */
- #define MASK_LTR (DIRPROP_FLAG(L)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(ENL)|DIRPROP_FLAG(ENR)|DIRPROP_FLAG(AN)|DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(LRI))
- #define MASK_RTL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(RLI))
- #define MASK_R_AL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL))
- #define MASK_STRONG_EN_AN (DIRPROP_FLAG(L)|DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(AN))
- /* explicit embedding codes */
- #define MASK_EXPLICIT (DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(PDF))
- /* explicit isolate codes */
- #define MASK_ISO (DIRPROP_FLAG(LRI)|DIRPROP_FLAG(RLI)|DIRPROP_FLAG(FSI)|DIRPROP_FLAG(PDI))
- #define MASK_BN_EXPLICIT (DIRPROP_FLAG(BN)|MASK_EXPLICIT)
- /* paragraph and segment separators */
- #define MASK_B_S (DIRPROP_FLAG(B)|DIRPROP_FLAG(S))
- /* all types that are counted as White Space or Neutral in some steps */
- #define MASK_WS (MASK_B_S|DIRPROP_FLAG(WS)|MASK_BN_EXPLICIT|MASK_ISO)
- /* types that are neutrals or could becomes neutrals in (Wn) */
- #define MASK_POSSIBLE_N (DIRPROP_FLAG(ON)|DIRPROP_FLAG(CS)|DIRPROP_FLAG(ES)|DIRPROP_FLAG(ET)|MASK_WS)
- /*
- * These types may be changed to "e",
- * the embedding type (L or R) of the run,
- * in the BiDi algorithm (N2)
- */
- #define MASK_EMBEDDING (DIRPROP_FLAG(NSM)|MASK_POSSIBLE_N)
- /* the dirProp's L and R are defined to 0 and 1 values in UCharDirection */
- #define GET_LR_FROM_LEVEL(level) ((DirProp)((level)&1))
- #define IS_DEFAULT_LEVEL(level) ((level)>=0xfe)
- /*
- * The following bit is used for the directional isolate status.
- * Stack entries corresponding to isolate sequences are greater than ISOLATE.
- */
- #define ISOLATE 0x0100
- U_CFUNC UBiDiLevel
- ubidi_getParaLevelAtIndex(const UBiDi *pBiDi, int32_t index);
- #define GET_PARALEVEL(ubidi, index) \
- ((UBiDiLevel)(!(ubidi)->defaultParaLevel || (index)<(ubidi)->paras[0].limit ? \
- (ubidi)->paraLevel : ubidi_getParaLevelAtIndex((ubidi), (index))))
- /* number of paras entries allocated initially without malloc */
- #define SIMPLE_PARAS_COUNT 10
- /* number of isolate entries allocated initially without malloc */
- #define SIMPLE_ISOLATES_COUNT 5
- /* number of isolate run entries for paired brackets allocated initially without malloc */
- #define SIMPLE_OPENINGS_COUNT 20
- #define CR 0x000D
- #define LF 0x000A
- /* Run structure for reordering --------------------------------------------- */
- enum {
- LRM_BEFORE=1,
- LRM_AFTER=2,
- RLM_BEFORE=4,
- RLM_AFTER=8
- };
- typedef struct Para {
- int32_t limit;
- int32_t level;
- } Para;
- enum { /* flags for Opening.flags */
- FOUND_L=DIRPROP_FLAG(L),
- FOUND_R=DIRPROP_FLAG(R)
- };
- typedef struct Opening {
- int32_t position; /* position of opening bracket */
- int32_t match; /* matching char or -position of closing bracket */
- int32_t contextPos; /* position of last strong char found before opening */
- uint16_t flags; /* bits for L or R/AL found within the pair */
- UBiDiDirection contextDir; /* L or R according to last strong char before opening */
- uint8_t filler; /* to complete a nice multiple of 4 chars */
- } Opening;
- typedef struct IsoRun {
- int32_t contextPos; /* position of char determining context */
- uint16_t start; /* index of first opening entry for this run */
- uint16_t limit; /* index after last opening entry for this run */
- UBiDiLevel level; /* level of this run */
- DirProp lastStrong; /* bidi class of last strong char found in this run */
- DirProp lastBase; /* bidi class of last base char found in this run */
- UBiDiDirection contextDir; /* L or R to use as context for following openings */
- } IsoRun;
- typedef struct BracketData {
- UBiDi *pBiDi;
- /* array of opening entries which should be enough in most cases; no malloc() */
- Opening simpleOpenings[SIMPLE_OPENINGS_COUNT];
- Opening *openings; /* pointer to current array of entries */
- int32_t openingsCount; /* number of allocated entries */
- int32_t isoRunLast; /* index of last used entry */
- /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL
- + 1 for index 0, + 1 for before the first isolated sequence */
- IsoRun isoRuns[UBIDI_MAX_EXPLICIT_LEVEL+2];
- UBool isNumbersSpecial; /* reordering mode for NUMBERS_SPECIAL */
- } BracketData;
- typedef struct Isolate {
- int32_t startON;
- int32_t start1;
- int32_t state;
- int16_t stateImp;
- } Isolate;
- typedef struct Run {
- int32_t logicalStart, /* first character of the run; b31 indicates even/odd level */
- visualLimit, /* last visual position of the run +1 */
- insertRemove; /* if >0, flags for inserting LRM/RLM before/after run,
- if <0, count of bidi controls within run */
- } Run;
- /* in a Run, logicalStart will get this bit set if the run level is odd */
- #define INDEX_ODD_BIT (1UL<<31)
- #define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)((level)&1)<<31))
- #define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((int32_t)((level)&1)<<31))
- #define REMOVE_ODD_BIT(x) ((x)&=~INDEX_ODD_BIT)
- #define GET_INDEX(x) ((x)&~INDEX_ODD_BIT)
- #define GET_ODD_BIT(x) ((uint32_t)(x)>>31)
- #define IS_ODD_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)!=0))
- #define IS_EVEN_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)==0))
- U_CFUNC UBool
- ubidi_getRuns(UBiDi *pBiDi, UErrorCode *pErrorCode);
- /** BiDi control code points */
- enum {
- ZWNJ_CHAR=0x200c,
- ZWJ_CHAR,
- LRM_CHAR,
- RLM_CHAR,
- LRE_CHAR=0x202a,
- RLE_CHAR,
- PDF_CHAR,
- LRO_CHAR,
- RLO_CHAR,
- LRI_CHAR=0x2066,
- RLI_CHAR,
- FSI_CHAR,
- PDI_CHAR
- };
- #define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffc)==ZWNJ_CHAR || (uint32_t)((c)-LRE_CHAR)<5 || (uint32_t)((c)-LRI_CHAR)<4)
- /* InsertPoints structure for noting where to put BiDi marks ---------------- */
- typedef struct Point {
- int32_t pos; /* position in text */
- int32_t flag; /* flag for LRM/RLM, before/after */
- } Point;
- typedef struct InsertPoints {
- int32_t capacity; /* number of points allocated */
- int32_t size; /* number of points used */
- int32_t confirmed; /* number of points confirmed */
- UErrorCode errorCode; /* for eventual memory shortage */
- Point *points; /* pointer to array of points */
- } InsertPoints;
- /* UBiDi structure ----------------------------------------------------------- */
- struct UBiDi {
- /* pointer to parent paragraph object (pointer to self if this object is
- * a paragraph object); set to NULL in a newly opened object; set to a
- * real value after a successful execution of ubidi_setPara or ubidi_setLine
- */
- const UBiDi * pParaBiDi;
- /* alias pointer to the current text */
- const UChar *text;
- /* length of the current text */
- int32_t originalLength;
- /* if the UBIDI_OPTION_STREAMING option is set, this is the length
- * of text actually processed by ubidi_setPara, which may be shorter than
- * the original length.
- * Otherwise, it is identical to the original length.
- */
- int32_t length;
- /* if the UBIDI_OPTION_REMOVE_CONTROLS option is set, and/or
- * marks are allowed to be inserted in one of the reordering mode, the
- * length of the result string may be different from the processed length.
- */
- int32_t resultLength;
- /* memory sizes in bytes */
- int32_t dirPropsSize, levelsSize, openingsSize, parasSize, runsSize, isolatesSize;
- /* allocated memory */
- DirProp *dirPropsMemory;
- UBiDiLevel *levelsMemory;
- Opening *openingsMemory;
- Para *parasMemory;
- Run *runsMemory;
- Isolate *isolatesMemory;
- /* indicators for whether memory may be allocated after ubidi_open() */
- UBool mayAllocateText, mayAllocateRuns;
- /* arrays with one value per text-character */
- DirProp *dirProps;
- UBiDiLevel *levels;
- /* are we performing an approximation of the "inverse BiDi" algorithm? */
- UBool isInverse;
- /* are we using the basic algorithm or its variation? */
- UBiDiReorderingMode reorderingMode;
- /* UBIDI_REORDER_xxx values must be ordered so that all the regular
- * logical to visual modes come first, and all inverse BiDi modes
- * come last.
- */
- #define UBIDI_REORDER_LAST_LOGICAL_TO_VISUAL UBIDI_REORDER_NUMBERS_SPECIAL
- /* bitmask for reordering options */
- uint32_t reorderingOptions;
- /* must block separators receive level 0? */
- UBool orderParagraphsLTR;
- /* the paragraph level */
- UBiDiLevel paraLevel;
- /* original paraLevel when contextual */
- /* must be one of UBIDI_DEFAULT_xxx or 0 if not contextual */
- UBiDiLevel defaultParaLevel;
- /* context data */
- const UChar *prologue;
- int32_t proLength;
- const UChar *epilogue;
- int32_t epiLength;
- /* the following is set in ubidi_setPara, used in processPropertySeq */
- const struct ImpTabPair * pImpTabPair; /* pointer to levels state table pair */
- /* the overall paragraph or line directionality - see UBiDiDirection */
- UBiDiDirection direction;
- /* flags is a bit set for which directional properties are in the text */
- Flags flags;
- /* lastArabicPos is index to the last AL in the text, -1 if none */
- int32_t lastArabicPos;
- /* characters after trailingWSStart are WS and are */
- /* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */
- int32_t trailingWSStart;
- /* fields for paragraph handling */
- int32_t paraCount; /* set in getDirProps() */
- /* filled in getDirProps() */
- Para *paras;
- /* for relatively short text, we only need a tiny array of paras (no malloc()) */
- Para simpleParas[SIMPLE_PARAS_COUNT];
- /* fields for line reordering */
- int32_t runCount; /* ==-1: runs not set up yet */
- Run *runs;
- /* for non-mixed text, we only need a tiny array of runs (no malloc()) */
- Run simpleRuns[1];
- /* maximum or current nesting depth of isolate sequences */
- /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal
- nesting encountered.
- Within resolveImplicitLevels(), this is the index of the current isolates
- stack entry. */
- int32_t isolateCount;
- Isolate *isolates;
- /* for simple text, have a small stack (no malloc()) */
- Isolate simpleIsolates[SIMPLE_ISOLATES_COUNT];
- /* for inverse Bidi with insertion of directional marks */
- InsertPoints insertPoints;
- /* for option UBIDI_OPTION_REMOVE_CONTROLS */
- int32_t controlCount;
- /* for Bidi class callback */
- UBiDiClassCallback *fnClassCallback; /* action pointer */
- const void *coClassCallback; /* context pointer */
- };
- #define IS_VALID_PARA(x) ((x) && ((x)->pParaBiDi==(x)))
- #define IS_VALID_PARA_OR_LINE(x) ((x) && ((x)->pParaBiDi==(x) || (((x)->pParaBiDi) && (x)->pParaBiDi->pParaBiDi==(x)->pParaBiDi)))
- typedef union {
- DirProp *dirPropsMemory;
- UBiDiLevel *levelsMemory;
- Opening *openingsMemory;
- Para *parasMemory;
- Run *runsMemory;
- Isolate *isolatesMemory;
- } BidiMemoryForAllocation;
- /* Macros for initial checks at function entry */
- #define RETURN_IF_NULL_OR_FAILING_ERRCODE(pErrcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
- if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return retvalue; \
- } UPRV_BLOCK_MACRO_END
- #define RETURN_IF_NOT_VALID_PARA(bidi, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
- if(!IS_VALID_PARA(bidi)) { \
- errcode=U_INVALID_STATE_ERROR; \
- return retvalue; \
- } \
- } UPRV_BLOCK_MACRO_END
- #define RETURN_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
- if(!IS_VALID_PARA_OR_LINE(bidi)) { \
- errcode=U_INVALID_STATE_ERROR; \
- return retvalue; \
- } \
- } UPRV_BLOCK_MACRO_END
- #define RETURN_IF_BAD_RANGE(arg, start, limit, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
- if((arg)<(start) || (arg)>=(limit)) { \
- (errcode)=U_ILLEGAL_ARGUMENT_ERROR; \
- return retvalue; \
- } \
- } UPRV_BLOCK_MACRO_END
- #define RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrcode) UPRV_BLOCK_MACRO_BEGIN { \
- if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return; \
- } UPRV_BLOCK_MACRO_END
- #define RETURN_VOID_IF_NOT_VALID_PARA(bidi, errcode) UPRV_BLOCK_MACRO_BEGIN { \
- if(!IS_VALID_PARA(bidi)) { \
- errcode=U_INVALID_STATE_ERROR; \
- return; \
- } \
- } UPRV_BLOCK_MACRO_END
- #define RETURN_VOID_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode) UPRV_BLOCK_MACRO_BEGIN { \
- if(!IS_VALID_PARA_OR_LINE(bidi)) { \
- errcode=U_INVALID_STATE_ERROR; \
- return; \
- } \
- } UPRV_BLOCK_MACRO_END
- #define RETURN_VOID_IF_BAD_RANGE(arg, start, limit, errcode) UPRV_BLOCK_MACRO_BEGIN { \
- if((arg)<(start) || (arg)>=(limit)) { \
- (errcode)=U_ILLEGAL_ARGUMENT_ERROR; \
- return; \
- } \
- } UPRV_BLOCK_MACRO_END
- /* helper function to (re)allocate memory if allowed */
- U_CFUNC UBool
- ubidi_getMemory(BidiMemoryForAllocation *pMemory, int32_t *pSize, UBool mayAllocate, int32_t sizeNeeded);
- /* helper macros for each allocated array in UBiDi */
- #define getDirPropsMemory(pBiDi, length) \
- ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
- (pBiDi)->mayAllocateText, (length))
- #define getLevelsMemory(pBiDi, length) \
- ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
- (pBiDi)->mayAllocateText, (length))
- #define getRunsMemory(pBiDi, length) \
- ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
- (pBiDi)->mayAllocateRuns, (length)*sizeof(Run))
- /* additional macros used by ubidi_open() - always allow allocation */
- #define getInitialDirPropsMemory(pBiDi, length) \
- ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
- true, (length))
- #define getInitialLevelsMemory(pBiDi, length) \
- ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
- true, (length))
- #define getInitialOpeningsMemory(pBiDi, length) \
- ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->openingsMemory, &(pBiDi)->openingsSize, \
- true, (length)*sizeof(Opening))
- #define getInitialParasMemory(pBiDi, length) \
- ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->parasMemory, &(pBiDi)->parasSize, \
- true, (length)*sizeof(Para))
- #define getInitialRunsMemory(pBiDi, length) \
- ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
- true, (length)*sizeof(Run))
- #define getInitialIsolatesMemory(pBiDi, length) \
- ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->isolatesMemory, &(pBiDi)->isolatesSize, \
- true, (length)*sizeof(Isolate))
- #endif
|