123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- *
- * Copyright (C) 2002-2011 International Business Machines
- * Corporation and others. All Rights Reserved.
- *
- *******************************************************************************
- * file name: uiter.h
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 2002jan18
- * created by: Markus W. Scherer
- */
- #ifndef __UITER_H__
- #define __UITER_H__
- /**
- * \file
- * \brief C API: Unicode Character Iteration
- *
- * @see UCharIterator
- */
- #include "unicode/utypes.h"
- #if U_SHOW_CPLUSPLUS_API
- U_NAMESPACE_BEGIN
- class CharacterIterator;
- class Replaceable;
- U_NAMESPACE_END
- #endif
- U_CDECL_BEGIN
- struct UCharIterator;
- typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
- /**
- * Origin constants for UCharIterator.getIndex() and UCharIterator.move().
- * @see UCharIteratorMove
- * @see UCharIterator
- * @stable ICU 2.1
- */
- typedef enum UCharIteratorOrigin {
- UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH
- } UCharIteratorOrigin;
- /** Constants for UCharIterator. @stable ICU 2.6 */
- enum {
- /**
- * Constant value that may be returned by UCharIteratorMove
- * indicating that the final UTF-16 index is not known, but that the move succeeded.
- * This can occur when moving relative to limit or length, or
- * when moving relative to the current index after a setState()
- * when the current UTF-16 index is not known.
- *
- * It would be very inefficient to have to count from the beginning of the text
- * just to get the current/limit/length index after moving relative to it.
- * The actual index can be determined with getIndex(UITER_CURRENT)
- * which will count the UChars if necessary.
- *
- * @stable ICU 2.6
- */
- UITER_UNKNOWN_INDEX=-2
- };
- /**
- * Constant for UCharIterator getState() indicating an error or
- * an unknown state.
- * Returned by uiter_getState()/UCharIteratorGetState
- * when an error occurs.
- * Also, some UCharIterator implementations may not be able to return
- * a valid state for each position. This will be clearly documented
- * for each such iterator (none of the public ones here).
- *
- * @stable ICU 2.6
- */
- #define UITER_NO_STATE ((uint32_t)0xffffffff)
- /**
- * Function type declaration for UCharIterator.getIndex().
- *
- * Gets the current position, or the start or limit of the
- * iteration range.
- *
- * This function may perform slowly for UITER_CURRENT after setState() was called,
- * or for UITER_LENGTH, because an iterator implementation may have to count
- * UChars if the underlying storage is not UTF-16.
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @param origin get the 0, start, limit, length, or current index
- * @return the requested index, or U_SENTINEL in an error condition
- *
- * @see UCharIteratorOrigin
- * @see UCharIterator
- * @stable ICU 2.1
- */
- typedef int32_t U_CALLCONV
- UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin);
- /**
- * Function type declaration for UCharIterator.move().
- *
- * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
- *
- * Moves the current position relative to the start or limit of the
- * iteration range, or relative to the current position itself.
- * The movement is expressed in numbers of code units forward
- * or backward by specifying a positive or negative delta.
- * Out of bounds movement will be pinned to the start or limit.
- *
- * This function may perform slowly for moving relative to UITER_LENGTH
- * because an iterator implementation may have to count the rest of the
- * UChars if the native storage is not UTF-16.
- *
- * When moving relative to the limit or length, or
- * relative to the current position after setState() was called,
- * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
- * determination of the actual UTF-16 index.
- * The actual index can be determined with getIndex(UITER_CURRENT)
- * which will count the UChars if necessary.
- * See UITER_UNKNOWN_INDEX for details.
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @param delta can be positive, zero, or negative
- * @param origin move relative to the 0, start, limit, length, or current index
- * @return the new index, or U_SENTINEL on an error condition,
- * or UITER_UNKNOWN_INDEX when the index is not known.
- *
- * @see UCharIteratorOrigin
- * @see UCharIterator
- * @see UITER_UNKNOWN_INDEX
- * @stable ICU 2.1
- */
- typedef int32_t U_CALLCONV
- UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin);
- /**
- * Function type declaration for UCharIterator.hasNext().
- *
- * Check if current() and next() can still
- * return another code unit.
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @return boolean value for whether current() and next() can still return another code unit
- *
- * @see UCharIterator
- * @stable ICU 2.1
- */
- typedef UBool U_CALLCONV
- UCharIteratorHasNext(UCharIterator *iter);
- /**
- * Function type declaration for UCharIterator.hasPrevious().
- *
- * Check if previous() can still return another code unit.
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @return boolean value for whether previous() can still return another code unit
- *
- * @see UCharIterator
- * @stable ICU 2.1
- */
- typedef UBool U_CALLCONV
- UCharIteratorHasPrevious(UCharIterator *iter);
-
- /**
- * Function type declaration for UCharIterator.current().
- *
- * Return the code unit at the current position,
- * or U_SENTINEL if there is none (index is at the limit).
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @return the current code unit
- *
- * @see UCharIterator
- * @stable ICU 2.1
- */
- typedef UChar32 U_CALLCONV
- UCharIteratorCurrent(UCharIterator *iter);
- /**
- * Function type declaration for UCharIterator.next().
- *
- * Return the code unit at the current index and increment
- * the index (post-increment, like s[i++]),
- * or return U_SENTINEL if there is none (index is at the limit).
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @return the current code unit (and post-increment the current index)
- *
- * @see UCharIterator
- * @stable ICU 2.1
- */
- typedef UChar32 U_CALLCONV
- UCharIteratorNext(UCharIterator *iter);
- /**
- * Function type declaration for UCharIterator.previous().
- *
- * Decrement the index and return the code unit from there
- * (pre-decrement, like s[--i]),
- * or return U_SENTINEL if there is none (index is at the start).
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @return the previous code unit (after pre-decrementing the current index)
- *
- * @see UCharIterator
- * @stable ICU 2.1
- */
- typedef UChar32 U_CALLCONV
- UCharIteratorPrevious(UCharIterator *iter);
- /**
- * Function type declaration for UCharIterator.reservedFn().
- * Reserved for future use.
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @param something some integer argument
- * @return some integer
- *
- * @see UCharIterator
- * @stable ICU 2.1
- */
- typedef int32_t U_CALLCONV
- UCharIteratorReserved(UCharIterator *iter, int32_t something);
- /**
- * Function type declaration for UCharIterator.getState().
- *
- * Get the "state" of the iterator in the form of a single 32-bit word.
- * It is recommended that the state value be calculated to be as small as
- * is feasible. For strings with limited lengths, fewer than 32 bits may
- * be sufficient.
- *
- * This is used together with setState()/UCharIteratorSetState
- * to save and restore the iterator position more efficiently than with
- * getIndex()/move().
- *
- * The iterator state is defined as a uint32_t value because it is designed
- * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state
- * of the character iterator.
- *
- * With some UCharIterator implementations (e.g., UTF-8),
- * getting and setting the UTF-16 index with existing functions
- * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
- * relatively slow because the iterator has to "walk" from a known index
- * to the requested one.
- * This takes more time the farther it needs to go.
- *
- * An opaque state value allows an iterator implementation to provide
- * an internal index (UTF-8: the source byte array index) for
- * fast, constant-time restoration.
- *
- * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
- * the UTF-16 index may not be restored as well, but the iterator can deliver
- * the correct text contents and move relative to the current position
- * without performance degradation.
- *
- * Some UCharIterator implementations may not be able to return
- * a valid state for each position, in which case they return UITER_NO_STATE instead.
- * This will be clearly documented for each such iterator (none of the public ones here).
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @return the state word
- *
- * @see UCharIterator
- * @see UCharIteratorSetState
- * @see UITER_NO_STATE
- * @stable ICU 2.6
- */
- typedef uint32_t U_CALLCONV
- UCharIteratorGetState(const UCharIterator *iter);
- /**
- * Function type declaration for UCharIterator.setState().
- *
- * Restore the "state" of the iterator using a state word from a getState() call.
- * The iterator object need not be the same one as for which getState() was called,
- * but it must be of the same type (set up using the same uiter_setXYZ function)
- * and it must iterate over the same string
- * (binary identical regardless of memory address).
- * For more about the state word see UCharIteratorGetState.
- *
- * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
- * the UTF-16 index may not be restored as well, but the iterator can deliver
- * the correct text contents and move relative to the current position
- * without performance degradation.
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @param state the state word from a getState() call
- * on a same-type, same-string iterator
- * @param pErrorCode Must be a valid pointer to an error code value,
- * which must not indicate a failure before the function call.
- *
- * @see UCharIterator
- * @see UCharIteratorGetState
- * @stable ICU 2.6
- */
- typedef void U_CALLCONV
- UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
- /**
- * C API for code unit iteration.
- * This can be used as a C wrapper around
- * CharacterIterator, Replaceable, or implemented using simple strings, etc.
- *
- * There are two roles for using UCharIterator:
- *
- * A "provider" sets the necessary function pointers and controls the "protected"
- * fields of the UCharIterator structure. A "provider" passes a UCharIterator
- * into C APIs that need a UCharIterator as an abstract, flexible string interface.
- *
- * Implementations of such C APIs are "callers" of UCharIterator functions;
- * they only use the "public" function pointers and never access the "protected"
- * fields directly.
- *
- * The current() and next() functions only check the current index against the
- * limit, and previous() only checks the current index against the start,
- * to see if the iterator already reached the end of the iteration range.
- *
- * The assumption - in all iterators - is that the index is moved via the API,
- * which means it won't go out of bounds, or the index is modified by
- * user code that knows enough about the iterator implementation to set valid
- * index values.
- *
- * UCharIterator functions return code unit values 0..0xffff,
- * or U_SENTINEL if the iteration bounds are reached.
- *
- * @stable ICU 2.1
- */
- struct UCharIterator {
- /**
- * (protected) Pointer to string or wrapped object or similar.
- * Not used by caller.
- * @stable ICU 2.1
- */
- const void *context;
- /**
- * (protected) Length of string or similar.
- * Not used by caller.
- * @stable ICU 2.1
- */
- int32_t length;
- /**
- * (protected) Start index or similar.
- * Not used by caller.
- * @stable ICU 2.1
- */
- int32_t start;
- /**
- * (protected) Current index or similar.
- * Not used by caller.
- * @stable ICU 2.1
- */
- int32_t index;
- /**
- * (protected) Limit index or similar.
- * Not used by caller.
- * @stable ICU 2.1
- */
- int32_t limit;
- /**
- * (protected) Used by UTF-8 iterators and possibly others.
- * @stable ICU 2.1
- */
- int32_t reservedField;
- /**
- * (public) Returns the current position or the
- * start or limit index of the iteration range.
- *
- * @see UCharIteratorGetIndex
- * @stable ICU 2.1
- */
- UCharIteratorGetIndex *getIndex;
- /**
- * (public) Moves the current position relative to the start or limit of the
- * iteration range, or relative to the current position itself.
- * The movement is expressed in numbers of code units forward
- * or backward by specifying a positive or negative delta.
- *
- * @see UCharIteratorMove
- * @stable ICU 2.1
- */
- UCharIteratorMove *move;
- /**
- * (public) Check if current() and next() can still
- * return another code unit.
- *
- * @see UCharIteratorHasNext
- * @stable ICU 2.1
- */
- UCharIteratorHasNext *hasNext;
- /**
- * (public) Check if previous() can still return another code unit.
- *
- * @see UCharIteratorHasPrevious
- * @stable ICU 2.1
- */
- UCharIteratorHasPrevious *hasPrevious;
- /**
- * (public) Return the code unit at the current position,
- * or U_SENTINEL if there is none (index is at the limit).
- *
- * @see UCharIteratorCurrent
- * @stable ICU 2.1
- */
- UCharIteratorCurrent *current;
- /**
- * (public) Return the code unit at the current index and increment
- * the index (post-increment, like s[i++]),
- * or return U_SENTINEL if there is none (index is at the limit).
- *
- * @see UCharIteratorNext
- * @stable ICU 2.1
- */
- UCharIteratorNext *next;
- /**
- * (public) Decrement the index and return the code unit from there
- * (pre-decrement, like s[--i]),
- * or return U_SENTINEL if there is none (index is at the start).
- *
- * @see UCharIteratorPrevious
- * @stable ICU 2.1
- */
- UCharIteratorPrevious *previous;
- /**
- * (public) Reserved for future use. Currently NULL.
- *
- * @see UCharIteratorReserved
- * @stable ICU 2.1
- */
- UCharIteratorReserved *reservedFn;
- /**
- * (public) Return the state of the iterator, to be restored later with setState().
- * This function pointer is NULL if the iterator does not implement it.
- *
- * @see UCharIteratorGet
- * @stable ICU 2.6
- */
- UCharIteratorGetState *getState;
- /**
- * (public) Restore the iterator state from the state word from a call
- * to getState().
- * This function pointer is NULL if the iterator does not implement it.
- *
- * @see UCharIteratorSet
- * @stable ICU 2.6
- */
- UCharIteratorSetState *setState;
- };
- /**
- * Helper function for UCharIterator to get the code point
- * at the current index.
- *
- * Return the code point that includes the code unit at the current position,
- * or U_SENTINEL if there is none (index is at the limit).
- * If the current code unit is a lead or trail surrogate,
- * then the following or preceding surrogate is used to form
- * the code point value.
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @return the current code point
- *
- * @see UCharIterator
- * @see U16_GET
- * @see UnicodeString::char32At()
- * @stable ICU 2.1
- */
- U_CAPI UChar32 U_EXPORT2
- uiter_current32(UCharIterator *iter);
- /**
- * Helper function for UCharIterator to get the next code point.
- *
- * Return the code point at the current index and increment
- * the index (post-increment, like s[i++]),
- * or return U_SENTINEL if there is none (index is at the limit).
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @return the current code point (and post-increment the current index)
- *
- * @see UCharIterator
- * @see U16_NEXT
- * @stable ICU 2.1
- */
- U_CAPI UChar32 U_EXPORT2
- uiter_next32(UCharIterator *iter);
- /**
- * Helper function for UCharIterator to get the previous code point.
- *
- * Decrement the index and return the code point from there
- * (pre-decrement, like s[--i]),
- * or return U_SENTINEL if there is none (index is at the start).
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @return the previous code point (after pre-decrementing the current index)
- *
- * @see UCharIterator
- * @see U16_PREV
- * @stable ICU 2.1
- */
- U_CAPI UChar32 U_EXPORT2
- uiter_previous32(UCharIterator *iter);
- /**
- * Get the "state" of the iterator in the form of a single 32-bit word.
- * This is a convenience function that calls iter->getState(iter)
- * if iter->getState is not NULL;
- * if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
- *
- * Some UCharIterator implementations may not be able to return
- * a valid state for each position, in which case they return UITER_NO_STATE instead.
- * This will be clearly documented for each such iterator (none of the public ones here).
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @return the state word
- *
- * @see UCharIterator
- * @see UCharIteratorGetState
- * @see UITER_NO_STATE
- * @stable ICU 2.6
- */
- U_CAPI uint32_t U_EXPORT2
- uiter_getState(const UCharIterator *iter);
- /**
- * Restore the "state" of the iterator using a state word from a getState() call.
- * This is a convenience function that calls iter->setState(iter, state, pErrorCode)
- * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
- *
- * @param iter the UCharIterator structure ("this pointer")
- * @param state the state word from a getState() call
- * on a same-type, same-string iterator
- * @param pErrorCode Must be a valid pointer to an error code value,
- * which must not indicate a failure before the function call.
- *
- * @see UCharIterator
- * @see UCharIteratorSetState
- * @stable ICU 2.6
- */
- U_CAPI void U_EXPORT2
- uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
- /**
- * Set up a UCharIterator to iterate over a string.
- *
- * Sets the UCharIterator function pointers for iteration over the string s
- * with iteration boundaries start=index=0 and length=limit=string length.
- * The "provider" may set the start, index, and limit values at any time
- * within the range 0..length.
- * The length field will be ignored.
- *
- * The string pointer s is set into UCharIterator.context without copying
- * or reallocating the string contents.
- *
- * getState() simply returns the current index.
- * move() will always return the final index.
- *
- * @param iter UCharIterator structure to be set for iteration
- * @param s String to iterate over
- * @param length Length of s, or -1 if NUL-terminated
- *
- * @see UCharIterator
- * @stable ICU 2.1
- */
- U_CAPI void U_EXPORT2
- uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
- /**
- * Set up a UCharIterator to iterate over a UTF-16BE string
- * (byte vector with a big-endian pair of bytes per UChar).
- *
- * Everything works just like with a normal UChar iterator (uiter_setString),
- * except that UChars are assembled from byte pairs,
- * and that the length argument here indicates an even number of bytes.
- *
- * getState() simply returns the current index.
- * move() will always return the final index.
- *
- * @param iter UCharIterator structure to be set for iteration
- * @param s UTF-16BE string to iterate over
- * @param length Length of s as an even number of bytes, or -1 if NUL-terminated
- * (NUL means pair of 0 bytes at even index from s)
- *
- * @see UCharIterator
- * @see uiter_setString
- * @stable ICU 2.6
- */
- U_CAPI void U_EXPORT2
- uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
- /**
- * Set up a UCharIterator to iterate over a UTF-8 string.
- *
- * Sets the UCharIterator function pointers for iteration over the UTF-8 string s
- * with UTF-8 iteration boundaries 0 and length.
- * The implementation counts the UTF-16 index on the fly and
- * lazily evaluates the UTF-16 length of the text.
- *
- * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
- * When the reservedField is not 0, then it contains a supplementary code point
- * and the UTF-16 index is between the two corresponding surrogates.
- * At that point, the UTF-8 index is behind that code point.
- *
- * The UTF-8 string pointer s is set into UCharIterator.context without copying
- * or reallocating the string contents.
- *
- * getState() returns a state value consisting of
- * - the current UTF-8 source byte index (bits 31..1)
- * - a flag (bit 0) that indicates whether the UChar position is in the middle
- * of a surrogate pair
- * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
- *
- * getState() cannot also encode the UTF-16 index in the state value.
- * move(relative to limit or length), or
- * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
- *
- * @param iter UCharIterator structure to be set for iteration
- * @param s UTF-8 string to iterate over
- * @param length Length of s in bytes, or -1 if NUL-terminated
- *
- * @see UCharIterator
- * @stable ICU 2.6
- */
- U_CAPI void U_EXPORT2
- uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
- #if U_SHOW_CPLUSPLUS_API
- /**
- * Set up a UCharIterator to wrap around a C++ CharacterIterator.
- *
- * Sets the UCharIterator function pointers for iteration using the
- * CharacterIterator charIter.
- *
- * The CharacterIterator pointer charIter is set into UCharIterator.context
- * without copying or cloning the CharacterIterator object.
- * The other "protected" UCharIterator fields are set to 0 and will be ignored.
- * The iteration index and boundaries are controlled by the CharacterIterator.
- *
- * getState() simply returns the current index.
- * move() will always return the final index.
- *
- * @param iter UCharIterator structure to be set for iteration
- * @param charIter CharacterIterator to wrap
- *
- * @see UCharIterator
- * @stable ICU 2.1
- */
- U_CAPI void U_EXPORT2
- uiter_setCharacterIterator(UCharIterator *iter, icu::CharacterIterator *charIter);
- /**
- * Set up a UCharIterator to iterate over a C++ Replaceable.
- *
- * Sets the UCharIterator function pointers for iteration over the
- * Replaceable rep with iteration boundaries start=index=0 and
- * length=limit=rep->length().
- * The "provider" may set the start, index, and limit values at any time
- * within the range 0..length=rep->length().
- * The length field will be ignored.
- *
- * The Replaceable pointer rep is set into UCharIterator.context without copying
- * or cloning/reallocating the Replaceable object.
- *
- * getState() simply returns the current index.
- * move() will always return the final index.
- *
- * @param iter UCharIterator structure to be set for iteration
- * @param rep Replaceable to iterate over
- *
- * @see UCharIterator
- * @stable ICU 2.1
- */
- U_CAPI void U_EXPORT2
- uiter_setReplaceable(UCharIterator *iter, const icu::Replaceable *rep);
- #endif
- U_CDECL_END
- #endif
|