12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465 |
- # Licensed to the Apache Software Foundation (ASF) under one
- # or more contributor license agreements. See the NOTICE file
- # distributed with this work for additional information
- # regarding copyright ownership. The ASF licenses this file
- # to you under the Apache License, Version 2.0 (the
- # "License"); you may not use this file except in compliance
- # with the License. You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing,
- # software distributed under the License is distributed on an
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- # KIND, either express or implied. See the License for the
- # specific language governing permissions and limitations
- # under the License.
- import os
- import warnings
- cdef _sequence_to_array(object sequence, object mask, object size,
- DataType type, CMemoryPool* pool, c_bool from_pandas):
- cdef:
- int64_t c_size
- PyConversionOptions options
- shared_ptr[CChunkedArray] chunked
- if type is not None:
- options.type = type.sp_type
- if size is not None:
- options.size = size
- options.from_pandas = from_pandas
- options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
- with nogil:
- chunked = GetResultValue(
- ConvertPySequence(sequence, mask, options, pool)
- )
- if chunked.get().num_chunks() == 1:
- return pyarrow_wrap_array(chunked.get().chunk(0))
- else:
- return pyarrow_wrap_chunked_array(chunked)
- cdef inline _is_array_like(obj):
- if isinstance(obj, np.ndarray):
- return True
- return pandas_api._have_pandas_internal() and pandas_api.is_array_like(obj)
- def _ndarray_to_arrow_type(object values, DataType type):
- return pyarrow_wrap_data_type(_ndarray_to_type(values, type))
- cdef shared_ptr[CDataType] _ndarray_to_type(object values,
- DataType type) except *:
- cdef shared_ptr[CDataType] c_type
- dtype = values.dtype
- if type is None and dtype != object:
- with nogil:
- check_status(NumPyDtypeToArrow(dtype, &c_type))
- if type is not None:
- c_type = type.sp_type
- return c_type
- cdef _ndarray_to_array(object values, object mask, DataType type,
- c_bool from_pandas, c_bool safe, CMemoryPool* pool):
- cdef:
- shared_ptr[CChunkedArray] chunked_out
- shared_ptr[CDataType] c_type = _ndarray_to_type(values, type)
- CCastOptions cast_options = CCastOptions(safe)
- with nogil:
- check_status(NdarrayToArrow(pool, values, mask, from_pandas,
- c_type, cast_options, &chunked_out))
- if chunked_out.get().num_chunks() > 1:
- return pyarrow_wrap_chunked_array(chunked_out)
- else:
- return pyarrow_wrap_array(chunked_out.get().chunk(0))
- cdef _codes_to_indices(object codes, object mask, DataType type,
- MemoryPool memory_pool):
- """
- Convert the codes of a pandas Categorical to indices for a pyarrow
- DictionaryArray, taking into account missing values + mask
- """
- if mask is None:
- mask = codes == -1
- else:
- mask = mask | (codes == -1)
- return array(codes, mask=mask, type=type, memory_pool=memory_pool)
- def _handle_arrow_array_protocol(obj, type, mask, size):
- if mask is not None or size is not None:
- raise ValueError(
- "Cannot specify a mask or a size when passing an object that is "
- "converted with the __arrow_array__ protocol.")
- res = obj.__arrow_array__(type=type)
- if not isinstance(res, (Array, ChunkedArray)):
- raise TypeError("The object's __arrow_array__ method does not "
- "return a pyarrow Array or ChunkedArray.")
- return res
- def array(object obj, type=None, mask=None, size=None, from_pandas=None,
- bint safe=True, MemoryPool memory_pool=None):
- """
- Create pyarrow.Array instance from a Python object.
- Parameters
- ----------
- obj : sequence, iterable, ndarray or Series
- If both type and size are specified may be a single use iterable. If
- not strongly-typed, Arrow type will be inferred for resulting array.
- type : pyarrow.DataType
- Explicit type to attempt to coerce to, otherwise will be inferred from
- the data.
- mask : array[bool], optional
- Indicate which values are null (True) or not null (False).
- size : int64, optional
- Size of the elements. If the input is larger than size bail at this
- length. For iterators, if size is larger than the input iterator this
- will be treated as a "max size", but will involve an initial allocation
- of size followed by a resize to the actual size (so if you know the
- exact size specifying it correctly will give you better performance).
- from_pandas : bool, default None
- Use pandas's semantics for inferring nulls from values in
- ndarray-like data. If passed, the mask tasks precedence, but
- if a value is unmasked (not-null), but still null according to
- pandas semantics, then it is null. Defaults to False if not
- passed explicitly by user, or True if a pandas object is
- passed in.
- safe : bool, default True
- Check for overflows or other unsafe conversions.
- memory_pool : pyarrow.MemoryPool, optional
- If not passed, will allocate memory from the currently-set default
- memory pool.
- Returns
- -------
- array : pyarrow.Array or pyarrow.ChunkedArray
- A ChunkedArray instead of an Array is returned if:
- - the object data overflowed binary storage.
- - the object's ``__arrow_array__`` protocol method returned a chunked
- array.
- Notes
- -----
- Localized timestamps will currently be returned as UTC (pandas's native
- representation). Timezone-naive data will be implicitly interpreted as
- UTC.
- Converting to dictionary array will promote to a wider integer type for
- indices if the number of distinct values cannot be represented, even if
- the index type was explicitly set. This means that if there are more than
- 127 values the returned dictionary array's index type will be at least
- pa.int16() even if pa.int8() was passed to the function. Note that an
- explicit index type will not be demoted even if it is wider than required.
- Examples
- --------
- >>> import pandas as pd
- >>> import pyarrow as pa
- >>> pa.array(pd.Series([1, 2]))
- <pyarrow.lib.Int64Array object at 0x7f674e4c0e10>
- [
- 1,
- 2
- ]
- >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string()))
- <pyarrow.lib.DictionaryArray object at 0x7feb288d9040>
- -- dictionary:
- [
- "a",
- "b"
- ]
- -- indices:
- [
- 0,
- 1,
- 0
- ]
- >>> import numpy as np
- >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool))
- <pyarrow.lib.Int64Array object at 0x7f9019e11208>
- [
- 1,
- null
- ]
- >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64()))
- >>> arr.type.index_type
- DataType(int16)
- """
- cdef:
- CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
- bint is_pandas_object = False
- bint c_from_pandas
- type = ensure_type(type, allow_none=True)
- if from_pandas is None:
- c_from_pandas = False
- else:
- c_from_pandas = from_pandas
- if hasattr(obj, '__arrow_array__'):
- return _handle_arrow_array_protocol(obj, type, mask, size)
- elif _is_array_like(obj):
- if mask is not None:
- # out argument unused
- mask = get_values(mask, &is_pandas_object)
- values = get_values(obj, &is_pandas_object)
- if is_pandas_object and from_pandas is None:
- c_from_pandas = True
- if isinstance(values, np.ma.MaskedArray):
- if mask is not None:
- raise ValueError("Cannot pass a numpy masked array and "
- "specify a mask at the same time")
- else:
- # don't use shrunken masks
- mask = None if values.mask is np.ma.nomask else values.mask
- values = values.data
- if mask is not None:
- if mask.dtype != np.bool_:
- raise TypeError("Mask must be boolean dtype")
- if mask.ndim != 1:
- raise ValueError("Mask must be 1D array")
- if len(values) != len(mask):
- raise ValueError(
- "Mask is a different length from sequence being converted")
- if hasattr(values, '__arrow_array__'):
- return _handle_arrow_array_protocol(values, type, mask, size)
- elif pandas_api.is_categorical(values):
- if type is not None:
- if type.id != Type_DICTIONARY:
- return _ndarray_to_array(
- np.asarray(values), mask, type, c_from_pandas, safe,
- pool)
- index_type = type.index_type
- value_type = type.value_type
- if values.ordered != type.ordered:
- warnings.warn(
- "The 'ordered' flag of the passed categorical values "
- "does not match the 'ordered' of the specified type. "
- "Using the flag of the values, but in the future this "
- "mismatch will raise a ValueError.",
- FutureWarning, stacklevel=2)
- else:
- index_type = None
- value_type = None
- indices = _codes_to_indices(
- values.codes, mask, index_type, memory_pool)
- try:
- dictionary = array(
- values.categories.values, type=value_type,
- memory_pool=memory_pool)
- except TypeError:
- # TODO when removing the deprecation warning, this whole
- # try/except can be removed (to bubble the TypeError of
- # the first array(..) call)
- if value_type is not None:
- warnings.warn(
- "The dtype of the 'categories' of the passed "
- "categorical values ({0}) does not match the "
- "specified type ({1}). For now ignoring the specified "
- "type, but in the future this mismatch will raise a "
- "TypeError".format(
- values.categories.dtype, value_type),
- FutureWarning, stacklevel=2)
- dictionary = array(
- values.categories.values, memory_pool=memory_pool)
- else:
- raise
- return DictionaryArray.from_arrays(
- indices, dictionary, ordered=values.ordered, safe=safe)
- else:
- if pandas_api.have_pandas:
- values, type = pandas_api.compat.get_datetimetz_type(
- values, obj.dtype, type)
- return _ndarray_to_array(values, mask, type, c_from_pandas, safe,
- pool)
- else:
- # ConvertPySequence does strict conversion if type is explicitly passed
- return _sequence_to_array(obj, mask, size, type, pool, c_from_pandas)
- def asarray(values, type=None):
- """
- Convert to pyarrow.Array, inferring type if not provided.
- Parameters
- ----------
- values : array-like
- This can be a sequence, numpy.ndarray, pyarrow.Array or
- pyarrow.ChunkedArray. If a ChunkedArray is passed, the output will be
- a ChunkedArray, otherwise the output will be a Array.
- type : string or DataType
- Explicitly construct the array with this type. Attempt to cast if
- indicated type is different.
- Returns
- -------
- arr : Array or ChunkedArray
- """
- if isinstance(values, (Array, ChunkedArray)):
- if type is not None and not values.type.equals(type):
- values = values.cast(type)
- return values
- else:
- return array(values, type=type)
- def nulls(size, type=None, MemoryPool memory_pool=None):
- """
- Create a strongly-typed Array instance with all elements null.
- Parameters
- ----------
- size : int
- Array length.
- type : pyarrow.DataType, default None
- Explicit type for the array. By default use NullType.
- memory_pool : MemoryPool, default None
- Arrow MemoryPool to use for allocations. Uses the default memory
- pool is not passed.
- Returns
- -------
- arr : Array
- Examples
- --------
- >>> import pyarrow as pa
- >>> pa.nulls(10)
- <pyarrow.lib.NullArray object at 0x7ffaf04c2e50>
- 10 nulls
- >>> pa.nulls(3, pa.uint32())
- <pyarrow.lib.UInt32Array object at 0x7ffaf04c2e50>
- [
- null,
- null,
- null
- ]
- """
- cdef:
- CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
- int64_t length = size
- shared_ptr[CDataType] ty
- shared_ptr[CArray] arr
- type = ensure_type(type, allow_none=True)
- if type is None:
- type = null()
- ty = pyarrow_unwrap_data_type(type)
- with nogil:
- arr = GetResultValue(MakeArrayOfNull(ty, length, pool))
- return pyarrow_wrap_array(arr)
- def repeat(value, size, MemoryPool memory_pool=None):
- """
- Create an Array instance whose slots are the given scalar.
- Parameters
- ----------
- value: Scalar-like object
- Either a pyarrow.Scalar or any python object coercible to a Scalar.
- size : int
- Number of times to repeat the scalar in the output Array.
- memory_pool : MemoryPool, default None
- Arrow MemoryPool to use for allocations. Uses the default memory
- pool is not passed.
- Returns
- -------
- arr : Array
- Examples
- --------
- >>> import pyarrow as pa
- >>> pa.repeat(10, 3)
- <pyarrow.lib.Int64Array object at 0x7ffac03a2750>
- [
- 10,
- 10,
- 10
- ]
- >>> pa.repeat([1, 2], 2)
- <pyarrow.lib.ListArray object at 0x7ffaf04c2e50>
- [
- [
- 1,
- 2
- ],
- [
- 1,
- 2
- ]
- ]
- >>> pa.repeat("string", 3)
- <pyarrow.lib.StringArray object at 0x7ffac03a2750>
- [
- "string",
- "string",
- "string"
- ]
- >>> pa.repeat(pa.scalar({'a': 1, 'b': [1, 2]}), 2)
- <pyarrow.lib.StructArray object at 0x7ffac03a2750>
- -- is_valid: all not null
- -- child 0 type: int64
- [
- 1,
- 1
- ]
- -- child 1 type: list<item: int64>
- [
- [
- 1,
- 2
- ],
- [
- 1,
- 2
- ]
- ]
- """
- cdef:
- CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
- int64_t length = size
- shared_ptr[CArray] c_array
- shared_ptr[CScalar] c_scalar
- if not isinstance(value, Scalar):
- value = scalar(value, memory_pool=memory_pool)
- c_scalar = (<Scalar> value).unwrap()
- with nogil:
- c_array = GetResultValue(
- MakeArrayFromScalar(deref(c_scalar), length, pool)
- )
- return pyarrow_wrap_array(c_array)
- def infer_type(values, mask=None, from_pandas=False):
- """
- Attempt to infer Arrow data type that can hold the passed Python
- sequence type in an Array object
- Parameters
- ----------
- values : array-like
- Sequence to infer type from.
- mask : ndarray (bool type), optional
- Optional exclusion mask where True marks null, False non-null.
- from_pandas : bool, default False
- Use pandas's NA/null sentinel values for type inference.
- Returns
- -------
- type : DataType
- """
- cdef:
- shared_ptr[CDataType] out
- c_bool use_pandas_sentinels = from_pandas
- if mask is not None and not isinstance(mask, np.ndarray):
- mask = np.array(mask, dtype=bool)
- out = GetResultValue(InferArrowType(values, mask, use_pandas_sentinels))
- return pyarrow_wrap_data_type(out)
- def _normalize_slice(object arrow_obj, slice key):
- """
- Slices with step not equal to 1 (or None) will produce a copy
- rather than a zero-copy view
- """
- cdef:
- Py_ssize_t start, stop, step
- Py_ssize_t n = len(arrow_obj)
- start = key.start or 0
- if start < 0:
- start += n
- if start < 0:
- start = 0
- elif start >= n:
- start = n
- stop = key.stop if key.stop is not None else n
- if stop < 0:
- stop += n
- if stop < 0:
- stop = 0
- elif stop >= n:
- stop = n
- step = key.step or 1
- if step != 1:
- if step < 0:
- # Negative steps require some special handling
- if key.start is None:
- start = n - 1
- if key.stop is None:
- stop = -1
- indices = np.arange(start, stop, step)
- return arrow_obj.take(indices)
- else:
- length = max(stop - start, 0)
- return arrow_obj.slice(start, length)
- cdef Py_ssize_t _normalize_index(Py_ssize_t index,
- Py_ssize_t length) except -1:
- if index < 0:
- index += length
- if index < 0:
- raise IndexError("index out of bounds")
- elif index >= length:
- raise IndexError("index out of bounds")
- return index
- cdef wrap_datum(const CDatum& datum):
- if datum.kind() == DatumType_ARRAY:
- return pyarrow_wrap_array(MakeArray(datum.array()))
- elif datum.kind() == DatumType_CHUNKED_ARRAY:
- return pyarrow_wrap_chunked_array(datum.chunked_array())
- elif datum.kind() == DatumType_RECORD_BATCH:
- return pyarrow_wrap_batch(datum.record_batch())
- elif datum.kind() == DatumType_TABLE:
- return pyarrow_wrap_table(datum.table())
- elif datum.kind() == DatumType_SCALAR:
- return pyarrow_wrap_scalar(datum.scalar())
- else:
- raise ValueError("Unable to wrap Datum in a Python object")
- cdef _append_array_buffers(const CArrayData* ad, list res):
- """
- Recursively append Buffer wrappers from *ad* and its children.
- """
- cdef size_t i, n
- assert ad != NULL
- n = ad.buffers.size()
- for i in range(n):
- buf = ad.buffers[i]
- res.append(pyarrow_wrap_buffer(buf)
- if buf.get() != NULL else None)
- n = ad.child_data.size()
- for i in range(n):
- _append_array_buffers(ad.child_data[i].get(), res)
- cdef _reduce_array_data(const CArrayData* ad):
- """
- Recursively dissect ArrayData to (pickable) tuples.
- """
- cdef size_t i, n
- assert ad != NULL
- n = ad.buffers.size()
- buffers = []
- for i in range(n):
- buf = ad.buffers[i]
- buffers.append(pyarrow_wrap_buffer(buf)
- if buf.get() != NULL else None)
- children = []
- n = ad.child_data.size()
- for i in range(n):
- children.append(_reduce_array_data(ad.child_data[i].get()))
- if ad.dictionary.get() != NULL:
- dictionary = _reduce_array_data(ad.dictionary.get())
- else:
- dictionary = None
- return pyarrow_wrap_data_type(ad.type), ad.length, ad.null_count, \
- ad.offset, buffers, children, dictionary
- cdef shared_ptr[CArrayData] _reconstruct_array_data(data):
- """
- Reconstruct CArrayData objects from the tuple structure generated
- by _reduce_array_data.
- """
- cdef:
- int64_t length, null_count, offset, i
- DataType dtype
- Buffer buf
- vector[shared_ptr[CBuffer]] c_buffers
- vector[shared_ptr[CArrayData]] c_children
- shared_ptr[CArrayData] c_dictionary
- dtype, length, null_count, offset, buffers, children, dictionary = data
- for i in range(len(buffers)):
- buf = buffers[i]
- if buf is None:
- c_buffers.push_back(shared_ptr[CBuffer]())
- else:
- c_buffers.push_back(buf.buffer)
- for i in range(len(children)):
- c_children.push_back(_reconstruct_array_data(children[i]))
- if dictionary is not None:
- c_dictionary = _reconstruct_array_data(dictionary)
- return CArrayData.MakeWithChildrenAndDictionary(
- dtype.sp_type,
- length,
- c_buffers,
- c_children,
- c_dictionary,
- null_count,
- offset)
- def _restore_array(data):
- """
- Reconstruct an Array from pickled ArrayData.
- """
- cdef shared_ptr[CArrayData] ad = _reconstruct_array_data(data)
- return pyarrow_wrap_array(MakeArray(ad))
- cdef class _PandasConvertible(_Weakrefable):
- def to_pandas(
- self,
- memory_pool=None,
- categories=None,
- bint strings_to_categorical=False,
- bint zero_copy_only=False,
- bint integer_object_nulls=False,
- bint date_as_object=True,
- bint timestamp_as_object=False,
- bint use_threads=True,
- bint deduplicate_objects=True,
- bint ignore_metadata=False,
- bint safe=True,
- bint split_blocks=False,
- bint self_destruct=False,
- types_mapper=None
- ):
- """
- Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
- Parameters
- ----------
- memory_pool : MemoryPool, default None
- Arrow MemoryPool to use for allocations. Uses the default memory
- pool is not passed.
- strings_to_categorical : bool, default False
- Encode string (UTF8) and binary types to pandas.Categorical.
- categories: list, default empty
- List of fields that should be returned as pandas.Categorical. Only
- applies to table-like data structures.
- zero_copy_only : bool, default False
- Raise an ArrowException if this function call would require copying
- the underlying data.
- integer_object_nulls : bool, default False
- Cast integers with nulls to objects
- date_as_object : bool, default True
- Cast dates to objects. If False, convert to datetime64[ns] dtype.
- timestamp_as_object : bool, default False
- Cast non-nanosecond timestamps (np.datetime64) to objects. This is
- useful if you have timestamps that don't fit in the normal date
- range of nanosecond timestamps (1678 CE-2262 CE).
- If False, all timestamps are converted to datetime64[ns] dtype.
- use_threads: bool, default True
- Whether to parallelize the conversion using multiple threads.
- deduplicate_objects : bool, default False
- Do not create multiple copies Python objects when created, to save
- on memory use. Conversion will be slower.
- ignore_metadata : bool, default False
- If True, do not use the 'pandas' metadata to reconstruct the
- DataFrame index, if present
- safe : bool, default True
- For certain data types, a cast is needed in order to store the
- data in a pandas DataFrame or Series (e.g. timestamps are always
- stored as nanoseconds in pandas). This option controls whether it
- is a safe cast or not.
- split_blocks : bool, default False
- If True, generate one internal "block" for each column when
- creating a pandas.DataFrame from a RecordBatch or Table. While this
- can temporarily reduce memory note that various pandas operations
- can trigger "consolidation" which may balloon memory use.
- self_destruct : bool, default False
- EXPERIMENTAL: If True, attempt to deallocate the originating Arrow
- memory while converting the Arrow object to pandas. If you use the
- object after calling to_pandas with this option it will crash your
- program.
- Note that you may not see always memory usage improvements. For
- example, if multiple columns share an underlying allocation,
- memory can't be freed until all columns are converted.
- types_mapper : function, default None
- A function mapping a pyarrow DataType to a pandas ExtensionDtype.
- This can be used to override the default pandas type for conversion
- of built-in pyarrow types or in absence of pandas_metadata in the
- Table schema. The function receives a pyarrow DataType and is
- expected to return a pandas ExtensionDtype or ``None`` if the
- default conversion should be used for that type. If you have
- a dictionary mapping, you can pass ``dict.get`` as function.
- Returns
- -------
- pandas.Series or pandas.DataFrame depending on type of object
- """
- options = dict(
- pool=memory_pool,
- strings_to_categorical=strings_to_categorical,
- zero_copy_only=zero_copy_only,
- integer_object_nulls=integer_object_nulls,
- date_as_object=date_as_object,
- timestamp_as_object=timestamp_as_object,
- use_threads=use_threads,
- deduplicate_objects=deduplicate_objects,
- safe=safe,
- split_blocks=split_blocks,
- self_destruct=self_destruct
- )
- return self._to_pandas(options, categories=categories,
- ignore_metadata=ignore_metadata,
- types_mapper=types_mapper)
- cdef PandasOptions _convert_pandas_options(dict options):
- cdef PandasOptions result
- result.pool = maybe_unbox_memory_pool(options['pool'])
- result.strings_to_categorical = options['strings_to_categorical']
- result.zero_copy_only = options['zero_copy_only']
- result.integer_object_nulls = options['integer_object_nulls']
- result.date_as_object = options['date_as_object']
- result.timestamp_as_object = options['timestamp_as_object']
- result.use_threads = options['use_threads']
- result.deduplicate_objects = options['deduplicate_objects']
- result.safe_cast = options['safe']
- result.split_blocks = options['split_blocks']
- result.self_destruct = options['self_destruct']
- result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
- return result
- cdef class Array(_PandasConvertible):
- """
- The base class for all Arrow arrays.
- """
- def __init__(self):
- raise TypeError("Do not call {}'s constructor directly, use one of "
- "the `pyarrow.Array.from_*` functions instead."
- .format(self.__class__.__name__))
- cdef void init(self, const shared_ptr[CArray]& sp_array) except *:
- self.sp_array = sp_array
- self.ap = sp_array.get()
- self.type = pyarrow_wrap_data_type(self.sp_array.get().type())
- def _debug_print(self):
- with nogil:
- check_status(DebugPrint(deref(self.ap), 0))
- def diff(self, Array other):
- """
- Compare contents of this array against another one.
- Return string containing the result of arrow::Diff comparing contents
- of this array against the other array.
- """
- cdef c_string result
- with nogil:
- result = self.ap.Diff(deref(other.ap))
- return frombytes(result, safe=True)
- def cast(self, object target_type, safe=True):
- """
- Cast array values to another data type
- See pyarrow.compute.cast for usage
- """
- return _pc().cast(self, target_type, safe=safe)
- def view(self, object target_type):
- """
- Return zero-copy "view" of array as another data type.
- The data types must have compatible columnar buffer layouts
- Parameters
- ----------
- target_type : DataType
- Type to construct view as.
- Returns
- -------
- view : Array
- """
- cdef DataType type = ensure_type(target_type)
- cdef shared_ptr[CArray] result
- with nogil:
- result = GetResultValue(self.ap.View(type.sp_type))
- return pyarrow_wrap_array(result)
- def sum(self, **kwargs):
- """
- Sum the values in a numerical array.
- """
- options = _pc().ScalarAggregateOptions(**kwargs)
- return _pc().call_function('sum', [self], options)
- def unique(self):
- """
- Compute distinct elements in array.
- """
- return _pc().call_function('unique', [self])
- def dictionary_encode(self, null_encoding='mask'):
- """
- Compute dictionary-encoded representation of array.
- """
- options = _pc().DictionaryEncodeOptions(null_encoding)
- return _pc().call_function('dictionary_encode', [self], options)
- def value_counts(self):
- """
- Compute counts of unique elements in array.
- Returns
- -------
- An array of <input type "Values", int64_t "Counts"> structs
- """
- return _pc().call_function('value_counts', [self])
- @staticmethod
- def from_pandas(obj, mask=None, type=None, bint safe=True,
- MemoryPool memory_pool=None):
- """
- Convert pandas.Series to an Arrow Array.
- This method uses Pandas semantics about what values indicate
- nulls. See pyarrow.array for more general conversion from arrays or
- sequences to Arrow arrays.
- Parameters
- ----------
- sequence : ndarray, pandas.Series, array-like
- mask : array (boolean), optional
- Indicate which values are null (True) or not null (False).
- type : pyarrow.DataType
- Explicit type to attempt to coerce to, otherwise will be inferred
- from the data.
- safe : bool, default True
- Check for overflows or other unsafe conversions.
- memory_pool : pyarrow.MemoryPool, optional
- If not passed, will allocate memory from the currently-set default
- memory pool.
- Notes
- -----
- Localized timestamps will currently be returned as UTC (pandas's native
- representation). Timezone-naive data will be implicitly interpreted as
- UTC.
- Returns
- -------
- array : pyarrow.Array or pyarrow.ChunkedArray
- ChunkedArray is returned if object data overflows binary buffer.
- """
- return array(obj, mask=mask, type=type, safe=safe, from_pandas=True,
- memory_pool=memory_pool)
- def __reduce__(self):
- return _restore_array, \
- (_reduce_array_data(self.sp_array.get().data().get()),)
- @staticmethod
- def from_buffers(DataType type, length, buffers, null_count=-1, offset=0,
- children=None):
- """
- Construct an Array from a sequence of buffers.
- The concrete type returned depends on the datatype.
- Parameters
- ----------
- type : DataType
- The value type of the array.
- length : int
- The number of values in the array.
- buffers : List[Buffer]
- The buffers backing this array.
- null_count : int, default -1
- The number of null entries in the array. Negative value means that
- the null count is not known.
- offset : int, default 0
- The array's logical offset (in values, not in bytes) from the
- start of each buffer.
- children : List[Array], default None
- Nested type children with length matching type.num_fields.
- Returns
- -------
- array : Array
- """
- cdef:
- Buffer buf
- Array child
- vector[shared_ptr[CBuffer]] c_buffers
- vector[shared_ptr[CArrayData]] c_child_data
- shared_ptr[CArrayData] array_data
- children = children or []
- if type.num_fields != len(children):
- raise ValueError("Type's expected number of children "
- "({0}) did not match the passed number "
- "({1}).".format(type.num_fields, len(children)))
- if type.num_buffers != len(buffers):
- raise ValueError("Type's expected number of buffers "
- "({0}) did not match the passed number "
- "({1}).".format(type.num_buffers, len(buffers)))
- for buf in buffers:
- # None will produce a null buffer pointer
- c_buffers.push_back(pyarrow_unwrap_buffer(buf))
- for child in children:
- c_child_data.push_back(child.ap.data())
- array_data = CArrayData.MakeWithChildren(type.sp_type, length,
- c_buffers, c_child_data,
- null_count, offset)
- cdef Array result = pyarrow_wrap_array(MakeArray(array_data))
- result.validate()
- return result
- @property
- def null_count(self):
- return self.sp_array.get().null_count()
- @property
- def nbytes(self):
- """
- Total number of bytes consumed by the elements of the array.
- """
- size = 0
- for buf in self.buffers():
- if buf is not None:
- size += buf.size
- return size
- def __sizeof__(self):
- return super(Array, self).__sizeof__() + self.nbytes
- def __iter__(self):
- for i in range(len(self)):
- yield self.getitem(i)
- def __repr__(self):
- type_format = object.__repr__(self)
- return '{0}\n{1}'.format(type_format, str(self))
- def to_string(self, int indent=0, int window=10):
- cdef:
- c_string result
- with nogil:
- check_status(
- PrettyPrint(
- deref(self.ap),
- PrettyPrintOptions(indent, window),
- &result
- )
- )
- return frombytes(result, safe=True)
- def format(self, **kwargs):
- import warnings
- warnings.warn('Array.format is deprecated, use Array.to_string')
- return self.to_string(**kwargs)
- def __str__(self):
- return self.to_string()
- def __eq__(self, other):
- try:
- return self.equals(other)
- except TypeError:
- # This also handles comparing with None
- # as Array.equals(None) raises a TypeError.
- return NotImplemented
- def equals(Array self, Array other not None):
- return self.ap.Equals(deref(other.ap))
- def __len__(self):
- return self.length()
- cdef int64_t length(self):
- if self.sp_array.get():
- return self.sp_array.get().length()
- else:
- return 0
- def is_null(self):
- """
- Return BooleanArray indicating the null values.
- """
- return _pc().is_null(self)
- def is_valid(self):
- """
- Return BooleanArray indicating the non-null values.
- """
- return _pc().is_valid(self)
- def fill_null(self, fill_value):
- """
- See pyarrow.compute.fill_null for usage.
- """
- return _pc().fill_null(self, fill_value)
- def __getitem__(self, key):
- """
- Slice or return value at given index
- Parameters
- ----------
- key : integer or slice
- Slices with step not equal to 1 (or None) will produce a copy
- rather than a zero-copy view
- Returns
- -------
- value : Scalar (index) or Array (slice)
- """
- if PySlice_Check(key):
- return _normalize_slice(self, key)
- return self.getitem(_normalize_index(key, self.length()))
- cdef getitem(self, int64_t i):
- return Scalar.wrap(GetResultValue(self.ap.GetScalar(i)))
- def slice(self, offset=0, length=None):
- """
- Compute zero-copy slice of this array.
- Parameters
- ----------
- offset : int, default 0
- Offset from start of array to slice.
- length : int, default None
- Length of slice (default is until end of Array starting from
- offset).
- Returns
- -------
- sliced : RecordBatch
- """
- cdef:
- shared_ptr[CArray] result
- if offset < 0:
- raise IndexError('Offset must be non-negative')
- offset = min(len(self), offset)
- if length is None:
- result = self.ap.Slice(offset)
- else:
- if length < 0:
- raise ValueError('Length must be non-negative')
- result = self.ap.Slice(offset, length)
- return pyarrow_wrap_array(result)
- def take(self, object indices):
- """
- Select values from an array. See pyarrow.compute.take for full usage.
- """
- return _pc().take(self, indices)
- def filter(self, Array mask, null_selection_behavior='drop'):
- """
- Select values from an array. See pyarrow.compute.filter for full usage.
- """
- return _pc().filter(self, mask, null_selection_behavior)
- def index(self, value, start=None, end=None, *, memory_pool=None):
- """
- Find the first index of a value.
- See pyarrow.compute.index for full usage.
- """
- return _pc().index(self, value, start, end, memory_pool=memory_pool)
- def _to_pandas(self, options, **kwargs):
- return _array_like_to_pandas(self, options)
- def __array__(self, dtype=None):
- values = self.to_numpy(zero_copy_only=False)
- if dtype is None:
- return values
- return values.astype(dtype)
- def to_numpy(self, zero_copy_only=True, writable=False):
- """
- Return a NumPy view or copy of this array (experimental).
- By default, tries to return a view of this array. This is only
- supported for primitive arrays with the same memory layout as NumPy
- (i.e. integers, floating point, ..) and without any nulls.
- Parameters
- ----------
- zero_copy_only : bool, default True
- If True, an exception will be raised if the conversion to a numpy
- array would require copying the underlying data (e.g. in presence
- of nulls, or for non-primitive types).
- writable : bool, default False
- For numpy arrays created with zero copy (view on the Arrow data),
- the resulting array is not writable (Arrow data is immutable).
- By setting this to True, a copy of the array is made to ensure
- it is writable.
- Returns
- -------
- array : numpy.ndarray
- """
- cdef:
- PyObject* out
- PandasOptions c_options
- object values
- if zero_copy_only and writable:
- raise ValueError(
- "Cannot return a writable array if asking for zero-copy")
- # If there are nulls and the array is a DictionaryArray
- # decoding the dictionary will make sure nulls are correctly handled.
- # Decoding a dictionary does imply a copy by the way,
- # so it can't be done if the user requested a zero_copy.
- c_options.decode_dictionaries = not zero_copy_only
- c_options.zero_copy_only = zero_copy_only
- with nogil:
- check_status(ConvertArrayToPandas(c_options, self.sp_array,
- self, &out))
- # wrap_array_output uses pandas to convert to Categorical, here
- # always convert to numpy array without pandas dependency
- array = PyObject_to_object(out)
- if isinstance(array, dict):
- array = np.take(array['dictionary'], array['indices'])
- if writable and not array.flags.writeable:
- # if the conversion already needed to a copy, writeable is True
- array = array.copy()
- return array
- def to_pylist(self):
- """
- Convert to a list of native Python objects.
- Returns
- -------
- lst : list
- """
- return [x.as_py() for x in self]
- def tolist(self):
- """
- Alias of to_pylist for compatibility with NumPy.
- """
- return self.to_pylist()
- def validate(self, *, full=False):
- """
- Perform validation checks. An exception is raised if validation fails.
- By default only cheap validation checks are run. Pass `full=True`
- for thorough validation checks (potentially O(n)).
- Parameters
- ----------
- full: bool, default False
- If True, run expensive checks, otherwise cheap checks only.
- Raises
- ------
- ArrowInvalid
- """
- if full:
- with nogil:
- check_status(self.ap.ValidateFull())
- else:
- with nogil:
- check_status(self.ap.Validate())
- @property
- def offset(self):
- """
- A relative position into another array's data.
- The purpose is to enable zero-copy slicing. This value defaults to zero
- but must be applied on all operations with the physical storage
- buffers.
- """
- return self.sp_array.get().offset()
- def buffers(self):
- """
- Return a list of Buffer objects pointing to this array's physical
- storage.
- To correctly interpret these buffers, you need to also apply the offset
- multiplied with the size of the stored data type.
- """
- res = []
- _append_array_buffers(self.sp_array.get().data().get(), res)
- return res
- def _export_to_c(self, uintptr_t out_ptr, uintptr_t out_schema_ptr=0):
- """
- Export to a C ArrowArray struct, given its pointer.
- If a C ArrowSchema struct pointer is also given, the array type
- is exported to it at the same time.
- Parameters
- ----------
- out_ptr: int
- The raw pointer to a C ArrowArray struct.
- out_schema_ptr: int (optional)
- The raw pointer to a C ArrowSchema struct.
- Be careful: if you don't pass the ArrowArray struct to a consumer,
- array memory will leak. This is a low-level function intended for
- expert users.
- """
- with nogil:
- check_status(ExportArray(deref(self.sp_array),
- <ArrowArray*> out_ptr,
- <ArrowSchema*> out_schema_ptr))
- @staticmethod
- def _import_from_c(uintptr_t in_ptr, type):
- """
- Import Array from a C ArrowArray struct, given its pointer
- and the imported array type.
- Parameters
- ----------
- in_ptr: int
- The raw pointer to a C ArrowArray struct.
- type: DataType or int
- Either a DataType object, or the raw pointer to a C ArrowSchema
- struct.
- This is a low-level function intended for expert users.
- """
- cdef:
- shared_ptr[CArray] c_array
- c_type = pyarrow_unwrap_data_type(type)
- if c_type == nullptr:
- # Not a DataType object, perhaps a raw ArrowSchema pointer
- type_ptr = <uintptr_t> type
- with nogil:
- c_array = GetResultValue(ImportArray(<ArrowArray*> in_ptr,
- <ArrowSchema*> type_ptr))
- else:
- with nogil:
- c_array = GetResultValue(ImportArray(<ArrowArray*> in_ptr,
- c_type))
- return pyarrow_wrap_array(c_array)
- cdef _array_like_to_pandas(obj, options):
- cdef:
- PyObject* out
- PandasOptions c_options = _convert_pandas_options(options)
- original_type = obj.type
- name = obj._name
- # ARROW-3789(wesm): Convert date/timestamp types to datetime64[ns]
- c_options.coerce_temporal_nanoseconds = True
- if isinstance(obj, Array):
- with nogil:
- check_status(ConvertArrayToPandas(c_options,
- (<Array> obj).sp_array,
- obj, &out))
- elif isinstance(obj, ChunkedArray):
- with nogil:
- check_status(libarrow.ConvertChunkedArrayToPandas(
- c_options,
- (<ChunkedArray> obj).sp_chunked_array,
- obj, &out))
- arr = wrap_array_output(out)
- if (isinstance(original_type, TimestampType) and
- options["timestamp_as_object"]):
- # ARROW-5359 - need to specify object dtype to avoid pandas to
- # coerce back to ns resolution
- dtype = "object"
- else:
- dtype = None
- result = pandas_api.series(arr, dtype=dtype, name=name)
- if (isinstance(original_type, TimestampType) and
- original_type.tz is not None and
- # can be object dtype for non-ns and timestamp_as_object=True
- result.dtype.kind == "M"):
- from pyarrow.pandas_compat import make_tz_aware
- result = make_tz_aware(result, original_type.tz)
- return result
- cdef wrap_array_output(PyObject* output):
- cdef object obj = PyObject_to_object(output)
- if isinstance(obj, dict):
- return pandas_api.categorical_type(obj['indices'],
- categories=obj['dictionary'],
- ordered=obj['ordered'],
- fastpath=True)
- else:
- return obj
- cdef class NullArray(Array):
- """
- Concrete class for Arrow arrays of null data type.
- """
- cdef class BooleanArray(Array):
- """
- Concrete class for Arrow arrays of boolean data type.
- """
- @property
- def false_count(self):
- return (<CBooleanArray*> self.ap).false_count()
- @property
- def true_count(self):
- return (<CBooleanArray*> self.ap).true_count()
- cdef class NumericArray(Array):
- """
- A base class for Arrow numeric arrays.
- """
- cdef class IntegerArray(NumericArray):
- """
- A base class for Arrow integer arrays.
- """
- cdef class FloatingPointArray(NumericArray):
- """
- A base class for Arrow floating-point arrays.
- """
- cdef class Int8Array(IntegerArray):
- """
- Concrete class for Arrow arrays of int8 data type.
- """
- cdef class UInt8Array(IntegerArray):
- """
- Concrete class for Arrow arrays of uint8 data type.
- """
- cdef class Int16Array(IntegerArray):
- """
- Concrete class for Arrow arrays of int16 data type.
- """
- cdef class UInt16Array(IntegerArray):
- """
- Concrete class for Arrow arrays of uint16 data type.
- """
- cdef class Int32Array(IntegerArray):
- """
- Concrete class for Arrow arrays of int32 data type.
- """
- cdef class UInt32Array(IntegerArray):
- """
- Concrete class for Arrow arrays of uint32 data type.
- """
- cdef class Int64Array(IntegerArray):
- """
- Concrete class for Arrow arrays of int64 data type.
- """
- cdef class UInt64Array(IntegerArray):
- """
- Concrete class for Arrow arrays of uint64 data type.
- """
- cdef class Date32Array(NumericArray):
- """
- Concrete class for Arrow arrays of date32 data type.
- """
- cdef class Date64Array(NumericArray):
- """
- Concrete class for Arrow arrays of date64 data type.
- """
- cdef class TimestampArray(NumericArray):
- """
- Concrete class for Arrow arrays of timestamp data type.
- """
- cdef class Time32Array(NumericArray):
- """
- Concrete class for Arrow arrays of time32 data type.
- """
- cdef class Time64Array(NumericArray):
- """
- Concrete class for Arrow arrays of time64 data type.
- """
- cdef class DurationArray(NumericArray):
- """
- Concrete class for Arrow arrays of duration data type.
- """
- cdef class HalfFloatArray(FloatingPointArray):
- """
- Concrete class for Arrow arrays of float16 data type.
- """
- cdef class FloatArray(FloatingPointArray):
- """
- Concrete class for Arrow arrays of float32 data type.
- """
- cdef class DoubleArray(FloatingPointArray):
- """
- Concrete class for Arrow arrays of float64 data type.
- """
- cdef class FixedSizeBinaryArray(Array):
- """
- Concrete class for Arrow arrays of a fixed-size binary data type.
- """
- cdef class Decimal128Array(FixedSizeBinaryArray):
- """
- Concrete class for Arrow arrays of decimal128 data type.
- """
- cdef class Decimal256Array(FixedSizeBinaryArray):
- """
- Concrete class for Arrow arrays of decimal256 data type.
- """
- cdef class BaseListArray(Array):
- def flatten(self):
- """
- Unnest this ListArray/LargeListArray by one level.
- The returned Array is logically a concatenation of all the sub-lists
- in this Array.
- Note that this method is different from ``self.values()`` in that
- it takes care of the slicing offset as well as null elements backed
- by non-empty sub-lists.
- Returns
- -------
- result : Array
- """
- return _pc().list_flatten(self)
- def value_parent_indices(self):
- """
- Return array of same length as list child values array where each
- output value is the index of the parent list array slot containing each
- child value.
- Examples
- --------
- >>> arr = pa.array([[1, 2, 3], [], None, [4]],
- ... type=pa.list_(pa.int32()))
- >>> arr.value_parent_indices()
- <pyarrow.lib.Int32Array object at 0x7efc5db958a0>
- [
- 0,
- 0,
- 0,
- 3
- ]
- """
- return _pc().list_parent_indices(self)
- def value_lengths(self):
- """
- Return integers array with values equal to the respective length of
- each list element. Null list values are null in the output.
- Examples
- --------
- >>> arr = pa.array([[1, 2, 3], [], None, [4]],
- ... type=pa.list_(pa.int32()))
- >>> arr.value_lengths()
- <pyarrow.lib.Int32Array object at 0x7efc5db95910>
- [
- 3,
- 0,
- null,
- 1
- ]
- """
- return _pc().list_value_length(self)
- cdef class ListArray(BaseListArray):
- """
- Concrete class for Arrow arrays of a list data type.
- """
- @staticmethod
- def from_arrays(offsets, values, MemoryPool pool=None):
- """
- Construct ListArray from arrays of int32 offsets and values.
- Parameters
- ----------
- offsets : Array (int32 type)
- values : Array (any type)
- Returns
- -------
- list_array : ListArray
- Examples
- --------
- >>> values = pa.array([1, 2, 3, 4])
- >>> offsets = pa.array([0, 2, 4])
- >>> pa.ListArray.from_arrays(offsets, values)
- <pyarrow.lib.ListArray object at 0x7fbde226bf40>
- [
- [
- 0,
- 1
- ],
- [
- 2,
- 3
- ]
- ]
- # nulls in the offsets array become null lists
- >>> offsets = pa.array([0, None, 2, 4])
- >>> pa.ListArray.from_arrays(offsets, values)
- <pyarrow.lib.ListArray object at 0x7fbde226bf40>
- [
- [
- 0,
- 1
- ],
- null,
- [
- 2,
- 3
- ]
- ]
- """
- cdef:
- Array _offsets, _values
- shared_ptr[CArray] out
- cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)
- _offsets = asarray(offsets, type='int32')
- _values = asarray(values)
- with nogil:
- out = GetResultValue(
- CListArray.FromArrays(_offsets.ap[0], _values.ap[0], cpool))
- cdef Array result = pyarrow_wrap_array(out)
- result.validate()
- return result
- @property
- def values(self):
- cdef CListArray* arr = <CListArray*> self.ap
- return pyarrow_wrap_array(arr.values())
- @property
- def offsets(self):
- """
- Return the offsets as an int32 array.
- """
- return pyarrow_wrap_array((<CListArray*> self.ap).offsets())
- cdef class LargeListArray(BaseListArray):
- """
- Concrete class for Arrow arrays of a large list data type.
- Identical to ListArray, but 64-bit offsets.
- """
- @staticmethod
- def from_arrays(offsets, values, MemoryPool pool=None):
- """
- Construct LargeListArray from arrays of int64 offsets and values.
- Parameters
- ----------
- offsets : Array (int64 type)
- values : Array (any type)
- Returns
- -------
- list_array : LargeListArray
- """
- cdef:
- Array _offsets, _values
- shared_ptr[CArray] out
- cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)
- _offsets = asarray(offsets, type='int64')
- _values = asarray(values)
- with nogil:
- out = GetResultValue(
- CLargeListArray.FromArrays(_offsets.ap[0], _values.ap[0],
- cpool))
- cdef Array result = pyarrow_wrap_array(out)
- result.validate()
- return result
- @property
- def values(self):
- cdef CLargeListArray* arr = <CLargeListArray*> self.ap
- return pyarrow_wrap_array(arr.values())
- @property
- def offsets(self):
- """
- Return the offsets as an int64 array.
- """
- return pyarrow_wrap_array((<CLargeListArray*> self.ap).offsets())
- cdef class MapArray(Array):
- """
- Concrete class for Arrow arrays of a map data type.
- """
- @staticmethod
- def from_arrays(offsets, keys, items, MemoryPool pool=None):
- """
- Construct MapArray from arrays of int32 offsets and key, item arrays.
- Parameters
- ----------
- offsets : array-like or sequence (int32 type)
- keys : array-like or sequence (any type)
- items : array-like or sequence (any type)
- Returns
- -------
- map_array : MapArray
- """
- cdef:
- Array _offsets, _keys, _items
- shared_ptr[CArray] out
- cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)
- _offsets = asarray(offsets, type='int32')
- _keys = asarray(keys)
- _items = asarray(items)
- with nogil:
- out = GetResultValue(
- CMapArray.FromArrays(_offsets.sp_array,
- _keys.sp_array,
- _items.sp_array, cpool))
- cdef Array result = pyarrow_wrap_array(out)
- result.validate()
- return result
- @property
- def keys(self):
- return pyarrow_wrap_array((<CMapArray*> self.ap).keys())
- @property
- def items(self):
- return pyarrow_wrap_array((<CMapArray*> self.ap).items())
- cdef class FixedSizeListArray(Array):
- """
- Concrete class for Arrow arrays of a fixed size list data type.
- """
- @staticmethod
- def from_arrays(values, int32_t list_size):
- """
- Construct FixedSizeListArray from array of values and a list length.
- Parameters
- ----------
- values : Array (any type)
- list_size : int
- The fixed length of the lists.
- Returns
- -------
- FixedSizeListArray
- """
- cdef:
- Array _values
- CResult[shared_ptr[CArray]] c_result
- _values = asarray(values)
- with nogil:
- c_result = CFixedSizeListArray.FromArrays(
- _values.sp_array, list_size)
- cdef Array result = pyarrow_wrap_array(GetResultValue(c_result))
- result.validate()
- return result
- @property
- def values(self):
- return self.flatten()
- def flatten(self):
- """
- Unnest this FixedSizeListArray by one level.
- Returns
- -------
- result : Array
- """
- cdef CFixedSizeListArray* arr = <CFixedSizeListArray*> self.ap
- return pyarrow_wrap_array(arr.values())
- cdef class UnionArray(Array):
- """
- Concrete class for Arrow arrays of a Union data type.
- """
- def child(self, int pos):
- import warnings
- warnings.warn("child is deprecated, use field", FutureWarning)
- return self.field(pos)
- def field(self, int pos):
- """
- Return the given child field as an individual array.
- For sparse unions, the returned array has its offset, length,
- and null count adjusted.
- For dense unions, the returned array is unchanged.
- """
- cdef shared_ptr[CArray] result
- result = (<CUnionArray*> self.ap).field(pos)
- if result != NULL:
- return pyarrow_wrap_array(result)
- raise KeyError("UnionArray does not have child {}".format(pos))
- @property
- def type_codes(self):
- """Get the type codes array."""
- buf = pyarrow_wrap_buffer((<CUnionArray*> self.ap).type_codes())
- return Array.from_buffers(int8(), len(self), [None, buf])
- @property
- def offsets(self):
- """
- Get the value offsets array (dense arrays only).
- Does not account for any slice offset.
- """
- if self.type.mode != "dense":
- raise ArrowTypeError("Can only get value offsets for dense arrays")
- cdef CDenseUnionArray* dense = <CDenseUnionArray*> self.ap
- buf = pyarrow_wrap_buffer(dense.value_offsets())
- return Array.from_buffers(int32(), len(self), [None, buf])
- @staticmethod
- def from_dense(Array types, Array value_offsets, list children,
- list field_names=None, list type_codes=None):
- """
- Construct dense UnionArray from arrays of int8 types, int32 offsets and
- children arrays
- Parameters
- ----------
- types : Array (int8 type)
- value_offsets : Array (int32 type)
- children : list
- field_names : list
- type_codes : list
- Returns
- -------
- union_array : UnionArray
- """
- cdef:
- shared_ptr[CArray] out
- vector[shared_ptr[CArray]] c
- Array child
- vector[c_string] c_field_names
- vector[int8_t] c_type_codes
- for child in children:
- c.push_back(child.sp_array)
- if field_names is not None:
- for x in field_names:
- c_field_names.push_back(tobytes(x))
- if type_codes is not None:
- for x in type_codes:
- c_type_codes.push_back(x)
- with nogil:
- out = GetResultValue(CDenseUnionArray.Make(
- deref(types.ap), deref(value_offsets.ap), c, c_field_names,
- c_type_codes))
- cdef Array result = pyarrow_wrap_array(out)
- result.validate()
- return result
- @staticmethod
- def from_sparse(Array types, list children, list field_names=None,
- list type_codes=None):
- """
- Construct sparse UnionArray from arrays of int8 types and children
- arrays
- Parameters
- ----------
- types : Array (int8 type)
- children : list
- field_names : list
- type_codes : list
- Returns
- -------
- union_array : UnionArray
- """
- cdef:
- shared_ptr[CArray] out
- vector[shared_ptr[CArray]] c
- Array child
- vector[c_string] c_field_names
- vector[int8_t] c_type_codes
- for child in children:
- c.push_back(child.sp_array)
- if field_names is not None:
- for x in field_names:
- c_field_names.push_back(tobytes(x))
- if type_codes is not None:
- for x in type_codes:
- c_type_codes.push_back(x)
- with nogil:
- out = GetResultValue(CSparseUnionArray.Make(
- deref(types.ap), c, c_field_names, c_type_codes))
- cdef Array result = pyarrow_wrap_array(out)
- result.validate()
- return result
- cdef class StringArray(Array):
- """
- Concrete class for Arrow arrays of string (or utf8) data type.
- """
- @staticmethod
- def from_buffers(int length, Buffer value_offsets, Buffer data,
- Buffer null_bitmap=None, int null_count=-1,
- int offset=0):
- """
- Construct a StringArray from value_offsets and data buffers.
- If there are nulls in the data, also a null_bitmap and the matching
- null_count must be passed.
- Parameters
- ----------
- length : int
- value_offsets : Buffer
- data : Buffer
- null_bitmap : Buffer, optional
- null_count : int, default 0
- offset : int, default 0
- Returns
- -------
- string_array : StringArray
- """
- return Array.from_buffers(utf8(), length,
- [null_bitmap, value_offsets, data],
- null_count, offset)
- cdef class LargeStringArray(Array):
- """
- Concrete class for Arrow arrays of large string (or utf8) data type.
- """
- @staticmethod
- def from_buffers(int length, Buffer value_offsets, Buffer data,
- Buffer null_bitmap=None, int null_count=-1,
- int offset=0):
- """
- Construct a LargeStringArray from value_offsets and data buffers.
- If there are nulls in the data, also a null_bitmap and the matching
- null_count must be passed.
- Parameters
- ----------
- length : int
- value_offsets : Buffer
- data : Buffer
- null_bitmap : Buffer, optional
- null_count : int, default 0
- offset : int, default 0
- Returns
- -------
- string_array : StringArray
- """
- return Array.from_buffers(large_utf8(), length,
- [null_bitmap, value_offsets, data],
- null_count, offset)
- cdef class BinaryArray(Array):
- """
- Concrete class for Arrow arrays of variable-sized binary data type.
- """
- @property
- def total_values_length(self):
- """
- The number of bytes from beginning to end of the data buffer addressed
- by the offsets of this BinaryArray.
- """
- return (<CBinaryArray*> self.ap).total_values_length()
- cdef class LargeBinaryArray(Array):
- """
- Concrete class for Arrow arrays of large variable-sized binary data type.
- """
- @property
- def total_values_length(self):
- """
- The number of bytes from beginning to end of the data buffer addressed
- by the offsets of this LargeBinaryArray.
- """
- return (<CLargeBinaryArray*> self.ap).total_values_length()
- cdef class DictionaryArray(Array):
- """
- Concrete class for dictionary-encoded Arrow arrays.
- """
- def dictionary_encode(self):
- return self
- def dictionary_decode(self):
- """
- Decodes the DictionaryArray to an Array.
- """
- return self.dictionary.take(self.indices)
- @property
- def dictionary(self):
- cdef CDictionaryArray* darr = <CDictionaryArray*>(self.ap)
- if self._dictionary is None:
- self._dictionary = pyarrow_wrap_array(darr.dictionary())
- return self._dictionary
- @property
- def indices(self):
- cdef CDictionaryArray* darr = <CDictionaryArray*>(self.ap)
- if self._indices is None:
- self._indices = pyarrow_wrap_array(darr.indices())
- return self._indices
- @staticmethod
- def from_arrays(indices, dictionary, mask=None, bint ordered=False,
- bint from_pandas=False, bint safe=True,
- MemoryPool memory_pool=None):
- """
- Construct a DictionaryArray from indices and values.
- Parameters
- ----------
- indices : pyarrow.Array, numpy.ndarray or pandas.Series, int type
- Non-negative integers referencing the dictionary values by zero
- based index.
- dictionary : pyarrow.Array, ndarray or pandas.Series
- The array of values referenced by the indices.
- mask : ndarray or pandas.Series, bool type
- True values indicate that indices are actually null.
- from_pandas : bool, default False
- If True, the indices should be treated as though they originated in
- a pandas.Categorical (null encoded as -1).
- ordered : bool, default False
- Set to True if the category values are ordered.
- safe : bool, default True
- If True, check that the dictionary indices are in range.
- memory_pool : MemoryPool, default None
- For memory allocations, if required, otherwise uses default pool.
- Returns
- -------
- dict_array : DictionaryArray
- """
- cdef:
- Array _indices, _dictionary
- shared_ptr[CDataType] c_type
- shared_ptr[CArray] c_result
- if isinstance(indices, Array):
- if mask is not None:
- raise NotImplementedError(
- "mask not implemented with Arrow array inputs yet")
- _indices = indices
- else:
- if from_pandas:
- _indices = _codes_to_indices(indices, mask, None, memory_pool)
- else:
- _indices = array(indices, mask=mask, memory_pool=memory_pool)
- if isinstance(dictionary, Array):
- _dictionary = dictionary
- else:
- _dictionary = array(dictionary, memory_pool=memory_pool)
- if not isinstance(_indices, IntegerArray):
- raise ValueError('Indices must be integer type')
- cdef c_bool c_ordered = ordered
- c_type.reset(new CDictionaryType(_indices.type.sp_type,
- _dictionary.sp_array.get().type(),
- c_ordered))
- if safe:
- with nogil:
- c_result = GetResultValue(
- CDictionaryArray.FromArrays(c_type, _indices.sp_array,
- _dictionary.sp_array))
- else:
- c_result.reset(new CDictionaryArray(c_type, _indices.sp_array,
- _dictionary.sp_array))
- cdef Array result = pyarrow_wrap_array(c_result)
- result.validate()
- return result
- cdef class StructArray(Array):
- """
- Concrete class for Arrow arrays of a struct data type.
- """
- def field(self, index):
- """
- Retrieves the child array belonging to field.
- Parameters
- ----------
- index : Union[int, str]
- Index / position or name of the field.
- Returns
- -------
- result : Array
- """
- cdef:
- CStructArray* arr = <CStructArray*> self.ap
- shared_ptr[CArray] child
- if isinstance(index, (bytes, str)):
- child = arr.GetFieldByName(tobytes(index))
- if child == nullptr:
- raise KeyError(index)
- elif isinstance(index, int):
- child = arr.field(
- <int>_normalize_index(index, self.ap.num_fields()))
- else:
- raise TypeError('Expected integer or string index')
- return pyarrow_wrap_array(child)
- def flatten(self, MemoryPool memory_pool=None):
- """
- Return one individual array for each field in the struct.
- Parameters
- ----------
- memory_pool : MemoryPool, default None
- For memory allocations, if required, otherwise use default pool.
- Returns
- -------
- result : List[Array]
- """
- cdef:
- vector[shared_ptr[CArray]] arrays
- CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
- CStructArray* sarr = <CStructArray*> self.ap
- with nogil:
- arrays = GetResultValue(sarr.Flatten(pool))
- return [pyarrow_wrap_array(arr) for arr in arrays]
- @staticmethod
- def from_arrays(arrays, names=None, fields=None, mask=None,
- memory_pool=None):
- """
- Construct StructArray from collection of arrays representing
- each field in the struct.
- Either field names or field instances must be passed.
- Parameters
- ----------
- arrays : sequence of Array
- names : List[str] (optional)
- Field names for each struct child.
- fields : List[Field] (optional)
- Field instances for each struct child.
- mask : pyarrow.Array[bool] (optional)
- Indicate which values are null (True) or not null (False).
- memory_pool : MemoryPool (optional)
- For memory allocations, if required, otherwise uses default pool.
- Returns
- -------
- result : StructArray
- """
- cdef:
- shared_ptr[CArray] c_array
- shared_ptr[CBuffer] c_mask
- vector[shared_ptr[CArray]] c_arrays
- vector[c_string] c_names
- vector[shared_ptr[CField]] c_fields
- CResult[shared_ptr[CArray]] c_result
- ssize_t num_arrays
- ssize_t length
- ssize_t i
- Field py_field
- DataType struct_type
- if names is None and fields is None:
- raise ValueError('Must pass either names or fields')
- if names is not None and fields is not None:
- raise ValueError('Must pass either names or fields, not both')
- if mask is None:
- c_mask = shared_ptr[CBuffer]()
- elif isinstance(mask, Array):
- if mask.type.id != Type_BOOL:
- raise ValueError('Mask must be a pyarrow.Array of type bool')
- if mask.null_count != 0:
- raise ValueError('Mask must not contain nulls')
- inverted_mask = _pc().invert(mask, memory_pool=memory_pool)
- c_mask = pyarrow_unwrap_buffer(inverted_mask.buffers()[1])
- else:
- raise ValueError('Mask must be a pyarrow.Array of type bool')
- arrays = [asarray(x) for x in arrays]
- for arr in arrays:
- c_array = pyarrow_unwrap_array(arr)
- if c_array == nullptr:
- raise TypeError(f"Expected Array, got {arr.__class__}")
- c_arrays.push_back(c_array)
- if names is not None:
- for name in names:
- c_names.push_back(tobytes(name))
- else:
- for item in fields:
- if isinstance(item, tuple):
- py_field = field(*item)
- else:
- py_field = item
- c_fields.push_back(py_field.sp_field)
- if (c_arrays.size() == 0 and c_names.size() == 0 and
- c_fields.size() == 0):
- # The C++ side doesn't allow this
- return array([], struct([]))
- if names is not None:
- # XXX Cannot pass "nullptr" for a shared_ptr<T> argument:
- # https://github.com/cython/cython/issues/3020
- c_result = CStructArray.MakeFromFieldNames(
- c_arrays, c_names, c_mask, -1, 0)
- else:
- c_result = CStructArray.MakeFromFields(
- c_arrays, c_fields, c_mask, -1, 0)
- cdef Array result = pyarrow_wrap_array(GetResultValue(c_result))
- result.validate()
- return result
- cdef class ExtensionArray(Array):
- """
- Concrete class for Arrow extension arrays.
- """
- @property
- def storage(self):
- cdef:
- CExtensionArray* ext_array = <CExtensionArray*>(self.ap)
- return pyarrow_wrap_array(ext_array.storage())
- @staticmethod
- def from_storage(BaseExtensionType typ, Array storage):
- """
- Construct ExtensionArray from type and storage array.
- Parameters
- ----------
- typ: DataType
- The extension type for the result array.
- storage: Array
- The underlying storage for the result array.
- Returns
- -------
- ext_array : ExtensionArray
- """
- cdef:
- shared_ptr[CExtensionArray] ext_array
- if storage.type != typ.storage_type:
- raise TypeError("Incompatible storage type {0} "
- "for extension type {1}".format(storage.type, typ))
- ext_array = make_shared[CExtensionArray](typ.sp_type, storage.sp_array)
- cdef Array result = pyarrow_wrap_array(<shared_ptr[CArray]> ext_array)
- result.validate()
- return result
- def _to_pandas(self, options, **kwargs):
- pandas_dtype = None
- try:
- pandas_dtype = self.type.to_pandas_dtype()
- except NotImplementedError:
- pass
- # pandas ExtensionDtype that implements conversion from pyarrow
- if hasattr(pandas_dtype, '__from_arrow__'):
- arr = pandas_dtype.__from_arrow__(self)
- return pandas_api.series(arr)
- # otherwise convert the storage array with the base implementation
- return Array._to_pandas(self.storage, options, **kwargs)
- def to_numpy(self, **kwargs):
- """
- Convert extension array to a numpy ndarray.
- See Also
- --------
- Array.to_numpy
- """
- return self.storage.to_numpy(**kwargs)
- cdef dict _array_classes = {
- _Type_NA: NullArray,
- _Type_BOOL: BooleanArray,
- _Type_UINT8: UInt8Array,
- _Type_UINT16: UInt16Array,
- _Type_UINT32: UInt32Array,
- _Type_UINT64: UInt64Array,
- _Type_INT8: Int8Array,
- _Type_INT16: Int16Array,
- _Type_INT32: Int32Array,
- _Type_INT64: Int64Array,
- _Type_DATE32: Date32Array,
- _Type_DATE64: Date64Array,
- _Type_TIMESTAMP: TimestampArray,
- _Type_TIME32: Time32Array,
- _Type_TIME64: Time64Array,
- _Type_DURATION: DurationArray,
- _Type_HALF_FLOAT: HalfFloatArray,
- _Type_FLOAT: FloatArray,
- _Type_DOUBLE: DoubleArray,
- _Type_LIST: ListArray,
- _Type_LARGE_LIST: LargeListArray,
- _Type_MAP: MapArray,
- _Type_FIXED_SIZE_LIST: FixedSizeListArray,
- _Type_SPARSE_UNION: UnionArray,
- _Type_DENSE_UNION: UnionArray,
- _Type_BINARY: BinaryArray,
- _Type_STRING: StringArray,
- _Type_LARGE_BINARY: LargeBinaryArray,
- _Type_LARGE_STRING: LargeStringArray,
- _Type_DICTIONARY: DictionaryArray,
- _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
- _Type_DECIMAL128: Decimal128Array,
- _Type_DECIMAL256: Decimal256Array,
- _Type_STRUCT: StructArray,
- _Type_EXTENSION: ExtensionArray,
- }
- cdef object get_array_class_from_type(
- const shared_ptr[CDataType]& sp_data_type):
- cdef CDataType* data_type = sp_data_type.get()
- if data_type == NULL:
- raise ValueError('Array data type was NULL')
- if data_type.id() == _Type_EXTENSION:
- py_ext_data_type = pyarrow_wrap_data_type(sp_data_type)
- return py_ext_data_type.__arrow_ext_class__()
- else:
- return _array_classes[data_type.id()]
- cdef object get_values(object obj, bint* is_series):
- if pandas_api.is_series(obj) or pandas_api.is_index(obj):
- result = pandas_api.get_values(obj)
- is_series[0] = True
- elif isinstance(obj, np.ndarray):
- result = obj
- is_series[0] = False
- else:
- result = pandas_api.series(obj).values
- is_series[0] = False
- return result
- def concat_arrays(arrays, MemoryPool memory_pool=None):
- """
- Concatenate the given arrays.
- The contents of the input arrays are copied into the returned array.
- Raises
- ------
- ArrowInvalid : if not all of the arrays have the same type.
- Parameters
- ----------
- arrays : iterable of pyarrow.Array
- Arrays to concatenate, must be identically typed.
- memory_pool : MemoryPool, default None
- For memory allocations. If None, the default pool is used.
- """
- cdef:
- vector[shared_ptr[CArray]] c_arrays
- shared_ptr[CArray] c_concatenated
- CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
- for array in arrays:
- if not isinstance(array, Array):
- raise TypeError("Iterable should contain Array objects, "
- "got {0} instead".format(type(array)))
- c_arrays.push_back(pyarrow_unwrap_array(array))
- with nogil:
- c_concatenated = GetResultValue(Concatenate(c_arrays, pool))
- return pyarrow_wrap_array(c_concatenated)
- def _empty_array(DataType type):
- """
- Create empty array of the given type.
- """
- if type.id == Type_DICTIONARY:
- arr = DictionaryArray.from_arrays(
- _empty_array(type.index_type), _empty_array(type.value_type),
- ordered=type.ordered)
- else:
- arr = array([], type=type)
- return arr
|