serialization.pxi 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. # Licensed to the Apache Software Foundation (ASF) under one
  2. # or more contributor license agreements. See the NOTICE file
  3. # distributed with this work for additional information
  4. # regarding copyright ownership. The ASF licenses this file
  5. # to you under the Apache License, Version 2.0 (the
  6. # "License"); you may not use this file except in compliance
  7. # with the License. You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing,
  12. # software distributed under the License is distributed on an
  13. # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14. # KIND, either express or implied. See the License for the
  15. # specific language governing permissions and limitations
  16. # under the License.
  17. from cpython.ref cimport PyObject
  18. import warnings
  19. def _deprecate_serialization(name):
  20. msg = (
  21. "'pyarrow.{}' is deprecated as of 2.0.0 and will be removed in a "
  22. "future version. Use pickle or the pyarrow IPC functionality instead."
  23. ).format(name)
  24. warnings.warn(msg, FutureWarning, stacklevel=3)
  25. def is_named_tuple(cls):
  26. """
  27. Return True if cls is a namedtuple and False otherwise.
  28. """
  29. b = cls.__bases__
  30. if len(b) != 1 or b[0] != tuple:
  31. return False
  32. f = getattr(cls, "_fields", None)
  33. if not isinstance(f, tuple):
  34. return False
  35. return all(isinstance(n, str) for n in f)
  36. class SerializationCallbackError(ArrowSerializationError):
  37. def __init__(self, message, example_object):
  38. ArrowSerializationError.__init__(self, message)
  39. self.example_object = example_object
  40. class DeserializationCallbackError(ArrowSerializationError):
  41. def __init__(self, message, type_id):
  42. ArrowSerializationError.__init__(self, message)
  43. self.type_id = type_id
  44. cdef class SerializationContext(_Weakrefable):
  45. cdef:
  46. object type_to_type_id
  47. object whitelisted_types
  48. object types_to_pickle
  49. object custom_serializers
  50. object custom_deserializers
  51. object pickle_serializer
  52. object pickle_deserializer
  53. def __init__(self):
  54. # Types with special serialization handlers
  55. self.type_to_type_id = dict()
  56. self.whitelisted_types = dict()
  57. self.types_to_pickle = set()
  58. self.custom_serializers = dict()
  59. self.custom_deserializers = dict()
  60. self.pickle_serializer = pickle.dumps
  61. self.pickle_deserializer = pickle.loads
  62. def set_pickle(self, serializer, deserializer):
  63. """
  64. Set the serializer and deserializer to use for objects that are to be
  65. pickled.
  66. Parameters
  67. ----------
  68. serializer : callable
  69. The serializer to use (e.g., pickle.dumps or cloudpickle.dumps).
  70. deserializer : callable
  71. The deserializer to use (e.g., pickle.dumps or cloudpickle.dumps).
  72. """
  73. self.pickle_serializer = serializer
  74. self.pickle_deserializer = deserializer
  75. def clone(self):
  76. """
  77. Return copy of this SerializationContext.
  78. Returns
  79. -------
  80. clone : SerializationContext
  81. """
  82. result = SerializationContext()
  83. result.type_to_type_id = self.type_to_type_id.copy()
  84. result.whitelisted_types = self.whitelisted_types.copy()
  85. result.types_to_pickle = self.types_to_pickle.copy()
  86. result.custom_serializers = self.custom_serializers.copy()
  87. result.custom_deserializers = self.custom_deserializers.copy()
  88. result.pickle_serializer = self.pickle_serializer
  89. result.pickle_deserializer = self.pickle_deserializer
  90. return result
  91. def register_type(self, type_, type_id, pickle=False,
  92. custom_serializer=None, custom_deserializer=None):
  93. r"""
  94. EXPERIMENTAL: Add type to the list of types we can serialize.
  95. Parameters
  96. ----------
  97. type\_ : type
  98. The type that we can serialize.
  99. type_id : string
  100. A string used to identify the type.
  101. pickle : bool
  102. True if the serialization should be done with pickle.
  103. False if it should be done efficiently with Arrow.
  104. custom_serializer : callable
  105. This argument is optional, but can be provided to
  106. serialize objects of the class in a particular way.
  107. custom_deserializer : callable
  108. This argument is optional, but can be provided to
  109. deserialize objects of the class in a particular way.
  110. """
  111. if not isinstance(type_id, str):
  112. raise TypeError("The type_id argument must be a string. The value "
  113. "passed in has type {}.".format(type(type_id)))
  114. self.type_to_type_id[type_] = type_id
  115. self.whitelisted_types[type_id] = type_
  116. if pickle:
  117. self.types_to_pickle.add(type_id)
  118. if custom_serializer is not None:
  119. self.custom_serializers[type_id] = custom_serializer
  120. self.custom_deserializers[type_id] = custom_deserializer
  121. def _serialize_callback(self, obj):
  122. found = False
  123. for type_ in type(obj).__mro__:
  124. if type_ in self.type_to_type_id:
  125. found = True
  126. break
  127. if not found:
  128. raise SerializationCallbackError(
  129. "pyarrow does not know how to "
  130. "serialize objects of type {}.".format(type(obj)), obj
  131. )
  132. # use the closest match to type(obj)
  133. type_id = self.type_to_type_id[type_]
  134. if type_id in self.types_to_pickle:
  135. serialized_obj = {"data": self.pickle_serializer(obj),
  136. "pickle": True}
  137. elif type_id in self.custom_serializers:
  138. serialized_obj = {"data": self.custom_serializers[type_id](obj)}
  139. else:
  140. if is_named_tuple(type_):
  141. serialized_obj = {}
  142. serialized_obj["_pa_getnewargs_"] = obj.__getnewargs__()
  143. elif hasattr(obj, "__dict__"):
  144. serialized_obj = obj.__dict__
  145. else:
  146. msg = "We do not know how to serialize " \
  147. "the object '{}'".format(obj)
  148. raise SerializationCallbackError(msg, obj)
  149. return dict(serialized_obj, **{"_pytype_": type_id})
  150. def _deserialize_callback(self, serialized_obj):
  151. type_id = serialized_obj["_pytype_"]
  152. if isinstance(type_id, bytes):
  153. # ARROW-4675: Python 2 serialized, read in Python 3
  154. type_id = frombytes(type_id)
  155. if "pickle" in serialized_obj:
  156. # The object was pickled, so unpickle it.
  157. obj = self.pickle_deserializer(serialized_obj["data"])
  158. else:
  159. assert type_id not in self.types_to_pickle
  160. if type_id not in self.whitelisted_types:
  161. msg = "Type ID " + type_id + " not registered in " \
  162. "deserialization callback"
  163. raise DeserializationCallbackError(msg, type_id)
  164. type_ = self.whitelisted_types[type_id]
  165. if type_id in self.custom_deserializers:
  166. obj = self.custom_deserializers[type_id](
  167. serialized_obj["data"])
  168. else:
  169. # In this case, serialized_obj should just be
  170. # the __dict__ field.
  171. if "_pa_getnewargs_" in serialized_obj:
  172. obj = type_.__new__(
  173. type_, *serialized_obj["_pa_getnewargs_"])
  174. else:
  175. obj = type_.__new__(type_)
  176. serialized_obj.pop("_pytype_")
  177. obj.__dict__.update(serialized_obj)
  178. return obj
  179. def serialize(self, obj):
  180. """
  181. Call pyarrow.serialize and pass this SerializationContext.
  182. """
  183. return serialize(obj, context=self)
  184. def serialize_to(self, object value, sink):
  185. """
  186. Call pyarrow.serialize_to and pass this SerializationContext.
  187. """
  188. return serialize_to(value, sink, context=self)
  189. def deserialize(self, what):
  190. """
  191. Call pyarrow.deserialize and pass this SerializationContext.
  192. """
  193. return deserialize(what, context=self)
  194. def deserialize_components(self, what):
  195. """
  196. Call pyarrow.deserialize_components and pass this SerializationContext.
  197. """
  198. return deserialize_components(what, context=self)
  199. _default_serialization_context = SerializationContext()
  200. _default_context_initialized = False
  201. def _get_default_context():
  202. global _default_context_initialized
  203. from pyarrow.serialization import _register_default_serialization_handlers
  204. if not _default_context_initialized:
  205. _register_default_serialization_handlers(
  206. _default_serialization_context)
  207. _default_context_initialized = True
  208. return _default_serialization_context
  209. cdef class SerializedPyObject(_Weakrefable):
  210. """
  211. Arrow-serialized representation of Python object.
  212. """
  213. cdef:
  214. CSerializedPyObject data
  215. cdef readonly:
  216. object base
  217. @property
  218. def total_bytes(self):
  219. cdef CMockOutputStream mock_stream
  220. with nogil:
  221. check_status(self.data.WriteTo(&mock_stream))
  222. return mock_stream.GetExtentBytesWritten()
  223. def write_to(self, sink):
  224. """
  225. Write serialized object to a sink.
  226. """
  227. cdef shared_ptr[COutputStream] stream
  228. get_writer(sink, &stream)
  229. self._write_to(stream.get())
  230. cdef _write_to(self, COutputStream* stream):
  231. with nogil:
  232. check_status(self.data.WriteTo(stream))
  233. def deserialize(self, SerializationContext context=None):
  234. """
  235. Convert back to Python object.
  236. """
  237. cdef PyObject* result
  238. if context is None:
  239. context = _get_default_context()
  240. with nogil:
  241. check_status(DeserializeObject(context, self.data,
  242. <PyObject*> self.base, &result))
  243. # PyObject_to_object is necessary to avoid a memory leak;
  244. # also unpack the list the object was wrapped in in serialize
  245. return PyObject_to_object(result)[0]
  246. def to_buffer(self, nthreads=1):
  247. """
  248. Write serialized data as Buffer.
  249. """
  250. cdef Buffer output = allocate_buffer(self.total_bytes)
  251. sink = FixedSizeBufferWriter(output)
  252. if nthreads > 1:
  253. sink.set_memcopy_threads(nthreads)
  254. self.write_to(sink)
  255. return output
  256. @staticmethod
  257. def from_components(components):
  258. """
  259. Reconstruct SerializedPyObject from output of
  260. SerializedPyObject.to_components.
  261. """
  262. cdef:
  263. int num_tensors = components['num_tensors']
  264. int num_ndarrays = components['num_ndarrays']
  265. int num_buffers = components['num_buffers']
  266. list buffers = components['data']
  267. SparseTensorCounts num_sparse_tensors = SparseTensorCounts()
  268. SerializedPyObject result = SerializedPyObject()
  269. num_sparse_tensors.coo = components['num_sparse_tensors']['coo']
  270. num_sparse_tensors.csr = components['num_sparse_tensors']['csr']
  271. num_sparse_tensors.csc = components['num_sparse_tensors']['csc']
  272. num_sparse_tensors.csf = components['num_sparse_tensors']['csf']
  273. num_sparse_tensors.ndim_csf = \
  274. components['num_sparse_tensors']['ndim_csf']
  275. with nogil:
  276. check_status(GetSerializedFromComponents(num_tensors,
  277. num_sparse_tensors,
  278. num_ndarrays,
  279. num_buffers,
  280. buffers, &result.data))
  281. return result
  282. def to_components(self, memory_pool=None):
  283. """
  284. Return the decomposed dict representation of the serialized object
  285. containing a collection of Buffer objects which maximize opportunities
  286. for zero-copy.
  287. Parameters
  288. ----------
  289. memory_pool : MemoryPool default None
  290. Pool to use for necessary allocations.
  291. Returns
  292. """
  293. cdef PyObject* result
  294. cdef CMemoryPool* c_pool = maybe_unbox_memory_pool(memory_pool)
  295. with nogil:
  296. check_status(self.data.GetComponents(c_pool, &result))
  297. return PyObject_to_object(result)
  298. def serialize(object value, SerializationContext context=None):
  299. """
  300. DEPRECATED: Serialize a general Python sequence for transient storage
  301. and transport.
  302. .. deprecated:: 2.0
  303. The custom serialization functionality is deprecated in pyarrow 2.0,
  304. and will be removed in a future version. Use the standard library
  305. ``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
  306. more).
  307. Notes
  308. -----
  309. This function produces data that is incompatible with the standard
  310. Arrow IPC binary protocol, i.e. it cannot be used with ipc.open_stream or
  311. ipc.open_file. You can use deserialize, deserialize_from, or
  312. deserialize_components to read it.
  313. Parameters
  314. ----------
  315. value: object
  316. Python object for the sequence that is to be serialized.
  317. context : SerializationContext
  318. Custom serialization and deserialization context, uses a default
  319. context with some standard type handlers if not specified.
  320. Returns
  321. -------
  322. serialized : SerializedPyObject
  323. """
  324. _deprecate_serialization("serialize")
  325. return _serialize(value, context)
  326. def _serialize(object value, SerializationContext context=None):
  327. cdef SerializedPyObject serialized = SerializedPyObject()
  328. wrapped_value = [value]
  329. if context is None:
  330. context = _get_default_context()
  331. with nogil:
  332. check_status(SerializeObject(context, wrapped_value, &serialized.data))
  333. return serialized
  334. def serialize_to(object value, sink, SerializationContext context=None):
  335. """
  336. DEPRECATED: Serialize a Python sequence to a file.
  337. .. deprecated:: 2.0
  338. The custom serialization functionality is deprecated in pyarrow 2.0,
  339. and will be removed in a future version. Use the standard library
  340. ``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
  341. more).
  342. Parameters
  343. ----------
  344. value: object
  345. Python object for the sequence that is to be serialized.
  346. sink: NativeFile or file-like
  347. File the sequence will be written to.
  348. context : SerializationContext
  349. Custom serialization and deserialization context, uses a default
  350. context with some standard type handlers if not specified.
  351. """
  352. _deprecate_serialization("serialize_to")
  353. serialized = _serialize(value, context)
  354. serialized.write_to(sink)
  355. def read_serialized(source, base=None):
  356. """
  357. DEPRECATED: Read serialized Python sequence from file-like object.
  358. .. deprecated:: 2.0
  359. The custom serialization functionality is deprecated in pyarrow 2.0,
  360. and will be removed in a future version. Use the standard library
  361. ``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
  362. more).
  363. Parameters
  364. ----------
  365. source: NativeFile
  366. File to read the sequence from.
  367. base: object
  368. This object will be the base object of all the numpy arrays
  369. contained in the sequence.
  370. Returns
  371. -------
  372. serialized : the serialized data
  373. """
  374. _deprecate_serialization("read_serialized")
  375. return _read_serialized(source, base=base)
  376. def _read_serialized(source, base=None):
  377. cdef shared_ptr[CRandomAccessFile] stream
  378. get_reader(source, True, &stream)
  379. cdef SerializedPyObject serialized = SerializedPyObject()
  380. serialized.base = base
  381. with nogil:
  382. check_status(ReadSerializedObject(stream.get(), &serialized.data))
  383. return serialized
  384. def deserialize_from(source, object base, SerializationContext context=None):
  385. """
  386. DEPRECATED: Deserialize a Python sequence from a file.
  387. .. deprecated:: 2.0
  388. The custom serialization functionality is deprecated in pyarrow 2.0,
  389. and will be removed in a future version. Use the standard library
  390. ``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
  391. more).
  392. This only can interact with data produced by pyarrow.serialize or
  393. pyarrow.serialize_to.
  394. Parameters
  395. ----------
  396. source: NativeFile
  397. File to read the sequence from.
  398. base: object
  399. This object will be the base object of all the numpy arrays
  400. contained in the sequence.
  401. context : SerializationContext
  402. Custom serialization and deserialization context.
  403. Returns
  404. -------
  405. object
  406. Python object for the deserialized sequence.
  407. """
  408. _deprecate_serialization("deserialize_from")
  409. serialized = _read_serialized(source, base=base)
  410. return serialized.deserialize(context)
  411. def deserialize_components(components, SerializationContext context=None):
  412. """
  413. DEPRECATED: Reconstruct Python object from output of
  414. SerializedPyObject.to_components.
  415. .. deprecated:: 2.0
  416. The custom serialization functionality is deprecated in pyarrow 2.0,
  417. and will be removed in a future version. Use the standard library
  418. ``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
  419. more).
  420. Parameters
  421. ----------
  422. components : dict
  423. Output of SerializedPyObject.to_components
  424. context : SerializationContext, default None
  425. Returns
  426. -------
  427. object : the Python object that was originally serialized
  428. """
  429. _deprecate_serialization("deserialize_components")
  430. serialized = SerializedPyObject.from_components(components)
  431. return serialized.deserialize(context)
  432. def deserialize(obj, SerializationContext context=None):
  433. """
  434. DEPRECATED: Deserialize Python object from Buffer or other Python
  435. object supporting the buffer protocol.
  436. .. deprecated:: 2.0
  437. The custom serialization functionality is deprecated in pyarrow 2.0,
  438. and will be removed in a future version. Use the standard library
  439. ``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
  440. more).
  441. This only can interact with data produced by pyarrow.serialize or
  442. pyarrow.serialize_to.
  443. Parameters
  444. ----------
  445. obj : pyarrow.Buffer or Python object supporting buffer protocol
  446. context : SerializationContext
  447. Custom serialization and deserialization context.
  448. Returns
  449. -------
  450. deserialized : object
  451. """
  452. _deprecate_serialization("deserialize")
  453. return _deserialize(obj, context=context)
  454. def _deserialize(obj, SerializationContext context=None):
  455. source = BufferReader(obj)
  456. serialized = _read_serialized(source, base=obj)
  457. return serialized.deserialize(context)