_json.pyx 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. # Licensed to the Apache Software Foundation (ASF) under one
  2. # or more contributor license agreements. See the NOTICE file
  3. # distributed with this work for additional information
  4. # regarding copyright ownership. The ASF licenses this file
  5. # to you under the Apache License, Version 2.0 (the
  6. # "License"); you may not use this file except in compliance
  7. # with the License. You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing,
  12. # software distributed under the License is distributed on an
  13. # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14. # KIND, either express or implied. See the License for the
  15. # specific language governing permissions and limitations
  16. # under the License.
  17. # cython: profile=False
  18. # distutils: language = c++
  19. # cython: language_level = 3
  20. from pyarrow.includes.common cimport *
  21. from pyarrow.includes.libarrow cimport *
  22. from pyarrow.lib cimport (check_status, _Weakrefable, Field, MemoryPool,
  23. ensure_type, maybe_unbox_memory_pool,
  24. get_input_stream, pyarrow_wrap_table,
  25. pyarrow_wrap_data_type, pyarrow_unwrap_data_type,
  26. pyarrow_wrap_schema, pyarrow_unwrap_schema)
  27. cdef class ReadOptions(_Weakrefable):
  28. """
  29. Options for reading JSON files.
  30. Parameters
  31. ----------
  32. use_threads : bool, optional (default True)
  33. Whether to use multiple threads to accelerate reading
  34. block_size : int, optional
  35. How much bytes to process at a time from the input stream.
  36. This will determine multi-threading granularity as well as
  37. the size of individual chunks in the Table.
  38. """
  39. cdef:
  40. CJSONReadOptions options
  41. # Avoid mistakingly creating attributes
  42. __slots__ = ()
  43. def __init__(self, use_threads=None, block_size=None):
  44. self.options = CJSONReadOptions.Defaults()
  45. if use_threads is not None:
  46. self.use_threads = use_threads
  47. if block_size is not None:
  48. self.block_size = block_size
  49. @property
  50. def use_threads(self):
  51. """
  52. Whether to use multiple threads to accelerate reading.
  53. """
  54. return self.options.use_threads
  55. @use_threads.setter
  56. def use_threads(self, value):
  57. self.options.use_threads = value
  58. @property
  59. def block_size(self):
  60. """
  61. How much bytes to process at a time from the input stream.
  62. This will determine multi-threading granularity as well as the size of
  63. individual chunks in the Table.
  64. """
  65. return self.options.block_size
  66. @block_size.setter
  67. def block_size(self, value):
  68. self.options.block_size = value
  69. cdef class ParseOptions(_Weakrefable):
  70. """
  71. Options for parsing JSON files.
  72. Parameters
  73. ----------
  74. explicit_schema: Schema, optional (default None)
  75. Optional explicit schema (no type inference, ignores other fields).
  76. newlines_in_values: bool, optional (default False)
  77. Whether objects may be printed across multiple lines (for example
  78. pretty printed). If false, input must end with an empty line.
  79. unexpected_field_behavior: str, default "infer"
  80. How JSON fields outside of explicit_schema (if given) are treated.
  81. Possible behaviors:
  82. - "ignore": unexpected JSON fields are ignored
  83. - "error": error out on unexpected JSON fields
  84. - "infer": unexpected JSON fields are type-inferred and included in
  85. the output
  86. """
  87. cdef:
  88. CJSONParseOptions options
  89. __slots__ = ()
  90. def __init__(self, explicit_schema=None, newlines_in_values=None,
  91. unexpected_field_behavior=None):
  92. self.options = CJSONParseOptions.Defaults()
  93. if explicit_schema is not None:
  94. self.explicit_schema = explicit_schema
  95. if newlines_in_values is not None:
  96. self.newlines_in_values = newlines_in_values
  97. if unexpected_field_behavior is not None:
  98. self.unexpected_field_behavior = unexpected_field_behavior
  99. @property
  100. def explicit_schema(self):
  101. """
  102. Optional explicit schema (no type inference, ignores other fields)
  103. """
  104. if self.options.explicit_schema.get() == NULL:
  105. return None
  106. else:
  107. return pyarrow_wrap_schema(self.options.explicit_schema)
  108. @explicit_schema.setter
  109. def explicit_schema(self, value):
  110. self.options.explicit_schema = pyarrow_unwrap_schema(value)
  111. @property
  112. def newlines_in_values(self):
  113. """
  114. Whether newline characters are allowed in JSON values.
  115. Setting this to True reduces the performance of multi-threaded
  116. JSON reading.
  117. """
  118. return self.options.newlines_in_values
  119. @newlines_in_values.setter
  120. def newlines_in_values(self, value):
  121. self.options.newlines_in_values = value
  122. @property
  123. def unexpected_field_behavior(self):
  124. """
  125. How JSON fields outside of explicit_schema (if given) are treated.
  126. Possible behaviors:
  127. - "ignore": unexpected JSON fields are ignored
  128. - "error": error out on unexpected JSON fields
  129. - "infer": unexpected JSON fields are type-inferred and included in
  130. the output
  131. Set to "infer" by default.
  132. """
  133. v = self.options.unexpected_field_behavior
  134. if v == CUnexpectedFieldBehavior_Ignore:
  135. return "ignore"
  136. elif v == CUnexpectedFieldBehavior_Error:
  137. return "error"
  138. elif v == CUnexpectedFieldBehavior_InferType:
  139. return "infer"
  140. else:
  141. raise ValueError('Unexpected value for unexpected_field_behavior')
  142. @unexpected_field_behavior.setter
  143. def unexpected_field_behavior(self, value):
  144. cdef CUnexpectedFieldBehavior v
  145. if value == "ignore":
  146. v = CUnexpectedFieldBehavior_Ignore
  147. elif value == "error":
  148. v = CUnexpectedFieldBehavior_Error
  149. elif value == "infer":
  150. v = CUnexpectedFieldBehavior_InferType
  151. else:
  152. raise ValueError(
  153. "Unexpected value `{}` for `unexpected_field_behavior`, pass "
  154. "either `ignore`, `error` or `infer`.".format(value)
  155. )
  156. self.options.unexpected_field_behavior = v
  157. cdef _get_reader(input_file, shared_ptr[CInputStream]* out):
  158. use_memory_map = False
  159. get_input_stream(input_file, use_memory_map, out)
  160. cdef _get_read_options(ReadOptions read_options, CJSONReadOptions* out):
  161. if read_options is None:
  162. out[0] = CJSONReadOptions.Defaults()
  163. else:
  164. out[0] = read_options.options
  165. cdef _get_parse_options(ParseOptions parse_options, CJSONParseOptions* out):
  166. if parse_options is None:
  167. out[0] = CJSONParseOptions.Defaults()
  168. else:
  169. out[0] = parse_options.options
  170. def read_json(input_file, read_options=None, parse_options=None,
  171. MemoryPool memory_pool=None):
  172. """
  173. Read a Table from a stream of JSON data.
  174. Parameters
  175. ----------
  176. input_file: string, path or file-like object
  177. The location of JSON data. Currently only the line-delimited JSON
  178. format is supported.
  179. read_options: pyarrow.json.ReadOptions, optional
  180. Options for the JSON reader (see ReadOptions constructor for defaults)
  181. parse_options: pyarrow.json.ParseOptions, optional
  182. Options for the JSON parser
  183. (see ParseOptions constructor for defaults)
  184. memory_pool: MemoryPool, optional
  185. Pool to allocate Table memory from
  186. Returns
  187. -------
  188. :class:`pyarrow.Table`
  189. Contents of the JSON file as a in-memory table.
  190. """
  191. cdef:
  192. shared_ptr[CInputStream] stream
  193. CJSONReadOptions c_read_options
  194. CJSONParseOptions c_parse_options
  195. shared_ptr[CJSONReader] reader
  196. shared_ptr[CTable] table
  197. _get_reader(input_file, &stream)
  198. _get_read_options(read_options, &c_read_options)
  199. _get_parse_options(parse_options, &c_parse_options)
  200. reader = GetResultValue(
  201. CJSONReader.Make(maybe_unbox_memory_pool(memory_pool),
  202. stream, c_read_options, c_parse_options))
  203. with nogil:
  204. table = GetResultValue(reader.get().Read())
  205. return pyarrow_wrap_table(table)