123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248 |
- # Licensed to the Apache Software Foundation (ASF) under one
- # or more contributor license agreements. See the NOTICE file
- # distributed with this work for additional information
- # regarding copyright ownership. The ASF licenses this file
- # to you under the Apache License, Version 2.0 (the
- # "License"); you may not use this file except in compliance
- # with the License. You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing,
- # software distributed under the License is distributed on an
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- # KIND, either express or implied. See the License for the
- # specific language governing permissions and limitations
- # under the License.
- # cython: profile=False
- # distutils: language = c++
- # cython: language_level = 3
- from pyarrow.includes.common cimport *
- from pyarrow.includes.libarrow cimport *
- from pyarrow.lib cimport (check_status, _Weakrefable, Field, MemoryPool,
- ensure_type, maybe_unbox_memory_pool,
- get_input_stream, pyarrow_wrap_table,
- pyarrow_wrap_data_type, pyarrow_unwrap_data_type,
- pyarrow_wrap_schema, pyarrow_unwrap_schema)
- cdef class ReadOptions(_Weakrefable):
- """
- Options for reading JSON files.
- Parameters
- ----------
- use_threads : bool, optional (default True)
- Whether to use multiple threads to accelerate reading
- block_size : int, optional
- How much bytes to process at a time from the input stream.
- This will determine multi-threading granularity as well as
- the size of individual chunks in the Table.
- """
- cdef:
- CJSONReadOptions options
- # Avoid mistakingly creating attributes
- __slots__ = ()
- def __init__(self, use_threads=None, block_size=None):
- self.options = CJSONReadOptions.Defaults()
- if use_threads is not None:
- self.use_threads = use_threads
- if block_size is not None:
- self.block_size = block_size
- @property
- def use_threads(self):
- """
- Whether to use multiple threads to accelerate reading.
- """
- return self.options.use_threads
- @use_threads.setter
- def use_threads(self, value):
- self.options.use_threads = value
- @property
- def block_size(self):
- """
- How much bytes to process at a time from the input stream.
- This will determine multi-threading granularity as well as the size of
- individual chunks in the Table.
- """
- return self.options.block_size
- @block_size.setter
- def block_size(self, value):
- self.options.block_size = value
- cdef class ParseOptions(_Weakrefable):
- """
- Options for parsing JSON files.
- Parameters
- ----------
- explicit_schema: Schema, optional (default None)
- Optional explicit schema (no type inference, ignores other fields).
- newlines_in_values: bool, optional (default False)
- Whether objects may be printed across multiple lines (for example
- pretty printed). If false, input must end with an empty line.
- unexpected_field_behavior: str, default "infer"
- How JSON fields outside of explicit_schema (if given) are treated.
- Possible behaviors:
- - "ignore": unexpected JSON fields are ignored
- - "error": error out on unexpected JSON fields
- - "infer": unexpected JSON fields are type-inferred and included in
- the output
- """
- cdef:
- CJSONParseOptions options
- __slots__ = ()
- def __init__(self, explicit_schema=None, newlines_in_values=None,
- unexpected_field_behavior=None):
- self.options = CJSONParseOptions.Defaults()
- if explicit_schema is not None:
- self.explicit_schema = explicit_schema
- if newlines_in_values is not None:
- self.newlines_in_values = newlines_in_values
- if unexpected_field_behavior is not None:
- self.unexpected_field_behavior = unexpected_field_behavior
- @property
- def explicit_schema(self):
- """
- Optional explicit schema (no type inference, ignores other fields)
- """
- if self.options.explicit_schema.get() == NULL:
- return None
- else:
- return pyarrow_wrap_schema(self.options.explicit_schema)
- @explicit_schema.setter
- def explicit_schema(self, value):
- self.options.explicit_schema = pyarrow_unwrap_schema(value)
- @property
- def newlines_in_values(self):
- """
- Whether newline characters are allowed in JSON values.
- Setting this to True reduces the performance of multi-threaded
- JSON reading.
- """
- return self.options.newlines_in_values
- @newlines_in_values.setter
- def newlines_in_values(self, value):
- self.options.newlines_in_values = value
- @property
- def unexpected_field_behavior(self):
- """
- How JSON fields outside of explicit_schema (if given) are treated.
- Possible behaviors:
- - "ignore": unexpected JSON fields are ignored
- - "error": error out on unexpected JSON fields
- - "infer": unexpected JSON fields are type-inferred and included in
- the output
- Set to "infer" by default.
- """
- v = self.options.unexpected_field_behavior
- if v == CUnexpectedFieldBehavior_Ignore:
- return "ignore"
- elif v == CUnexpectedFieldBehavior_Error:
- return "error"
- elif v == CUnexpectedFieldBehavior_InferType:
- return "infer"
- else:
- raise ValueError('Unexpected value for unexpected_field_behavior')
- @unexpected_field_behavior.setter
- def unexpected_field_behavior(self, value):
- cdef CUnexpectedFieldBehavior v
- if value == "ignore":
- v = CUnexpectedFieldBehavior_Ignore
- elif value == "error":
- v = CUnexpectedFieldBehavior_Error
- elif value == "infer":
- v = CUnexpectedFieldBehavior_InferType
- else:
- raise ValueError(
- "Unexpected value `{}` for `unexpected_field_behavior`, pass "
- "either `ignore`, `error` or `infer`.".format(value)
- )
- self.options.unexpected_field_behavior = v
- cdef _get_reader(input_file, shared_ptr[CInputStream]* out):
- use_memory_map = False
- get_input_stream(input_file, use_memory_map, out)
- cdef _get_read_options(ReadOptions read_options, CJSONReadOptions* out):
- if read_options is None:
- out[0] = CJSONReadOptions.Defaults()
- else:
- out[0] = read_options.options
- cdef _get_parse_options(ParseOptions parse_options, CJSONParseOptions* out):
- if parse_options is None:
- out[0] = CJSONParseOptions.Defaults()
- else:
- out[0] = parse_options.options
- def read_json(input_file, read_options=None, parse_options=None,
- MemoryPool memory_pool=None):
- """
- Read a Table from a stream of JSON data.
- Parameters
- ----------
- input_file: string, path or file-like object
- The location of JSON data. Currently only the line-delimited JSON
- format is supported.
- read_options: pyarrow.json.ReadOptions, optional
- Options for the JSON reader (see ReadOptions constructor for defaults)
- parse_options: pyarrow.json.ParseOptions, optional
- Options for the JSON parser
- (see ParseOptions constructor for defaults)
- memory_pool: MemoryPool, optional
- Pool to allocate Table memory from
- Returns
- -------
- :class:`pyarrow.Table`
- Contents of the JSON file as a in-memory table.
- """
- cdef:
- shared_ptr[CInputStream] stream
- CJSONReadOptions c_read_options
- CJSONParseOptions c_parse_options
- shared_ptr[CJSONReader] reader
- shared_ptr[CTable] table
- _get_reader(input_file, &stream)
- _get_read_options(read_options, &c_read_options)
- _get_parse_options(parse_options, &c_parse_options)
- reader = GetResultValue(
- CJSONReader.Make(maybe_unbox_memory_pool(memory_pool),
- stream, c_read_options, c_parse_options))
- with nogil:
- table = GetResultValue(reader.get().Read())
- return pyarrow_wrap_table(table)
|