client.py 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795
  1. import io
  2. import logging
  3. from datetime import tzinfo
  4. import pytz
  5. from abc import ABC, abstractmethod
  6. from typing import Iterable, Optional, Any, Union, Sequence, Dict, Generator, BinaryIO
  7. from pytz.exceptions import UnknownTimeZoneError
  8. from clickhouse_connect import common
  9. from clickhouse_connect.common import version
  10. from clickhouse_connect.datatypes.registry import get_from_name
  11. from clickhouse_connect.datatypes.base import ClickHouseType
  12. from clickhouse_connect.driver import tzutil
  13. from clickhouse_connect.driver.common import dict_copy, StreamContext, coerce_int, coerce_bool
  14. from clickhouse_connect.driver.constants import CH_VERSION_WITH_PROTOCOL, PROTOCOL_VERSION_WITH_LOW_CARD
  15. from clickhouse_connect.driver.exceptions import ProgrammingError, OperationalError
  16. from clickhouse_connect.driver.external import ExternalData
  17. from clickhouse_connect.driver.insert import InsertContext
  18. from clickhouse_connect.driver.summary import QuerySummary
  19. from clickhouse_connect.driver.models import ColumnDef, SettingDef, SettingStatus
  20. from clickhouse_connect.driver.query import QueryResult, to_arrow, to_arrow_batches, QueryContext, arrow_buffer, \
  21. quote_identifier
  22. io.DEFAULT_BUFFER_SIZE = 1024 * 256
  23. logger = logging.getLogger(__name__)
  24. arrow_str_setting = 'output_format_arrow_string_as_string'
  25. # pylint: disable=too-many-public-methods, too-many-instance-attributes
  26. class Client(ABC):
  27. """
  28. Base ClickHouse Connect client
  29. """
  30. compression: str = None
  31. write_compression: str = None
  32. protocol_version = 0
  33. valid_transport_settings = set()
  34. optional_transport_settings = set()
  35. database = None
  36. max_error_message = 0
  37. apply_server_timezone = False
  38. show_clickhouse_errors = True
  39. def __init__(self,
  40. database: str,
  41. query_limit: int,
  42. uri: str,
  43. query_retries: int,
  44. server_host_name: Optional[str],
  45. apply_server_timezone: Optional[Union[str, bool]],
  46. show_clickhouse_errors: Optional[bool]):
  47. """
  48. Shared initialization of ClickHouse Connect client
  49. :param database: database name
  50. :param query_limit: default LIMIT for queries
  51. :param uri: uri for error messages
  52. """
  53. self.query_limit = coerce_int(query_limit)
  54. self.query_retries = coerce_int(query_retries)
  55. if show_clickhouse_errors is not None:
  56. self.show_clickhouse_errors = coerce_bool(show_clickhouse_errors)
  57. self.server_host_name = server_host_name
  58. self.server_tz, dst_safe = pytz.UTC, True
  59. self.server_version, server_tz = \
  60. tuple(self.command('SELECT version(), timezone()', use_database=False))
  61. try:
  62. server_tz = pytz.timezone(server_tz)
  63. server_tz, dst_safe = tzutil.normalize_timezone(server_tz)
  64. if apply_server_timezone is None:
  65. apply_server_timezone = dst_safe
  66. self.apply_server_timezone = apply_server_timezone == 'always' or coerce_bool(apply_server_timezone)
  67. self.server_tz = server_tz
  68. except UnknownTimeZoneError:
  69. logger.warning('Warning, server is using an unrecognized timezone %s, will use UTC default', server_tz)
  70. if not self.apply_server_timezone and not tzutil.local_tz_dst_safe:
  71. logger.warning('local timezone %s may return unexpected times due to Daylight Savings Time/' +
  72. 'Summer Time differences', tzutil.local_tz.tzname(None))
  73. readonly = 'readonly'
  74. if not self.min_version('19.17'):
  75. readonly = common.get_setting('readonly')
  76. server_settings = self.query(f'SELECT name, value, {readonly} as readonly FROM system.settings LIMIT 10000')
  77. self.server_settings = {row['name']: SettingDef(**row) for row in server_settings.named_results()}
  78. if database and not database == '__default__':
  79. self.database = database
  80. if self.min_version(CH_VERSION_WITH_PROTOCOL):
  81. # Unfortunately we have to validate that the client protocol version is actually used by ClickHouse
  82. # since the query parameter could be stripped off (in particular, by CHProxy)
  83. test_data = self.raw_query('SELECT 1 AS check', fmt='Native', settings={
  84. 'client_protocol_version': PROTOCOL_VERSION_WITH_LOW_CARD
  85. })
  86. if test_data[8:16] == b'\x01\x01\x05check':
  87. self.protocol_version = PROTOCOL_VERSION_WITH_LOW_CARD
  88. self.uri = uri
  89. def _validate_settings(self, settings: Optional[Dict[str, Any]]) -> Dict[str, str]:
  90. """
  91. This strips any ClickHouse settings that are not recognized or are read only.
  92. :param settings: Dictionary of setting name and values
  93. :return: A filtered dictionary of settings with values rendered as strings
  94. """
  95. validated = {}
  96. invalid_action = common.get_setting('invalid_setting_action')
  97. for key, value in settings.items():
  98. str_value = self._validate_setting(key, value, invalid_action)
  99. if str_value is not None:
  100. validated[key] = value
  101. return validated
  102. def _validate_setting(self, key: str, value: Any, invalid_action: str) -> Optional[str]:
  103. if key not in self.valid_transport_settings:
  104. setting_def = self.server_settings.get(key)
  105. if setting_def is None or setting_def.readonly:
  106. if key in self.optional_transport_settings:
  107. return None
  108. if invalid_action == 'send':
  109. logger.warning('Attempting to send unrecognized or readonly setting %s', key)
  110. elif invalid_action == 'drop':
  111. logger.warning('Dropping unrecognized or readonly settings %s', key)
  112. return None
  113. else:
  114. raise ProgrammingError(f'Setting {key} is unknown or readonly') from None
  115. if isinstance(value, bool):
  116. return '1' if value else '0'
  117. return str(value)
  118. def _setting_status(self, key: str) -> SettingStatus:
  119. comp_setting = self.server_settings.get(key)
  120. if not comp_setting:
  121. return SettingStatus(False, False)
  122. return SettingStatus(comp_setting.value != '0', comp_setting.readonly != 1)
  123. def _prep_query(self, context: QueryContext):
  124. if context.is_select and not context.has_limit and self.query_limit:
  125. limit = f'\n LIMIT {self.query_limit}'
  126. if isinstance(context.query, bytes):
  127. return context.final_query + limit.encode()
  128. return context.final_query + limit
  129. return context.final_query
  130. def _check_tz_change(self, new_tz) -> Optional[tzinfo]:
  131. if new_tz:
  132. try:
  133. new_tzinfo = pytz.timezone(new_tz)
  134. if new_tzinfo != self.server_tz:
  135. return new_tzinfo
  136. except UnknownTimeZoneError:
  137. logger.warning('Unrecognized timezone %s received from ClickHouse', new_tz)
  138. return None
  139. @abstractmethod
  140. def _query_with_context(self, context: QueryContext):
  141. pass
  142. @abstractmethod
  143. def set_client_setting(self, key, value):
  144. """
  145. Set a clickhouse setting for the client after initialization. If a setting is not recognized by ClickHouse,
  146. or the setting is identified as "read_only", this call will either throw a Programming exception or attempt
  147. to send the setting anyway based on the common setting 'invalid_setting_action'
  148. :param key: ClickHouse setting name
  149. :param value: ClickHouse setting value
  150. """
  151. @abstractmethod
  152. def get_client_setting(self, key) -> Optional[str]:
  153. """
  154. :param key: The setting key
  155. :return: The string value of the setting, if it exists, or None
  156. """
  157. # pylint: disable=too-many-arguments,unused-argument,too-many-locals
  158. def query(self,
  159. query: Optional[str] = None,
  160. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  161. settings: Optional[Dict[str, Any]] = None,
  162. query_formats: Optional[Dict[str, str]] = None,
  163. column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None,
  164. encoding: Optional[str] = None,
  165. use_none: Optional[bool] = None,
  166. column_oriented: Optional[bool] = None,
  167. use_numpy: Optional[bool] = None,
  168. max_str_len: Optional[int] = None,
  169. context: QueryContext = None,
  170. query_tz: Optional[Union[str, tzinfo]] = None,
  171. column_tzs: Optional[Dict[str, Union[str, tzinfo]]] = None,
  172. external_data: Optional[ExternalData] = None) -> QueryResult:
  173. """
  174. Main query method for SELECT, DESCRIBE and other SQL statements that return a result matrix. For
  175. parameters, see the create_query_context method
  176. :return: QueryResult -- data and metadata from response
  177. """
  178. if query and query.lower().strip().startswith('select __connect_version__'):
  179. return QueryResult([[f'ClickHouse Connect v.{version()} ⓒ ClickHouse Inc.']], None,
  180. ('connect_version',), (get_from_name('String'),))
  181. kwargs = locals().copy()
  182. del kwargs['self']
  183. query_context = self.create_query_context(**kwargs)
  184. if query_context.is_command:
  185. response = self.command(query,
  186. parameters=query_context.parameters,
  187. settings=query_context.settings,
  188. external_data=query_context.external_data)
  189. if isinstance(response, QuerySummary):
  190. return response.as_query_result()
  191. return QueryResult([response] if isinstance(response, list) else [[response]])
  192. return self._query_with_context(query_context)
  193. def query_column_block_stream(self,
  194. query: Optional[str] = None,
  195. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  196. settings: Optional[Dict[str, Any]] = None,
  197. query_formats: Optional[Dict[str, str]] = None,
  198. column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None,
  199. encoding: Optional[str] = None,
  200. use_none: Optional[bool] = None,
  201. context: QueryContext = None,
  202. query_tz: Optional[Union[str, tzinfo]] = None,
  203. column_tzs: Optional[Dict[str, Union[str, tzinfo]]] = None,
  204. external_data: Optional[ExternalData] = None) -> StreamContext:
  205. """
  206. Variation of main query method that returns a stream of column oriented blocks. For
  207. parameters, see the create_query_context method.
  208. :return: StreamContext -- Iterable stream context that returns column oriented blocks
  209. """
  210. return self._context_query(locals(), use_numpy=False, streaming=True).column_block_stream
  211. def query_row_block_stream(self,
  212. query: Optional[str] = None,
  213. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  214. settings: Optional[Dict[str, Any]] = None,
  215. query_formats: Optional[Dict[str, str]] = None,
  216. column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None,
  217. encoding: Optional[str] = None,
  218. use_none: Optional[bool] = None,
  219. context: QueryContext = None,
  220. query_tz: Optional[Union[str, tzinfo]] = None,
  221. column_tzs: Optional[Dict[str, Union[str, tzinfo]]] = None,
  222. external_data: Optional[ExternalData] = None) -> StreamContext:
  223. """
  224. Variation of main query method that returns a stream of row oriented blocks. For
  225. parameters, see the create_query_context method.
  226. :return: StreamContext -- Iterable stream context that returns blocks of rows
  227. """
  228. return self._context_query(locals(), use_numpy=False, streaming=True).row_block_stream
  229. def query_rows_stream(self,
  230. query: Optional[str] = None,
  231. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  232. settings: Optional[Dict[str, Any]] = None,
  233. query_formats: Optional[Dict[str, str]] = None,
  234. column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None,
  235. encoding: Optional[str] = None,
  236. use_none: Optional[bool] = None,
  237. context: QueryContext = None,
  238. query_tz: Optional[Union[str, tzinfo]] = None,
  239. column_tzs: Optional[Dict[str, Union[str, tzinfo]]] = None,
  240. external_data: Optional[ExternalData] = None) -> StreamContext:
  241. """
  242. Variation of main query method that returns a stream of row oriented blocks. For
  243. parameters, see the create_query_context method.
  244. :return: StreamContext -- Iterable stream context that returns blocks of rows
  245. """
  246. return self._context_query(locals(), use_numpy=False, streaming=True).rows_stream
  247. @abstractmethod
  248. def raw_query(self, query: str,
  249. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  250. settings: Optional[Dict[str, Any]] = None,
  251. fmt: str = None,
  252. use_database: bool = True,
  253. external_data: Optional[ExternalData] = None) -> bytes:
  254. """
  255. Query method that simply returns the raw ClickHouse format bytes
  256. :param query: Query statement/format string
  257. :param parameters: Optional dictionary used to format the query
  258. :param settings: Optional dictionary of ClickHouse settings (key/string values)
  259. :param fmt: ClickHouse output format
  260. :param use_database Send the database parameter to ClickHouse so the command will be executed in the client
  261. database context.
  262. :param external_data External data to send with the query
  263. :return: bytes representing raw ClickHouse return value based on format
  264. """
  265. @abstractmethod
  266. def raw_stream(self, query: str,
  267. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  268. settings: Optional[Dict[str, Any]] = None,
  269. fmt: str = None,
  270. use_database: bool = True,
  271. external_data: Optional[ExternalData] = None) -> io.IOBase:
  272. """
  273. Query method that returns the result as an io.IOBase iterator
  274. :param query: Query statement/format string
  275. :param parameters: Optional dictionary used to format the query
  276. :param settings: Optional dictionary of ClickHouse settings (key/string values)
  277. :param fmt: ClickHouse output format
  278. :param use_database Send the database parameter to ClickHouse so the command will be executed in the client
  279. database context.
  280. :param external_data External data to send with the query
  281. :return: io.IOBase stream/iterator for the result
  282. """
  283. # pylint: disable=duplicate-code,too-many-arguments,unused-argument
  284. def query_np(self,
  285. query: Optional[str] = None,
  286. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  287. settings: Optional[Dict[str, Any]] = None,
  288. query_formats: Optional[Dict[str, str]] = None,
  289. column_formats: Optional[Dict[str, str]] = None,
  290. encoding: Optional[str] = None,
  291. use_none: Optional[bool] = None,
  292. max_str_len: Optional[int] = None,
  293. context: QueryContext = None,
  294. external_data: Optional[ExternalData] = None):
  295. """
  296. Query method that returns the results as a numpy array. For parameter values, see the
  297. create_query_context method
  298. :return: Numpy array representing the result set
  299. """
  300. return self._context_query(locals(), use_numpy=True).np_result
  301. # pylint: disable=duplicate-code,too-many-arguments,unused-argument
  302. def query_np_stream(self,
  303. query: Optional[str] = None,
  304. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  305. settings: Optional[Dict[str, Any]] = None,
  306. query_formats: Optional[Dict[str, str]] = None,
  307. column_formats: Optional[Dict[str, str]] = None,
  308. encoding: Optional[str] = None,
  309. use_none: Optional[bool] = None,
  310. max_str_len: Optional[int] = None,
  311. context: QueryContext = None,
  312. external_data: Optional[ExternalData] = None) -> StreamContext:
  313. """
  314. Query method that returns the results as a stream of numpy arrays. For parameter values, see the
  315. create_query_context method
  316. :return: Generator that yield a numpy array per block representing the result set
  317. """
  318. return self._context_query(locals(), use_numpy=True, streaming=True).np_stream
  319. # pylint: disable=duplicate-code,too-many-arguments,unused-argument
  320. def query_df(self,
  321. query: Optional[str] = None,
  322. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  323. settings: Optional[Dict[str, Any]] = None,
  324. query_formats: Optional[Dict[str, str]] = None,
  325. column_formats: Optional[Dict[str, str]] = None,
  326. encoding: Optional[str] = None,
  327. use_none: Optional[bool] = None,
  328. max_str_len: Optional[int] = None,
  329. use_na_values: Optional[bool] = None,
  330. query_tz: Optional[str] = None,
  331. column_tzs: Optional[Dict[str, Union[str, tzinfo]]] = None,
  332. context: QueryContext = None,
  333. external_data: Optional[ExternalData] = None,
  334. use_extended_dtypes: Optional[bool] = None):
  335. """
  336. Query method that results the results as a pandas dataframe. For parameter values, see the
  337. create_query_context method
  338. :return: Pandas dataframe representing the result set
  339. """
  340. return self._context_query(locals(), use_numpy=True, as_pandas=True).df_result
  341. # pylint: disable=duplicate-code,too-many-arguments,unused-argument
  342. def query_df_stream(self,
  343. query: Optional[str] = None,
  344. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  345. settings: Optional[Dict[str, Any]] = None,
  346. query_formats: Optional[Dict[str, str]] = None,
  347. column_formats: Optional[Dict[str, str]] = None,
  348. encoding: Optional[str] = None,
  349. use_none: Optional[bool] = None,
  350. max_str_len: Optional[int] = None,
  351. use_na_values: Optional[bool] = None,
  352. query_tz: Optional[str] = None,
  353. column_tzs: Optional[Dict[str, Union[str, tzinfo]]] = None,
  354. context: QueryContext = None,
  355. external_data: Optional[ExternalData] = None,
  356. use_extended_dtypes: Optional[bool] = None) -> StreamContext:
  357. """
  358. Query method that returns the results as a StreamContext. For parameter values, see the
  359. create_query_context method
  360. :return: Generator that yields a Pandas dataframe per block representing the result set
  361. """
  362. return self._context_query(locals(), use_numpy=True,
  363. as_pandas=True,
  364. streaming=True).df_stream
  365. def create_query_context(self,
  366. query: Optional[Union[str, bytes]] = None,
  367. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  368. settings: Optional[Dict[str, Any]] = None,
  369. query_formats: Optional[Dict[str, str]] = None,
  370. column_formats: Optional[Dict[str, Union[str, Dict[str, str]]]] = None,
  371. encoding: Optional[str] = None,
  372. use_none: Optional[bool] = None,
  373. column_oriented: Optional[bool] = None,
  374. use_numpy: Optional[bool] = False,
  375. max_str_len: Optional[int] = 0,
  376. context: Optional[QueryContext] = None,
  377. query_tz: Optional[Union[str, tzinfo]] = None,
  378. column_tzs: Optional[Dict[str, Union[str, tzinfo]]] = None,
  379. use_na_values: Optional[bool] = None,
  380. streaming: bool = False,
  381. as_pandas: bool = False,
  382. external_data: Optional[ExternalData] = None,
  383. use_extended_dtypes: Optional[bool] = None) -> QueryContext:
  384. """
  385. Creates or updates a reusable QueryContext object
  386. :param query: Query statement/format string
  387. :param parameters: Optional dictionary used to format the query
  388. :param settings: Optional dictionary of ClickHouse settings (key/string values)
  389. :param query_formats: See QueryContext __init__ docstring
  390. :param column_formats: See QueryContext __init__ docstring
  391. :param encoding: See QueryContext __init__ docstring
  392. :param use_none: Use None for ClickHouse NULL instead of default values. Note that using None in Numpy
  393. arrays will force the numpy array dtype to 'object', which is often inefficient. This effect also
  394. will impact the performance of Pandas dataframes.
  395. :param column_oriented: Deprecated. Controls orientation of the QueryResult result_set property
  396. :param use_numpy: Return QueryResult columns as one-dimensional numpy arrays
  397. :param max_str_len: Limit returned ClickHouse String values to this length, which allows a Numpy
  398. structured array even with ClickHouse variable length String columns. If 0, Numpy arrays for
  399. String columns will always be object arrays
  400. :param context: An existing QueryContext to be updated with any provided parameter values
  401. :param query_tz Either a string or a pytz tzinfo object. (Strings will be converted to tzinfo objects).
  402. Values for any DateTime or DateTime64 column in the query will be converted to Python datetime.datetime
  403. objects with the selected timezone.
  404. :param column_tzs A dictionary of column names to tzinfo objects (or strings that will be converted to
  405. tzinfo objects). The timezone will be applied to datetime objects returned in the query
  406. :param use_na_values: Deprecated alias for use_advanced_dtypes
  407. :param as_pandas Return the result columns as pandas.Series objects
  408. :param streaming Marker used to correctly configure streaming queries
  409. :param external_data ClickHouse "external data" to send with query
  410. :param use_extended_dtypes: Only relevant to Pandas Dataframe queries. Use Pandas "missing types", such as
  411. pandas.NA and pandas.NaT for ClickHouse NULL values, as well as extended Pandas dtypes such as IntegerArray
  412. and StringArray. Defaulted to True for query_df methods
  413. :return: Reusable QueryContext
  414. """
  415. if context:
  416. return context.updated_copy(query=query,
  417. parameters=parameters,
  418. settings=settings,
  419. query_formats=query_formats,
  420. column_formats=column_formats,
  421. encoding=encoding,
  422. server_tz=self.server_tz,
  423. use_none=use_none,
  424. column_oriented=column_oriented,
  425. use_numpy=use_numpy,
  426. max_str_len=max_str_len,
  427. query_tz=query_tz,
  428. column_tzs=column_tzs,
  429. as_pandas=as_pandas,
  430. use_extended_dtypes=use_extended_dtypes,
  431. streaming=streaming,
  432. external_data=external_data)
  433. if use_numpy and max_str_len is None:
  434. max_str_len = 0
  435. if use_extended_dtypes is None:
  436. use_extended_dtypes = use_na_values
  437. if as_pandas and use_extended_dtypes is None:
  438. use_extended_dtypes = True
  439. return QueryContext(query=query,
  440. parameters=parameters,
  441. settings=settings,
  442. query_formats=query_formats,
  443. column_formats=column_formats,
  444. encoding=encoding,
  445. server_tz=self.server_tz,
  446. use_none=use_none,
  447. column_oriented=column_oriented,
  448. use_numpy=use_numpy,
  449. max_str_len=max_str_len,
  450. query_tz=query_tz,
  451. column_tzs=column_tzs,
  452. use_extended_dtypes=use_extended_dtypes,
  453. as_pandas=as_pandas,
  454. streaming=streaming,
  455. apply_server_tz=self.apply_server_timezone,
  456. external_data=external_data)
  457. def query_arrow(self,
  458. query: str,
  459. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  460. settings: Optional[Dict[str, Any]] = None,
  461. use_strings: Optional[bool] = None,
  462. external_data: Optional[ExternalData] = None):
  463. """
  464. Query method using the ClickHouse Arrow format to return a PyArrow table
  465. :param query: Query statement/format string
  466. :param parameters: Optional dictionary used to format the query
  467. :param settings: Optional dictionary of ClickHouse settings (key/string values)
  468. :param use_strings: Convert ClickHouse String type to Arrow string type (instead of binary)
  469. :param external_data ClickHouse "external data" to send with query
  470. :return: PyArrow.Table
  471. """
  472. settings = self._update_arrow_settings(settings, use_strings)
  473. return to_arrow(self.raw_query(query,
  474. parameters,
  475. settings,
  476. fmt='Arrow',
  477. external_data=external_data))
  478. def query_arrow_stream(self,
  479. query: str,
  480. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  481. settings: Optional[Dict[str, Any]] = None,
  482. use_strings: Optional[bool] = None,
  483. external_data: Optional[ExternalData] = None) -> StreamContext:
  484. """
  485. Query method that returns the results as a stream of Arrow tables
  486. :param query: Query statement/format string
  487. :param parameters: Optional dictionary used to format the query
  488. :param settings: Optional dictionary of ClickHouse settings (key/string values)
  489. :param use_strings: Convert ClickHouse String type to Arrow string type (instead of binary)
  490. :param external_data ClickHouse "external data" to send with query
  491. :return: Generator that yields a PyArrow.Table for per block representing the result set
  492. """
  493. settings = self._update_arrow_settings(settings, use_strings)
  494. return to_arrow_batches(self.raw_stream(query,
  495. parameters,
  496. settings,
  497. fmt='ArrowStream',
  498. external_data=external_data))
  499. def _update_arrow_settings(self,
  500. settings: Optional[Dict[str, Any]],
  501. use_strings: Optional[bool]) -> Dict[str, Any]:
  502. settings = dict_copy(settings)
  503. if self.database:
  504. settings['database'] = self.database
  505. str_status = self._setting_status(arrow_str_setting)
  506. if use_strings is None:
  507. if str_status.is_writable and not str_status.is_set:
  508. settings[arrow_str_setting] = '1' # Default to returning strings if possible
  509. elif use_strings != str_status.is_set:
  510. if not str_status.is_writable:
  511. raise OperationalError(f'Cannot change readonly {arrow_str_setting} to {use_strings}')
  512. settings[arrow_str_setting] = '1' if use_strings else '0'
  513. return settings
  514. @abstractmethod
  515. def command(self,
  516. cmd: str,
  517. parameters: Optional[Union[Sequence, Dict[str, Any]]] = None,
  518. data: Union[str, bytes] = None,
  519. settings: Dict[str, Any] = None,
  520. use_database: bool = True,
  521. external_data: Optional[ExternalData] = None) -> Union[str, int, Sequence[str], QuerySummary]:
  522. """
  523. Client method that returns a single value instead of a result set
  524. :param cmd: ClickHouse query/command as a python format string
  525. :param parameters: Optional dictionary of key/values pairs to be formatted
  526. :param data: Optional 'data' for the command (for INSERT INTO in particular)
  527. :param settings: Optional dictionary of ClickHouse settings (key/string values)
  528. :param use_database: Send the database parameter to ClickHouse so the command will be executed in the client
  529. database context. Otherwise, no database will be specified with the command. This is useful for determining
  530. the default user database
  531. :param external_data ClickHouse "external data" to send with command/query
  532. :return: Decoded response from ClickHouse as either a string, int, or sequence of strings, or QuerySummary
  533. if no data returned
  534. """
  535. @abstractmethod
  536. def ping(self) -> bool:
  537. """
  538. Validate the connection, does not throw an Exception (see debug logs)
  539. :return: ClickHouse server is up and reachable
  540. """
  541. # pylint: disable=too-many-arguments
  542. def insert(self,
  543. table: Optional[str] = None,
  544. data: Sequence[Sequence[Any]] = None,
  545. column_names: Union[str, Iterable[str]] = '*',
  546. database: Optional[str] = None,
  547. column_types: Sequence[ClickHouseType] = None,
  548. column_type_names: Sequence[str] = None,
  549. column_oriented: bool = False,
  550. settings: Optional[Dict[str, Any]] = None,
  551. context: InsertContext = None) -> QuerySummary:
  552. """
  553. Method to insert multiple rows/data matrix of native Python objects. If context is specified arguments
  554. other than data are ignored
  555. :param table: Target table
  556. :param data: Sequence of sequences of Python data
  557. :param column_names: Ordered list of column names or '*' if column types should be retrieved from the
  558. ClickHouse table definition
  559. :param database: Target database -- will use client default database if not specified.
  560. :param column_types: ClickHouse column types. If set then column data does not need to be retrieved from
  561. the server
  562. :param column_type_names: ClickHouse column type names. If set then column data does not need to be
  563. retrieved from the server
  564. :param column_oriented: If true the data is already "pivoted" in column form
  565. :param settings: Optional dictionary of ClickHouse settings (key/string values)
  566. :param context: Optional reusable insert context to allow repeated inserts into the same table with
  567. different data batches
  568. :return: QuerySummary with summary information, throws exception if insert fails
  569. """
  570. if (context is None or context.empty) and data is None:
  571. raise ProgrammingError('No data specified for insert') from None
  572. if context is None:
  573. context = self.create_insert_context(table,
  574. column_names,
  575. database,
  576. column_types,
  577. column_type_names,
  578. column_oriented,
  579. settings)
  580. if data is not None:
  581. if not context.empty:
  582. raise ProgrammingError('Attempting to insert new data with non-empty insert context') from None
  583. context.data = data
  584. return self.data_insert(context)
  585. def insert_df(self, table: str = None,
  586. df=None,
  587. database: Optional[str] = None,
  588. settings: Optional[Dict] = None,
  589. column_names: Optional[Sequence[str]] = None,
  590. column_types: Sequence[ClickHouseType] = None,
  591. column_type_names: Sequence[str] = None,
  592. context: InsertContext = None) -> QuerySummary:
  593. """
  594. Insert a pandas DataFrame into ClickHouse. If context is specified arguments other than df are ignored
  595. :param table: ClickHouse table
  596. :param df: two-dimensional pandas dataframe
  597. :param database: Optional ClickHouse database
  598. :param settings: Optional dictionary of ClickHouse settings (key/string values)
  599. :param column_names: An optional list of ClickHouse column names. If not set, the DataFrame column names
  600. will be used
  601. :param column_types: ClickHouse column types. If set then column data does not need to be retrieved from
  602. the server
  603. :param column_type_names: ClickHouse column type names. If set then column data does not need to be
  604. retrieved from the server
  605. :param context: Optional reusable insert context to allow repeated inserts into the same table with
  606. different data batches
  607. :return: QuerySummary with summary information, throws exception if insert fails
  608. """
  609. if context is None:
  610. if column_names is None:
  611. column_names = df.columns
  612. elif len(column_names) != len(df.columns):
  613. raise ProgrammingError('DataFrame column count does not match insert_columns') from None
  614. return self.insert(table,
  615. df,
  616. column_names,
  617. database,
  618. column_types=column_types,
  619. column_type_names=column_type_names,
  620. settings=settings, context=context)
  621. def insert_arrow(self, table: str,
  622. arrow_table, database: str = None,
  623. settings: Optional[Dict] = None) -> QuerySummary:
  624. """
  625. Insert a PyArrow table DataFrame into ClickHouse using raw Arrow format
  626. :param table: ClickHouse table
  627. :param arrow_table: PyArrow Table object
  628. :param database: Optional ClickHouse database
  629. :param settings: Optional dictionary of ClickHouse settings (key/string values)
  630. :return: QuerySummary with summary information, throws exception if insert fails
  631. """
  632. full_table = table if '.' in table or not database else f'{database}.{table}'
  633. column_names, insert_block = arrow_buffer(arrow_table)
  634. return self.raw_insert(full_table, column_names, insert_block, settings, 'Arrow')
  635. def create_insert_context(self,
  636. table: str,
  637. column_names: Optional[Union[str, Sequence[str]]] = None,
  638. database: Optional[str] = None,
  639. column_types: Sequence[ClickHouseType] = None,
  640. column_type_names: Sequence[str] = None,
  641. column_oriented: bool = False,
  642. settings: Optional[Dict[str, Any]] = None,
  643. data: Optional[Sequence[Sequence[Any]]] = None) -> InsertContext:
  644. """
  645. Builds a reusable insert context to hold state for a duration of an insert
  646. :param table: Target table
  647. :param database: Target database. If not set, uses the client default database
  648. :param column_names: Optional ordered list of column names. If not set, all columns ('*') will be assumed
  649. in the order specified by the table definition
  650. :param database: Target database -- will use client default database if not specified
  651. :param column_types: ClickHouse column types. Optional Sequence of ClickHouseType objects. If neither column
  652. types nor column type names are set, actual column types will be retrieved from the server.
  653. :param column_type_names: ClickHouse column type names. Specified column types by name string
  654. :param column_oriented: If true the data is already "pivoted" in column form
  655. :param settings: Optional dictionary of ClickHouse settings (key/string values)
  656. :param data: Initial dataset for insert
  657. :return Reusable insert context
  658. """
  659. full_table = table
  660. if '.' not in table:
  661. if database:
  662. full_table = f'{quote_identifier(database)}.{quote_identifier(table)}'
  663. else:
  664. full_table = quote_identifier(table)
  665. column_defs = []
  666. if column_types is None and column_type_names is None:
  667. describe_result = self.query(f'DESCRIBE TABLE {full_table}')
  668. column_defs = [ColumnDef(**row) for row in describe_result.named_results()
  669. if row['default_type'] not in ('ALIAS', 'MATERIALIZED')]
  670. if column_names is None or isinstance(column_names, str) and column_names == '*':
  671. column_names = [cd.name for cd in column_defs]
  672. column_types = [cd.ch_type for cd in column_defs]
  673. elif isinstance(column_names, str):
  674. column_names = [column_names]
  675. if len(column_names) == 0:
  676. raise ValueError('Column names must be specified for insert')
  677. if not column_types:
  678. if column_type_names:
  679. column_types = [get_from_name(name) for name in column_type_names]
  680. else:
  681. column_map = {d.name: d for d in column_defs}
  682. try:
  683. column_types = [column_map[name].ch_type for name in column_names]
  684. except KeyError as ex:
  685. raise ProgrammingError(f'Unrecognized column {ex} in table {table}') from None
  686. if len(column_names) != len(column_types):
  687. raise ProgrammingError('Column names do not match column types') from None
  688. return InsertContext(full_table,
  689. column_names,
  690. column_types,
  691. column_oriented=column_oriented,
  692. settings=settings,
  693. data=data)
  694. def min_version(self, version_str: str) -> bool:
  695. """
  696. Determine whether the connected server is at least the submitted version
  697. For Altinity Stable versions like 22.8.15.25.altinitystable
  698. the last condition in the first list comprehension expression is added
  699. :param version_str: A version string consisting of up to 4 integers delimited by dots
  700. :return: True if version_str is greater than the server_version, False if less than
  701. """
  702. try:
  703. server_parts = [int(x) for x in self.server_version.split('.') if x.isnumeric()]
  704. server_parts.extend([0] * (4 - len(server_parts)))
  705. version_parts = [int(x) for x in version_str.split('.')]
  706. version_parts.extend([0] * (4 - len(version_parts)))
  707. except ValueError:
  708. logger.warning('Server %s or requested version %s does not match format of numbers separated by dots',
  709. self.server_version, version_str)
  710. return False
  711. for x, y in zip(server_parts, version_parts):
  712. if x > y:
  713. return True
  714. if x < y:
  715. return False
  716. return True
  717. @abstractmethod
  718. def data_insert(self, context: InsertContext) -> QuerySummary:
  719. """
  720. Subclass implementation of the data insert
  721. :context: InsertContext parameter object
  722. :return: No return, throws an exception if the insert fails
  723. """
  724. @abstractmethod
  725. def raw_insert(self, table: str,
  726. column_names: Optional[Sequence[str]] = None,
  727. insert_block: Union[str, bytes, Generator[bytes, None, None], BinaryIO] = None,
  728. settings: Optional[Dict] = None,
  729. fmt: Optional[str] = None,
  730. compression: Optional[str] = None) -> QuerySummary:
  731. """
  732. Insert data already formatted in a bytes object
  733. :param table: Table name (whether qualified with the database name or not)
  734. :param column_names: Sequence of column names
  735. :param insert_block: Binary or string data already in a recognized ClickHouse format
  736. :param settings: Optional dictionary of ClickHouse settings (key/string values)
  737. :param compression: Recognized ClickHouse `Accept-Encoding` header compression value
  738. :param fmt: Valid clickhouse format
  739. """
  740. def close(self):
  741. """
  742. Subclass implementation to close the connection to the server/deallocate the client
  743. """
  744. def _context_query(self, lcls: dict, **overrides):
  745. kwargs = lcls.copy()
  746. kwargs.pop('self')
  747. kwargs.update(overrides)
  748. return self._query_with_context((self.create_query_context(**kwargs)))
  749. def __enter__(self):
  750. return self
  751. def __exit__(self, exc_type, exc_value, exc_traceback):
  752. self.close()