csv_parser.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. #include "csv_parser.h"
  2. #include <ydb/public/lib/ydb_cli/common/common.h>
  3. #include <library/cpp/string_utils/csv/csv.h>
  4. namespace NYdb {
  5. namespace NConsoleClient {
  6. namespace {
  7. class TCsvToYdbConverter {
  8. public:
  9. explicit TCsvToYdbConverter(TTypeParser& parser, const std::optional<TString>& nullValue)
  10. : Parser(parser)
  11. , NullValue(nullValue)
  12. {
  13. }
  14. template <class T, std::enable_if_t<std::is_integral_v<T> && std::is_signed_v<T>, std::nullptr_t> = nullptr>
  15. static i64 StringToArithmetic(const TString& token, size_t& cnt) {
  16. return std::stoll(token, &cnt);
  17. }
  18. template <class T, std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, std::nullptr_t> = nullptr>
  19. static ui64 StringToArithmetic(const TString& token, size_t& cnt) {
  20. return std::stoull(token, &cnt);
  21. }
  22. template <class T, std::enable_if_t<std::is_same_v<T, float>, std::nullptr_t> = nullptr>
  23. static float StringToArithmetic(const TString& token, size_t& cnt) {
  24. return std::stof(token, &cnt);
  25. }
  26. template <class T, std::enable_if_t<std::is_same_v<T, double>, std::nullptr_t> = nullptr>
  27. static double StringToArithmetic(const TString& token, size_t& cnt) {
  28. return std::stod(token, &cnt);
  29. }
  30. template <class T>
  31. T GetArithmetic(const TString& token) const {
  32. size_t cnt;
  33. try {
  34. auto value = StringToArithmetic<T>(token, cnt);
  35. if (cnt != token.Size() || value < std::numeric_limits<T>::lowest() || value > std::numeric_limits<T>::max()) {
  36. throw yexception();
  37. }
  38. return static_cast<T>(value);
  39. } catch (std::exception& e) {
  40. throw TCsvParseException() << "Expected " << Parser.GetPrimitive() << " value, received: \"" << token << "\".";
  41. }
  42. }
  43. void BuildPrimitive(const TString& token) {
  44. switch (Parser.GetPrimitive()) {
  45. case EPrimitiveType::Int8:
  46. Builder.Int8(GetArithmetic<i8>(token));
  47. break;
  48. case EPrimitiveType::Int16:
  49. Builder.Int16(GetArithmetic<i16>(token));
  50. break;
  51. case EPrimitiveType::Int32:
  52. Builder.Int32(GetArithmetic<i32>(token));
  53. break;
  54. case EPrimitiveType::Int64:
  55. Builder.Int64(GetArithmetic<i64>(token));
  56. break;
  57. case EPrimitiveType::Uint8:
  58. Builder.Uint8(GetArithmetic<ui8>(token));
  59. break;
  60. case EPrimitiveType::Uint16:
  61. Builder.Uint16(GetArithmetic<ui16>(token));
  62. break;
  63. case EPrimitiveType::Uint32:
  64. Builder.Uint32(GetArithmetic<ui32>(token));
  65. break;
  66. case EPrimitiveType::Uint64:
  67. Builder.Uint64(GetArithmetic<ui64>(token));
  68. break;
  69. case EPrimitiveType::Bool:
  70. Builder.Bool(GetBool(token));
  71. break;
  72. case EPrimitiveType::String:
  73. Builder.String(token);
  74. break;
  75. case EPrimitiveType::Utf8:
  76. Builder.Utf8(token);
  77. break;
  78. case EPrimitiveType::Json:
  79. Builder.Json(token);
  80. break;
  81. case EPrimitiveType::JsonDocument:
  82. Builder.JsonDocument(token);
  83. break;
  84. case EPrimitiveType::Yson:
  85. Builder.Yson(token);
  86. break;
  87. case EPrimitiveType::Uuid:
  88. Builder.Uuid(token);
  89. break;
  90. case EPrimitiveType::Float:
  91. Builder.Float(GetArithmetic<float>(token));
  92. break;
  93. case EPrimitiveType::Double:
  94. Builder.Double(GetArithmetic<double>(token));
  95. break;
  96. case EPrimitiveType::DyNumber:
  97. Builder.DyNumber(token);
  98. break;
  99. case EPrimitiveType::Date: {
  100. TInstant date;
  101. if (!TInstant::TryParseIso8601(token, date)) {
  102. date = TInstant::Days(GetArithmetic<ui16>(token));
  103. }
  104. Builder.Date(date);
  105. break;
  106. }
  107. case EPrimitiveType::Datetime: {
  108. TInstant datetime;
  109. if (!TInstant::TryParseIso8601(token, datetime)) {
  110. datetime = TInstant::Seconds(GetArithmetic<ui32>(token));
  111. }
  112. Builder.Datetime(datetime);
  113. break;
  114. }
  115. case EPrimitiveType::Timestamp: {
  116. TInstant timestamp;
  117. if (!TInstant::TryParseIso8601(token, timestamp)) {
  118. timestamp = TInstant::MicroSeconds(GetArithmetic<ui64>(token));
  119. }
  120. Builder.Timestamp(timestamp);
  121. break;
  122. }
  123. case EPrimitiveType::Interval:
  124. Builder.Interval(GetArithmetic<i64>(token));
  125. break;
  126. case EPrimitiveType::Date32: {
  127. TInstant date;
  128. if (TInstant::TryParseIso8601(token, date)) {
  129. Builder.Date32(date.Days());
  130. } else {
  131. Builder.Date32(GetArithmetic<i32>(token));
  132. }
  133. break;
  134. }
  135. case EPrimitiveType::Datetime64: {
  136. TInstant date;
  137. if (TInstant::TryParseIso8601(token, date)) {
  138. Builder.Datetime64(date.Seconds());
  139. } else {
  140. Builder.Datetime64(GetArithmetic<i64>(token));
  141. }
  142. break;
  143. }
  144. case EPrimitiveType::Timestamp64: {
  145. TInstant date;
  146. if (TInstant::TryParseIso8601(token, date)) {
  147. Builder.Timestamp64(date.MicroSeconds());
  148. } else {
  149. Builder.Timestamp64(GetArithmetic<i64>(token));
  150. }
  151. break;
  152. }
  153. case EPrimitiveType::Interval64:
  154. Builder.Interval64(GetArithmetic<i64>(token));
  155. break;
  156. case EPrimitiveType::TzDate:
  157. Builder.TzDate(token);
  158. break;
  159. case EPrimitiveType::TzDatetime:
  160. Builder.TzDatetime(token);
  161. break;
  162. case EPrimitiveType::TzTimestamp:
  163. Builder.TzTimestamp(token);
  164. break;
  165. default:
  166. throw TCsvParseException() << "Unsupported primitive type: " << Parser.GetPrimitive();
  167. }
  168. }
  169. void BuildValue(TStringBuf token) {
  170. switch (Parser.GetKind()) {
  171. case TTypeParser::ETypeKind::Primitive: {
  172. BuildPrimitive(TString(token));
  173. break;
  174. }
  175. case TTypeParser::ETypeKind::Decimal: {
  176. Builder.Decimal(TString(token));
  177. break;
  178. }
  179. case TTypeParser::ETypeKind::Optional: {
  180. Parser.OpenOptional();
  181. if (NullValue && token == NullValue) {
  182. Builder.EmptyOptional(GetType());
  183. } else {
  184. Builder.BeginOptional();
  185. BuildValue(token);
  186. Builder.EndOptional();
  187. }
  188. Parser.CloseOptional();
  189. break;
  190. }
  191. case TTypeParser::ETypeKind::Null: {
  192. EnsureNull(token);
  193. break;
  194. }
  195. case TTypeParser::ETypeKind::Void: {
  196. EnsureNull(token);
  197. break;
  198. }
  199. case TTypeParser::ETypeKind::Tagged: {
  200. Parser.OpenTagged();
  201. Builder.BeginTagged(Parser.GetTag());
  202. BuildValue(token);
  203. Builder.EndTagged();
  204. Parser.CloseTagged();
  205. break;
  206. }
  207. case TTypeParser::ETypeKind::Pg: {
  208. if (NullValue && token == NullValue) {
  209. Builder.Pg(TPgValue(TPgValue::VK_NULL, {}, Parser.GetPg()));
  210. } else {
  211. Builder.Pg(TPgValue(TPgValue::VK_TEXT, TString(token), Parser.GetPg()));
  212. }
  213. break;
  214. }
  215. default:
  216. throw TCsvParseException() << "Unsupported type kind: " << Parser.GetKind();
  217. }
  218. }
  219. void BuildType(TTypeBuilder& typeBuilder) {
  220. switch (Parser.GetKind()) {
  221. case TTypeParser::ETypeKind::Primitive:
  222. typeBuilder.Primitive(Parser.GetPrimitive());
  223. break;
  224. case TTypeParser::ETypeKind::Decimal:
  225. typeBuilder.Decimal(Parser.GetDecimal());
  226. break;
  227. case TTypeParser::ETypeKind::Optional:
  228. Parser.OpenOptional();
  229. typeBuilder.BeginOptional();
  230. BuildType(typeBuilder);
  231. typeBuilder.EndOptional();
  232. Parser.CloseOptional();
  233. break;
  234. case TTypeParser::ETypeKind::Tagged:
  235. Parser.OpenTagged();
  236. typeBuilder.BeginTagged(Parser.GetTag());
  237. BuildType(typeBuilder);
  238. typeBuilder.EndTagged();
  239. Parser.CloseTagged();
  240. break;
  241. case TTypeParser::ETypeKind::Pg:
  242. typeBuilder.Pg(Parser.GetPg());
  243. break;
  244. default:
  245. throw TCsvParseException() << "Unsupported type kind: " << Parser.GetKind();
  246. }
  247. }
  248. TType GetType() {
  249. TTypeBuilder typeBuilder;
  250. BuildType(typeBuilder);
  251. return typeBuilder.Build();
  252. }
  253. bool GetBool(const TString& token) const {
  254. if (token == "true") {
  255. return true;
  256. }
  257. if (token == "false") {
  258. return false;
  259. }
  260. throw TCsvParseException() << "Expected bool value: \"true\" or \"false\", received: \"" << token << "\".";
  261. }
  262. void EnsureNull(TStringBuf token) const {
  263. if (!NullValue) {
  264. throw TCsvParseException() << "Expected null value instead of \"" << token << "\", but null value is not set.";
  265. }
  266. if (token != NullValue) {
  267. throw TCsvParseException() << "Expected null value: \"" << NullValue << "\", received: \"" << token << "\".";
  268. }
  269. }
  270. TValue Convert(TStringBuf token) {
  271. BuildValue(token);
  272. return Builder.Build();
  273. }
  274. private:
  275. TTypeParser& Parser;
  276. const std::optional<TString> NullValue = "";
  277. TValueBuilder Builder;
  278. };
  279. TCsvParseException FormatError(const std::exception& inputError,
  280. const TCsvParser::TParseMetadata& meta,
  281. std::optional<TString> columnName = {}) {
  282. auto outputError = TCsvParseException() << "Error during CSV parsing";
  283. if (meta.Line.has_value()) {
  284. outputError << " in line " << meta.Line.value();
  285. }
  286. if (columnName.has_value()) {
  287. outputError << " in column `" << columnName.value() << "`";
  288. }
  289. if (meta.Filename.has_value()) {
  290. outputError << " in file `" << meta.Filename.value() << "`";
  291. }
  292. outputError << ":\n" << inputError.what();
  293. return outputError;
  294. }
  295. TValue FieldToValue(TTypeParser& parser,
  296. TStringBuf token,
  297. const std::optional<TString>& nullValue,
  298. const TCsvParser::TParseMetadata& meta,
  299. TString columnName) {
  300. try {
  301. TCsvToYdbConverter converter(parser, nullValue);
  302. return converter.Convert(token);
  303. } catch (std::exception& e) {
  304. throw FormatError(e, meta, columnName);
  305. }
  306. }
  307. TStringBuf Consume(NCsvFormat::CsvSplitter& splitter,
  308. const TCsvParser::TParseMetadata& meta,
  309. TString columnName) {
  310. try {
  311. return splitter.Consume();
  312. } catch (std::exception& e) {
  313. throw FormatError(e, meta, columnName);
  314. }
  315. }
  316. }
  317. TCsvParser::TCsvParser(TString&& headerRow, const char delimeter, const std::optional<TString>& nullValue,
  318. const std::map<TString, TType>* paramTypes,
  319. const std::map<TString, TString>* paramSources)
  320. : HeaderRow(std::move(headerRow))
  321. , Delimeter(delimeter)
  322. , NullValue(nullValue)
  323. , ParamTypes(paramTypes)
  324. , ParamSources(paramSources)
  325. {
  326. NCsvFormat::CsvSplitter splitter(HeaderRow, Delimeter);
  327. Header = static_cast<TVector<TString>>(splitter);
  328. }
  329. TCsvParser::TCsvParser(TVector<TString>&& header, const char delimeter, const std::optional<TString>& nullValue,
  330. const std::map<TString, TType>* paramTypes,
  331. const std::map<TString, TString>* paramSources)
  332. : Header(std::move(header))
  333. , Delimeter(delimeter)
  334. , NullValue(nullValue)
  335. , ParamTypes(paramTypes)
  336. , ParamSources(paramSources)
  337. {
  338. }
  339. void TCsvParser::GetParams(TString&& data, TParamsBuilder& builder, const TParseMetadata& meta) const {
  340. NCsvFormat::CsvSplitter splitter(data, Delimeter);
  341. auto headerIt = Header.begin();
  342. do {
  343. if (headerIt == Header.end()) {
  344. throw FormatError(yexception() << "Header contains less fields than data. Header: \"" << HeaderRow << "\", data: \"" << data << "\"", meta);
  345. }
  346. TStringBuf token = Consume(splitter, meta, *headerIt);
  347. TString fullname = "$" + *headerIt;
  348. auto paramIt = ParamTypes->find(fullname);
  349. if (paramIt == ParamTypes->end()) {
  350. ++headerIt;
  351. continue;
  352. }
  353. if (ParamSources) {
  354. auto paramSource = ParamSources->find(fullname);
  355. if (paramSource != ParamSources->end()) {
  356. throw FormatError(yexception() << "Parameter " << fullname << " value found in more than one source: stdin, " << paramSource->second << ".", meta);
  357. }
  358. }
  359. TTypeParser parser(paramIt->second);
  360. builder.AddParam(fullname, FieldToValue(parser, token, NullValue, meta, *headerIt));
  361. ++headerIt;
  362. } while (splitter.Step());
  363. if (headerIt != Header.end()) {
  364. throw FormatError(yexception() << "Header contains more fields than data. Header: \"" << HeaderRow << "\", data: \"" << data << "\"", meta);
  365. }
  366. }
  367. void TCsvParser::GetValue(TString&& data, TValueBuilder& builder, const TType& type, const TParseMetadata& meta) const {
  368. NCsvFormat::CsvSplitter splitter(data, Delimeter);
  369. auto headerIt = Header.cbegin();
  370. std::map<TString, TStringBuf> fields;
  371. do {
  372. if (headerIt == Header.cend()) {
  373. throw FormatError(yexception() << "Header contains less fields than data. Header: \"" << HeaderRow << "\", data: \"" << data << "\"", meta);
  374. }
  375. TStringBuf token = Consume(splitter, meta, *headerIt);
  376. fields[*headerIt] = token;
  377. ++headerIt;
  378. } while (splitter.Step());
  379. if (headerIt != Header.cend()) {
  380. throw FormatError(yexception() << "Header contains more fields than data. Header: \"" << HeaderRow << "\", data: \"" << data << "\"", meta);
  381. }
  382. builder.BeginStruct();
  383. TTypeParser parser(type);
  384. parser.OpenStruct();
  385. while (parser.TryNextMember()) {
  386. TString name = parser.GetMemberName();
  387. if (name == "__ydb_skip_column_name") {
  388. continue;
  389. }
  390. auto fieldIt = fields.find(name);
  391. if (fieldIt == fields.end()) {
  392. throw FormatError(yexception() << "No member \"" << name << "\" in csv string for YDB struct type", meta);
  393. }
  394. builder.AddMember(name, FieldToValue(parser, fieldIt->second, NullValue, meta, name));
  395. }
  396. parser.CloseStruct();
  397. builder.EndStruct();
  398. }
  399. TType TCsvParser::GetColumnsType() const {
  400. TTypeBuilder builder;
  401. builder.BeginStruct();
  402. for (const auto& colName : Header) {
  403. if (ParamTypes->find(colName) != ParamTypes->end()) {
  404. builder.AddMember(colName, ParamTypes->at(colName));
  405. } else {
  406. builder.AddMember("__ydb_skip_column_name", TTypeBuilder().Build());
  407. }
  408. }
  409. builder.EndStruct();
  410. return builder.Build();
  411. }
  412. }
  413. }