yql_csv.cpp 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. #include "yql_csv.h"
  2. #include <util/string/split.h>
  3. #include <util/string/escape.h>
  4. namespace {
  5. const char QUOTE_CH = '"';
  6. const char ESCAPE_CH = '\\';
  7. const char NULL_CH = '\0';
  8. class TCsvLineParser {
  9. public:
  10. TCsvLineParser(const TStringBuf& line, char delimiter)
  11. : Cur_(line.begin())
  12. , End_(line.end())
  13. , Delim_(delimiter)
  14. , Prev_(NULL_CH) /* must differ from ESCAPE_CH */
  15. {
  16. }
  17. bool Next(TStringBuf& token) {
  18. if (Cur_ > End_) return false;
  19. const char* tokenStart = Cur_;
  20. while (Cur_ < End_ && (Prev_ == ESCAPE_CH || *Cur_ != QUOTE_CH) && *Cur_ != Delim_) Prev_ = *Cur_++;
  21. if (Cur_ == End_) {
  22. token = { tokenStart, Cur_ };
  23. Prev_ = NULL_CH;
  24. Cur_++;
  25. return true;
  26. }
  27. if (Prev_ != ESCAPE_CH && *Cur_ == QUOTE_CH) {
  28. Prev_ = *Cur_++;
  29. tokenStart = Cur_;
  30. while (Cur_ < End_) {
  31. // find non escaped quote char
  32. if (*Cur_ == QUOTE_CH && Prev_ != ESCAPE_CH) break;
  33. Prev_ = *Cur_++;
  34. }
  35. if (Cur_ == End_)
  36. ythrow yexception() << "expected closing \"";
  37. token = { tokenStart, Cur_ };
  38. Prev_ = *Cur_++; // skip closing quote char
  39. if (Cur_ != End_ && *Cur_ != Delim_)
  40. ythrow yexception() << "expected end of line or delimiter";
  41. Prev_ = *Cur_++; // move out of buffer or skip delimiter
  42. return true;
  43. } else if (*Cur_ == Delim_) {
  44. token = { tokenStart, Cur_ };
  45. Prev_ = *Cur_++;
  46. return true;
  47. }
  48. return false;
  49. }
  50. private:
  51. const char* Cur_;
  52. const char* End_;
  53. const char Delim_;
  54. char Prev_;
  55. };
  56. } // namspace
  57. namespace NYql {
  58. namespace NUtils {
  59. ///////////////////////////////////////////////////////////////////////////////
  60. // TCsvInputStream
  61. ///////////////////////////////////////////////////////////////////////////////
  62. TCsvInputStream::TCsvInputStream(IInputStream& slave, char delimiter)
  63. : Slave_(slave)
  64. , Delim_(delimiter)
  65. {
  66. }
  67. TVector<TString> TCsvInputStream::ReadLine()
  68. {
  69. TVector<TString> parts;
  70. TString line;
  71. if (Slave_.ReadLine(line)) {
  72. TCsvLineParser lineParser(line, Delim_);
  73. TStringBuf token;
  74. while (lineParser.Next(token)) {
  75. parts.push_back(UnescapeC(token.data(), token.size()));
  76. }
  77. }
  78. return parts;
  79. }
  80. TVector<TString> TCsvInputStream::ReadLineWithEscaping()
  81. {
  82. TVector<TString> parts;
  83. TString line;
  84. if (Slave_.ReadLine(line)) {
  85. TCsvLineParser lineParser(line, Delim_);
  86. TStringBuf token;
  87. while (lineParser.Next(token)) {
  88. parts.push_back(TString(token));
  89. }
  90. }
  91. return parts;
  92. }
  93. ///////////////////////////////////////////////////////////////////////////////
  94. // TCsvInputStream
  95. ///////////////////////////////////////////////////////////////////////////////
  96. TCsvInputBuffer::TCsvInputBuffer(const TStringBuf& buffer, char delimiter)
  97. : Buffer_(buffer)
  98. , Delim_(delimiter)
  99. {
  100. }
  101. TVector<TString> TCsvInputBuffer::ReadLine()
  102. {
  103. TVector<TString> parts;
  104. TStringBuf line;
  105. if (Buffer_.ReadLine(line)) {
  106. TCsvLineParser lineParser(line, Delim_);
  107. TStringBuf token;
  108. while (lineParser.Next(token)) {
  109. parts.push_back(UnescapeC(token.data(), token.size()));
  110. }
  111. }
  112. return parts;
  113. }
  114. TVector<TString> TCsvInputBuffer::ReadLineWithEscaping()
  115. {
  116. TVector<TString> parts;
  117. TStringBuf line;
  118. if (Buffer_.ReadLine(line)) {
  119. TCsvLineParser lineParser(line, Delim_);
  120. TStringBuf token;
  121. while (lineParser.Next(token)) {
  122. parts.push_back(TString(token.data(), token.size()));
  123. }
  124. }
  125. return parts;
  126. }
  127. ///////////////////////////////////////////////////////////////////////////////
  128. // TCsvOutputStream
  129. ///////////////////////////////////////////////////////////////////////////////
  130. TCsvOutputStream::TCsvOutputStream(IOutputStream& slave, char delimiter, bool quoteItems)
  131. : Slave_(slave)
  132. , WasNL_(true)
  133. , Delim_(delimiter)
  134. , QuoteItems_(quoteItems)
  135. {
  136. }
  137. void TCsvOutputStream::DoWrite(const void* buf, size_t len)
  138. {
  139. TStringBuf charBuf(reinterpret_cast<const char*>(buf), len);
  140. if (charBuf == TStringBuf("\n")) {
  141. WasNL_ = true;
  142. Slave_.Write(buf, len);
  143. } else {
  144. if (!WasNL_) Slave_.Write(Delim_);
  145. WasNL_ = false;
  146. if (QuoteItems_) {
  147. Slave_.Write(QUOTE_CH);
  148. }
  149. Slave_.Write(EscapeC(charBuf));
  150. if (QuoteItems_) {
  151. Slave_.Write(QUOTE_CH);
  152. }
  153. }
  154. }
  155. void TCsvOutputStream::DoFlush() {
  156. Slave_.Flush();
  157. }
  158. } // namspace NUtils
  159. } // namspace NYql