reader.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. # This module contains abstractions for the input stream. You don't have to
  2. # looks further, there are no pretty code.
  3. #
  4. # We define two classes here.
  5. #
  6. # Mark(source, line, column)
  7. # It's just a record and its only use is producing nice error messages.
  8. # Parser does not use it for any other purposes.
  9. #
  10. # Reader(source, data)
  11. # Reader determines the encoding of `data` and converts it to unicode.
  12. # Reader provides the following methods and attributes:
  13. # reader.peek(length=1) - return the next `length` characters
  14. # reader.forward(length=1) - move the current position to `length` characters.
  15. # reader.index - the number of the current character.
  16. # reader.line, stream.column - the line and the column of the current character.
  17. __all__ = ['Reader', 'ReaderError']
  18. from .error import YAMLError, Mark
  19. import codecs, re
  20. class ReaderError(YAMLError):
  21. def __init__(self, name, position, character, encoding, reason):
  22. self.name = name
  23. self.character = character
  24. self.position = position
  25. self.encoding = encoding
  26. self.reason = reason
  27. def __str__(self):
  28. if isinstance(self.character, bytes):
  29. return "'%s' codec can't decode byte #x%02x: %s\n" \
  30. " in \"%s\", position %d" \
  31. % (self.encoding, ord(self.character), self.reason,
  32. self.name, self.position)
  33. else:
  34. return "unacceptable character #x%04x: %s\n" \
  35. " in \"%s\", position %d" \
  36. % (self.character, self.reason,
  37. self.name, self.position)
  38. class Reader(object):
  39. # Reader:
  40. # - determines the data encoding and converts it to a unicode string,
  41. # - checks if characters are in allowed range,
  42. # - adds '\0' to the end.
  43. # Reader accepts
  44. # - a `bytes` object,
  45. # - a `str` object,
  46. # - a file-like object with its `read` method returning `str`,
  47. # - a file-like object with its `read` method returning `unicode`.
  48. # Yeah, it's ugly and slow.
  49. def __init__(self, stream):
  50. self.name = None
  51. self.stream = None
  52. self.stream_pointer = 0
  53. self.eof = True
  54. self.buffer = ''
  55. self.pointer = 0
  56. self.raw_buffer = None
  57. self.raw_decode = None
  58. self.encoding = None
  59. self.index = 0
  60. self.line = 0
  61. self.column = 0
  62. if isinstance(stream, str):
  63. self.name = "<unicode string>"
  64. self.check_printable(stream)
  65. self.buffer = stream+'\0'
  66. elif isinstance(stream, bytes):
  67. self.name = "<byte string>"
  68. self.raw_buffer = stream
  69. self.determine_encoding()
  70. else:
  71. self.stream = stream
  72. self.name = getattr(stream, 'name', "<file>")
  73. self.eof = False
  74. self.raw_buffer = None
  75. self.determine_encoding()
  76. def peek(self, index=0):
  77. try:
  78. return self.buffer[self.pointer+index]
  79. except IndexError:
  80. self.update(index+1)
  81. return self.buffer[self.pointer+index]
  82. def prefix(self, length=1):
  83. if self.pointer+length >= len(self.buffer):
  84. self.update(length)
  85. return self.buffer[self.pointer:self.pointer+length]
  86. def forward(self, length=1):
  87. if self.pointer+length+1 >= len(self.buffer):
  88. self.update(length+1)
  89. while length:
  90. ch = self.buffer[self.pointer]
  91. self.pointer += 1
  92. self.index += 1
  93. if ch in '\n\x85\u2028\u2029' \
  94. or (ch == '\r' and self.buffer[self.pointer] != '\n'):
  95. self.line += 1
  96. self.column = 0
  97. elif ch != '\uFEFF':
  98. self.column += 1
  99. length -= 1
  100. def get_mark(self):
  101. if self.stream is None:
  102. return Mark(self.name, self.index, self.line, self.column,
  103. self.buffer, self.pointer)
  104. else:
  105. return Mark(self.name, self.index, self.line, self.column,
  106. None, None)
  107. def determine_encoding(self):
  108. while not self.eof and (self.raw_buffer is None or len(self.raw_buffer) < 2):
  109. self.update_raw()
  110. if isinstance(self.raw_buffer, bytes):
  111. if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
  112. self.raw_decode = codecs.utf_16_le_decode
  113. self.encoding = 'utf-16-le'
  114. elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
  115. self.raw_decode = codecs.utf_16_be_decode
  116. self.encoding = 'utf-16-be'
  117. else:
  118. self.raw_decode = codecs.utf_8_decode
  119. self.encoding = 'utf-8'
  120. self.update(1)
  121. NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
  122. def check_printable(self, data):
  123. match = self.NON_PRINTABLE.search(data)
  124. if match:
  125. character = match.group()
  126. position = self.index+(len(self.buffer)-self.pointer)+match.start()
  127. raise ReaderError(self.name, position, ord(character),
  128. 'unicode', "special characters are not allowed")
  129. def update(self, length):
  130. if self.raw_buffer is None:
  131. return
  132. self.buffer = self.buffer[self.pointer:]
  133. self.pointer = 0
  134. while len(self.buffer) < length:
  135. if not self.eof:
  136. self.update_raw()
  137. if self.raw_decode is not None:
  138. try:
  139. data, converted = self.raw_decode(self.raw_buffer,
  140. 'strict', self.eof)
  141. except UnicodeDecodeError as exc:
  142. character = self.raw_buffer[exc.start]
  143. if self.stream is not None:
  144. position = self.stream_pointer-len(self.raw_buffer)+exc.start
  145. else:
  146. position = exc.start
  147. raise ReaderError(self.name, position, character,
  148. exc.encoding, exc.reason)
  149. else:
  150. data = self.raw_buffer
  151. converted = len(data)
  152. self.check_printable(data)
  153. self.buffer += data
  154. self.raw_buffer = self.raw_buffer[converted:]
  155. if self.eof:
  156. self.buffer += '\0'
  157. self.raw_buffer = None
  158. break
  159. def update_raw(self, size=4096):
  160. data = self.stream.read(size)
  161. if self.raw_buffer is None:
  162. self.raw_buffer = data
  163. else:
  164. self.raw_buffer += data
  165. self.stream_pointer += len(data)
  166. if not data:
  167. self.eof = True