reader.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. # SPDX-License-Identifier: MIT
  2. # This module contains abstractions for the input stream. You don't have to
  3. # looks further, there are no pretty code.
  4. #
  5. # We define two classes here.
  6. #
  7. # Mark(source, line, column)
  8. # It's just a record and its only use is producing nice error messages.
  9. # Parser does not use it for any other purposes.
  10. #
  11. # Reader(source, data)
  12. # Reader determines the encoding of `data` and converts it to unicode.
  13. # Reader provides the following methods and attributes:
  14. # reader.peek(length=1) - return the next `length` characters
  15. # reader.forward(length=1) - move the current position to `length` characters.
  16. # reader.index - the number of the current character.
  17. # reader.line, stream.column - the line and the column of the current character.
  18. __all__ = ['Reader', 'ReaderError']
  19. from error import YAMLError, Mark
  20. import codecs, re
  21. class ReaderError(YAMLError):
  22. def __init__(self, name, position, character, encoding, reason):
  23. self.name = name
  24. self.character = character
  25. self.position = position
  26. self.encoding = encoding
  27. self.reason = reason
  28. def __str__(self):
  29. if isinstance(self.character, str):
  30. return "'%s' codec can't decode byte #x%02x: %s\n" \
  31. " in \"%s\", position %d" \
  32. % (self.encoding, ord(self.character), self.reason,
  33. self.name, self.position)
  34. else:
  35. return "unacceptable character #x%04x: %s\n" \
  36. " in \"%s\", position %d" \
  37. % (self.character, self.reason,
  38. self.name, self.position)
  39. class Reader(object):
  40. # Reader:
  41. # - determines the data encoding and converts it to unicode,
  42. # - checks if characters are in allowed range,
  43. # - adds '\0' to the end.
  44. # Reader accepts
  45. # - a `str` object,
  46. # - a `unicode` object,
  47. # - a file-like object with its `read` method returning `str`,
  48. # - a file-like object with its `read` method returning `unicode`.
  49. # Yeah, it's ugly and slow.
  50. def __init__(self, stream):
  51. self.name = None
  52. self.stream = None
  53. self.stream_pointer = 0
  54. self.eof = True
  55. self.buffer = u''
  56. self.pointer = 0
  57. self.raw_buffer = None
  58. self.raw_decode = None
  59. self.encoding = None
  60. self.index = 0
  61. self.line = 0
  62. self.column = 0
  63. if isinstance(stream, unicode):
  64. self.name = "<unicode string>"
  65. self.check_printable(stream)
  66. self.buffer = stream+u'\0'
  67. elif isinstance(stream, str):
  68. self.name = "<string>"
  69. self.raw_buffer = stream
  70. self.determine_encoding()
  71. else:
  72. self.stream = stream
  73. self.name = getattr(stream, 'name', "<file>")
  74. self.eof = False
  75. self.raw_buffer = ''
  76. self.determine_encoding()
  77. def peek(self, index=0):
  78. try:
  79. return self.buffer[self.pointer+index]
  80. except IndexError:
  81. self.update(index+1)
  82. return self.buffer[self.pointer+index]
  83. def prefix(self, length=1):
  84. if self.pointer+length >= len(self.buffer):
  85. self.update(length)
  86. return self.buffer[self.pointer:self.pointer+length]
  87. def forward(self, length=1):
  88. if self.pointer+length+1 >= len(self.buffer):
  89. self.update(length+1)
  90. while length:
  91. ch = self.buffer[self.pointer]
  92. self.pointer += 1
  93. self.index += 1
  94. if ch in u'\n\x85\u2028\u2029' \
  95. or (ch == u'\r' and self.buffer[self.pointer] != u'\n'):
  96. self.line += 1
  97. self.column = 0
  98. elif ch != u'\uFEFF':
  99. self.column += 1
  100. length -= 1
  101. def get_mark(self):
  102. if self.stream is None:
  103. return Mark(self.name, self.index, self.line, self.column,
  104. self.buffer, self.pointer)
  105. else:
  106. return Mark(self.name, self.index, self.line, self.column,
  107. None, None)
  108. def determine_encoding(self):
  109. while not self.eof and len(self.raw_buffer) < 2:
  110. self.update_raw()
  111. if not isinstance(self.raw_buffer, unicode):
  112. if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
  113. self.raw_decode = codecs.utf_16_le_decode
  114. self.encoding = 'utf-16-le'
  115. elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
  116. self.raw_decode = codecs.utf_16_be_decode
  117. self.encoding = 'utf-16-be'
  118. else:
  119. self.raw_decode = codecs.utf_8_decode
  120. self.encoding = 'utf-8'
  121. self.update(1)
  122. NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
  123. def check_printable(self, data):
  124. match = self.NON_PRINTABLE.search(data)
  125. if match:
  126. character = match.group()
  127. position = self.index+(len(self.buffer)-self.pointer)+match.start()
  128. raise ReaderError(self.name, position, ord(character),
  129. 'unicode', "special characters are not allowed")
  130. def update(self, length):
  131. if self.raw_buffer is None:
  132. return
  133. self.buffer = self.buffer[self.pointer:]
  134. self.pointer = 0
  135. while len(self.buffer) < length:
  136. if not self.eof:
  137. self.update_raw()
  138. if self.raw_decode is not None:
  139. try:
  140. data, converted = self.raw_decode(self.raw_buffer,
  141. 'strict', self.eof)
  142. except UnicodeDecodeError, exc:
  143. character = exc.object[exc.start]
  144. if self.stream is not None:
  145. position = self.stream_pointer-len(self.raw_buffer)+exc.start
  146. else:
  147. position = exc.start
  148. raise ReaderError(self.name, position, character,
  149. exc.encoding, exc.reason)
  150. else:
  151. data = self.raw_buffer
  152. converted = len(data)
  153. self.check_printable(data)
  154. self.buffer += data
  155. self.raw_buffer = self.raw_buffer[converted:]
  156. if self.eof:
  157. self.buffer += u'\0'
  158. self.raw_buffer = None
  159. break
  160. def update_raw(self, size=1024):
  161. data = self.stream.read(size)
  162. if data:
  163. self.raw_buffer += data
  164. self.stream_pointer += len(data)
  165. else:
  166. self.eof = True
  167. #try:
  168. # import psyco
  169. # psyco.bind(Reader)
  170. #except ImportError:
  171. # pass