reader.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. # This module contains abstractions for the input stream. You don't have to
  2. # looks further, there are no pretty code.
  3. #
  4. # We define two classes here.
  5. #
  6. # Mark(source, line, column)
  7. # It's just a record and its only use is producing nice error messages.
  8. # Parser does not use it for any other purposes.
  9. #
  10. # Reader(source, data)
  11. # Reader determines the encoding of `data` and converts it to unicode.
  12. # Reader provides the following methods and attributes:
  13. # reader.peek(length=1) - return the next `length` characters
  14. # reader.forward(length=1) - move the current position to `length` characters.
  15. # reader.index - the number of the current character.
  16. # reader.line, stream.column - the line and the column of the current character.
  17. __all__ = ['Reader', 'ReaderError']
  18. from error import YAMLError, Mark
  19. import codecs, re, sys
  20. has_ucs4 = sys.maxunicode > 0xffff
  21. class ReaderError(YAMLError):
  22. def __init__(self, name, position, character, encoding, reason):
  23. self.name = name
  24. self.character = character
  25. self.position = position
  26. self.encoding = encoding
  27. self.reason = reason
  28. def __str__(self):
  29. if isinstance(self.character, str):
  30. return "'%s' codec can't decode byte #x%02x: %s\n" \
  31. " in \"%s\", position %d" \
  32. % (self.encoding, ord(self.character), self.reason,
  33. self.name, self.position)
  34. else:
  35. return "unacceptable character #x%04x: %s\n" \
  36. " in \"%s\", position %d" \
  37. % (self.character, self.reason,
  38. self.name, self.position)
  39. class Reader(object):
  40. # Reader:
  41. # - determines the data encoding and converts it to unicode,
  42. # - checks if characters are in allowed range,
  43. # - adds '\0' to the end.
  44. # Reader accepts
  45. # - a `str` object,
  46. # - a `unicode` object,
  47. # - a file-like object with its `read` method returning `str`,
  48. # - a file-like object with its `read` method returning `unicode`.
  49. # Yeah, it's ugly and slow.
  50. def __init__(self, stream):
  51. self.name = None
  52. self.stream = None
  53. self.stream_pointer = 0
  54. self.eof = True
  55. self.buffer = u''
  56. self.pointer = 0
  57. self.raw_buffer = None
  58. self.raw_decode = None
  59. self.encoding = None
  60. self.index = 0
  61. self.line = 0
  62. self.column = 0
  63. if isinstance(stream, unicode):
  64. self.name = "<unicode string>"
  65. self.check_printable(stream)
  66. self.buffer = stream+u'\0'
  67. elif isinstance(stream, str):
  68. self.name = "<string>"
  69. self.raw_buffer = stream
  70. self.determine_encoding()
  71. else:
  72. self.stream = stream
  73. self.name = getattr(stream, 'name', "<file>")
  74. self.eof = False
  75. self.raw_buffer = ''
  76. self.determine_encoding()
  77. def peek(self, index=0):
  78. try:
  79. return self.buffer[self.pointer+index]
  80. except IndexError:
  81. self.update(index+1)
  82. return self.buffer[self.pointer+index]
  83. def prefix(self, length=1):
  84. if self.pointer+length >= len(self.buffer):
  85. self.update(length)
  86. return self.buffer[self.pointer:self.pointer+length]
  87. def forward(self, length=1):
  88. if self.pointer+length+1 >= len(self.buffer):
  89. self.update(length+1)
  90. while length:
  91. ch = self.buffer[self.pointer]
  92. self.pointer += 1
  93. self.index += 1
  94. if ch in u'\n\x85\u2028\u2029' \
  95. or (ch == u'\r' and self.buffer[self.pointer] != u'\n'):
  96. self.line += 1
  97. self.column = 0
  98. elif ch != u'\uFEFF':
  99. self.column += 1
  100. length -= 1
  101. def get_mark(self):
  102. if self.stream is None:
  103. return Mark(self.name, self.index, self.line, self.column,
  104. self.buffer, self.pointer)
  105. else:
  106. return Mark(self.name, self.index, self.line, self.column,
  107. None, None)
  108. def determine_encoding(self):
  109. while not self.eof and len(self.raw_buffer) < 2:
  110. self.update_raw()
  111. if not isinstance(self.raw_buffer, unicode):
  112. if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
  113. self.raw_decode = codecs.utf_16_le_decode
  114. self.encoding = 'utf-16-le'
  115. elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
  116. self.raw_decode = codecs.utf_16_be_decode
  117. self.encoding = 'utf-16-be'
  118. else:
  119. self.raw_decode = codecs.utf_8_decode
  120. self.encoding = 'utf-8'
  121. self.update(1)
  122. if has_ucs4:
  123. NON_PRINTABLE = u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]'
  124. elif sys.platform.startswith('java'):
  125. # Jython doesn't support lone surrogates https://bugs.jython.org/issue2048
  126. NON_PRINTABLE = u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]'
  127. else:
  128. # Need to use eval here due to the above Jython issue
  129. NON_PRINTABLE = eval(r"u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uFFFD]|(?:^|[^\uD800-\uDBFF])[\uDC00-\uDFFF]|[\uD800-\uDBFF](?:[^\uDC00-\uDFFF]|$)'")
  130. NON_PRINTABLE = re.compile(NON_PRINTABLE)
  131. def check_printable(self, data):
  132. match = self.NON_PRINTABLE.search(data)
  133. if match:
  134. character = match.group()
  135. position = self.index+(len(self.buffer)-self.pointer)+match.start()
  136. raise ReaderError(self.name, position, ord(character),
  137. 'unicode', "special characters are not allowed")
  138. def update(self, length):
  139. if self.raw_buffer is None:
  140. return
  141. self.buffer = self.buffer[self.pointer:]
  142. self.pointer = 0
  143. while len(self.buffer) < length:
  144. if not self.eof:
  145. self.update_raw()
  146. if self.raw_decode is not None:
  147. try:
  148. data, converted = self.raw_decode(self.raw_buffer,
  149. 'strict', self.eof)
  150. except UnicodeDecodeError, exc:
  151. character = exc.object[exc.start]
  152. if self.stream is not None:
  153. position = self.stream_pointer-len(self.raw_buffer)+exc.start
  154. else:
  155. position = exc.start
  156. raise ReaderError(self.name, position, character,
  157. exc.encoding, exc.reason)
  158. else:
  159. data = self.raw_buffer
  160. converted = len(data)
  161. self.check_printable(data)
  162. self.buffer += data
  163. self.raw_buffer = self.raw_buffer[converted:]
  164. if self.eof:
  165. self.buffer += u'\0'
  166. self.raw_buffer = None
  167. break
  168. def update_raw(self, size=1024):
  169. data = self.stream.read(size)
  170. if data:
  171. self.raw_buffer += data
  172. self.stream_pointer += len(data)
  173. else:
  174. self.eof = True