tokenize_open.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. """Backport of tokenize.open from Python 3.5
  2. This is the exact Python 3.5 with the following differences:
  3. - detect_encoding_ex is detect_encoding from Python 3.5 returning also a bool whether a cookie was found
  4. - detect_encoding calls detect_encoding_ex, so that its signature is the same as in Python 3.5
  5. - function read_source_lines was added
  6. """
  7. from codecs import lookup, BOM_UTF8
  8. from io import TextIOWrapper, open as _builtin_open
  9. import re
  10. re_ASCII = 256 # not present in Python 2
  11. cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re_ASCII)
  12. blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re_ASCII)
  13. def _get_normal_name(orig_enc):
  14. """Imitates get_normal_name in tokenizer.c."""
  15. # Only care about the first 12 characters.
  16. enc = orig_enc[:12].lower().replace("_", "-")
  17. if enc == "utf-8" or enc.startswith("utf-8-"):
  18. return "utf-8"
  19. if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
  20. enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
  21. return "iso-8859-1"
  22. return orig_enc
  23. def detect_encoding(readline):
  24. """
  25. The detect_encoding() function is used to detect the encoding that should
  26. be used to decode a Python source file. It requires one argument, readline,
  27. in the same way as the tokenize() generator.
  28. It will call readline a maximum of twice, and return the encoding used
  29. (as a string) and a list of any lines (left as bytes) it has read in.
  30. It detects the encoding from the presence of a utf-8 bom or an encoding
  31. cookie as specified in pep-0263. If both a bom and a cookie are present,
  32. but disagree, a SyntaxError will be raised. If the encoding cookie is an
  33. invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
  34. 'utf-8-sig' is returned.
  35. If no encoding is specified, then the default of 'utf-8' will be returned.
  36. """
  37. return detect_encoding_ex(readline)[:2]
  38. def detect_encoding_ex(readline):
  39. try:
  40. filename = readline.__self__.name
  41. except AttributeError:
  42. filename = None
  43. bom_found = False
  44. encoding = None
  45. default = 'utf-8'
  46. def read_or_stop():
  47. try:
  48. return readline()
  49. except StopIteration:
  50. return b''
  51. def find_cookie(line):
  52. try:
  53. # Decode as UTF-8. Either the line is an encoding declaration,
  54. # in which case it should be pure ASCII, or it must be UTF-8
  55. # per default encoding.
  56. line_string = line.decode('utf-8')
  57. except UnicodeDecodeError:
  58. msg = "invalid or missing encoding declaration"
  59. if filename is not None:
  60. msg = '{} for {!r}'.format(msg, filename)
  61. raise SyntaxError(msg)
  62. match = cookie_re.match(line_string)
  63. if not match:
  64. return None
  65. encoding = _get_normal_name(match.group(1))
  66. try:
  67. codec = lookup(encoding)
  68. except LookupError:
  69. # This behaviour mimics the Python interpreter
  70. if filename is None:
  71. msg = "unknown encoding: " + encoding
  72. else:
  73. msg = "unknown encoding for {!r}: {}".format(filename,
  74. encoding)
  75. raise SyntaxError(msg)
  76. if bom_found:
  77. if encoding != 'utf-8':
  78. # This behaviour mimics the Python interpreter
  79. if filename is None:
  80. msg = 'encoding problem: utf-8'
  81. else:
  82. msg = 'encoding problem for {!r}: utf-8'.format(filename)
  83. raise SyntaxError(msg)
  84. encoding += '-sig'
  85. return encoding
  86. first = read_or_stop()
  87. if first.startswith(BOM_UTF8):
  88. bom_found = True
  89. first = first[3:]
  90. default = 'utf-8-sig'
  91. if not first:
  92. return default, [], False
  93. encoding = find_cookie(first)
  94. if encoding:
  95. return encoding, [first], True
  96. if not blank_re.match(first):
  97. return default, [first], False
  98. second = read_or_stop()
  99. if not second:
  100. return default, [first], False
  101. encoding = find_cookie(second)
  102. if encoding:
  103. return encoding, [first, second], True
  104. return default, [first, second], False
  105. def open(filename):
  106. """Open a file in read only mode using the encoding detected by
  107. detect_encoding().
  108. """
  109. buffer = _builtin_open(filename, 'rb')
  110. try:
  111. encoding, lines = detect_encoding(buffer.readline)
  112. buffer.seek(0)
  113. text = TextIOWrapper(buffer, encoding, line_buffering=True)
  114. text.mode = 'r'
  115. return text
  116. except:
  117. buffer.close()
  118. raise
  119. def read_source_lines(filename):
  120. buffer = _builtin_open(filename, 'rb')
  121. try:
  122. encoding, lines, cookie_present = detect_encoding_ex(buffer.readline)
  123. buffer.seek(0)
  124. text = TextIOWrapper(buffer, encoding, line_buffering=True)
  125. text.mode = 'r'
  126. except:
  127. buffer.close()
  128. raise
  129. with text:
  130. if cookie_present:
  131. for i in lines:
  132. yield text.readline().replace("coding", "Coding")
  133. # so compile() won't complain about encoding declatation in a Unicode string
  134. # see 2.7/Python/ast.c:228
  135. for line in text:
  136. yield line