openpy.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. """
  2. Tools to open .py files as Unicode, using the encoding specified within the file,
  3. as per PEP 263.
  4. Much of the code is taken from the tokenize module in Python 3.2.
  5. """
  6. from __future__ import absolute_import
  7. import io
  8. from io import TextIOWrapper, BytesIO
  9. import os.path
  10. import re
  11. from .py3compat import unicode_type
  12. cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)", re.UNICODE)
  13. cookie_comment_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
  14. try:
  15. # Available in Python 3
  16. from tokenize import detect_encoding
  17. except ImportError:
  18. from codecs import lookup, BOM_UTF8
  19. # Copied from Python 3.2 tokenize
  20. def _get_normal_name(orig_enc):
  21. """Imitates get_normal_name in tokenizer.c."""
  22. # Only care about the first 12 characters.
  23. enc = orig_enc[:12].lower().replace("_", "-")
  24. if enc == "utf-8" or enc.startswith("utf-8-"):
  25. return "utf-8"
  26. if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
  27. enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
  28. return "iso-8859-1"
  29. return orig_enc
  30. # Copied from Python 3.2 tokenize
  31. def detect_encoding(readline):
  32. """
  33. The detect_encoding() function is used to detect the encoding that should
  34. be used to decode a Python source file. It requires one argment, readline,
  35. in the same way as the tokenize() generator.
  36. It will call readline a maximum of twice, and return the encoding used
  37. (as a string) and a list of any lines (left as bytes) it has read in.
  38. It detects the encoding from the presence of a utf-8 bom or an encoding
  39. cookie as specified in pep-0263. If both a bom and a cookie are present,
  40. but disagree, a SyntaxError will be raised. If the encoding cookie is an
  41. invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
  42. 'utf-8-sig' is returned.
  43. If no encoding is specified, then the default of 'utf-8' will be returned.
  44. """
  45. bom_found = False
  46. encoding = None
  47. default = 'utf-8'
  48. def read_or_stop():
  49. try:
  50. return readline()
  51. except StopIteration:
  52. return b''
  53. def find_cookie(line):
  54. try:
  55. line_string = line.decode('ascii')
  56. except UnicodeDecodeError:
  57. return None
  58. matches = cookie_re.findall(line_string)
  59. if not matches:
  60. return None
  61. encoding = _get_normal_name(matches[0])
  62. try:
  63. codec = lookup(encoding)
  64. except LookupError:
  65. # This behaviour mimics the Python interpreter
  66. raise SyntaxError("unknown encoding: " + encoding)
  67. if bom_found:
  68. if codec.name != 'utf-8':
  69. # This behaviour mimics the Python interpreter
  70. raise SyntaxError('encoding problem: utf-8')
  71. encoding += '-sig'
  72. return encoding
  73. first = read_or_stop()
  74. if first.startswith(BOM_UTF8):
  75. bom_found = True
  76. first = first[3:]
  77. default = 'utf-8-sig'
  78. if not first:
  79. return default, []
  80. encoding = find_cookie(first)
  81. if encoding:
  82. return encoding, [first]
  83. second = read_or_stop()
  84. if not second:
  85. return default, [first]
  86. encoding = find_cookie(second)
  87. if encoding:
  88. return encoding, [first, second]
  89. return default, [first, second]
  90. try:
  91. # Available in Python 3.2 and above.
  92. from tokenize import open
  93. except ImportError:
  94. # Copied from Python 3.2 tokenize
  95. def open(filename):
  96. """Open a file in read only mode using the encoding detected by
  97. detect_encoding().
  98. """
  99. buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
  100. encoding, lines = detect_encoding(buffer.readline)
  101. buffer.seek(0)
  102. text = TextIOWrapper(buffer, encoding, line_buffering=True)
  103. text.mode = 'r'
  104. return text
  105. def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
  106. """Converts a bytes string with python source code to unicode.
  107. Unicode strings are passed through unchanged. Byte strings are checked
  108. for the python source file encoding cookie to determine encoding.
  109. txt can be either a bytes buffer or a string containing the source
  110. code.
  111. """
  112. if isinstance(txt, unicode_type):
  113. return txt
  114. if isinstance(txt, bytes):
  115. buffer = BytesIO(txt)
  116. else:
  117. buffer = txt
  118. try:
  119. encoding, _ = detect_encoding(buffer.readline)
  120. except SyntaxError:
  121. encoding = "ascii"
  122. buffer.seek(0)
  123. text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
  124. text.mode = 'r'
  125. if skip_encoding_cookie:
  126. return u"".join(strip_encoding_cookie(text))
  127. else:
  128. return text.read()
  129. def strip_encoding_cookie(filelike):
  130. """Generator to pull lines from a text-mode file, skipping the encoding
  131. cookie if it is found in the first two lines.
  132. """
  133. it = iter(filelike)
  134. try:
  135. first = next(it)
  136. if not cookie_comment_re.match(first):
  137. yield first
  138. second = next(it)
  139. if not cookie_comment_re.match(second):
  140. yield second
  141. except StopIteration:
  142. return
  143. for line in it:
  144. yield line
  145. def read_py_file(filename, skip_encoding_cookie=True):
  146. """Read a Python file, using the encoding declared inside the file.
  147. Parameters
  148. ----------
  149. filename : str
  150. The path to the file to read.
  151. skip_encoding_cookie : bool
  152. If True (the default), and the encoding declaration is found in the first
  153. two lines, that line will be excluded from the output - compiling a
  154. unicode string with an encoding declaration is a SyntaxError in Python 2.
  155. Returns
  156. -------
  157. A unicode string containing the contents of the file.
  158. """
  159. with open(filename) as f: # the open function defined in this module.
  160. if skip_encoding_cookie:
  161. return "".join(strip_encoding_cookie(f))
  162. else:
  163. return f.read()
  164. def read_py_url(url, errors='replace', skip_encoding_cookie=True):
  165. """Read a Python file from a URL, using the encoding declared inside the file.
  166. Parameters
  167. ----------
  168. url : str
  169. The URL from which to fetch the file.
  170. errors : str
  171. How to handle decoding errors in the file. Options are the same as for
  172. bytes.decode(), but here 'replace' is the default.
  173. skip_encoding_cookie : bool
  174. If True (the default), and the encoding declaration is found in the first
  175. two lines, that line will be excluded from the output - compiling a
  176. unicode string with an encoding declaration is a SyntaxError in Python 2.
  177. Returns
  178. -------
  179. A unicode string containing the contents of the file.
  180. """
  181. # Deferred import for faster start
  182. try:
  183. from urllib.request import urlopen # Py 3
  184. except ImportError:
  185. from urllib import urlopen
  186. response = urlopen(url)
  187. buffer = io.BytesIO(response.read())
  188. return source_to_unicode(buffer, errors, skip_encoding_cookie)
  189. def _list_readline(x):
  190. """Given a list, returns a readline() function that returns the next element
  191. with each call.
  192. """
  193. x = iter(x)
  194. def readline():
  195. return next(x)
  196. return readline
  197. # Code for going between .py files and cached .pyc files ----------------------
  198. try: # Python 3.2, see PEP 3147
  199. try:
  200. from importlib.util import source_from_cache, cache_from_source
  201. except ImportError :
  202. ## deprecated since 3.4
  203. from imp import source_from_cache, cache_from_source
  204. except ImportError:
  205. # Python <= 3.1: .pyc files go next to .py
  206. def source_from_cache(path):
  207. basename, ext = os.path.splitext(path)
  208. if ext not in ('.pyc', '.pyo'):
  209. raise ValueError('Not a cached Python file extension', ext)
  210. # Should we look for .pyw files?
  211. return basename + '.py'
  212. def cache_from_source(path, debug_override=None):
  213. if debug_override is None:
  214. debug_override = __debug__
  215. basename, ext = os.path.splitext(path)
  216. return basename + '.pyc' if debug_override else '.pyo'