openpy.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. """
  2. Tools to open .py files as Unicode, using the encoding specified within the file,
  3. as per PEP 263.
  4. Much of the code is taken from the tokenize module in Python 3.2.
  5. """
  6. import io
  7. from io import TextIOWrapper, BytesIO
  8. from pathlib import Path
  9. import re
  10. from tokenize import open, detect_encoding
  11. cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)", re.UNICODE)
  12. cookie_comment_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
  13. def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
  14. """Converts a bytes string with python source code to unicode.
  15. Unicode strings are passed through unchanged. Byte strings are checked
  16. for the python source file encoding cookie to determine encoding.
  17. txt can be either a bytes buffer or a string containing the source
  18. code.
  19. """
  20. if isinstance(txt, str):
  21. return txt
  22. if isinstance(txt, bytes):
  23. buffer = BytesIO(txt)
  24. else:
  25. buffer = txt
  26. try:
  27. encoding, _ = detect_encoding(buffer.readline)
  28. except SyntaxError:
  29. encoding = "ascii"
  30. buffer.seek(0)
  31. with TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) as text:
  32. text.mode = 'r'
  33. if skip_encoding_cookie:
  34. return u"".join(strip_encoding_cookie(text))
  35. else:
  36. return text.read()
  37. def strip_encoding_cookie(filelike):
  38. """Generator to pull lines from a text-mode file, skipping the encoding
  39. cookie if it is found in the first two lines.
  40. """
  41. it = iter(filelike)
  42. try:
  43. first = next(it)
  44. if not cookie_comment_re.match(first):
  45. yield first
  46. second = next(it)
  47. if not cookie_comment_re.match(second):
  48. yield second
  49. except StopIteration:
  50. return
  51. for line in it:
  52. yield line
  53. def read_py_file(filename, skip_encoding_cookie=True):
  54. """Read a Python file, using the encoding declared inside the file.
  55. Parameters
  56. ----------
  57. filename : str
  58. The path to the file to read.
  59. skip_encoding_cookie : bool
  60. If True (the default), and the encoding declaration is found in the first
  61. two lines, that line will be excluded from the output.
  62. Returns
  63. -------
  64. A unicode string containing the contents of the file.
  65. """
  66. filepath = Path(filename)
  67. with open(filepath) as f: # the open function defined in this module.
  68. if skip_encoding_cookie:
  69. return "".join(strip_encoding_cookie(f))
  70. else:
  71. return f.read()
  72. def read_py_url(url, errors='replace', skip_encoding_cookie=True):
  73. """Read a Python file from a URL, using the encoding declared inside the file.
  74. Parameters
  75. ----------
  76. url : str
  77. The URL from which to fetch the file.
  78. errors : str
  79. How to handle decoding errors in the file. Options are the same as for
  80. bytes.decode(), but here 'replace' is the default.
  81. skip_encoding_cookie : bool
  82. If True (the default), and the encoding declaration is found in the first
  83. two lines, that line will be excluded from the output.
  84. Returns
  85. -------
  86. A unicode string containing the contents of the file.
  87. """
  88. # Deferred import for faster start
  89. from urllib.request import urlopen
  90. response = urlopen(url)
  91. buffer = io.BytesIO(response.read())
  92. return source_to_unicode(buffer, errors, skip_encoding_cookie)