SMusatov
/
ydb
зеркало из https://github.com/ydb-platform/ydb.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
							"""Backport of tokenize.open from Python 3.5

This is the exact Python 3.5 with the following differences:
 - detect_encoding_ex is detect_encoding from Python 3.5 returning also a bool whether a cookie was found
 - detect_encoding calls detect_encoding_ex, so that its signature is the same as in Python 3.5
 - function read_source_lines was added
"""

from codecs import lookup, BOM_UTF8
from io import TextIOWrapper, open as _builtin_open
import re

re_ASCII = 256 # not present in Python 2
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re_ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re_ASCII)


def _get_normal_name(orig_enc):
	"""Imitates get_normal_name in tokenizer.c."""
	# Only care about the first 12 characters.
	enc = orig_enc[:12].lower().replace("_", "-")
	if enc == "utf-8" or enc.startswith("utf-8-"):
		return "utf-8"
	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
			enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
		return "iso-8859-1"
	return orig_enc


def detect_encoding(readline):
	"""
	The detect_encoding() function is used to detect the encoding that should
	be used to decode a Python source file.  It requires one argument, readline,
	in the same way as the tokenize() generator.
	
	It will call readline a maximum of twice, and return the encoding used
	(as a string) and a list of any lines (left as bytes) it has read in.
	
	It detects the encoding from the presence of a utf-8 bom or an encoding
	cookie as specified in pep-0263.  If both a bom and a cookie are present,
	but disagree, a SyntaxError will be raised.  If the encoding cookie is an
	invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
	'utf-8-sig' is returned.
	
	If no encoding is specified, then the default of 'utf-8' will be returned.
	"""
	
	return detect_encoding_ex(readline)[:2]


def detect_encoding_ex(readline):
	try:
		filename = readline.__self__.name
	except AttributeError:
		filename = None
	bom_found = False
	encoding = None
	default = 'utf-8'
	def read_or_stop():
		try:
			return readline()
		except StopIteration:
			return b''
	
	def find_cookie(line):
		try:
			# Decode as UTF-8. Either the line is an encoding declaration,
			# in which case it should be pure ASCII, or it must be UTF-8
			# per default encoding.
			line_string = line.decode('utf-8')
		except UnicodeDecodeError:
			msg = "invalid or missing encoding declaration"
			if filename is not None:
				msg = '{} for {!r}'.format(msg, filename)
			raise SyntaxError(msg)
		
		match = cookie_re.match(line_string)
		if not match:
			return None
		encoding = _get_normal_name(match.group(1))
		try:
			codec = lookup(encoding)
		except LookupError:
			# This behaviour mimics the Python interpreter
			if filename is None:
				msg = "unknown encoding: " + encoding
			else:
				msg = "unknown encoding for {!r}: {}".format(filename,
						encoding)
			raise SyntaxError(msg)
		
		if bom_found:
			if encoding != 'utf-8':
				# This behaviour mimics the Python interpreter
				if filename is None:
					msg = 'encoding problem: utf-8'
				else:
					msg = 'encoding problem for {!r}: utf-8'.format(filename)
				raise SyntaxError(msg)
			encoding += '-sig'
		return encoding
	
	first = read_or_stop()
	if first.startswith(BOM_UTF8):
		bom_found = True
		first = first[3:]
		default = 'utf-8-sig'
	if not first:
		return default, [], False
	
	encoding = find_cookie(first)
	if encoding:
		return encoding, [first], True
	if not blank_re.match(first):
		return default, [first], False
	
	second = read_or_stop()
	if not second:
		return default, [first], False
	
	encoding = find_cookie(second)
	if encoding:
		return encoding, [first, second], True
	
	return default, [first, second], False


def open(filename):
	"""Open a file in read only mode using the encoding detected by
	detect_encoding().
	"""
	buffer = _builtin_open(filename, 'rb')
	try:
		encoding, lines = detect_encoding(buffer.readline)
		buffer.seek(0)
		text = TextIOWrapper(buffer, encoding, line_buffering=True)
		text.mode = 'r'
		return text
	except:
		buffer.close()
		raise

def read_source_lines(filename):
	buffer = _builtin_open(filename, 'rb')
	try:
		encoding, lines, cookie_present = detect_encoding_ex(buffer.readline)
		buffer.seek(0)
		text = TextIOWrapper(buffer, encoding, line_buffering=True)
		text.mode = 'r'
	except:
		buffer.close()
		raise
	
	with text:
		if cookie_present:
			for i in lines:
				yield text.readline().replace("coding", "Coding")
				# so compile() won't complain about encoding declatation in a Unicode string
				# see 2.7/Python/ast.c:228
		
		for line in text:
			yield line