123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151 |
- """
- Run chardet on a bunch of documents and see that we get the correct encodings.
- :author: Dan Blanchard
- :author: Ian Cordasco
- """
- from __future__ import with_statement
- import textwrap
- from difflib import ndiff
- from io import open
- from os import listdir
- from os.path import dirname, isdir, join, splitext, basename
- try:
- import hypothesis.strategies as st
- from hypothesis import given, assume, settings, Verbosity
- HAVE_HYPOTHESIS = True
- except ImportError:
- HAVE_HYPOTHESIS = False
- import pytest
- import chardet
- import yatest.common
- # TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
- # retrain model.
- MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
- 'windows-1254', 'windows-1256'}
- EXPECTED_FAILURES = {'iso-8859-7-greek/disabled.gr.xml',
- 'iso-8859-9-turkish/divxplanet.com.xml',
- 'iso-8859-9-turkish/subtitle.srt',
- 'iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}
- def gen_test_params():
- """Yields tuples of paths and encodings to use for test_encoding_detection"""
- base_path = yatest.common.work_path('test_data')
- for encoding in listdir(base_path):
- path = join(base_path, encoding)
- # Skip files in tests directory
- if not isdir(path):
- continue
- # Remove language suffixes from encoding if pressent
- encoding = encoding.lower()
- for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
- '-hebrew', '-hungarian', '-turkish']:
- if encoding.endswith(postfix):
- encoding = encoding.rpartition(postfix)[0]
- break
- # Skip directories for encodings we don't handle yet.
- if encoding in MISSING_ENCODINGS:
- continue
- # Test encoding detection for each file we have of encoding for
- for file_name in listdir(path):
- ext = splitext(file_name)[1].lower()
- if ext not in ['.html', '.txt', '.xml', '.srt']:
- continue
- full_path = join(path, file_name)
- test_case = full_path, encoding
- if join(basename(path), file_name) in EXPECTED_FAILURES:
- test_case = pytest.param(*test_case, marks=pytest.mark.xfail)
- yield test_case
- def get_test_name(args):
- return join(basename(dirname(args)), basename(args))
- @pytest.mark.parametrize ('file_name, encoding', gen_test_params(), ids=get_test_name)
- def test_encoding_detection(file_name, encoding):
- with open(file_name, 'rb') as f:
- input_bytes = f.read()
- result = chardet.detect(input_bytes)
- try:
- expected_unicode = input_bytes.decode(encoding)
- except LookupError:
- expected_unicode = ''
- try:
- detected_unicode = input_bytes.decode(result['encoding'])
- except (LookupError, UnicodeDecodeError, TypeError):
- detected_unicode = ''
- if result:
- encoding_match = (result['encoding'] or '').lower() == encoding
- else:
- encoding_match = False
- # Only care about mismatches that would actually result in different
- # behavior when decoding
- if not encoding_match and expected_unicode != detected_unicode:
- wrapped_expected = '\n'.join(textwrap.wrap(expected_unicode, 100)) + '\n'
- wrapped_detected = '\n'.join(textwrap.wrap(detected_unicode, 100)) + '\n'
- diff = ''.join(ndiff(wrapped_expected.splitlines(True),
- wrapped_detected.splitlines(True)))
- else:
- diff = ''
- encoding_match = True
- assert encoding_match, ("Expected %s, but got %s for %s. Character "
- "differences: \n%s" % (encoding,
- result,
- file_name,
- diff))
- if HAVE_HYPOTHESIS:
- class JustALengthIssue(Exception):
- pass
- @pytest.mark.xfail
- @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
- 'utf-32', 'iso-8859-7',
- 'iso-8859-8', 'windows-1255']),
- st.randoms())
- @settings(max_examples=200)
- def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
- try:
- data = txt.encode(enc)
- except UnicodeEncodeError:
- assume(False)
- detected = chardet.detect(data)['encoding']
- if detected is None:
- with pytest.raises(JustALengthIssue):
- @given(st.text(), random=rnd)
- @settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
- def string_poisons_following_text(suffix):
- try:
- extended = (txt + suffix).encode(enc)
- except UnicodeEncodeError:
- assume(False)
- result = chardet.detect(extended)
- if result and result['encoding'] is not None:
- raise JustALengthIssue()
- @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
- 'utf-32', 'iso-8859-7',
- 'iso-8859-8', 'windows-1255']),
- st.randoms())
- @settings(max_examples=200)
- def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
- try:
- data = txt.encode(enc)
- except UnicodeEncodeError:
- assume(False)
- try:
- result = chardet.detect(data)
- results = chardet.detect_all(data)
- assert result['encoding'] == results[0]['encoding']
- except Exception:
- raise Exception('%s != %s' % (result, results))
|