test.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. """
  2. Run chardet on a bunch of documents and see that we get the correct encodings.
  3. :author: Dan Blanchard
  4. :author: Ian Cordasco
  5. """
  6. from __future__ import with_statement
  7. import textwrap
  8. from difflib import ndiff
  9. from io import open
  10. from os import listdir
  11. from os.path import dirname, isdir, join, splitext, basename
  12. try:
  13. import hypothesis.strategies as st
  14. from hypothesis import given, assume, settings, Verbosity
  15. HAVE_HYPOTHESIS = True
  16. except ImportError:
  17. HAVE_HYPOTHESIS = False
  18. import pytest
  19. import chardet
  20. import yatest.common
  21. # TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
  22. # retrain model.
  23. MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
  24. 'windows-1254', 'windows-1256'}
  25. EXPECTED_FAILURES = {'iso-8859-7-greek/disabled.gr.xml',
  26. 'iso-8859-9-turkish/divxplanet.com.xml',
  27. 'iso-8859-9-turkish/subtitle.srt',
  28. 'iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}
  29. def gen_test_params():
  30. """Yields tuples of paths and encodings to use for test_encoding_detection"""
  31. base_path = yatest.common.work_path('test_data')
  32. for encoding in listdir(base_path):
  33. path = join(base_path, encoding)
  34. # Skip files in tests directory
  35. if not isdir(path):
  36. continue
  37. # Remove language suffixes from encoding if pressent
  38. encoding = encoding.lower()
  39. for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
  40. '-hebrew', '-hungarian', '-turkish']:
  41. if encoding.endswith(postfix):
  42. encoding = encoding.rpartition(postfix)[0]
  43. break
  44. # Skip directories for encodings we don't handle yet.
  45. if encoding in MISSING_ENCODINGS:
  46. continue
  47. # Test encoding detection for each file we have of encoding for
  48. for file_name in listdir(path):
  49. ext = splitext(file_name)[1].lower()
  50. if ext not in ['.html', '.txt', '.xml', '.srt']:
  51. continue
  52. full_path = join(path, file_name)
  53. test_case = full_path, encoding
  54. if join(basename(path), file_name) in EXPECTED_FAILURES:
  55. test_case = pytest.param(*test_case, marks=pytest.mark.xfail)
  56. yield test_case
  57. def get_test_name(args):
  58. return join(basename(dirname(args)), basename(args))
  59. @pytest.mark.parametrize ('file_name, encoding', gen_test_params(), ids=get_test_name)
  60. def test_encoding_detection(file_name, encoding):
  61. with open(file_name, 'rb') as f:
  62. input_bytes = f.read()
  63. result = chardet.detect(input_bytes)
  64. try:
  65. expected_unicode = input_bytes.decode(encoding)
  66. except LookupError:
  67. expected_unicode = ''
  68. try:
  69. detected_unicode = input_bytes.decode(result['encoding'])
  70. except (LookupError, UnicodeDecodeError, TypeError):
  71. detected_unicode = ''
  72. if result:
  73. encoding_match = (result['encoding'] or '').lower() == encoding
  74. else:
  75. encoding_match = False
  76. # Only care about mismatches that would actually result in different
  77. # behavior when decoding
  78. if not encoding_match and expected_unicode != detected_unicode:
  79. wrapped_expected = '\n'.join(textwrap.wrap(expected_unicode, 100)) + '\n'
  80. wrapped_detected = '\n'.join(textwrap.wrap(detected_unicode, 100)) + '\n'
  81. diff = ''.join(ndiff(wrapped_expected.splitlines(True),
  82. wrapped_detected.splitlines(True)))
  83. else:
  84. diff = ''
  85. encoding_match = True
  86. assert encoding_match, ("Expected %s, but got %s for %s. Character "
  87. "differences: \n%s" % (encoding,
  88. result,
  89. file_name,
  90. diff))
  91. if HAVE_HYPOTHESIS:
  92. class JustALengthIssue(Exception):
  93. pass
  94. @pytest.mark.xfail
  95. @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
  96. 'utf-32', 'iso-8859-7',
  97. 'iso-8859-8', 'windows-1255']),
  98. st.randoms())
  99. @settings(max_examples=200)
  100. def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
  101. try:
  102. data = txt.encode(enc)
  103. except UnicodeEncodeError:
  104. assume(False)
  105. detected = chardet.detect(data)['encoding']
  106. if detected is None:
  107. with pytest.raises(JustALengthIssue):
  108. @given(st.text(), random=rnd)
  109. @settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
  110. def string_poisons_following_text(suffix):
  111. try:
  112. extended = (txt + suffix).encode(enc)
  113. except UnicodeEncodeError:
  114. assume(False)
  115. result = chardet.detect(extended)
  116. if result and result['encoding'] is not None:
  117. raise JustALengthIssue()
  118. @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
  119. 'utf-32', 'iso-8859-7',
  120. 'iso-8859-8', 'windows-1255']),
  121. st.randoms())
  122. @settings(max_examples=200)
  123. def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
  124. try:
  125. data = txt.encode(enc)
  126. except UnicodeEncodeError:
  127. assume(False)
  128. try:
  129. result = chardet.detect(data)
  130. results = chardet.detect_all(data)
  131. assert result['encoding'] == results[0]['encoding']
  132. except Exception:
  133. raise Exception('%s != %s' % (result, results))