md.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. from functools import lru_cache
  2. from logging import getLogger
  3. from typing import List, Optional
  4. from .constant import (
  5. COMMON_SAFE_ASCII_CHARACTERS,
  6. TRACE,
  7. UNICODE_SECONDARY_RANGE_KEYWORD,
  8. )
  9. from .utils import (
  10. is_accentuated,
  11. is_case_variable,
  12. is_cjk,
  13. is_emoticon,
  14. is_hangul,
  15. is_hiragana,
  16. is_katakana,
  17. is_latin,
  18. is_punctuation,
  19. is_separator,
  20. is_symbol,
  21. is_thai,
  22. is_unprintable,
  23. remove_accent,
  24. unicode_range,
  25. )
  26. class MessDetectorPlugin:
  27. """
  28. Base abstract class used for mess detection plugins.
  29. All detectors MUST extend and implement given methods.
  30. """
  31. def eligible(self, character: str) -> bool:
  32. """
  33. Determine if given character should be fed in.
  34. """
  35. raise NotImplementedError # pragma: nocover
  36. def feed(self, character: str) -> None:
  37. """
  38. The main routine to be executed upon character.
  39. Insert the logic in witch the text would be considered chaotic.
  40. """
  41. raise NotImplementedError # pragma: nocover
  42. def reset(self) -> None: # pragma: no cover
  43. """
  44. Permit to reset the plugin to the initial state.
  45. """
  46. raise NotImplementedError
  47. @property
  48. def ratio(self) -> float:
  49. """
  50. Compute the chaos ratio based on what your feed() has seen.
  51. Must NOT be lower than 0.; No restriction gt 0.
  52. """
  53. raise NotImplementedError # pragma: nocover
  54. class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
  55. def __init__(self) -> None:
  56. self._punctuation_count: int = 0
  57. self._symbol_count: int = 0
  58. self._character_count: int = 0
  59. self._last_printable_char: Optional[str] = None
  60. self._frenzy_symbol_in_word: bool = False
  61. def eligible(self, character: str) -> bool:
  62. return character.isprintable()
  63. def feed(self, character: str) -> None:
  64. self._character_count += 1
  65. if (
  66. character != self._last_printable_char
  67. and character not in COMMON_SAFE_ASCII_CHARACTERS
  68. ):
  69. if is_punctuation(character):
  70. self._punctuation_count += 1
  71. elif (
  72. character.isdigit() is False
  73. and is_symbol(character)
  74. and is_emoticon(character) is False
  75. ):
  76. self._symbol_count += 2
  77. self._last_printable_char = character
  78. def reset(self) -> None: # pragma: no cover
  79. self._punctuation_count = 0
  80. self._character_count = 0
  81. self._symbol_count = 0
  82. @property
  83. def ratio(self) -> float:
  84. if self._character_count == 0:
  85. return 0.0
  86. ratio_of_punctuation: float = (
  87. self._punctuation_count + self._symbol_count
  88. ) / self._character_count
  89. return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
  90. class TooManyAccentuatedPlugin(MessDetectorPlugin):
  91. def __init__(self) -> None:
  92. self._character_count: int = 0
  93. self._accentuated_count: int = 0
  94. def eligible(self, character: str) -> bool:
  95. return character.isalpha()
  96. def feed(self, character: str) -> None:
  97. self._character_count += 1
  98. if is_accentuated(character):
  99. self._accentuated_count += 1
  100. def reset(self) -> None: # pragma: no cover
  101. self._character_count = 0
  102. self._accentuated_count = 0
  103. @property
  104. def ratio(self) -> float:
  105. if self._character_count == 0 or self._character_count < 8:
  106. return 0.0
  107. ratio_of_accentuation: float = self._accentuated_count / self._character_count
  108. return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
  109. class UnprintablePlugin(MessDetectorPlugin):
  110. def __init__(self) -> None:
  111. self._unprintable_count: int = 0
  112. self._character_count: int = 0
  113. def eligible(self, character: str) -> bool:
  114. return True
  115. def feed(self, character: str) -> None:
  116. if is_unprintable(character):
  117. self._unprintable_count += 1
  118. self._character_count += 1
  119. def reset(self) -> None: # pragma: no cover
  120. self._unprintable_count = 0
  121. @property
  122. def ratio(self) -> float:
  123. if self._character_count == 0:
  124. return 0.0
  125. return (self._unprintable_count * 8) / self._character_count
  126. class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
  127. def __init__(self) -> None:
  128. self._successive_count: int = 0
  129. self._character_count: int = 0
  130. self._last_latin_character: Optional[str] = None
  131. def eligible(self, character: str) -> bool:
  132. return character.isalpha() and is_latin(character)
  133. def feed(self, character: str) -> None:
  134. self._character_count += 1
  135. if (
  136. self._last_latin_character is not None
  137. and is_accentuated(character)
  138. and is_accentuated(self._last_latin_character)
  139. ):
  140. if character.isupper() and self._last_latin_character.isupper():
  141. self._successive_count += 1
  142. # Worse if its the same char duplicated with different accent.
  143. if remove_accent(character) == remove_accent(self._last_latin_character):
  144. self._successive_count += 1
  145. self._last_latin_character = character
  146. def reset(self) -> None: # pragma: no cover
  147. self._successive_count = 0
  148. self._character_count = 0
  149. self._last_latin_character = None
  150. @property
  151. def ratio(self) -> float:
  152. if self._character_count == 0:
  153. return 0.0
  154. return (self._successive_count * 2) / self._character_count
  155. class SuspiciousRange(MessDetectorPlugin):
  156. def __init__(self) -> None:
  157. self._suspicious_successive_range_count: int = 0
  158. self._character_count: int = 0
  159. self._last_printable_seen: Optional[str] = None
  160. def eligible(self, character: str) -> bool:
  161. return character.isprintable()
  162. def feed(self, character: str) -> None:
  163. self._character_count += 1
  164. if (
  165. character.isspace()
  166. or is_punctuation(character)
  167. or character in COMMON_SAFE_ASCII_CHARACTERS
  168. ):
  169. self._last_printable_seen = None
  170. return
  171. if self._last_printable_seen is None:
  172. self._last_printable_seen = character
  173. return
  174. unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
  175. unicode_range_b: Optional[str] = unicode_range(character)
  176. if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
  177. self._suspicious_successive_range_count += 1
  178. self._last_printable_seen = character
  179. def reset(self) -> None: # pragma: no cover
  180. self._character_count = 0
  181. self._suspicious_successive_range_count = 0
  182. self._last_printable_seen = None
  183. @property
  184. def ratio(self) -> float:
  185. if self._character_count == 0:
  186. return 0.0
  187. ratio_of_suspicious_range_usage: float = (
  188. self._suspicious_successive_range_count * 2
  189. ) / self._character_count
  190. if ratio_of_suspicious_range_usage < 0.1:
  191. return 0.0
  192. return ratio_of_suspicious_range_usage
  193. class SuperWeirdWordPlugin(MessDetectorPlugin):
  194. def __init__(self) -> None:
  195. self._word_count: int = 0
  196. self._bad_word_count: int = 0
  197. self._foreign_long_count: int = 0
  198. self._is_current_word_bad: bool = False
  199. self._foreign_long_watch: bool = False
  200. self._character_count: int = 0
  201. self._bad_character_count: int = 0
  202. self._buffer: str = ""
  203. self._buffer_accent_count: int = 0
  204. def eligible(self, character: str) -> bool:
  205. return True
  206. def feed(self, character: str) -> None:
  207. if character.isalpha():
  208. self._buffer += character
  209. if is_accentuated(character):
  210. self._buffer_accent_count += 1
  211. if (
  212. self._foreign_long_watch is False
  213. and (is_latin(character) is False or is_accentuated(character))
  214. and is_cjk(character) is False
  215. and is_hangul(character) is False
  216. and is_katakana(character) is False
  217. and is_hiragana(character) is False
  218. and is_thai(character) is False
  219. ):
  220. self._foreign_long_watch = True
  221. return
  222. if not self._buffer:
  223. return
  224. if (
  225. character.isspace() or is_punctuation(character) or is_separator(character)
  226. ) and self._buffer:
  227. self._word_count += 1
  228. buffer_length: int = len(self._buffer)
  229. self._character_count += buffer_length
  230. if buffer_length >= 4:
  231. if self._buffer_accent_count / buffer_length > 0.34:
  232. self._is_current_word_bad = True
  233. # Word/Buffer ending with an upper case accentuated letter are so rare,
  234. # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
  235. if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
  236. self._foreign_long_count += 1
  237. self._is_current_word_bad = True
  238. if buffer_length >= 24 and self._foreign_long_watch:
  239. camel_case_dst = [
  240. i
  241. for c, i in zip(self._buffer, range(0, buffer_length))
  242. if c.isupper()
  243. ]
  244. probable_camel_cased: bool = False
  245. if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
  246. probable_camel_cased = True
  247. if not probable_camel_cased:
  248. self._foreign_long_count += 1
  249. self._is_current_word_bad = True
  250. if self._is_current_word_bad:
  251. self._bad_word_count += 1
  252. self._bad_character_count += len(self._buffer)
  253. self._is_current_word_bad = False
  254. self._foreign_long_watch = False
  255. self._buffer = ""
  256. self._buffer_accent_count = 0
  257. elif (
  258. character not in {"<", ">", "-", "=", "~", "|", "_"}
  259. and character.isdigit() is False
  260. and is_symbol(character)
  261. ):
  262. self._is_current_word_bad = True
  263. self._buffer += character
  264. def reset(self) -> None: # pragma: no cover
  265. self._buffer = ""
  266. self._is_current_word_bad = False
  267. self._foreign_long_watch = False
  268. self._bad_word_count = 0
  269. self._word_count = 0
  270. self._character_count = 0
  271. self._bad_character_count = 0
  272. self._foreign_long_count = 0
  273. @property
  274. def ratio(self) -> float:
  275. if self._word_count <= 10 and self._foreign_long_count == 0:
  276. return 0.0
  277. return self._bad_character_count / self._character_count
  278. class CjkInvalidStopPlugin(MessDetectorPlugin):
  279. """
  280. GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
  281. can be easily detected. Searching for the overuse of '丅' and '丄'.
  282. """
  283. def __init__(self) -> None:
  284. self._wrong_stop_count: int = 0
  285. self._cjk_character_count: int = 0
  286. def eligible(self, character: str) -> bool:
  287. return True
  288. def feed(self, character: str) -> None:
  289. if character in {"丅", "丄"}:
  290. self._wrong_stop_count += 1
  291. return
  292. if is_cjk(character):
  293. self._cjk_character_count += 1
  294. def reset(self) -> None: # pragma: no cover
  295. self._wrong_stop_count = 0
  296. self._cjk_character_count = 0
  297. @property
  298. def ratio(self) -> float:
  299. if self._cjk_character_count < 16:
  300. return 0.0
  301. return self._wrong_stop_count / self._cjk_character_count
  302. class ArchaicUpperLowerPlugin(MessDetectorPlugin):
  303. def __init__(self) -> None:
  304. self._buf: bool = False
  305. self._character_count_since_last_sep: int = 0
  306. self._successive_upper_lower_count: int = 0
  307. self._successive_upper_lower_count_final: int = 0
  308. self._character_count: int = 0
  309. self._last_alpha_seen: Optional[str] = None
  310. self._current_ascii_only: bool = True
  311. def eligible(self, character: str) -> bool:
  312. return True
  313. def feed(self, character: str) -> None:
  314. is_concerned = character.isalpha() and is_case_variable(character)
  315. chunk_sep = is_concerned is False
  316. if chunk_sep and self._character_count_since_last_sep > 0:
  317. if (
  318. self._character_count_since_last_sep <= 64
  319. and character.isdigit() is False
  320. and self._current_ascii_only is False
  321. ):
  322. self._successive_upper_lower_count_final += (
  323. self._successive_upper_lower_count
  324. )
  325. self._successive_upper_lower_count = 0
  326. self._character_count_since_last_sep = 0
  327. self._last_alpha_seen = None
  328. self._buf = False
  329. self._character_count += 1
  330. self._current_ascii_only = True
  331. return
  332. if self._current_ascii_only is True and character.isascii() is False:
  333. self._current_ascii_only = False
  334. if self._last_alpha_seen is not None:
  335. if (character.isupper() and self._last_alpha_seen.islower()) or (
  336. character.islower() and self._last_alpha_seen.isupper()
  337. ):
  338. if self._buf is True:
  339. self._successive_upper_lower_count += 2
  340. self._buf = False
  341. else:
  342. self._buf = True
  343. else:
  344. self._buf = False
  345. self._character_count += 1
  346. self._character_count_since_last_sep += 1
  347. self._last_alpha_seen = character
  348. def reset(self) -> None: # pragma: no cover
  349. self._character_count = 0
  350. self._character_count_since_last_sep = 0
  351. self._successive_upper_lower_count = 0
  352. self._successive_upper_lower_count_final = 0
  353. self._last_alpha_seen = None
  354. self._buf = False
  355. self._current_ascii_only = True
  356. @property
  357. def ratio(self) -> float:
  358. if self._character_count == 0:
  359. return 0.0
  360. return self._successive_upper_lower_count_final / self._character_count
  361. @lru_cache(maxsize=1024)
  362. def is_suspiciously_successive_range(
  363. unicode_range_a: Optional[str], unicode_range_b: Optional[str]
  364. ) -> bool:
  365. """
  366. Determine if two Unicode range seen next to each other can be considered as suspicious.
  367. """
  368. if unicode_range_a is None or unicode_range_b is None:
  369. return True
  370. if unicode_range_a == unicode_range_b:
  371. return False
  372. if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
  373. return False
  374. if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
  375. return False
  376. # Latin characters can be accompanied with a combining diacritical mark
  377. # eg. Vietnamese.
  378. if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
  379. "Combining" in unicode_range_a or "Combining" in unicode_range_b
  380. ):
  381. return False
  382. keywords_range_a, keywords_range_b = unicode_range_a.split(
  383. " "
  384. ), unicode_range_b.split(" ")
  385. for el in keywords_range_a:
  386. if el in UNICODE_SECONDARY_RANGE_KEYWORD:
  387. continue
  388. if el in keywords_range_b:
  389. return False
  390. # Japanese Exception
  391. range_a_jp_chars, range_b_jp_chars = (
  392. unicode_range_a
  393. in (
  394. "Hiragana",
  395. "Katakana",
  396. ),
  397. unicode_range_b in ("Hiragana", "Katakana"),
  398. )
  399. if (range_a_jp_chars or range_b_jp_chars) and (
  400. "CJK" in unicode_range_a or "CJK" in unicode_range_b
  401. ):
  402. return False
  403. if range_a_jp_chars and range_b_jp_chars:
  404. return False
  405. if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
  406. if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
  407. return False
  408. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  409. return False
  410. # Chinese/Japanese use dedicated range for punctuation and/or separators.
  411. if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
  412. unicode_range_a in ["Katakana", "Hiragana"]
  413. and unicode_range_b in ["Katakana", "Hiragana"]
  414. ):
  415. if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
  416. return False
  417. if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
  418. return False
  419. return True
  420. @lru_cache(maxsize=2048)
  421. def mess_ratio(
  422. decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
  423. ) -> float:
  424. """
  425. Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
  426. """
  427. detectors: List[MessDetectorPlugin] = [
  428. md_class() for md_class in MessDetectorPlugin.__subclasses__()
  429. ]
  430. length: int = len(decoded_sequence) + 1
  431. mean_mess_ratio: float = 0.0
  432. if length < 512:
  433. intermediary_mean_mess_ratio_calc: int = 32
  434. elif length <= 1024:
  435. intermediary_mean_mess_ratio_calc = 64
  436. else:
  437. intermediary_mean_mess_ratio_calc = 128
  438. for character, index in zip(decoded_sequence + "\n", range(length)):
  439. for detector in detectors:
  440. if detector.eligible(character):
  441. detector.feed(character)
  442. if (
  443. index > 0 and index % intermediary_mean_mess_ratio_calc == 0
  444. ) or index == length - 1:
  445. mean_mess_ratio = sum(dt.ratio for dt in detectors)
  446. if mean_mess_ratio >= maximum_threshold:
  447. break
  448. if debug:
  449. logger = getLogger("charset_normalizer")
  450. logger.log(
  451. TRACE,
  452. "Mess-detector extended-analysis start. "
  453. f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
  454. f"maximum_threshold={maximum_threshold}",
  455. )
  456. if len(decoded_sequence) > 16:
  457. logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
  458. logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
  459. for dt in detectors: # pragma: nocover
  460. logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
  461. return round(mean_mess_ratio, 3)