123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395 |
- import importlib
- from codecs import IncrementalDecoder
- from collections import Counter
- from functools import lru_cache
- from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
- from .constant import (
- FREQUENCIES,
- KO_NAMES,
- LANGUAGE_SUPPORTED_COUNT,
- TOO_SMALL_SEQUENCE,
- ZH_NAMES,
- )
- from .md import is_suspiciously_successive_range
- from .models import CoherenceMatches
- from .utils import (
- is_accentuated,
- is_latin,
- is_multi_byte_encoding,
- is_unicode_range_secondary,
- unicode_range,
- )
- def encoding_unicode_range(iana_name: str) -> List[str]:
- """
- Return associated unicode ranges in a single byte code page.
- """
- if is_multi_byte_encoding(iana_name):
- raise IOError("Function not supported on multi-byte code page")
- decoder = importlib.import_module(
- "encodings.{}".format(iana_name)
- ).IncrementalDecoder
- p: IncrementalDecoder = decoder(errors="ignore")
- seen_ranges: Dict[str, int] = {}
- character_count: int = 0
- for i in range(0x40, 0xFF):
- chunk: str = p.decode(bytes([i]))
- if chunk:
- character_range: Optional[str] = unicode_range(chunk)
- if character_range is None:
- continue
- if is_unicode_range_secondary(character_range) is False:
- if character_range not in seen_ranges:
- seen_ranges[character_range] = 0
- seen_ranges[character_range] += 1
- character_count += 1
- return sorted(
- [
- character_range
- for character_range in seen_ranges
- if seen_ranges[character_range] / character_count >= 0.15
- ]
- )
- def unicode_range_languages(primary_range: str) -> List[str]:
- """
- Return inferred languages used with a unicode range.
- """
- languages: List[str] = []
- for language, characters in FREQUENCIES.items():
- for character in characters:
- if unicode_range(character) == primary_range:
- languages.append(language)
- break
- return languages
- @lru_cache()
- def encoding_languages(iana_name: str) -> List[str]:
- """
- Single-byte encoding language association. Some code page are heavily linked to particular language(s).
- This function does the correspondence.
- """
- unicode_ranges: List[str] = encoding_unicode_range(iana_name)
- primary_range: Optional[str] = None
- for specified_range in unicode_ranges:
- if "Latin" not in specified_range:
- primary_range = specified_range
- break
- if primary_range is None:
- return ["Latin Based"]
- return unicode_range_languages(primary_range)
- @lru_cache()
- def mb_encoding_languages(iana_name: str) -> List[str]:
- """
- Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
- This function does the correspondence.
- """
- if (
- iana_name.startswith("shift_")
- or iana_name.startswith("iso2022_jp")
- or iana_name.startswith("euc_j")
- or iana_name == "cp932"
- ):
- return ["Japanese"]
- if iana_name.startswith("gb") or iana_name in ZH_NAMES:
- return ["Chinese"]
- if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
- return ["Korean"]
- return []
- @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
- def get_target_features(language: str) -> Tuple[bool, bool]:
- """
- Determine main aspects from a supported language if it contains accents and if is pure Latin.
- """
- target_have_accents: bool = False
- target_pure_latin: bool = True
- for character in FREQUENCIES[language]:
- if not target_have_accents and is_accentuated(character):
- target_have_accents = True
- if target_pure_latin and is_latin(character) is False:
- target_pure_latin = False
- return target_have_accents, target_pure_latin
- def alphabet_languages(
- characters: List[str], ignore_non_latin: bool = False
- ) -> List[str]:
- """
- Return associated languages associated to given characters.
- """
- languages: List[Tuple[str, float]] = []
- source_have_accents = any(is_accentuated(character) for character in characters)
- for language, language_characters in FREQUENCIES.items():
- target_have_accents, target_pure_latin = get_target_features(language)
- if ignore_non_latin and target_pure_latin is False:
- continue
- if target_have_accents is False and source_have_accents:
- continue
- character_count: int = len(language_characters)
- character_match_count: int = len(
- [c for c in language_characters if c in characters]
- )
- ratio: float = character_match_count / character_count
- if ratio >= 0.2:
- languages.append((language, ratio))
- languages = sorted(languages, key=lambda x: x[1], reverse=True)
- return [compatible_language[0] for compatible_language in languages]
- def characters_popularity_compare(
- language: str, ordered_characters: List[str]
- ) -> float:
- """
- Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
- The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
- Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
- """
- if language not in FREQUENCIES:
- raise ValueError("{} not available".format(language))
- character_approved_count: int = 0
- FREQUENCIES_language_set = set(FREQUENCIES[language])
- ordered_characters_count: int = len(ordered_characters)
- target_language_characters_count: int = len(FREQUENCIES[language])
- large_alphabet: bool = target_language_characters_count > 26
- for character, character_rank in zip(
- ordered_characters, range(0, ordered_characters_count)
- ):
- if character not in FREQUENCIES_language_set:
- continue
- character_rank_in_language: int = FREQUENCIES[language].index(character)
- expected_projection_ratio: float = (
- target_language_characters_count / ordered_characters_count
- )
- character_rank_projection: int = int(character_rank * expected_projection_ratio)
- if (
- large_alphabet is False
- and abs(character_rank_projection - character_rank_in_language) > 4
- ):
- continue
- if (
- large_alphabet is True
- and abs(character_rank_projection - character_rank_in_language)
- < target_language_characters_count / 3
- ):
- character_approved_count += 1
- continue
- characters_before_source: List[str] = FREQUENCIES[language][
- 0:character_rank_in_language
- ]
- characters_after_source: List[str] = FREQUENCIES[language][
- character_rank_in_language:
- ]
- characters_before: List[str] = ordered_characters[0:character_rank]
- characters_after: List[str] = ordered_characters[character_rank:]
- before_match_count: int = len(
- set(characters_before) & set(characters_before_source)
- )
- after_match_count: int = len(
- set(characters_after) & set(characters_after_source)
- )
- if len(characters_before_source) == 0 and before_match_count <= 4:
- character_approved_count += 1
- continue
- if len(characters_after_source) == 0 and after_match_count <= 4:
- character_approved_count += 1
- continue
- if (
- before_match_count / len(characters_before_source) >= 0.4
- or after_match_count / len(characters_after_source) >= 0.4
- ):
- character_approved_count += 1
- continue
- return character_approved_count / len(ordered_characters)
- def alpha_unicode_split(decoded_sequence: str) -> List[str]:
- """
- Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
- Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
- One containing the latin letters and the other hebrew.
- """
- layers: Dict[str, str] = {}
- for character in decoded_sequence:
- if character.isalpha() is False:
- continue
- character_range: Optional[str] = unicode_range(character)
- if character_range is None:
- continue
- layer_target_range: Optional[str] = None
- for discovered_range in layers:
- if (
- is_suspiciously_successive_range(discovered_range, character_range)
- is False
- ):
- layer_target_range = discovered_range
- break
- if layer_target_range is None:
- layer_target_range = character_range
- if layer_target_range not in layers:
- layers[layer_target_range] = character.lower()
- continue
- layers[layer_target_range] += character.lower()
- return list(layers.values())
- def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
- """
- This function merge results previously given by the function coherence_ratio.
- The return type is the same as coherence_ratio.
- """
- per_language_ratios: Dict[str, List[float]] = {}
- for result in results:
- for sub_result in result:
- language, ratio = sub_result
- if language not in per_language_ratios:
- per_language_ratios[language] = [ratio]
- continue
- per_language_ratios[language].append(ratio)
- merge = [
- (
- language,
- round(
- sum(per_language_ratios[language]) / len(per_language_ratios[language]),
- 4,
- ),
- )
- for language in per_language_ratios
- ]
- return sorted(merge, key=lambda x: x[1], reverse=True)
- def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
- """
- We shall NOT return "English—" in CoherenceMatches because it is an alternative
- of "English". This function only keeps the best match and remove the em-dash in it.
- """
- index_results: Dict[str, List[float]] = dict()
- for result in results:
- language, ratio = result
- no_em_name: str = language.replace("—", "")
- if no_em_name not in index_results:
- index_results[no_em_name] = []
- index_results[no_em_name].append(ratio)
- if any(len(index_results[e]) > 1 for e in index_results):
- filtered_results: CoherenceMatches = []
- for language in index_results:
- filtered_results.append((language, max(index_results[language])))
- return filtered_results
- return results
- @lru_cache(maxsize=2048)
- def coherence_ratio(
- decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
- ) -> CoherenceMatches:
- """
- Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
- A layer = Character extraction by alphabets/ranges.
- """
- results: List[Tuple[str, float]] = []
- ignore_non_latin: bool = False
- sufficient_match_count: int = 0
- lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
- if "Latin Based" in lg_inclusion_list:
- ignore_non_latin = True
- lg_inclusion_list.remove("Latin Based")
- for layer in alpha_unicode_split(decoded_sequence):
- sequence_frequencies: TypeCounter[str] = Counter(layer)
- most_common = sequence_frequencies.most_common()
- character_count: int = sum(o for c, o in most_common)
- if character_count <= TOO_SMALL_SEQUENCE:
- continue
- popular_character_ordered: List[str] = [c for c, o in most_common]
- for language in lg_inclusion_list or alphabet_languages(
- popular_character_ordered, ignore_non_latin
- ):
- ratio: float = characters_popularity_compare(
- language, popular_character_ordered
- )
- if ratio < threshold:
- continue
- elif ratio >= 0.8:
- sufficient_match_count += 1
- results.append((language, round(ratio, 4)))
- if sufficient_match_count >= 3:
- break
- return sorted(
- filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
- )
|