#!/usr/bin/env python3
#
# Copyright 2022 Google LLC All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from collections import defaultdict, Counter
import re
import unicodedata

from gflanguages import LoadLanguages, languages_public_pb2, LoadScripts, LoadRegions
import pytest
import youseedee


LANGUAGES = LoadLanguages()
SCRIPTS = LoadScripts()
REGIONS = LoadRegions()

CLDR_SCRIPT_TO_UCD_SCRIPT = {
    "Bangla": "Bengali",
    "Traditional Han": "Han",
    "Simplified Han": "Han",
    "Korean": "Hangul",
    "Odia": "Oriya",
    "Ol Chiki": "Ol_Chiki",
}

SKIP_EXEMPLARS = {
    "ja_Jpan": "Contains multiple scripts",
    "aii_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
    "sel_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
    "coo_Latn": "Does indeed use Greek glyphs while writing Latin",
    "hur_Latn": "Does indeed use Greek glyphs while writing Latin",
    "kwk_Latn": "Does indeed use Greek glyphs while writing Latin",
    "thp_Latn": "Does indeed use Greek glyphs while writing Latin",
    "dnj_Latn": "Does use future Unicode 16 Latin glyphs",
    "gov_Latn": "Does use future Unicode 16 Latin glyphs",
}

SKIP_REGION = {
    "cpf_Latn": "French-based creole languages is a group of languages.",
    "gem_Latn": "Germanic languages is a group of languages.",
    "sla_Latn": "Slavic languages is a group of languages.",
    "hmn_Latn": "Homnic languages is a group of languages.",
    "ie_Latn": "Interlingue is an artifical language.",
    "io_Latn": "Ido is an artifical language.",
    "jbo_Latn": "Lobjan is an artifical language.",
    "tlh_Latn": "Klingon is an artifical language.",
}


@pytest.mark.parametrize("lang_code", LANGUAGES)
@pytest.mark.parametrize(
    "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
)
def test_languages_exemplars_canonical_duplicates(lang_code, exemplar_name):
    lang = LANGUAGES[lang_code]
    exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
    normalized = defaultdict(set)

    for g in exemplar:
        if g[0] == "{" and g[-1] == "}":
            g = g.lstrip("{").rstrip("}")
        normalized[unicodedata.normalize("NFC", g)].add(g)

    result = [(len(gs), n) for n, gs in normalized.items()]
    expected = [(1, n) for n, _ in normalized.items()]
    assert result == expected


@pytest.mark.parametrize("lang_code", LANGUAGES)
@pytest.mark.parametrize(
    "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
)
def test_languages_exemplars_duplicates(lang_code, exemplar_name):
    lang = LANGUAGES[lang_code]
    exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
    counter = Counter(exemplar)
    counts = sorted(counter.most_common(), key=lambda pair: exemplar.index(pair[0]))
    assert counts == [(v, 1) for v in exemplar]


@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
@pytest.mark.parametrize(
    "exemplar_name", ["base", "auxiliary", "numerals", "punctuation", "index"]
)
def test_exemplars_bracketed_sequences(lang_code, exemplar_name):
    lang = LANGUAGES[lang_code]
    if lang.script != "Latn":
        return
    exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
    for chars in exemplar:
        if len(chars) > 1:
            assert chars.startswith("{") and chars.endswith("}")
            assert len(chars[1:-1]) > 1


SampleText = languages_public_pb2.SampleTextProto().DESCRIPTOR
ExemplarChars = languages_public_pb2.ExemplarCharsProto().DESCRIPTOR


@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
def test_language_samples(lang_code):
    # Although marked as optional in the protobuf file, all
    # SampleText fields (except note) are required, so make
    # sure they are present.
    lang = LANGUAGES[lang_code]
    if not lang.sample_text.ListFields():
        pytest.skip("No sample text for language " + lang_code)
        return

    for field in SampleText.fields:
        if field.name == "note":
            continue
        assert getattr(lang.sample_text, field.name)


@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
def test_script_is_known(lang_code):
    lang = LANGUAGES[lang_code]
    script = lang.script
    assert script in SCRIPTS, f"{lang_code} used unknown script {lang.script}"


@pytest.mark.parametrize("lang_code", LANGUAGES)
def test_region_is_known(lang_code):
    lang = LANGUAGES[lang_code]
    if lang.id in SKIP_REGION:
        pytest.skip(SKIP_REGION[lang.id])
        return
    regions = lang.region
    for region in regions:
        assert region in REGIONS.keys()


@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
def test_exemplars_are_in_script(lang_code):
    lang = LANGUAGES[lang_code]
    script_name = SCRIPTS[lang.script].name
    script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
    if not lang.exemplar_chars.ListFields():
        pytest.skip("No exemplars for language " + lang_code)
        return
    if lang.id in SKIP_EXEMPLARS:
        pytest.skip(SKIP_EXEMPLARS[lang.id])
        return
    out_of_script = {}
    for field in ExemplarChars.fields:
        if field.name == "auxiliary" or field.name == "index":
            continue
        exemplars = getattr(lang.exemplar_chars, field.name)
        group_of_chars = re.findall(r"(\{[^}]+\}|\S+)", exemplars)
        for chars in group_of_chars:
            for char in chars:
                char_script = youseedee.ucd_data(ord(char)).get("Script")
                if char_script == "Common" or char_script == "Inherited":
                    continue
                if char_script is not None and char_script != script_name:
                    out_of_script[chars] = char_script
                    break
    assert not out_of_script, (
        f"{lang_code} exemplars contained out-of-script characters"
        f": {', '.join(out_of_script.keys())}"
        f" from scripts {', '.join(set(out_of_script.values()))}"
    )