from __future__ import annotations

import re
import unicodedata
from datetime import datetime, date
from typing import Dict, Any, List, Optional

# ============================================================
# CONSTANTES E MAPAS
# ============================================================

PT_MONTHS = {
    "janeiro": 1,
    "fevereiro": 2,
    "março": 3,
    "marco": 3,
    "abril": 4,
    "maio": 5,
    "junho": 6,
    "julho": 7,
    "agosto": 8,
    "setembro": 9,
    "outubro": 10,
    "novembro": 11,
    "dezembro": 12,
}

# ============================================================
# FUNÇÕES DE PRÉ-PROCESSAMENTO
# ============================================================

def _normalize_text(text: str) -> str:
    """Normaliza o texto bruto do OCR."""
    if not text:
        return ""
    text = re.sub(r'[\n\r\t]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def _clean_nuit(nuit_str: str) -> Optional[str]:
    """Limpa e valida um potencial NUIT."""
    if not nuit_str:
        return None
    cleaned = re.sub(r'[^0-9]', '', nuit_str)
    # NUIT moçambicano tem 9 dígitos e geralmente começa com 400
    if cleaned and len(cleaned) == 9 and cleaned.startswith('400'):
        return cleaned
    return None

def _parse_date_pt(value: str) -> Optional[date]:
    """Interpreta datas em formato português."""
    if not value:
        return None
    value = value.strip().lower()
    # formato numérico
    for fmt in ("%d/%m/%Y", "%d-%m-%Y"):
        try:
            return datetime.strptime(value, fmt).date()
        except ValueError:
            pass
    # formato textual: "10 de junho de 2018"
    match = re.search(r'(\d{1,2})\s+de\s+([a-zçãéêôíóú]+)\s+de\s+(\d{4})', value, re.IGNORECASE)
    if match:
        day = int(match.group(1))
        month_name = match.group(2).lower()
        year = int(match.group(3))
        month = PT_MONTHS.get(month_name)
        if month:
            try:
                return date(year, month, day)
            except ValueError:
                pass
    return None

def _months_since(d: date) -> int:
    """Meses desde a data até hoje."""
    if not d:
        return 0
    today = date.today()
    return max(0, (today.year - d.year) * 12 + (today.month - d.month))

# ============================================================
# REGEX OTIMIZADAS
# ============================================================

# Emissor oficial
RE_ISSUER = re.compile(
    r"(autoridade\s+tribut[aá]ria|minist[eé]rio\s+da\s+economia|dire[cç][aã]o\s+geral\s+de\s+impostos|rep[uú]blica\s+de\s+mo[cç]ambique)",
    re.IGNORECASE
)

# Assunto
RE_SUBJECT = re.compile(
    r"(comunica[cç][aã]o\s+de\s+atribui[cç][aã]o\s+do\s+nuit|atribui[cç][aã]o\s+do\s+nuit)",
    re.IGNORECASE
)

# NUIT – várias formas
RE_NUIT_SPECIFIC = re.compile(
    r'n[uú]mero\s+[uú]nico\s+de\s+identifica[cç][aã]o\s+tribut[aá]ria\s*[:\-]?\s*(\d{9})',
    re.IGNORECASE
)
RE_NUIT_HYPHEN = re.compile(r'nuit\s*[:\-]?\s*(\d{9})', re.IGNORECASE)
RE_NUIT_SIMPLE = re.compile(r'\b(400[89]?\d{5})\b')

# Nome da entidade
RE_ENTITY_NAME = re.compile(
    r'identifica[cç][aã]o\s+da\s+entidade\s*[:\-]?\s*(.+?)(?:\n|\.|Nome\s+Comercial|\Z)',
    re.IGNORECASE | re.DOTALL
)

# Tipo de entidade
RE_ENTITY_TYPE = re.compile(
    r'tipo\s+de\s+entidade\s*[:\-]?\s*(.+?)(?:\n|\.|Data|\Z)',
    re.IGNORECASE | re.DOTALL
)

# Data de constituição
RE_INCORPORATION_DATE = re.compile(
    r'data\s+da\s+constitui[cç][aã]o\s+da\s+entidade\s*[:\-]?\s*(\d{1,2}\s+de\s+[a-zçãéêôíóú]+\s+de\s+\d{4}|\d{2}[\/\-]\d{2}[\/\-]\d{4})',
    re.IGNORECASE
)

# Nome comercial (opcional)
RE_TRADE_NAME = re.compile(
    r'nome\s+comercial\s*[:\-]?\s*(.+?)(?:\n|\.|\Z)',
    re.IGNORECASE | re.DOTALL
)

# ============================================================
# FUNÇÃO PRINCIPAL – analyze_nuit
# ============================================================

def analyze_nuit(text: str) -> Dict[str, Any]:
    """
    Analisa documento de atribuição do NUIT.

    Args:
        text: Texto extraído do documento via OCR.

    Returns:
        Dicionário com features extraídas e metadados da análise.
    """
    # Normalização
    original_text = text
    text = _normalize_text(text)

    alerts: List[str] = []

    # 1. NUIT
    nuit = None
    nuit_raw = None
    for pattern in [RE_NUIT_SPECIFIC, RE_NUIT_HYPHEN, RE_NUIT_SIMPLE]:
        match = pattern.search(text)
        if match:
            nuit_raw = match.group(1)
            break
    if nuit_raw:
        nuit = _clean_nuit(nuit_raw)
        if not nuit:
            alerts.append(f"NUIT encontrado mas formato inválido: {nuit_raw}")
    else:
        alerts.append("NUIT não identificado no documento.")

    # 2. Nome da entidade
    company_name = None
    match = RE_ENTITY_NAME.search(text)
    if match:
        company_name = match.group(1).strip()
        company_name = re.sub(r'\s+', ' ', company_name)
        company_name = re.sub(r'[.,;:]\s*$', '', company_name)
    else:
        alerts.append("Nome da entidade não identificado.")

    # 3. Tipo de entidade
    entity_type = None
    match = RE_ENTITY_TYPE.search(text)
    if match:
        entity_type = match.group(1).strip()
        entity_type = re.sub(r'\s+', ' ', entity_type)
        entity_type = re.sub(r'[.,;:]\s*$', '', entity_type)
    else:
        # fallback para siglas
        sigla_match = re.search(r'\b(LDA|LIMITADA|SA|S\.A\.|SU, LDA)\b', text, re.IGNORECASE)
        if sigla_match:
            entity_type = sigla_match.group(1).upper()
        else:
            alerts.append("Tipo de entidade não identificado.")

    # 4. Data de constituição
    incorporation_date = None
    incorporation_date_str = None
    company_age_months = None
    match = RE_INCORPORATION_DATE.search(text)
    if match:
        incorporation_date_str = match.group(1).strip()
        incorporation_date = _parse_date_pt(incorporation_date_str)
        if incorporation_date:
            company_age_months = _months_since(incorporation_date)
        else:
            alerts.append(f"Data de constituição encontrada mas não interpretada: {incorporation_date_str}")
    else:
        alerts.append("Data de constituição não identificada.")

    # 5. Nome comercial (opcional)
    trade_name = None
    match = RE_TRADE_NAME.search(text)
    if match:
        trade_name = match.group(1).strip()
        trade_name = re.sub(r'\s+', ' ', trade_name)

    # 6. Validações de emissor e assunto
    issuer_valid = bool(RE_ISSUER.search(text.lower()))
    subject_valid = bool(RE_SUBJECT.search(text.lower()))
    if not issuer_valid:
        alerts.append("Emissor do documento não reconhecido como entidade fiscal oficial.")
    if not subject_valid:
        alerts.append("Assunto do documento não confirma atribuição de NUIT.")

    # 7. Montagem do dicionário de retorno
    features = {
        "nuit": nuit,
        "company_name": company_name,
        "entity_type": entity_type,
        "incorporation_date": incorporation_date.isoformat() if incorporation_date else None,
        "incorporation_date_str": incorporation_date_str,
        "company_age_months": company_age_months,
        "trade_name": trade_name,
        "issuer_valid": issuer_valid,
        "subject_valid": subject_valid,
        "extraction_quality": {
            "nuit_found": nuit is not None,
            "company_name_found": company_name is not None,
            "entity_type_found": entity_type is not None,
            "incorporation_date_found": incorporation_date is not None,
        },
        "alerts": alerts,
        # opcional: trecho para debug
        "_debug_text_snippet": text[:300] + "..." if len(text) > 300 else text,
    }
    return features