"""OpenAlex enrichment + BibTeX export for the 35 references.

Reads:  scripts/refs_input.json
Writes: scripts/refs_resolved.json, paper/bib/refs.bib, scripts/refs_report.md

- Auto-accepts if confidence >= 0.85 AND abs(year_diff) <= 1.
- URL-only refs become @misc entries.
- On-disk cache at cache/openalex/ (idempotent re-run).
- BibTeX keys: author-year format (firstauthor_lc + year + firstword_title_lc).
"""
import hashlib
import json
import re
import sys
import time
from pathlib import Path

try:
    import requests
    from rapidfuzz import fuzz
    from tenacity import retry, stop_after_attempt, wait_exponential
except ImportError as e:
    print(f"Missing dependency: {e}. Run: pip install -r requirements.txt")
    sys.exit(1)

ROOT = Path(__file__).parent.parent
INPUT = ROOT / "scripts" / "refs_input.json"
RESOLVED = ROOT / "scripts" / "refs_resolved.json"
BIB = ROOT / "paper" / "bib" / "refs.bib"
REPORT = ROOT / "scripts" / "refs_report.md"
CACHE_DIR = ROOT / "cache" / "openalex"

OPENALEX_BASE = "https://api.openalex.org/works"
MAILTO = "joerg.osterrieder@utwente.nl"
CONFIDENCE_THRESHOLD = 0.85
YEAR_TOLERANCE = 1


def cache_path(query: str) -> Path:
    h = hashlib.sha1(query.encode("utf-8")).hexdigest()
    return CACHE_DIR / f"{h}.json"


@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=1, max=30))
def openalex_search(title: str, year: int | None) -> dict | None:
    query_key = f"{title}|{year}"
    cp = cache_path(query_key)
    if cp.exists():
        return json.loads(cp.read_text(encoding="utf-8"))

    # Use title.search filter for more targeted matching (vs general search)
    filters = [f"title.search:{title[:200]}"]
    if year:
        filters.append(f"publication_year:{year - 1}|{year}|{year + 1}")
    params = {"filter": ",".join(filters), "per-page": 5, "mailto": MAILTO}
    r = requests.get(OPENALEX_BASE, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    cp.parent.mkdir(parents=True, exist_ok=True)
    cp.write_text(json.dumps(data), encoding="utf-8")
    time.sleep(0.1)  # 10 req/s max
    return data


def normalize_name(name: str) -> str:
    return re.sub(r"[^a-z]", "", name.lower())


def bib_key(first_author: str, year: int, title: str) -> str:
    lc = normalize_name(first_author)[:20] or "anon"
    first_word = re.sub(r"[^a-z]", "", title.split()[0].lower()) if title else "untitled"
    first_word = first_word[:15] or "untitled"
    return f"{lc}{year}{first_word}"


def entry_type(work: dict) -> str:
    t = work.get("type", "")
    if t in ("journal-article", "article"):
        return "article"
    if t in ("book-chapter", "chapter"):
        return "incollection"
    if t in ("proceedings-article", "conference-paper"):
        return "inproceedings"
    if t in ("book", "monograph"):
        return "book"
    if t in ("dissertation", "thesis"):
        return "phdthesis"
    return "misc"


def format_authors(authorships: list) -> str:
    names = []
    for au in authorships[:6]:
        name = au.get("author", {}).get("display_name", "")
        if name:
            names.append(name)
    out = " and ".join(names)
    if len(authorships) > 6:
        out += " and others"
    return out


def bibtex_from_work(key: str, work: dict, ref: dict) -> str:
    etype = entry_type(work)
    title = (work.get("title") or ref.get("title_guess") or "Untitled").replace("{", "\\{").replace("}", "\\}")
    year = work.get("publication_year") or ref.get("year_guess")
    authors = format_authors(work.get("authorships", []))
    primary_loc = work.get("primary_location") or {}
    source_dict = primary_loc.get("source") or {}
    venue_name = ""
    if isinstance(source_dict, dict):
        venue_name = source_dict.get("display_name") or ""
    if not venue_name:
        host = work.get("host_venue") or {}
        if isinstance(host, dict):
            venue_name = host.get("display_name") or ""
    doi = work.get("doi", "")
    if doi and doi.startswith("https://doi.org/"):
        doi = doi.replace("https://doi.org/", "")
    volume = work.get("biblio", {}).get("volume", "")
    issue = work.get("biblio", {}).get("issue", "")
    pages_first = work.get("biblio", {}).get("first_page", "")
    pages_last = work.get("biblio", {}).get("last_page", "")
    pages = f"{pages_first}--{pages_last}" if pages_first and pages_last else pages_first

    lines = [f"@{etype}{{{key},"]
    lines.append(f"  author = {{{authors}}},")
    lines.append(f"  title  = {{{title}}},")
    if venue_name:
        field = "journal" if etype == "article" else "booktitle"
        lines.append(f"  {field} = {{{venue_name}}},")
    if year:
        lines.append(f"  year   = {{{year}}},")
    if volume:
        lines.append(f"  volume = {{{volume}}},")
    if issue:
        lines.append(f"  number = {{{issue}}},")
    if pages:
        lines.append(f"  pages  = {{{pages}}},")
    if doi:
        lines.append(f"  doi    = {{{doi}}},")
    else:
        lines.append("  note   = {No DOI available},")
    lines.append("}\n")
    return "\n".join(lines)


def misc_bibtex(key: str, ref: dict) -> str:
    url_match = re.search(r"https?://\S+", ref["raw"])
    url = url_match.group(0).rstrip(".,;") if url_match else ""
    title = ref["title_guess"] or "Web resource"
    authors = ref["authors_guess"] or "Anon"
    year = ref["year_guess"] or "n.d."
    lines = [f"@misc{{{key},"]
    lines.append(f"  author = {{{authors}}},")
    lines.append(f"  title  = {{{title}}},")
    lines.append(f"  year   = {{{year}}},")
    if url:
        lines.append(f"  howpublished = {{\\url{{{url}}}}},")
    lines.append("  urldate = {2024-02-20},")
    lines.append("  note   = {URL-only reference, classified as @misc},")
    lines.append("}\n")
    return "\n".join(lines)


def resolve(ref: dict) -> dict:
    if ref["is_url_only"]:
        return {**ref, "status": "url_only", "confidence": None, "openalex": None}
    title = ref.get("title_guess", "")
    year = ref.get("year_guess")
    if not title:
        return {**ref, "status": "no_title_parsed", "confidence": 0.0, "openalex": None}
    try:
        data = openalex_search(title, year)
    except Exception as e:
        return {**ref, "status": f"error: {e}", "confidence": 0.0, "openalex": None}

    results = data.get("results", []) if data else []
    if not results:
        return {**ref, "status": "no_results", "confidence": 0.0, "openalex": None}

    # Rank results by title similarity
    best_score = 0.0
    best_work = None
    for w in results[:5]:
        cand_title = w.get("title", "") or ""
        score = fuzz.token_set_ratio(title, cand_title) / 100.0
        if score > best_score:
            best_score = score
            best_work = w

    if not best_work:
        return {**ref, "status": "no_match", "confidence": 0.0, "openalex": None}

    year_diff = 99
    if year and best_work.get("publication_year"):
        year_diff = abs(int(year) - int(best_work["publication_year"]))

    accepted = best_score >= CONFIDENCE_THRESHOLD and year_diff <= YEAR_TOLERANCE
    primary_loc = best_work.get("primary_location") or {}
    source_dict = primary_loc.get("source") or {}
    venue_name = source_dict.get("display_name") if isinstance(source_dict, dict) else None
    return {
        **ref,
        "status": "accepted" if accepted else "flagged",
        "confidence": round(best_score, 3),
        "year_diff": year_diff,
        "openalex": {
            "id": best_work.get("id"),
            "title": best_work.get("title"),
            "doi": best_work.get("doi"),
            "year": best_work.get("publication_year"),
            "venue": venue_name,
            "raw": best_work,
        },
    }


def main() -> None:
    refs = json.loads(INPUT.read_text(encoding="utf-8"))
    print(f"Loaded {len(refs)} refs from {INPUT}")

    resolved = []
    for ref in refs:
        out = resolve(ref)
        resolved.append(out)
        print(f"  [{ref['num']:2d}] {out['status']:10s} conf={out['confidence']} year={ref.get('year_guess')}")

    RESOLVED.write_text(json.dumps(resolved, indent=2, ensure_ascii=False, default=str), encoding="utf-8")

    # Build BibTeX entries
    seen_keys = set()
    bibtex_entries = []
    report_lines = ["# OpenAlex resolution report\n", f"Total refs: {len(resolved)}\n"]
    accepted = flagged = url_only = errored = 0

    for r in resolved:
        if r["status"] == "accepted" and r.get("openalex"):
            w = r["openalex"]["raw"]
            authors = w.get("authorships", [])
            first_au = authors[0].get("author", {}).get("display_name", "").split()[-1] if authors else "anon"
            year = w.get("publication_year") or r.get("year_guess") or 0
            title = w.get("title") or r["title_guess"]
            key = bib_key(first_au, year, title)
            # Disambiguate
            base = key
            suffix = ord("a")
            while key in seen_keys:
                key = f"{base}{chr(suffix)}"
                suffix += 1
            seen_keys.add(key)
            r["bib_key"] = key
            bibtex_entries.append(bibtex_from_work(key, w, r))
            accepted += 1
        elif r["status"] == "url_only":
            first_au = (r.get("authors_guess") or "Anon").split()[0].lower()
            first_au = re.sub(r"[^a-z]", "", first_au) or "anon"
            year = r.get("year_guess") or 0
            title = r.get("title_guess") or "web"
            key = bib_key(first_au, year, title)
            base = key
            suffix = ord("a")
            while key in seen_keys:
                key = f"{base}{chr(suffix)}"
                suffix += 1
            seen_keys.add(key)
            r["bib_key"] = key
            bibtex_entries.append(misc_bibtex(key, r))
            url_only += 1
        else:
            flagged += 1
            report_lines.append(f"\n## Flagged ref #{r['num']}")
            report_lines.append(f"- Status: {r['status']}")
            report_lines.append(f"- Title guess: {r.get('title_guess','')}")
            report_lines.append(f"- Confidence: {r.get('confidence')}")
            report_lines.append(f"- Raw: {r.get('raw','')[:200]}")
            # Still emit a placeholder @misc so bib file has all 35
            first_au = (r.get("authors_guess") or "Anon").split()[0].lower()
            first_au = re.sub(r"[^a-z]", "", first_au) or "anon"
            year = r.get("year_guess") or 0
            title = r.get("title_guess") or "unresolved"
            key = bib_key(first_au, year, title)
            base = key
            suffix = ord("a")
            while key in seen_keys:
                key = f"{base}{chr(suffix)}"
                suffix += 1
            seen_keys.add(key)
            r["bib_key"] = key
            bibtex_entries.append(misc_bibtex(key, r))

    # Re-save with bib_keys populated
    RESOLVED.write_text(json.dumps(resolved, indent=2, ensure_ascii=False, default=str), encoding="utf-8")

    BIB.parent.mkdir(parents=True, exist_ok=True)
    BIB.write_text("% Auto-generated by scripts/refs_openalex.py\n% Author-year keys; Springer sn-basic format\n\n" + "\n".join(bibtex_entries), encoding="utf-8")

    report_lines.insert(2, f"- Accepted: {accepted}")
    report_lines.insert(3, f"- Flagged (low-confidence): {flagged}")
    report_lines.insert(4, f"- URL-only (@misc): {url_only}")
    report_lines.insert(5, f"- Errored: {errored}\n")
    REPORT.write_text("\n".join(report_lines), encoding="utf-8")

    print(f"\nWritten: {BIB}, {RESOLVED}, {REPORT}")
    print(f"Accepted={accepted}  Flagged={flagged}  URL-only={url_only}")


if __name__ == "__main__":
    main()
