"""Parse 35 references from the source markdown into structured JSON.

Reads: source.md (argv[1] or the default blockchain-security paper)
Writes: scripts/refs_input.json

Each record: {num, raw, title_guess, authors_guess, year_guess, venue_guess, is_url_only}
"""
import json
import re
import sys
from pathlib import Path

DEFAULT_SOURCE = Path(__file__).parent.parent / "2024.02.20 Enhancing Security in Blockchain Networks_ Anomalies, Frauds, and Advanced Detection Techniques.md"
OUTPUT = Path(__file__).parent / "refs_input.json"

# Match entries like: \[1\] Ahmed, M., Mahmood, A.N. and Islam, M.R., 2016. A survey of...
REF_LINE = re.compile(r"^\\?\[(\d+)\\?\]\s*(.*)$", re.MULTILINE)
YEAR_RE = re.compile(r"\b(19|20)\d{2}\b")
URL_RE = re.compile(r"https?://\S+")


def clean_escapes(s: str) -> str:
    s = re.sub(r"\\([.\[\]()])", r"\1", s)
    return s.strip()


def parse_entry(num: int, raw: str) -> dict:
    raw = clean_escapes(raw)
    # URL-only detection
    has_url = bool(URL_RE.search(raw))
    academic_indicators = re.search(
        r"(pp\. ?\d|doi:|vol\. ?\d|journal|proceedings|IEEE|ACM|Springer|Elsevier|arxiv\.org|Lecture Notes|Conference|Wiley|Computing Surveys|Computational Statistics|Crime Science|Future Generation|Royal Society|Data Mining|Manufacturing Science|Biophysical|Environmental Management|Information Security for|Wireless Communications|DSAA|ICBC|IWCMC|ISSA|IUCC|Systems Architecture|Internet of Things|Review of Finance)",
        raw,
        re.IGNORECASE,
    )
    is_url_only = has_url and not academic_indicators

    year_match = YEAR_RE.search(raw)
    year_guess = int(year_match.group(0)) if year_match else None

    authors_guess = ""
    title_guess = ""
    venue_guess = ""
    if year_match:
        before_year = raw[: year_match.start()].rstrip(", .\t")
        after_year = raw[year_match.end():].lstrip(". ,\t")
        authors_guess = before_year
        # Title ends at ". TitleCase" (start of venue) or ". Available at"
        m = re.search(r"\.\s+(?=[A-Z][a-zA-Z]+)", after_year)
        if m and m.start() > 15:
            title_guess = after_year[:m.start()].strip()
            venue_guess = after_year[m.end():].strip()
        else:
            title_guess = after_year.split(".")[0].strip() if "." in after_year else after_year.strip()

    return {
        "num": num,
        "raw": raw,
        "title_guess": title_guess,
        "authors_guess": authors_guess,
        "year_guess": year_guess,
        "venue_guess": venue_guess,
        "is_url_only": is_url_only,
    }


def main(src: Path = DEFAULT_SOURCE) -> None:
    text = src.read_text(encoding="utf-8")
    # Locate references section
    ref_section_start = text.find("# References")
    if ref_section_start < 0:
        raise SystemExit("References section not found in source")
    ref_text = text[ref_section_start:]
    # Cut at next top-level header (appendix)
    next_section = ref_text.find("\n# ", 20)
    if next_section > 0:
        ref_text = ref_text[:next_section]

    entries = []
    for match in REF_LINE.finditer(ref_text):
        num = int(match.group(1))
        raw = match.group(2)
        entries.append(parse_entry(num, raw))

    entries.sort(key=lambda e: e["num"])
    OUTPUT.write_text(json.dumps(entries, indent=2, ensure_ascii=False), encoding="utf-8")
    url_only = [e["num"] for e in entries if e["is_url_only"]]
    print(f"Parsed {len(entries)} references -> {OUTPUT}")
    print(f"URL-only refs: {url_only}")


if __name__ == "__main__":
    src_arg = Path(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_SOURCE
    main(src_arg)
