"""Automated AI-tell + vague-claim + duplication linter for the source markdown.

Reads:  argv[1] (input .md) or source.md by default
Writes: scripts/hostile_lint.json — machine-readable lint findings to be incorporated
        into reviewer-report.md by the author/executor.

Patterns are intentionally conservative — false positives are acceptable since the
hostile reviewer posture is critical. Zero flags in a known-generic LLM text would
indicate the patterns need broadening.
"""
import json
import re
import sys
from pathlib import Path

ROOT = Path(__file__).parent.parent
DEFAULT_SOURCE = ROOT / "2024.02.20 Enhancing Security in Blockchain Networks_ Anomalies, Frauds, and Advanced Detection Techniques.md"
OUTPUT = ROOT / "scripts" / "hostile_lint.json"

AI_TELL_PATTERNS = [
    r"dynamic(?:ally)? and rapidly evolving",
    r"Together, these studies",
    r"a rich and diverse body of research",
    r"sophisticated statistical models",
    r"it is important to note",
    r"multi-faceted approach",
    r"robust and effective",
    r"a comprehensive (?:review|overview|understanding)",
    r"continue to mature",
    r"highlight(?:s|ing)? the (?:need for|importance of|potential of)",
    r"significantly (?:impact|contribute|enhance)",
    r"plays a (?:crucial|key|vital) role",
    r"in the (?:context|realm|field) of",
    r"encompass(?:ing|es)",
    r"paramount importance",
    r"serve(?:s|d)? to (?:illustrate|demonstrate|highlight)",
    r"delve(?:s|d)? into",
    r"shed(?:s|ding)? light on",
    r"underscor(?:e|es|ing) the (?:importance|need|potential)",
]

VAGUE_CLAIM_PATTERNS = [
    r"has been shown",
    r"have been shown",
    r"it is well[- ]known",
    r"can be used to",
    r"various (?:techniques|methods|approaches|studies)",
    r"several (?:studies|techniques|methods|approaches)",
    r"a number of (?:studies|techniques|methods|approaches)",
    r"many (?:studies|researchers|approaches)",
    r"some (?:studies|researchers|approaches)",
    r"recent (?:studies|research|work)(?!\s+\w+,\s+\d{4})",  # not followed by citation
    r"prior research",
    r"previous work",
    r"it is widely",
    r"it is generally",
]


def scan_patterns(text: str, patterns: list[str]) -> list[dict]:
    findings = []
    lines = text.split("\n")
    for i, line in enumerate(lines, start=1):
        for p in patterns:
            for m in re.finditer(p, line, re.IGNORECASE):
                findings.append({
                    "line": i,
                    "pattern": p,
                    "match": m.group(0),
                    "context": line.strip()[:200],
                })
    return findings


def detect_duplicate_bullets(text: str) -> list[dict]:
    """Find adjacent bullet lists that are near-identical."""
    # Paragraphs separated by blank lines
    paragraphs = re.split(r"\n\s*\n", text)
    findings = []
    for i, p in enumerate(paragraphs):
        lines = p.strip().split("\n")
        bullet_lines = [l for l in lines if re.match(r"^\s*\d+\.\s", l)]
        if len(bullet_lines) < 3:
            continue
        for j in range(i + 1, min(i + 4, len(paragraphs))):
            p2 = paragraphs[j]
            bullet_lines_2 = [l for l in p2.strip().split("\n") if re.match(r"^\s*\d+\.\s", l)]
            if len(bullet_lines_2) < 3:
                continue
            # Compare first 40 chars of each bullet
            sigs_1 = [l[:60].strip() for l in bullet_lines]
            sigs_2 = [l[:60].strip() for l in bullet_lines_2]
            overlap = sum(1 for s in sigs_1 if s in sigs_2)
            if overlap >= min(3, len(sigs_1)):
                findings.append({
                    "paragraph_a": i,
                    "paragraph_b": j,
                    "overlap": overlap,
                    "example": sigs_1[0],
                })
    return findings


def count_post_2020_refs(source_text: str) -> int:
    """Count references with year >= 2020 in the References section."""
    refs_section = source_text[source_text.find("# References"):]
    years = re.findall(r"\b(20\d{2})\b", refs_section)
    return sum(1 for y in years if int(y) >= 2020)


def main() -> None:
    src = Path(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_SOURCE
    text = src.read_text(encoding="utf-8")
    ai_tells = scan_patterns(text, AI_TELL_PATTERNS)
    vague = scan_patterns(text, VAGUE_CLAIM_PATTERNS)
    dupes = detect_duplicate_bullets(text)
    post_2020 = count_post_2020_refs(text)

    result = {
        "source": str(src),
        "ai_tells_count": len(ai_tells),
        "ai_tells": ai_tells,
        "vague_claims_count": len(vague),
        "vague_claims": vague,
        "duplicate_bullet_blocks": dupes,
        "post_2020_refs_count": post_2020,
    }
    OUTPUT.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8")
    print(f"Wrote {OUTPUT}")
    print(f"AI-tells: {len(ai_tells)}")
    print(f"Vague claims: {len(vague)}")
    print(f"Duplicate bullet blocks: {len(dupes)}")
    print(f"Post-2020 refs: {post_2020}")


if __name__ == "__main__":
    main()