MultiPhysicsVault/scripts/tiling-check.py

#!/usr/bin/env python3
"""tiling-check.py — DragonScale Mechanism 3: semantic tiling lint.

Computes per-page embeddings via a local ollama instance and reports
candidate duplicate page pairs. Read-only; never modifies wiki pages.

Security model:
- Defaults to http://127.0.0.1:11434. Remote ollama endpoints require
  --allow-remote-ollama explicitly (vault bodies are POSTed as embedding
  input; a hostile env var would otherwise exfiltrate content).
- Rejects symlinked page files to prevent escape outside the vault root.

Feature-gated: exits 10 if ollama is unreachable or 11 if the embedding
model is not pulled, so the calling skill can no-op gracefully. Exits 0
on success. Exit 3 on cache corruption. Exit 2 on usage error.

Concurrency:
- Locks `.vault-meta/.tiling.lock` (flock exclusive) around cache I/O.
- Per-PID temp file to avoid shared-tempfile races.

Usage:
  tiling-check.py                      # run; exit 10/11 if ollama/model missing
  tiling-check.py --report PATH        # also write report to PATH
  tiling-check.py --rebuild-cache      # ignore cached embeddings
  tiling-check.py --peek               # structured diagnostics; no compute
  tiling-check.py --allow-remote-ollama # accept non-localhost OLLAMA_URL
"""

import argparse
import fcntl
import hashlib
import json
import math
import os
import re
import sys
import urllib.error
import urllib.parse
import urllib.request
from datetime import datetime
from pathlib import Path

DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434"
DEFAULT_MODEL = "nomic-embed-text"
OLLAMA_TIMEOUT_SEC = 3
EMBED_TIMEOUT_SEC = 30
MAX_RESPONSE_BYTES = 4 * 1024 * 1024  # 4 MB; embeddings can be ~10 KB each

VAULT_ROOT = Path(__file__).resolve().parent.parent
WIKI_DIR = VAULT_ROOT / "wiki"
META_DIR = VAULT_ROOT / ".vault-meta"
CACHE_PATH = META_DIR / "tiling-cache.json"
CACHE_LOCK = META_DIR / ".tiling.lock"
THRESHOLDS_PATH = META_DIR / "tiling-thresholds.json"

EXCLUDE_TYPES = {"meta", "fold"}
EXCLUDE_FILENAMES = {
    "_index.md", "index.md", "log.md", "hot.md", "overview.md",
    "dashboard.md", "Wiki Map.md", "getting-started.md",
}
EXCLUDE_PATH_PREFIXES = ("wiki/folds/", "wiki/meta/")
MAX_BODY_BYTES = 128 * 1024
SCALE_WARN_PAGES = 500
SCALE_HARD_FAIL_PAGES = 5000

EXIT_OK = 0
EXIT_USAGE = 2
EXIT_CACHE_CORRUPT = 3
EXIT_SCALE_EXCEEDED = 4
EXIT_NO_OLLAMA = 10
EXIT_NO_MODEL = 11

FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
TYPE_RE = re.compile(r"^type:\s*(\S+)", re.MULTILINE)


def log(msg: str) -> None:
    print(msg, file=sys.stderr)


def _is_local_url(url: str) -> bool:
    try:
        host = urllib.parse.urlparse(url).hostname or ""
    except ValueError:
        return False
    return host in ("127.0.0.1", "localhost", "::1")


def _http_get_json(url: str, timeout: float) -> dict:
    with urllib.request.urlopen(url, timeout=timeout) as resp:
        raw = resp.read(MAX_RESPONSE_BYTES + 1)
    if len(raw) > MAX_RESPONSE_BYTES:
        raise RuntimeError("response exceeded size limit")
    return json.loads(raw.decode("utf-8"))


def _http_post_json(url: str, payload: dict, timeout: float) -> dict:
    data = json.dumps(payload).encode("utf-8")
    req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        raw = resp.read(MAX_RESPONSE_BYTES + 1)
    if len(raw) > MAX_RESPONSE_BYTES:
        raise RuntimeError("response exceeded size limit")
    return json.loads(raw.decode("utf-8"))


def detect_ollama(url: str) -> bool:
    try:
        _http_get_json(f"{url}/api/version", OLLAMA_TIMEOUT_SEC)
        return True
    except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
        return False


def detect_model(url: str, model: str) -> bool:
    try:
        data = _http_get_json(f"{url}/api/tags", OLLAMA_TIMEOUT_SEC)
    except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
        return False
    models = data.get("models")
    if not isinstance(models, list):
        return False
    for entry in models:
        if not isinstance(entry, dict):
            continue
        name = entry.get("name", "")
        if isinstance(name, str) and (name == model or name.startswith(f"{model}:")):
            return True
    return False


def parse_frontmatter(text: str) -> tuple[dict, str]:
    m = FRONTMATTER_RE.match(text)
    if not m:
        return {}, text
    fm_raw = m.group(1)
    body = text[m.end():]
    fm: dict = {}
    tm = TYPE_RE.search(fm_raw)
    if tm:
        fm["type"] = tm.group(1).strip().strip('"').strip("'")
    return fm, body


def body_hash(body: str, model: str) -> str:
    h = hashlib.sha256()
    h.update(f"model={model}\n".encode("utf-8"))
    h.update(body.encode("utf-8"))
    return h.hexdigest()


def cosine(a: list[float], b: list[float]) -> float:
    if len(a) != len(b):
        raise ValueError(f"dim mismatch: {len(a)} vs {len(b)}")
    dot = sum(x * y for x, y in zip(a, b))
    na = math.sqrt(sum(x * x for x in a))
    nb = math.sqrt(sum(x * x for x in b))
    if na == 0.0 or nb == 0.0:
        return 0.0
    return dot / (na * nb)


def _lock_cache():
    META_DIR.mkdir(exist_ok=True)
    fd = os.open(str(CACHE_LOCK), os.O_CREAT | os.O_RDWR, 0o644)
    try:
        fcntl.flock(fd, fcntl.LOCK_EX)
    except OSError:
        os.close(fd)
        raise
    return fd


def _unlock_cache(fd: int) -> None:
    try:
        fcntl.flock(fd, fcntl.LOCK_UN)
    finally:
        os.close(fd)


def load_cache(current_model: str) -> dict:
    if not CACHE_PATH.exists():
        return {"version": 1, "model": current_model, "embeddings": {}}
    try:
        with CACHE_PATH.open() as f:
            data = json.load(f)
    except (OSError, json.JSONDecodeError) as exc:
        log(f"ERR: cache read failed: {exc}")
        sys.exit(EXIT_CACHE_CORRUPT)
    if data.get("version") != 1:
        log(f"ERR: unknown cache version: {data.get('version')}")
        sys.exit(EXIT_CACHE_CORRUPT)
    cached_model = data.get("model", "")
    if cached_model != current_model:
        log(f"INFO: cached model '{cached_model}' differs from current '{current_model}'; invalidating cache")
        return {"version": 1, "model": current_model, "embeddings": {}}
    if not isinstance(data.get("embeddings"), dict):
        log("ERR: cache.embeddings is not a dict")
        sys.exit(EXIT_CACHE_CORRUPT)
    return data


def save_cache(cache: dict) -> None:
    META_DIR.mkdir(exist_ok=True)
    tmp = CACHE_PATH.with_name(f"{CACHE_PATH.stem}.{os.getpid()}.tmp")
    with tmp.open("w") as f:
        json.dump(cache, f, indent=2)
    tmp.replace(CACHE_PATH)


def load_thresholds() -> dict:
    if not THRESHOLDS_PATH.exists():
        return {
            "version": 1, "model": DEFAULT_MODEL,
            "bands": {"error": 0.90, "review": 0.80},
            "calibrated": False, "calibration_pairs_labeled": 0,
        }
    with THRESHOLDS_PATH.open() as f:
        return json.load(f)


def included(path: Path, fm: dict) -> tuple[bool, str]:
    rel = path.relative_to(VAULT_ROOT).as_posix()
    if path.is_symlink():
        return False, "symlink"
    resolved = path.resolve()
    try:
        resolved.relative_to(VAULT_ROOT.resolve())
    except ValueError:
        return False, "escapes vault"
    if path.name in EXCLUDE_FILENAMES:
        return False, "excluded filename"
    for prefix in EXCLUDE_PATH_PREFIXES:
        if rel.startswith(prefix):
            return False, f"under {prefix}"
    if fm.get("type") in EXCLUDE_TYPES:
        return False, f"type={fm['type']}"
    return True, "included"


def embed(text: str, model: str, url: str) -> list[float]:
    data = _http_post_json(
        f"{url}/api/embeddings",
        {"model": model, "prompt": text},
        EMBED_TIMEOUT_SEC,
    )
    emb = data.get("embedding")
    if not isinstance(emb, list) or not emb:
        raise RuntimeError(f"ollama returned no embedding: {str(data)[:200]}")
    for v in emb:
        if not isinstance(v, (int, float)):
            raise RuntimeError("embedding contains non-numeric values")
    return emb


def run_check(
    rebuild: bool,
    report_path: Path | None,
    ollama_url: str,
    model: str,
) -> int:
    if not detect_ollama(ollama_url):
        log(f"ollama not reachable at {ollama_url}; skipping tiling check")
        return EXIT_NO_OLLAMA
    if not detect_model(ollama_url, model):
        log(f"model '{model}' not pulled; run: ollama pull {model}")
        return EXIT_NO_MODEL

    thresholds = load_thresholds()

    lock_fd = _lock_cache()
    try:
        cache = (load_cache(model) if not rebuild
                 else {"version": 1, "model": model, "embeddings": {}})

        pages: list[tuple[str, list[float]]] = []
        scanned = 0
        computed = 0
        cached_hits = 0
        skipped_counts: dict[str, int] = {}
        live_paths: set[str] = set()

        candidates = sorted(WIKI_DIR.rglob("*.md"))
        scale_n = len(candidates)
        if scale_n > SCALE_HARD_FAIL_PAGES:
            log(f"ERR: {scale_n} pages exceed hard-fail limit {SCALE_HARD_FAIL_PAGES}")
            return EXIT_SCALE_EXCEEDED
        if scale_n > SCALE_WARN_PAGES:
            log(f"WARN: {scale_n} pages; cold-cache embed will issue ~{scale_n} POSTs to ollama")

        for md in candidates:
            scanned += 1
            # Symlink and vault-root guards must run BEFORE read_text so a
            # hostile symlink cannot cause off-vault content to be read and
            # POSTed to the embedding endpoint.
            if md.is_symlink():
                skipped_counts["symlink"] = skipped_counts.get("symlink", 0) + 1
                continue
            try:
                resolved = md.resolve(strict=True)
                resolved.relative_to(VAULT_ROOT.resolve())
            except (OSError, ValueError):
                skipped_counts["escapes vault"] = skipped_counts.get("escapes vault", 0) + 1
                continue
            try:
                text = md.read_text(encoding="utf-8")
            except (OSError, UnicodeDecodeError):
                skipped_counts["read_error"] = skipped_counts.get("read_error", 0) + 1
                continue
            if len(text.encode("utf-8")) > MAX_BODY_BYTES:
                skipped_counts["too_large"] = skipped_counts.get("too_large", 0) + 1
                continue
            fm, body = parse_frontmatter(text)
            ok, reason = included(md, fm)
            if not ok:
                skipped_counts[reason] = skipped_counts.get(reason, 0) + 1
                continue
            rel = md.relative_to(VAULT_ROOT).as_posix()
            live_paths.add(rel)
            h = body_hash(body, model)
            entry = cache["embeddings"].get(rel)
            if entry and entry.get("hash") == h:
                pages.append((rel, entry["embedding"]))
                cached_hits += 1
                continue
            try:
                emb = embed(body, model, ollama_url)
            except Exception as exc:
                log(f"ERR embedding {rel}: {exc}")
                skipped_counts["embed_error"] = skipped_counts.get("embed_error", 0) + 1
                continue
            cache["embeddings"][rel] = {
                "hash": h,
                "embedding": emb,
                "computed_at": datetime.utcnow().isoformat(timespec="seconds") + "Z",
            }
            pages.append((rel, emb))
            computed += 1

        # Orphan GC: drop cache entries for paths that no longer exist.
        orphans = [k for k in cache["embeddings"] if k not in live_paths]
        for k in orphans:
            del cache["embeddings"][k]

        save_cache(cache)
    finally:
        _unlock_cache(lock_fd)

    review = thresholds["bands"]["review"]
    error_ = thresholds["bands"]["error"]
    pairs: list[tuple[float, str, str]] = []
    for i in range(len(pages)):
        for j in range(i + 1, len(pages)):
            a_path, a_emb = pages[i]
            b_path, b_emb = pages[j]
            try:
                sim = cosine(a_emb, b_emb)
            except ValueError as exc:
                log(f"WARN cosine skip ({a_path}, {b_path}): {exc}")
                continue
            if sim >= review:
                pairs.append((sim, a_path, b_path))
    pairs.sort(reverse=True)

    errors = [p for p in pairs if p[0] >= error_]
    reviews = [p for p in pairs if review <= p[0] < error_]

    out_lines: list[str] = []
    out_lines.append("# Semantic Tiling Report")
    out_lines.append("")
    out_lines.append(f"- generated: {datetime.utcnow().isoformat(timespec='seconds')}Z")
    out_lines.append(f"- model: {model}")
    out_lines.append(f"- ollama_url: {ollama_url}")
    out_lines.append(f"- thresholds: error>={error_}, review={review}-{error_}")
    out_lines.append(f"- calibrated: {thresholds.get('calibrated', False)}"
                     + (" (using uncalibrated defaults)" if not thresholds.get("calibrated") else ""))
    out_lines.append(f"- pages scanned: {scanned}; embedded: {len(pages)}; skipped: {sum(skipped_counts.values())}")
    if skipped_counts:
        out_lines.append("- skipped reasons: " + ", ".join(f"{k}={v}" for k, v in sorted(skipped_counts.items())))
    out_lines.append(f"- cache hits: {cached_hits}; recomputed: {computed}; orphans pruned: {len(orphans)}")
    out_lines.append("")
    out_lines.append(f"## Errors (similarity >= {error_})")
    out_lines.append("")
    if not errors:
        out_lines.append("- none")
    else:
        for sim, a, b in errors:
            out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
    out_lines.append("")
    out_lines.append(f"## Review ({review} <= similarity < {error_})")
    out_lines.append("")
    if not reviews:
        out_lines.append("- none")
    else:
        for sim, a, b in reviews:
            out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
    report = "\n".join(out_lines) + "\n"

    print(report)
    if report_path is not None:
        # Confine report writes to VAULT_ROOT. A path that resolves outside
        # the vault is refused (prevents `--report /etc/passwd` style
        # accidents or hostile args from writing outside the repo).
        try:
            resolved_report = (
                report_path if report_path.is_absolute() else (Path.cwd() / report_path)
            ).resolve()
            resolved_report.relative_to(VAULT_ROOT.resolve())
        except ValueError:
            log(f"ERR: --report path '{report_path}' escapes vault root {VAULT_ROOT}")
            return EXIT_USAGE
        resolved_report.parent.mkdir(parents=True, exist_ok=True)
        resolved_report.write_text(report, encoding="utf-8")
        log(f"report written: {resolved_report}")

    return EXIT_OK


def cmd_peek(ollama_url: str, model: str) -> int:
    """Structured diagnostics. Prints a JSON object and a plain summary."""
    diag: dict = {}
    script_path = Path(__file__).resolve()
    diag["script_path"] = str(script_path)
    diag["script_executable"] = os.access(script_path, os.X_OK)
    diag["python"] = sys.executable
    diag["vault_root"] = str(VAULT_ROOT)
    diag["ollama_url"] = ollama_url
    diag["ollama_reachable"] = detect_ollama(ollama_url)
    diag["model_requested"] = model
    diag["model_present"] = detect_model(ollama_url, model) if diag["ollama_reachable"] else False
    diag["cache_present"] = CACHE_PATH.exists()
    diag["cache_readable"] = False
    diag["cache_entries"] = 0
    diag["cache_model"] = None
    if diag["cache_present"]:
        try:
            with CACHE_PATH.open() as f:
                c = json.load(f)
            diag["cache_readable"] = (c.get("version") == 1
                                      and isinstance(c.get("embeddings"), dict))
            diag["cache_entries"] = len(c.get("embeddings", {}))
            diag["cache_model"] = c.get("model")
        except (OSError, json.JSONDecodeError) as exc:
            diag["cache_readable"] = False
            diag["cache_error"] = str(exc)
    diag["thresholds_present"] = THRESHOLDS_PATH.exists()
    diag["thresholds_readable"] = False
    if diag["thresholds_present"]:
        try:
            with THRESHOLDS_PATH.open() as f:
                t = json.load(f)
            diag["thresholds_readable"] = True
            diag["thresholds_calibrated"] = bool(t.get("calibrated", False))
            diag["thresholds_bands"] = t.get("bands", {})
        except (OSError, json.JSONDecodeError):
            diag["thresholds_readable"] = False
    print(json.dumps(diag, indent=2))
    if not diag["ollama_reachable"]:
        return EXIT_NO_OLLAMA
    if not diag["model_present"]:
        return EXIT_NO_MODEL
    if diag["cache_present"] and not diag["cache_readable"]:
        return EXIT_CACHE_CORRUPT
    return EXIT_OK


def main(argv: list[str]) -> int:
    p = argparse.ArgumentParser()
    p.add_argument("--report", type=Path, default=None)
    p.add_argument("--rebuild-cache", action="store_true")
    p.add_argument("--peek", action="store_true")
    p.add_argument("--allow-remote-ollama", action="store_true",
                   help="allow OLLAMA_URL env override pointing outside localhost")
    p.add_argument("--model", default=DEFAULT_MODEL)
    args = p.parse_args(argv)

    env_url = os.environ.get("OLLAMA_URL")
    ollama_url = env_url or DEFAULT_OLLAMA_URL
    if env_url and not _is_local_url(ollama_url) and not args.allow_remote_ollama:
        log(f"ERR: OLLAMA_URL={ollama_url!r} is not localhost. "
            f"Vault content would be POSTed to a non-local host. "
            f"Pass --allow-remote-ollama to override.")
        return EXIT_USAGE

    if args.peek:
        return cmd_peek(ollama_url, args.model)
    return run_check(
        rebuild=args.rebuild_cache,
        report_path=args.report,
        ollama_url=ollama_url,
        model=args.model,
    )


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))