add claude-obsidian

2026-05-28 10:57:16 +09:00
parent 1b07531a45
commit 72dad72703
205 changed files with 41703 additions and 80 deletions
@@ -0,0 +1,496 @@
+#!/usr/bin/env python3
+"""tiling-check.py — DragonScale Mechanism 3: semantic tiling lint.
+
+Computes per-page embeddings via a local ollama instance and reports
+candidate duplicate page pairs. Read-only; never modifies wiki pages.
+
+Security model:
+- Defaults to http://127.0.0.1:11434. Remote ollama endpoints require
+  --allow-remote-ollama explicitly (vault bodies are POSTed as embedding
+  input; a hostile env var would otherwise exfiltrate content).
+- Rejects symlinked page files to prevent escape outside the vault root.
+
+Feature-gated: exits 10 if ollama is unreachable or 11 if the embedding
+model is not pulled, so the calling skill can no-op gracefully. Exits 0
+on success. Exit 3 on cache corruption. Exit 2 on usage error.
+
+Concurrency:
+- Locks `.vault-meta/.tiling.lock` (flock exclusive) around cache I/O.
+- Per-PID temp file to avoid shared-tempfile races.
+
+Usage:
+  tiling-check.py                      # run; exit 10/11 if ollama/model missing
+  tiling-check.py --report PATH        # also write report to PATH
+  tiling-check.py --rebuild-cache      # ignore cached embeddings
+  tiling-check.py --peek               # structured diagnostics; no compute
+  tiling-check.py --allow-remote-ollama # accept non-localhost OLLAMA_URL
+"""
+
+import argparse
+import fcntl
+import hashlib
+import json
+import math
+import os
+import re
+import sys
+import urllib.error
+import urllib.parse
+import urllib.request
+from datetime import datetime
+from pathlib import Path
+
+DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434"
+DEFAULT_MODEL = "nomic-embed-text"
+OLLAMA_TIMEOUT_SEC = 3
+EMBED_TIMEOUT_SEC = 30
+MAX_RESPONSE_BYTES = 4 * 1024 * 1024  # 4 MB; embeddings can be ~10 KB each
+
+VAULT_ROOT = Path(__file__).resolve().parent.parent
+WIKI_DIR = VAULT_ROOT / "wiki"
+META_DIR = VAULT_ROOT / ".vault-meta"
+CACHE_PATH = META_DIR / "tiling-cache.json"
+CACHE_LOCK = META_DIR / ".tiling.lock"
+THRESHOLDS_PATH = META_DIR / "tiling-thresholds.json"
+
+EXCLUDE_TYPES = {"meta", "fold"}
+EXCLUDE_FILENAMES = {
+    "_index.md", "index.md", "log.md", "hot.md", "overview.md",
+    "dashboard.md", "Wiki Map.md", "getting-started.md",
+}
+EXCLUDE_PATH_PREFIXES = ("wiki/folds/", "wiki/meta/")
+MAX_BODY_BYTES = 128 * 1024
+SCALE_WARN_PAGES = 500
+SCALE_HARD_FAIL_PAGES = 5000
+
+EXIT_OK = 0
+EXIT_USAGE = 2
+EXIT_CACHE_CORRUPT = 3
+EXIT_SCALE_EXCEEDED = 4
+EXIT_NO_OLLAMA = 10
+EXIT_NO_MODEL = 11
+
+FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
+TYPE_RE = re.compile(r"^type:\s*(\S+)", re.MULTILINE)
+
+
+def log(msg: str) -> None:
+    print(msg, file=sys.stderr)
+
+
+def _is_local_url(url: str) -> bool:
+    try:
+        host = urllib.parse.urlparse(url).hostname or ""
+    except ValueError:
+        return False
+    return host in ("127.0.0.1", "localhost", "::1")
+
+
+def _http_get_json(url: str, timeout: float) -> dict:
+    with urllib.request.urlopen(url, timeout=timeout) as resp:
+        raw = resp.read(MAX_RESPONSE_BYTES + 1)
+    if len(raw) > MAX_RESPONSE_BYTES:
+        raise RuntimeError("response exceeded size limit")
+    return json.loads(raw.decode("utf-8"))
+
+
+def _http_post_json(url: str, payload: dict, timeout: float) -> dict:
+    data = json.dumps(payload).encode("utf-8")
+    req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        raw = resp.read(MAX_RESPONSE_BYTES + 1)
+    if len(raw) > MAX_RESPONSE_BYTES:
+        raise RuntimeError("response exceeded size limit")
+    return json.loads(raw.decode("utf-8"))
+
+
+def detect_ollama(url: str) -> bool:
+    try:
+        _http_get_json(f"{url}/api/version", OLLAMA_TIMEOUT_SEC)
+        return True
+    except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
+        return False
+
+
+def detect_model(url: str, model: str) -> bool:
+    try:
+        data = _http_get_json(f"{url}/api/tags", OLLAMA_TIMEOUT_SEC)
+    except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
+        return False
+    models = data.get("models")
+    if not isinstance(models, list):
+        return False
+    for entry in models:
+        if not isinstance(entry, dict):
+            continue
+        name = entry.get("name", "")
+        if isinstance(name, str) and (name == model or name.startswith(f"{model}:")):
+            return True
+    return False
+
+
+def parse_frontmatter(text: str) -> tuple[dict, str]:
+    m = FRONTMATTER_RE.match(text)
+    if not m:
+        return {}, text
+    fm_raw = m.group(1)
+    body = text[m.end():]
+    fm: dict = {}
+    tm = TYPE_RE.search(fm_raw)
+    if tm:
+        fm["type"] = tm.group(1).strip().strip('"').strip("'")
+    return fm, body
+
+
+def body_hash(body: str, model: str) -> str:
+    h = hashlib.sha256()
+    h.update(f"model={model}\n".encode("utf-8"))
+    h.update(body.encode("utf-8"))
+    return h.hexdigest()
+
+
+def cosine(a: list[float], b: list[float]) -> float:
+    if len(a) != len(b):
+        raise ValueError(f"dim mismatch: {len(a)} vs {len(b)}")
+    dot = sum(x * y for x, y in zip(a, b))
+    na = math.sqrt(sum(x * x for x in a))
+    nb = math.sqrt(sum(x * x for x in b))
+    if na == 0.0 or nb == 0.0:
+        return 0.0
+    return dot / (na * nb)
+
+
+def _lock_cache():
+    META_DIR.mkdir(exist_ok=True)
+    fd = os.open(str(CACHE_LOCK), os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(fd, fcntl.LOCK_EX)
+    except OSError:
+        os.close(fd)
+        raise
+    return fd
+
+
+def _unlock_cache(fd: int) -> None:
+    try:
+        fcntl.flock(fd, fcntl.LOCK_UN)
+    finally:
+        os.close(fd)
+
+
+def load_cache(current_model: str) -> dict:
+    if not CACHE_PATH.exists():
+        return {"version": 1, "model": current_model, "embeddings": {}}
+    try:
+        with CACHE_PATH.open() as f:
+            data = json.load(f)
+    except (OSError, json.JSONDecodeError) as exc:
+        log(f"ERR: cache read failed: {exc}")
+        sys.exit(EXIT_CACHE_CORRUPT)
+    if data.get("version") != 1:
+        log(f"ERR: unknown cache version: {data.get('version')}")
+        sys.exit(EXIT_CACHE_CORRUPT)
+    cached_model = data.get("model", "")
+    if cached_model != current_model:
+        log(f"INFO: cached model '{cached_model}' differs from current '{current_model}'; invalidating cache")
+        return {"version": 1, "model": current_model, "embeddings": {}}
+    if not isinstance(data.get("embeddings"), dict):
+        log("ERR: cache.embeddings is not a dict")
+        sys.exit(EXIT_CACHE_CORRUPT)
+    return data
+
+
+def save_cache(cache: dict) -> None:
+    META_DIR.mkdir(exist_ok=True)
+    tmp = CACHE_PATH.with_name(f"{CACHE_PATH.stem}.{os.getpid()}.tmp")
+    with tmp.open("w") as f:
+        json.dump(cache, f, indent=2)
+    tmp.replace(CACHE_PATH)
+
+
+def load_thresholds() -> dict:
+    if not THRESHOLDS_PATH.exists():
+        return {
+            "version": 1, "model": DEFAULT_MODEL,
+            "bands": {"error": 0.90, "review": 0.80},
+            "calibrated": False, "calibration_pairs_labeled": 0,
+        }
+    with THRESHOLDS_PATH.open() as f:
+        return json.load(f)
+
+
+def included(path: Path, fm: dict) -> tuple[bool, str]:
+    rel = path.relative_to(VAULT_ROOT).as_posix()
+    if path.is_symlink():
+        return False, "symlink"
+    resolved = path.resolve()
+    try:
+        resolved.relative_to(VAULT_ROOT.resolve())
+    except ValueError:
+        return False, "escapes vault"
+    if path.name in EXCLUDE_FILENAMES:
+        return False, "excluded filename"
+    for prefix in EXCLUDE_PATH_PREFIXES:
+        if rel.startswith(prefix):
+            return False, f"under {prefix}"
+    if fm.get("type") in EXCLUDE_TYPES:
+        return False, f"type={fm['type']}"
+    return True, "included"
+
+
+def embed(text: str, model: str, url: str) -> list[float]:
+    data = _http_post_json(
+        f"{url}/api/embeddings",
+        {"model": model, "prompt": text},
+        EMBED_TIMEOUT_SEC,
+    )
+    emb = data.get("embedding")
+    if not isinstance(emb, list) or not emb:
+        raise RuntimeError(f"ollama returned no embedding: {str(data)[:200]}")
+    for v in emb:
+        if not isinstance(v, (int, float)):
+            raise RuntimeError("embedding contains non-numeric values")
+    return emb
+
+
+def run_check(
+    rebuild: bool,
+    report_path: Path | None,
+    ollama_url: str,
+    model: str,
+) -> int:
+    if not detect_ollama(ollama_url):
+        log(f"ollama not reachable at {ollama_url}; skipping tiling check")
+        return EXIT_NO_OLLAMA
+    if not detect_model(ollama_url, model):
+        log(f"model '{model}' not pulled; run: ollama pull {model}")
+        return EXIT_NO_MODEL
+
+    thresholds = load_thresholds()
+
+    lock_fd = _lock_cache()
+    try:
+        cache = (load_cache(model) if not rebuild
+                 else {"version": 1, "model": model, "embeddings": {}})
+
+        pages: list[tuple[str, list[float]]] = []
+        scanned = 0
+        computed = 0
+        cached_hits = 0
+        skipped_counts: dict[str, int] = {}
+        live_paths: set[str] = set()
+
+        candidates = sorted(WIKI_DIR.rglob("*.md"))
+        scale_n = len(candidates)
+        if scale_n > SCALE_HARD_FAIL_PAGES:
+            log(f"ERR: {scale_n} pages exceed hard-fail limit {SCALE_HARD_FAIL_PAGES}")
+            return EXIT_SCALE_EXCEEDED
+        if scale_n > SCALE_WARN_PAGES:
+            log(f"WARN: {scale_n} pages; cold-cache embed will issue ~{scale_n} POSTs to ollama")
+
+        for md in candidates:
+            scanned += 1
+            # Symlink and vault-root guards must run BEFORE read_text so a
+            # hostile symlink cannot cause off-vault content to be read and
+            # POSTed to the embedding endpoint.
+            if md.is_symlink():
+                skipped_counts["symlink"] = skipped_counts.get("symlink", 0) + 1
+                continue
+            try:
+                resolved = md.resolve(strict=True)
+                resolved.relative_to(VAULT_ROOT.resolve())
+            except (OSError, ValueError):
+                skipped_counts["escapes vault"] = skipped_counts.get("escapes vault", 0) + 1
+                continue
+            try:
+                text = md.read_text(encoding="utf-8")
+            except (OSError, UnicodeDecodeError):
+                skipped_counts["read_error"] = skipped_counts.get("read_error", 0) + 1
+                continue
+            if len(text.encode("utf-8")) > MAX_BODY_BYTES:
+                skipped_counts["too_large"] = skipped_counts.get("too_large", 0) + 1
+                continue
+            fm, body = parse_frontmatter(text)
+            ok, reason = included(md, fm)
+            if not ok:
+                skipped_counts[reason] = skipped_counts.get(reason, 0) + 1
+                continue
+            rel = md.relative_to(VAULT_ROOT).as_posix()
+            live_paths.add(rel)
+            h = body_hash(body, model)
+            entry = cache["embeddings"].get(rel)
+            if entry and entry.get("hash") == h:
+                pages.append((rel, entry["embedding"]))
+                cached_hits += 1
+                continue
+            try:
+                emb = embed(body, model, ollama_url)
+            except Exception as exc:
+                log(f"ERR embedding {rel}: {exc}")
+                skipped_counts["embed_error"] = skipped_counts.get("embed_error", 0) + 1
+                continue
+            cache["embeddings"][rel] = {
+                "hash": h,
+                "embedding": emb,
+                "computed_at": datetime.utcnow().isoformat(timespec="seconds") + "Z",
+            }
+            pages.append((rel, emb))
+            computed += 1
+
+        # Orphan GC: drop cache entries for paths that no longer exist.
+        orphans = [k for k in cache["embeddings"] if k not in live_paths]
+        for k in orphans:
+            del cache["embeddings"][k]
+
+        save_cache(cache)
+    finally:
+        _unlock_cache(lock_fd)
+
+    review = thresholds["bands"]["review"]
+    error_ = thresholds["bands"]["error"]
+    pairs: list[tuple[float, str, str]] = []
+    for i in range(len(pages)):
+        for j in range(i + 1, len(pages)):
+            a_path, a_emb = pages[i]
+            b_path, b_emb = pages[j]
+            try:
+                sim = cosine(a_emb, b_emb)
+            except ValueError as exc:
+                log(f"WARN cosine skip ({a_path}, {b_path}): {exc}")
+                continue
+            if sim >= review:
+                pairs.append((sim, a_path, b_path))
+    pairs.sort(reverse=True)
+
+    errors = [p for p in pairs if p[0] >= error_]
+    reviews = [p for p in pairs if review <= p[0] < error_]
+
+    out_lines: list[str] = []
+    out_lines.append("# Semantic Tiling Report")
+    out_lines.append("")
+    out_lines.append(f"- generated: {datetime.utcnow().isoformat(timespec='seconds')}Z")
+    out_lines.append(f"- model: {model}")
+    out_lines.append(f"- ollama_url: {ollama_url}")
+    out_lines.append(f"- thresholds: error>={error_}, review={review}-{error_}")
+    out_lines.append(f"- calibrated: {thresholds.get('calibrated', False)}"
+                     + (" (using uncalibrated defaults)" if not thresholds.get("calibrated") else ""))
+    out_lines.append(f"- pages scanned: {scanned}; embedded: {len(pages)}; skipped: {sum(skipped_counts.values())}")
+    if skipped_counts:
+        out_lines.append("- skipped reasons: " + ", ".join(f"{k}={v}" for k, v in sorted(skipped_counts.items())))
+    out_lines.append(f"- cache hits: {cached_hits}; recomputed: {computed}; orphans pruned: {len(orphans)}")
+    out_lines.append("")
+    out_lines.append(f"## Errors (similarity >= {error_})")
+    out_lines.append("")
+    if not errors:
+        out_lines.append("- none")
+    else:
+        for sim, a, b in errors:
+            out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
+    out_lines.append("")
+    out_lines.append(f"## Review ({review} <= similarity < {error_})")
+    out_lines.append("")
+    if not reviews:
+        out_lines.append("- none")
+    else:
+        for sim, a, b in reviews:
+            out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
+    report = "\n".join(out_lines) + "\n"
+
+    print(report)
+    if report_path is not None:
+        # Confine report writes to VAULT_ROOT. A path that resolves outside
+        # the vault is refused (prevents `--report /etc/passwd` style
+        # accidents or hostile args from writing outside the repo).
+        try:
+            resolved_report = (
+                report_path if report_path.is_absolute() else (Path.cwd() / report_path)
+            ).resolve()
+            resolved_report.relative_to(VAULT_ROOT.resolve())
+        except ValueError:
+            log(f"ERR: --report path '{report_path}' escapes vault root {VAULT_ROOT}")
+            return EXIT_USAGE
+        resolved_report.parent.mkdir(parents=True, exist_ok=True)
+        resolved_report.write_text(report, encoding="utf-8")
+        log(f"report written: {resolved_report}")
+
+    return EXIT_OK
+
+
+def cmd_peek(ollama_url: str, model: str) -> int:
+    """Structured diagnostics. Prints a JSON object and a plain summary."""
+    diag: dict = {}
+    script_path = Path(__file__).resolve()
+    diag["script_path"] = str(script_path)
+    diag["script_executable"] = os.access(script_path, os.X_OK)
+    diag["python"] = sys.executable
+    diag["vault_root"] = str(VAULT_ROOT)
+    diag["ollama_url"] = ollama_url
+    diag["ollama_reachable"] = detect_ollama(ollama_url)
+    diag["model_requested"] = model
+    diag["model_present"] = detect_model(ollama_url, model) if diag["ollama_reachable"] else False
+    diag["cache_present"] = CACHE_PATH.exists()
+    diag["cache_readable"] = False
+    diag["cache_entries"] = 0
+    diag["cache_model"] = None
+    if diag["cache_present"]:
+        try:
+            with CACHE_PATH.open() as f:
+                c = json.load(f)
+            diag["cache_readable"] = (c.get("version") == 1
+                                      and isinstance(c.get("embeddings"), dict))
+            diag["cache_entries"] = len(c.get("embeddings", {}))
+            diag["cache_model"] = c.get("model")
+        except (OSError, json.JSONDecodeError) as exc:
+            diag["cache_readable"] = False
+            diag["cache_error"] = str(exc)
+    diag["thresholds_present"] = THRESHOLDS_PATH.exists()
+    diag["thresholds_readable"] = False
+    if diag["thresholds_present"]:
+        try:
+            with THRESHOLDS_PATH.open() as f:
+                t = json.load(f)
+            diag["thresholds_readable"] = True
+            diag["thresholds_calibrated"] = bool(t.get("calibrated", False))
+            diag["thresholds_bands"] = t.get("bands", {})
+        except (OSError, json.JSONDecodeError):
+            diag["thresholds_readable"] = False
+    print(json.dumps(diag, indent=2))
+    if not diag["ollama_reachable"]:
+        return EXIT_NO_OLLAMA
+    if not diag["model_present"]:
+        return EXIT_NO_MODEL
+    if diag["cache_present"] and not diag["cache_readable"]:
+        return EXIT_CACHE_CORRUPT
+    return EXIT_OK
+
+
+def main(argv: list[str]) -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--report", type=Path, default=None)
+    p.add_argument("--rebuild-cache", action="store_true")
+    p.add_argument("--peek", action="store_true")
+    p.add_argument("--allow-remote-ollama", action="store_true",
+                   help="allow OLLAMA_URL env override pointing outside localhost")
+    p.add_argument("--model", default=DEFAULT_MODEL)
+    args = p.parse_args(argv)
+
+    env_url = os.environ.get("OLLAMA_URL")
+    ollama_url = env_url or DEFAULT_OLLAMA_URL
+    if env_url and not _is_local_url(ollama_url) and not args.allow_remote_ollama:
+        log(f"ERR: OLLAMA_URL={ollama_url!r} is not localhost. "
+            f"Vault content would be POSTed to a non-local host. "
+            f"Pass --allow-remote-ollama to override.")
+        return EXIT_USAGE
+
+    if args.peek:
+        return cmd_peek(ollama_url, args.model)
+    return run_check(
+        rebuild=args.rebuild_cache,
+        report_path=args.report,
+        ollama_url=ollama_url,
+        model=args.model,
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))