Files
MultiPhysicsVault/scripts/tiling-check.py
T
김경종 72dad72703
Tests / Hermetic test suite (push) Has been cancelled
Tests / Skill frontmatter validation (push) Has been cancelled
add claude-obsidian
2026-05-28 10:57:16 +09:00

497 lines
18 KiB
Python

#!/usr/bin/env python3
"""tiling-check.py — DragonScale Mechanism 3: semantic tiling lint.
Computes per-page embeddings via a local ollama instance and reports
candidate duplicate page pairs. Read-only; never modifies wiki pages.
Security model:
- Defaults to http://127.0.0.1:11434. Remote ollama endpoints require
--allow-remote-ollama explicitly (vault bodies are POSTed as embedding
input; a hostile env var would otherwise exfiltrate content).
- Rejects symlinked page files to prevent escape outside the vault root.
Feature-gated: exits 10 if ollama is unreachable or 11 if the embedding
model is not pulled, so the calling skill can no-op gracefully. Exits 0
on success. Exit 3 on cache corruption. Exit 2 on usage error.
Concurrency:
- Locks `.vault-meta/.tiling.lock` (flock exclusive) around cache I/O.
- Per-PID temp file to avoid shared-tempfile races.
Usage:
tiling-check.py # run; exit 10/11 if ollama/model missing
tiling-check.py --report PATH # also write report to PATH
tiling-check.py --rebuild-cache # ignore cached embeddings
tiling-check.py --peek # structured diagnostics; no compute
tiling-check.py --allow-remote-ollama # accept non-localhost OLLAMA_URL
"""
import argparse
import fcntl
import hashlib
import json
import math
import os
import re
import sys
import urllib.error
import urllib.parse
import urllib.request
from datetime import datetime
from pathlib import Path
DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434"
DEFAULT_MODEL = "nomic-embed-text"
OLLAMA_TIMEOUT_SEC = 3
EMBED_TIMEOUT_SEC = 30
MAX_RESPONSE_BYTES = 4 * 1024 * 1024 # 4 MB; embeddings can be ~10 KB each
VAULT_ROOT = Path(__file__).resolve().parent.parent
WIKI_DIR = VAULT_ROOT / "wiki"
META_DIR = VAULT_ROOT / ".vault-meta"
CACHE_PATH = META_DIR / "tiling-cache.json"
CACHE_LOCK = META_DIR / ".tiling.lock"
THRESHOLDS_PATH = META_DIR / "tiling-thresholds.json"
EXCLUDE_TYPES = {"meta", "fold"}
EXCLUDE_FILENAMES = {
"_index.md", "index.md", "log.md", "hot.md", "overview.md",
"dashboard.md", "Wiki Map.md", "getting-started.md",
}
EXCLUDE_PATH_PREFIXES = ("wiki/folds/", "wiki/meta/")
MAX_BODY_BYTES = 128 * 1024
SCALE_WARN_PAGES = 500
SCALE_HARD_FAIL_PAGES = 5000
EXIT_OK = 0
EXIT_USAGE = 2
EXIT_CACHE_CORRUPT = 3
EXIT_SCALE_EXCEEDED = 4
EXIT_NO_OLLAMA = 10
EXIT_NO_MODEL = 11
FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
TYPE_RE = re.compile(r"^type:\s*(\S+)", re.MULTILINE)
def log(msg: str) -> None:
print(msg, file=sys.stderr)
def _is_local_url(url: str) -> bool:
try:
host = urllib.parse.urlparse(url).hostname or ""
except ValueError:
return False
return host in ("127.0.0.1", "localhost", "::1")
def _http_get_json(url: str, timeout: float) -> dict:
with urllib.request.urlopen(url, timeout=timeout) as resp:
raw = resp.read(MAX_RESPONSE_BYTES + 1)
if len(raw) > MAX_RESPONSE_BYTES:
raise RuntimeError("response exceeded size limit")
return json.loads(raw.decode("utf-8"))
def _http_post_json(url: str, payload: dict, timeout: float) -> dict:
data = json.dumps(payload).encode("utf-8")
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
with urllib.request.urlopen(req, timeout=timeout) as resp:
raw = resp.read(MAX_RESPONSE_BYTES + 1)
if len(raw) > MAX_RESPONSE_BYTES:
raise RuntimeError("response exceeded size limit")
return json.loads(raw.decode("utf-8"))
def detect_ollama(url: str) -> bool:
try:
_http_get_json(f"{url}/api/version", OLLAMA_TIMEOUT_SEC)
return True
except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
return False
def detect_model(url: str, model: str) -> bool:
try:
data = _http_get_json(f"{url}/api/tags", OLLAMA_TIMEOUT_SEC)
except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
return False
models = data.get("models")
if not isinstance(models, list):
return False
for entry in models:
if not isinstance(entry, dict):
continue
name = entry.get("name", "")
if isinstance(name, str) and (name == model or name.startswith(f"{model}:")):
return True
return False
def parse_frontmatter(text: str) -> tuple[dict, str]:
m = FRONTMATTER_RE.match(text)
if not m:
return {}, text
fm_raw = m.group(1)
body = text[m.end():]
fm: dict = {}
tm = TYPE_RE.search(fm_raw)
if tm:
fm["type"] = tm.group(1).strip().strip('"').strip("'")
return fm, body
def body_hash(body: str, model: str) -> str:
h = hashlib.sha256()
h.update(f"model={model}\n".encode("utf-8"))
h.update(body.encode("utf-8"))
return h.hexdigest()
def cosine(a: list[float], b: list[float]) -> float:
if len(a) != len(b):
raise ValueError(f"dim mismatch: {len(a)} vs {len(b)}")
dot = sum(x * y for x, y in zip(a, b))
na = math.sqrt(sum(x * x for x in a))
nb = math.sqrt(sum(x * x for x in b))
if na == 0.0 or nb == 0.0:
return 0.0
return dot / (na * nb)
def _lock_cache():
META_DIR.mkdir(exist_ok=True)
fd = os.open(str(CACHE_LOCK), os.O_CREAT | os.O_RDWR, 0o644)
try:
fcntl.flock(fd, fcntl.LOCK_EX)
except OSError:
os.close(fd)
raise
return fd
def _unlock_cache(fd: int) -> None:
try:
fcntl.flock(fd, fcntl.LOCK_UN)
finally:
os.close(fd)
def load_cache(current_model: str) -> dict:
if not CACHE_PATH.exists():
return {"version": 1, "model": current_model, "embeddings": {}}
try:
with CACHE_PATH.open() as f:
data = json.load(f)
except (OSError, json.JSONDecodeError) as exc:
log(f"ERR: cache read failed: {exc}")
sys.exit(EXIT_CACHE_CORRUPT)
if data.get("version") != 1:
log(f"ERR: unknown cache version: {data.get('version')}")
sys.exit(EXIT_CACHE_CORRUPT)
cached_model = data.get("model", "")
if cached_model != current_model:
log(f"INFO: cached model '{cached_model}' differs from current '{current_model}'; invalidating cache")
return {"version": 1, "model": current_model, "embeddings": {}}
if not isinstance(data.get("embeddings"), dict):
log("ERR: cache.embeddings is not a dict")
sys.exit(EXIT_CACHE_CORRUPT)
return data
def save_cache(cache: dict) -> None:
META_DIR.mkdir(exist_ok=True)
tmp = CACHE_PATH.with_name(f"{CACHE_PATH.stem}.{os.getpid()}.tmp")
with tmp.open("w") as f:
json.dump(cache, f, indent=2)
tmp.replace(CACHE_PATH)
def load_thresholds() -> dict:
if not THRESHOLDS_PATH.exists():
return {
"version": 1, "model": DEFAULT_MODEL,
"bands": {"error": 0.90, "review": 0.80},
"calibrated": False, "calibration_pairs_labeled": 0,
}
with THRESHOLDS_PATH.open() as f:
return json.load(f)
def included(path: Path, fm: dict) -> tuple[bool, str]:
rel = path.relative_to(VAULT_ROOT).as_posix()
if path.is_symlink():
return False, "symlink"
resolved = path.resolve()
try:
resolved.relative_to(VAULT_ROOT.resolve())
except ValueError:
return False, "escapes vault"
if path.name in EXCLUDE_FILENAMES:
return False, "excluded filename"
for prefix in EXCLUDE_PATH_PREFIXES:
if rel.startswith(prefix):
return False, f"under {prefix}"
if fm.get("type") in EXCLUDE_TYPES:
return False, f"type={fm['type']}"
return True, "included"
def embed(text: str, model: str, url: str) -> list[float]:
data = _http_post_json(
f"{url}/api/embeddings",
{"model": model, "prompt": text},
EMBED_TIMEOUT_SEC,
)
emb = data.get("embedding")
if not isinstance(emb, list) or not emb:
raise RuntimeError(f"ollama returned no embedding: {str(data)[:200]}")
for v in emb:
if not isinstance(v, (int, float)):
raise RuntimeError("embedding contains non-numeric values")
return emb
def run_check(
rebuild: bool,
report_path: Path | None,
ollama_url: str,
model: str,
) -> int:
if not detect_ollama(ollama_url):
log(f"ollama not reachable at {ollama_url}; skipping tiling check")
return EXIT_NO_OLLAMA
if not detect_model(ollama_url, model):
log(f"model '{model}' not pulled; run: ollama pull {model}")
return EXIT_NO_MODEL
thresholds = load_thresholds()
lock_fd = _lock_cache()
try:
cache = (load_cache(model) if not rebuild
else {"version": 1, "model": model, "embeddings": {}})
pages: list[tuple[str, list[float]]] = []
scanned = 0
computed = 0
cached_hits = 0
skipped_counts: dict[str, int] = {}
live_paths: set[str] = set()
candidates = sorted(WIKI_DIR.rglob("*.md"))
scale_n = len(candidates)
if scale_n > SCALE_HARD_FAIL_PAGES:
log(f"ERR: {scale_n} pages exceed hard-fail limit {SCALE_HARD_FAIL_PAGES}")
return EXIT_SCALE_EXCEEDED
if scale_n > SCALE_WARN_PAGES:
log(f"WARN: {scale_n} pages; cold-cache embed will issue ~{scale_n} POSTs to ollama")
for md in candidates:
scanned += 1
# Symlink and vault-root guards must run BEFORE read_text so a
# hostile symlink cannot cause off-vault content to be read and
# POSTed to the embedding endpoint.
if md.is_symlink():
skipped_counts["symlink"] = skipped_counts.get("symlink", 0) + 1
continue
try:
resolved = md.resolve(strict=True)
resolved.relative_to(VAULT_ROOT.resolve())
except (OSError, ValueError):
skipped_counts["escapes vault"] = skipped_counts.get("escapes vault", 0) + 1
continue
try:
text = md.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
skipped_counts["read_error"] = skipped_counts.get("read_error", 0) + 1
continue
if len(text.encode("utf-8")) > MAX_BODY_BYTES:
skipped_counts["too_large"] = skipped_counts.get("too_large", 0) + 1
continue
fm, body = parse_frontmatter(text)
ok, reason = included(md, fm)
if not ok:
skipped_counts[reason] = skipped_counts.get(reason, 0) + 1
continue
rel = md.relative_to(VAULT_ROOT).as_posix()
live_paths.add(rel)
h = body_hash(body, model)
entry = cache["embeddings"].get(rel)
if entry and entry.get("hash") == h:
pages.append((rel, entry["embedding"]))
cached_hits += 1
continue
try:
emb = embed(body, model, ollama_url)
except Exception as exc:
log(f"ERR embedding {rel}: {exc}")
skipped_counts["embed_error"] = skipped_counts.get("embed_error", 0) + 1
continue
cache["embeddings"][rel] = {
"hash": h,
"embedding": emb,
"computed_at": datetime.utcnow().isoformat(timespec="seconds") + "Z",
}
pages.append((rel, emb))
computed += 1
# Orphan GC: drop cache entries for paths that no longer exist.
orphans = [k for k in cache["embeddings"] if k not in live_paths]
for k in orphans:
del cache["embeddings"][k]
save_cache(cache)
finally:
_unlock_cache(lock_fd)
review = thresholds["bands"]["review"]
error_ = thresholds["bands"]["error"]
pairs: list[tuple[float, str, str]] = []
for i in range(len(pages)):
for j in range(i + 1, len(pages)):
a_path, a_emb = pages[i]
b_path, b_emb = pages[j]
try:
sim = cosine(a_emb, b_emb)
except ValueError as exc:
log(f"WARN cosine skip ({a_path}, {b_path}): {exc}")
continue
if sim >= review:
pairs.append((sim, a_path, b_path))
pairs.sort(reverse=True)
errors = [p for p in pairs if p[0] >= error_]
reviews = [p for p in pairs if review <= p[0] < error_]
out_lines: list[str] = []
out_lines.append("# Semantic Tiling Report")
out_lines.append("")
out_lines.append(f"- generated: {datetime.utcnow().isoformat(timespec='seconds')}Z")
out_lines.append(f"- model: {model}")
out_lines.append(f"- ollama_url: {ollama_url}")
out_lines.append(f"- thresholds: error>={error_}, review={review}-{error_}")
out_lines.append(f"- calibrated: {thresholds.get('calibrated', False)}"
+ (" (using uncalibrated defaults)" if not thresholds.get("calibrated") else ""))
out_lines.append(f"- pages scanned: {scanned}; embedded: {len(pages)}; skipped: {sum(skipped_counts.values())}")
if skipped_counts:
out_lines.append("- skipped reasons: " + ", ".join(f"{k}={v}" for k, v in sorted(skipped_counts.items())))
out_lines.append(f"- cache hits: {cached_hits}; recomputed: {computed}; orphans pruned: {len(orphans)}")
out_lines.append("")
out_lines.append(f"## Errors (similarity >= {error_})")
out_lines.append("")
if not errors:
out_lines.append("- none")
else:
for sim, a, b in errors:
out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
out_lines.append("")
out_lines.append(f"## Review ({review} <= similarity < {error_})")
out_lines.append("")
if not reviews:
out_lines.append("- none")
else:
for sim, a, b in reviews:
out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
report = "\n".join(out_lines) + "\n"
print(report)
if report_path is not None:
# Confine report writes to VAULT_ROOT. A path that resolves outside
# the vault is refused (prevents `--report /etc/passwd` style
# accidents or hostile args from writing outside the repo).
try:
resolved_report = (
report_path if report_path.is_absolute() else (Path.cwd() / report_path)
).resolve()
resolved_report.relative_to(VAULT_ROOT.resolve())
except ValueError:
log(f"ERR: --report path '{report_path}' escapes vault root {VAULT_ROOT}")
return EXIT_USAGE
resolved_report.parent.mkdir(parents=True, exist_ok=True)
resolved_report.write_text(report, encoding="utf-8")
log(f"report written: {resolved_report}")
return EXIT_OK
def cmd_peek(ollama_url: str, model: str) -> int:
"""Structured diagnostics. Prints a JSON object and a plain summary."""
diag: dict = {}
script_path = Path(__file__).resolve()
diag["script_path"] = str(script_path)
diag["script_executable"] = os.access(script_path, os.X_OK)
diag["python"] = sys.executable
diag["vault_root"] = str(VAULT_ROOT)
diag["ollama_url"] = ollama_url
diag["ollama_reachable"] = detect_ollama(ollama_url)
diag["model_requested"] = model
diag["model_present"] = detect_model(ollama_url, model) if diag["ollama_reachable"] else False
diag["cache_present"] = CACHE_PATH.exists()
diag["cache_readable"] = False
diag["cache_entries"] = 0
diag["cache_model"] = None
if diag["cache_present"]:
try:
with CACHE_PATH.open() as f:
c = json.load(f)
diag["cache_readable"] = (c.get("version") == 1
and isinstance(c.get("embeddings"), dict))
diag["cache_entries"] = len(c.get("embeddings", {}))
diag["cache_model"] = c.get("model")
except (OSError, json.JSONDecodeError) as exc:
diag["cache_readable"] = False
diag["cache_error"] = str(exc)
diag["thresholds_present"] = THRESHOLDS_PATH.exists()
diag["thresholds_readable"] = False
if diag["thresholds_present"]:
try:
with THRESHOLDS_PATH.open() as f:
t = json.load(f)
diag["thresholds_readable"] = True
diag["thresholds_calibrated"] = bool(t.get("calibrated", False))
diag["thresholds_bands"] = t.get("bands", {})
except (OSError, json.JSONDecodeError):
diag["thresholds_readable"] = False
print(json.dumps(diag, indent=2))
if not diag["ollama_reachable"]:
return EXIT_NO_OLLAMA
if not diag["model_present"]:
return EXIT_NO_MODEL
if diag["cache_present"] and not diag["cache_readable"]:
return EXIT_CACHE_CORRUPT
return EXIT_OK
def main(argv: list[str]) -> int:
p = argparse.ArgumentParser()
p.add_argument("--report", type=Path, default=None)
p.add_argument("--rebuild-cache", action="store_true")
p.add_argument("--peek", action="store_true")
p.add_argument("--allow-remote-ollama", action="store_true",
help="allow OLLAMA_URL env override pointing outside localhost")
p.add_argument("--model", default=DEFAULT_MODEL)
args = p.parse_args(argv)
env_url = os.environ.get("OLLAMA_URL")
ollama_url = env_url or DEFAULT_OLLAMA_URL
if env_url and not _is_local_url(ollama_url) and not args.allow_remote_ollama:
log(f"ERR: OLLAMA_URL={ollama_url!r} is not localhost. "
f"Vault content would be POSTed to a non-local host. "
f"Pass --allow-remote-ollama to override.")
return EXIT_USAGE
if args.peek:
return cmd_peek(ollama_url, args.model)
return run_check(
rebuild=args.rebuild_cache,
report_path=args.report,
ollama_url=ollama_url,
model=args.model,
)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))