add claude-obsidian
This commit is contained in:
@@ -0,0 +1,496 @@
|
||||
#!/usr/bin/env python3
|
||||
"""tiling-check.py — DragonScale Mechanism 3: semantic tiling lint.
|
||||
|
||||
Computes per-page embeddings via a local ollama instance and reports
|
||||
candidate duplicate page pairs. Read-only; never modifies wiki pages.
|
||||
|
||||
Security model:
|
||||
- Defaults to http://127.0.0.1:11434. Remote ollama endpoints require
|
||||
--allow-remote-ollama explicitly (vault bodies are POSTed as embedding
|
||||
input; a hostile env var would otherwise exfiltrate content).
|
||||
- Rejects symlinked page files to prevent escape outside the vault root.
|
||||
|
||||
Feature-gated: exits 10 if ollama is unreachable or 11 if the embedding
|
||||
model is not pulled, so the calling skill can no-op gracefully. Exits 0
|
||||
on success. Exit 3 on cache corruption. Exit 2 on usage error.
|
||||
|
||||
Concurrency:
|
||||
- Locks `.vault-meta/.tiling.lock` (flock exclusive) around cache I/O.
|
||||
- Per-PID temp file to avoid shared-tempfile races.
|
||||
|
||||
Usage:
|
||||
tiling-check.py # run; exit 10/11 if ollama/model missing
|
||||
tiling-check.py --report PATH # also write report to PATH
|
||||
tiling-check.py --rebuild-cache # ignore cached embeddings
|
||||
tiling-check.py --peek # structured diagnostics; no compute
|
||||
tiling-check.py --allow-remote-ollama # accept non-localhost OLLAMA_URL
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import fcntl
|
||||
import hashlib
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434"
|
||||
DEFAULT_MODEL = "nomic-embed-text"
|
||||
OLLAMA_TIMEOUT_SEC = 3
|
||||
EMBED_TIMEOUT_SEC = 30
|
||||
MAX_RESPONSE_BYTES = 4 * 1024 * 1024 # 4 MB; embeddings can be ~10 KB each
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
WIKI_DIR = VAULT_ROOT / "wiki"
|
||||
META_DIR = VAULT_ROOT / ".vault-meta"
|
||||
CACHE_PATH = META_DIR / "tiling-cache.json"
|
||||
CACHE_LOCK = META_DIR / ".tiling.lock"
|
||||
THRESHOLDS_PATH = META_DIR / "tiling-thresholds.json"
|
||||
|
||||
EXCLUDE_TYPES = {"meta", "fold"}
|
||||
EXCLUDE_FILENAMES = {
|
||||
"_index.md", "index.md", "log.md", "hot.md", "overview.md",
|
||||
"dashboard.md", "Wiki Map.md", "getting-started.md",
|
||||
}
|
||||
EXCLUDE_PATH_PREFIXES = ("wiki/folds/", "wiki/meta/")
|
||||
MAX_BODY_BYTES = 128 * 1024
|
||||
SCALE_WARN_PAGES = 500
|
||||
SCALE_HARD_FAIL_PAGES = 5000
|
||||
|
||||
EXIT_OK = 0
|
||||
EXIT_USAGE = 2
|
||||
EXIT_CACHE_CORRUPT = 3
|
||||
EXIT_SCALE_EXCEEDED = 4
|
||||
EXIT_NO_OLLAMA = 10
|
||||
EXIT_NO_MODEL = 11
|
||||
|
||||
FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
|
||||
TYPE_RE = re.compile(r"^type:\s*(\S+)", re.MULTILINE)
|
||||
|
||||
|
||||
def log(msg: str) -> None:
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
|
||||
def _is_local_url(url: str) -> bool:
|
||||
try:
|
||||
host = urllib.parse.urlparse(url).hostname or ""
|
||||
except ValueError:
|
||||
return False
|
||||
return host in ("127.0.0.1", "localhost", "::1")
|
||||
|
||||
|
||||
def _http_get_json(url: str, timeout: float) -> dict:
|
||||
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
||||
raw = resp.read(MAX_RESPONSE_BYTES + 1)
|
||||
if len(raw) > MAX_RESPONSE_BYTES:
|
||||
raise RuntimeError("response exceeded size limit")
|
||||
return json.loads(raw.decode("utf-8"))
|
||||
|
||||
|
||||
def _http_post_json(url: str, payload: dict, timeout: float) -> dict:
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
raw = resp.read(MAX_RESPONSE_BYTES + 1)
|
||||
if len(raw) > MAX_RESPONSE_BYTES:
|
||||
raise RuntimeError("response exceeded size limit")
|
||||
return json.loads(raw.decode("utf-8"))
|
||||
|
||||
|
||||
def detect_ollama(url: str) -> bool:
|
||||
try:
|
||||
_http_get_json(f"{url}/api/version", OLLAMA_TIMEOUT_SEC)
|
||||
return True
|
||||
except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
|
||||
return False
|
||||
|
||||
|
||||
def detect_model(url: str, model: str) -> bool:
|
||||
try:
|
||||
data = _http_get_json(f"{url}/api/tags", OLLAMA_TIMEOUT_SEC)
|
||||
except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
|
||||
return False
|
||||
models = data.get("models")
|
||||
if not isinstance(models, list):
|
||||
return False
|
||||
for entry in models:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
name = entry.get("name", "")
|
||||
if isinstance(name, str) and (name == model or name.startswith(f"{model}:")):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def parse_frontmatter(text: str) -> tuple[dict, str]:
|
||||
m = FRONTMATTER_RE.match(text)
|
||||
if not m:
|
||||
return {}, text
|
||||
fm_raw = m.group(1)
|
||||
body = text[m.end():]
|
||||
fm: dict = {}
|
||||
tm = TYPE_RE.search(fm_raw)
|
||||
if tm:
|
||||
fm["type"] = tm.group(1).strip().strip('"').strip("'")
|
||||
return fm, body
|
||||
|
||||
|
||||
def body_hash(body: str, model: str) -> str:
|
||||
h = hashlib.sha256()
|
||||
h.update(f"model={model}\n".encode("utf-8"))
|
||||
h.update(body.encode("utf-8"))
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def cosine(a: list[float], b: list[float]) -> float:
|
||||
if len(a) != len(b):
|
||||
raise ValueError(f"dim mismatch: {len(a)} vs {len(b)}")
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
na = math.sqrt(sum(x * x for x in a))
|
||||
nb = math.sqrt(sum(x * x for x in b))
|
||||
if na == 0.0 or nb == 0.0:
|
||||
return 0.0
|
||||
return dot / (na * nb)
|
||||
|
||||
|
||||
def _lock_cache():
|
||||
META_DIR.mkdir(exist_ok=True)
|
||||
fd = os.open(str(CACHE_LOCK), os.O_CREAT | os.O_RDWR, 0o644)
|
||||
try:
|
||||
fcntl.flock(fd, fcntl.LOCK_EX)
|
||||
except OSError:
|
||||
os.close(fd)
|
||||
raise
|
||||
return fd
|
||||
|
||||
|
||||
def _unlock_cache(fd: int) -> None:
|
||||
try:
|
||||
fcntl.flock(fd, fcntl.LOCK_UN)
|
||||
finally:
|
||||
os.close(fd)
|
||||
|
||||
|
||||
def load_cache(current_model: str) -> dict:
|
||||
if not CACHE_PATH.exists():
|
||||
return {"version": 1, "model": current_model, "embeddings": {}}
|
||||
try:
|
||||
with CACHE_PATH.open() as f:
|
||||
data = json.load(f)
|
||||
except (OSError, json.JSONDecodeError) as exc:
|
||||
log(f"ERR: cache read failed: {exc}")
|
||||
sys.exit(EXIT_CACHE_CORRUPT)
|
||||
if data.get("version") != 1:
|
||||
log(f"ERR: unknown cache version: {data.get('version')}")
|
||||
sys.exit(EXIT_CACHE_CORRUPT)
|
||||
cached_model = data.get("model", "")
|
||||
if cached_model != current_model:
|
||||
log(f"INFO: cached model '{cached_model}' differs from current '{current_model}'; invalidating cache")
|
||||
return {"version": 1, "model": current_model, "embeddings": {}}
|
||||
if not isinstance(data.get("embeddings"), dict):
|
||||
log("ERR: cache.embeddings is not a dict")
|
||||
sys.exit(EXIT_CACHE_CORRUPT)
|
||||
return data
|
||||
|
||||
|
||||
def save_cache(cache: dict) -> None:
|
||||
META_DIR.mkdir(exist_ok=True)
|
||||
tmp = CACHE_PATH.with_name(f"{CACHE_PATH.stem}.{os.getpid()}.tmp")
|
||||
with tmp.open("w") as f:
|
||||
json.dump(cache, f, indent=2)
|
||||
tmp.replace(CACHE_PATH)
|
||||
|
||||
|
||||
def load_thresholds() -> dict:
|
||||
if not THRESHOLDS_PATH.exists():
|
||||
return {
|
||||
"version": 1, "model": DEFAULT_MODEL,
|
||||
"bands": {"error": 0.90, "review": 0.80},
|
||||
"calibrated": False, "calibration_pairs_labeled": 0,
|
||||
}
|
||||
with THRESHOLDS_PATH.open() as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def included(path: Path, fm: dict) -> tuple[bool, str]:
|
||||
rel = path.relative_to(VAULT_ROOT).as_posix()
|
||||
if path.is_symlink():
|
||||
return False, "symlink"
|
||||
resolved = path.resolve()
|
||||
try:
|
||||
resolved.relative_to(VAULT_ROOT.resolve())
|
||||
except ValueError:
|
||||
return False, "escapes vault"
|
||||
if path.name in EXCLUDE_FILENAMES:
|
||||
return False, "excluded filename"
|
||||
for prefix in EXCLUDE_PATH_PREFIXES:
|
||||
if rel.startswith(prefix):
|
||||
return False, f"under {prefix}"
|
||||
if fm.get("type") in EXCLUDE_TYPES:
|
||||
return False, f"type={fm['type']}"
|
||||
return True, "included"
|
||||
|
||||
|
||||
def embed(text: str, model: str, url: str) -> list[float]:
|
||||
data = _http_post_json(
|
||||
f"{url}/api/embeddings",
|
||||
{"model": model, "prompt": text},
|
||||
EMBED_TIMEOUT_SEC,
|
||||
)
|
||||
emb = data.get("embedding")
|
||||
if not isinstance(emb, list) or not emb:
|
||||
raise RuntimeError(f"ollama returned no embedding: {str(data)[:200]}")
|
||||
for v in emb:
|
||||
if not isinstance(v, (int, float)):
|
||||
raise RuntimeError("embedding contains non-numeric values")
|
||||
return emb
|
||||
|
||||
|
||||
def run_check(
|
||||
rebuild: bool,
|
||||
report_path: Path | None,
|
||||
ollama_url: str,
|
||||
model: str,
|
||||
) -> int:
|
||||
if not detect_ollama(ollama_url):
|
||||
log(f"ollama not reachable at {ollama_url}; skipping tiling check")
|
||||
return EXIT_NO_OLLAMA
|
||||
if not detect_model(ollama_url, model):
|
||||
log(f"model '{model}' not pulled; run: ollama pull {model}")
|
||||
return EXIT_NO_MODEL
|
||||
|
||||
thresholds = load_thresholds()
|
||||
|
||||
lock_fd = _lock_cache()
|
||||
try:
|
||||
cache = (load_cache(model) if not rebuild
|
||||
else {"version": 1, "model": model, "embeddings": {}})
|
||||
|
||||
pages: list[tuple[str, list[float]]] = []
|
||||
scanned = 0
|
||||
computed = 0
|
||||
cached_hits = 0
|
||||
skipped_counts: dict[str, int] = {}
|
||||
live_paths: set[str] = set()
|
||||
|
||||
candidates = sorted(WIKI_DIR.rglob("*.md"))
|
||||
scale_n = len(candidates)
|
||||
if scale_n > SCALE_HARD_FAIL_PAGES:
|
||||
log(f"ERR: {scale_n} pages exceed hard-fail limit {SCALE_HARD_FAIL_PAGES}")
|
||||
return EXIT_SCALE_EXCEEDED
|
||||
if scale_n > SCALE_WARN_PAGES:
|
||||
log(f"WARN: {scale_n} pages; cold-cache embed will issue ~{scale_n} POSTs to ollama")
|
||||
|
||||
for md in candidates:
|
||||
scanned += 1
|
||||
# Symlink and vault-root guards must run BEFORE read_text so a
|
||||
# hostile symlink cannot cause off-vault content to be read and
|
||||
# POSTed to the embedding endpoint.
|
||||
if md.is_symlink():
|
||||
skipped_counts["symlink"] = skipped_counts.get("symlink", 0) + 1
|
||||
continue
|
||||
try:
|
||||
resolved = md.resolve(strict=True)
|
||||
resolved.relative_to(VAULT_ROOT.resolve())
|
||||
except (OSError, ValueError):
|
||||
skipped_counts["escapes vault"] = skipped_counts.get("escapes vault", 0) + 1
|
||||
continue
|
||||
try:
|
||||
text = md.read_text(encoding="utf-8")
|
||||
except (OSError, UnicodeDecodeError):
|
||||
skipped_counts["read_error"] = skipped_counts.get("read_error", 0) + 1
|
||||
continue
|
||||
if len(text.encode("utf-8")) > MAX_BODY_BYTES:
|
||||
skipped_counts["too_large"] = skipped_counts.get("too_large", 0) + 1
|
||||
continue
|
||||
fm, body = parse_frontmatter(text)
|
||||
ok, reason = included(md, fm)
|
||||
if not ok:
|
||||
skipped_counts[reason] = skipped_counts.get(reason, 0) + 1
|
||||
continue
|
||||
rel = md.relative_to(VAULT_ROOT).as_posix()
|
||||
live_paths.add(rel)
|
||||
h = body_hash(body, model)
|
||||
entry = cache["embeddings"].get(rel)
|
||||
if entry and entry.get("hash") == h:
|
||||
pages.append((rel, entry["embedding"]))
|
||||
cached_hits += 1
|
||||
continue
|
||||
try:
|
||||
emb = embed(body, model, ollama_url)
|
||||
except Exception as exc:
|
||||
log(f"ERR embedding {rel}: {exc}")
|
||||
skipped_counts["embed_error"] = skipped_counts.get("embed_error", 0) + 1
|
||||
continue
|
||||
cache["embeddings"][rel] = {
|
||||
"hash": h,
|
||||
"embedding": emb,
|
||||
"computed_at": datetime.utcnow().isoformat(timespec="seconds") + "Z",
|
||||
}
|
||||
pages.append((rel, emb))
|
||||
computed += 1
|
||||
|
||||
# Orphan GC: drop cache entries for paths that no longer exist.
|
||||
orphans = [k for k in cache["embeddings"] if k not in live_paths]
|
||||
for k in orphans:
|
||||
del cache["embeddings"][k]
|
||||
|
||||
save_cache(cache)
|
||||
finally:
|
||||
_unlock_cache(lock_fd)
|
||||
|
||||
review = thresholds["bands"]["review"]
|
||||
error_ = thresholds["bands"]["error"]
|
||||
pairs: list[tuple[float, str, str]] = []
|
||||
for i in range(len(pages)):
|
||||
for j in range(i + 1, len(pages)):
|
||||
a_path, a_emb = pages[i]
|
||||
b_path, b_emb = pages[j]
|
||||
try:
|
||||
sim = cosine(a_emb, b_emb)
|
||||
except ValueError as exc:
|
||||
log(f"WARN cosine skip ({a_path}, {b_path}): {exc}")
|
||||
continue
|
||||
if sim >= review:
|
||||
pairs.append((sim, a_path, b_path))
|
||||
pairs.sort(reverse=True)
|
||||
|
||||
errors = [p for p in pairs if p[0] >= error_]
|
||||
reviews = [p for p in pairs if review <= p[0] < error_]
|
||||
|
||||
out_lines: list[str] = []
|
||||
out_lines.append("# Semantic Tiling Report")
|
||||
out_lines.append("")
|
||||
out_lines.append(f"- generated: {datetime.utcnow().isoformat(timespec='seconds')}Z")
|
||||
out_lines.append(f"- model: {model}")
|
||||
out_lines.append(f"- ollama_url: {ollama_url}")
|
||||
out_lines.append(f"- thresholds: error>={error_}, review={review}-{error_}")
|
||||
out_lines.append(f"- calibrated: {thresholds.get('calibrated', False)}"
|
||||
+ (" (using uncalibrated defaults)" if not thresholds.get("calibrated") else ""))
|
||||
out_lines.append(f"- pages scanned: {scanned}; embedded: {len(pages)}; skipped: {sum(skipped_counts.values())}")
|
||||
if skipped_counts:
|
||||
out_lines.append("- skipped reasons: " + ", ".join(f"{k}={v}" for k, v in sorted(skipped_counts.items())))
|
||||
out_lines.append(f"- cache hits: {cached_hits}; recomputed: {computed}; orphans pruned: {len(orphans)}")
|
||||
out_lines.append("")
|
||||
out_lines.append(f"## Errors (similarity >= {error_})")
|
||||
out_lines.append("")
|
||||
if not errors:
|
||||
out_lines.append("- none")
|
||||
else:
|
||||
for sim, a, b in errors:
|
||||
out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
|
||||
out_lines.append("")
|
||||
out_lines.append(f"## Review ({review} <= similarity < {error_})")
|
||||
out_lines.append("")
|
||||
if not reviews:
|
||||
out_lines.append("- none")
|
||||
else:
|
||||
for sim, a, b in reviews:
|
||||
out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
|
||||
report = "\n".join(out_lines) + "\n"
|
||||
|
||||
print(report)
|
||||
if report_path is not None:
|
||||
# Confine report writes to VAULT_ROOT. A path that resolves outside
|
||||
# the vault is refused (prevents `--report /etc/passwd` style
|
||||
# accidents or hostile args from writing outside the repo).
|
||||
try:
|
||||
resolved_report = (
|
||||
report_path if report_path.is_absolute() else (Path.cwd() / report_path)
|
||||
).resolve()
|
||||
resolved_report.relative_to(VAULT_ROOT.resolve())
|
||||
except ValueError:
|
||||
log(f"ERR: --report path '{report_path}' escapes vault root {VAULT_ROOT}")
|
||||
return EXIT_USAGE
|
||||
resolved_report.parent.mkdir(parents=True, exist_ok=True)
|
||||
resolved_report.write_text(report, encoding="utf-8")
|
||||
log(f"report written: {resolved_report}")
|
||||
|
||||
return EXIT_OK
|
||||
|
||||
|
||||
def cmd_peek(ollama_url: str, model: str) -> int:
|
||||
"""Structured diagnostics. Prints a JSON object and a plain summary."""
|
||||
diag: dict = {}
|
||||
script_path = Path(__file__).resolve()
|
||||
diag["script_path"] = str(script_path)
|
||||
diag["script_executable"] = os.access(script_path, os.X_OK)
|
||||
diag["python"] = sys.executable
|
||||
diag["vault_root"] = str(VAULT_ROOT)
|
||||
diag["ollama_url"] = ollama_url
|
||||
diag["ollama_reachable"] = detect_ollama(ollama_url)
|
||||
diag["model_requested"] = model
|
||||
diag["model_present"] = detect_model(ollama_url, model) if diag["ollama_reachable"] else False
|
||||
diag["cache_present"] = CACHE_PATH.exists()
|
||||
diag["cache_readable"] = False
|
||||
diag["cache_entries"] = 0
|
||||
diag["cache_model"] = None
|
||||
if diag["cache_present"]:
|
||||
try:
|
||||
with CACHE_PATH.open() as f:
|
||||
c = json.load(f)
|
||||
diag["cache_readable"] = (c.get("version") == 1
|
||||
and isinstance(c.get("embeddings"), dict))
|
||||
diag["cache_entries"] = len(c.get("embeddings", {}))
|
||||
diag["cache_model"] = c.get("model")
|
||||
except (OSError, json.JSONDecodeError) as exc:
|
||||
diag["cache_readable"] = False
|
||||
diag["cache_error"] = str(exc)
|
||||
diag["thresholds_present"] = THRESHOLDS_PATH.exists()
|
||||
diag["thresholds_readable"] = False
|
||||
if diag["thresholds_present"]:
|
||||
try:
|
||||
with THRESHOLDS_PATH.open() as f:
|
||||
t = json.load(f)
|
||||
diag["thresholds_readable"] = True
|
||||
diag["thresholds_calibrated"] = bool(t.get("calibrated", False))
|
||||
diag["thresholds_bands"] = t.get("bands", {})
|
||||
except (OSError, json.JSONDecodeError):
|
||||
diag["thresholds_readable"] = False
|
||||
print(json.dumps(diag, indent=2))
|
||||
if not diag["ollama_reachable"]:
|
||||
return EXIT_NO_OLLAMA
|
||||
if not diag["model_present"]:
|
||||
return EXIT_NO_MODEL
|
||||
if diag["cache_present"] and not diag["cache_readable"]:
|
||||
return EXIT_CACHE_CORRUPT
|
||||
return EXIT_OK
|
||||
|
||||
|
||||
def main(argv: list[str]) -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--report", type=Path, default=None)
|
||||
p.add_argument("--rebuild-cache", action="store_true")
|
||||
p.add_argument("--peek", action="store_true")
|
||||
p.add_argument("--allow-remote-ollama", action="store_true",
|
||||
help="allow OLLAMA_URL env override pointing outside localhost")
|
||||
p.add_argument("--model", default=DEFAULT_MODEL)
|
||||
args = p.parse_args(argv)
|
||||
|
||||
env_url = os.environ.get("OLLAMA_URL")
|
||||
ollama_url = env_url or DEFAULT_OLLAMA_URL
|
||||
if env_url and not _is_local_url(ollama_url) and not args.allow_remote_ollama:
|
||||
log(f"ERR: OLLAMA_URL={ollama_url!r} is not localhost. "
|
||||
f"Vault content would be POSTed to a non-local host. "
|
||||
f"Pass --allow-remote-ollama to override.")
|
||||
return EXIT_USAGE
|
||||
|
||||
if args.peek:
|
||||
return cmd_peek(ollama_url, args.model)
|
||||
return run_check(
|
||||
rebuild=args.rebuild_cache,
|
||||
report_path=args.report,
|
||||
ollama_url=ollama_url,
|
||||
model=args.model,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
Reference in New Issue
Block a user