add claude-obsidian
Tests / Hermetic test suite (push) Has been cancelled
Tests / Skill frontmatter validation (push) Has been cancelled

This commit is contained in:
김경종
2026-05-28 10:57:16 +09:00
parent 1b07531a45
commit 72dad72703
205 changed files with 41703 additions and 80 deletions
+102
View File
@@ -0,0 +1,102 @@
#!/usr/bin/env bash
# allocate-address.sh — atomic creation-order address allocation for the vault.
#
# Reserves the next address of the form c-NNNNNN and increments the counter
# under an exclusive flock. On missing counter file, recovers by scanning the
# vault for the highest existing c-NNNNNN in page frontmatter and resuming from
# max+1. Never silently resets to 1 in a non-empty vault.
#
# Usage:
# ./scripts/allocate-address.sh # prints the reserved address (e.g. c-000042) to stdout
# ./scripts/allocate-address.sh --peek # prints the next value without incrementing
# ./scripts/allocate-address.sh --rebuild # recomputes counter from max observed and exits
#
# Exit codes:
# 0 — success
# 1 — lock acquisition failed (another writer is holding the lock)
# 2 — vault-meta directory missing and cannot be created
# 3 — counter value corrupt or non-numeric
set -euo pipefail
VAULT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
COUNTER_FILE="${VAULT_ROOT}/.vault-meta/address-counter.txt"
LOCK_FILE="${VAULT_ROOT}/.vault-meta/.address.lock"
WIKI_DIR="${VAULT_ROOT}/wiki"
MODE="${1:-allocate}"
mkdir -p "$(dirname "$COUNTER_FILE")" || {
echo "ERR: cannot create .vault-meta/" >&2
exit 2
}
# Acquire exclusive lock with 5-second timeout. Release automatically on scope exit.
exec 9>"$LOCK_FILE"
if ! flock -x -w 5 9; then
echo "ERR: could not acquire address allocator lock within 5s" >&2
exit 1
fi
scan_max_c_address() {
# Emit the largest NNNNNN from "address: c-NNNNNN" lines that appear inside
# the FIRST YAML frontmatter block of each wiki .md file. Code-block examples
# and body prose are excluded. Returns 0 if none found.
if [ ! -d "$WIKI_DIR" ]; then
echo 0
return
fi
find "$WIKI_DIR" -type f -name '*.md' -print0 2>/dev/null \
| xargs -0 awk '
FNR == 1 { state = "pre"; next_is_fm = ($0 == "---") ? 1 : 0 }
FNR == 1 && $0 == "---" { state = "fm"; next }
state == "fm" && $0 == "---" { state = "body"; nextfile }
state == "fm" && match($0, /^address:[[:space:]]+c-[0-9]{6}[[:space:]]*$/) {
if (match($0, /c-[0-9]{6}/)) {
print substr($0, RSTART, RLENGTH)
}
}
' 2>/dev/null \
| sed 's/^c-0*//;s/^$/0/' \
| sort -n \
| tail -1 \
| awk 'BEGIN{n=0} {n=$0} END{print (n+0)}'
}
read_or_recover_counter() {
if [ ! -f "$COUNTER_FILE" ]; then
local max_c
max_c="$(scan_max_c_address)"
echo $((max_c + 1)) > "$COUNTER_FILE"
echo "INFO: counter file missing; recovered from vault scan, set to $((max_c + 1))" >&2
fi
local raw
raw="$(cat "$COUNTER_FILE")"
if ! [[ "$raw" =~ ^[0-9]+$ ]]; then
echo "ERR: counter file content is not a positive integer: $raw" >&2
exit 3
fi
echo "$raw"
}
case "$MODE" in
--peek)
read_or_recover_counter
;;
--rebuild)
max_c="$(scan_max_c_address)"
echo $((max_c + 1)) > "$COUNTER_FILE"
echo "Counter rebuilt: next = $((max_c + 1))"
;;
allocate|"")
current="$(read_or_recover_counter)"
next=$((current + 1))
echo "$next" > "$COUNTER_FILE"
printf 'c-%06d\n' "$current"
;;
*)
echo "ERR: unknown mode: $MODE" >&2
echo "Usage: $0 [allocate|--peek|--rebuild]" >&2
exit 3
;;
esac
+167
View File
@@ -0,0 +1,167 @@
#!/usr/bin/env python3
"""baseline-v16.py — simulate the v1.6 hot→index→drill retrieval chain.
Exists ONLY for benchmarking v1.7's hybrid retrieval against the legacy
v1.6 behavior. Not used by any v1.7 skill; not feature-gated; not part of
the regular vault workflow.
The v1.6 query path (per skills/wiki-query/SKILL.md before v1.7):
1. Read wiki/hot.md (always; quick context)
2. Read wiki/index.md (scan for descriptions matching query terms)
3. Read top-N pages cited in the index whose entries best match query
4. Caller synthesizes answer
This script approximates that path by:
1. Tokenizing the query (same stopword-filtered ASCII tokenizer as bm25-index.py)
2. Scoring each *.md page in wiki/ by the count of distinct query terms it contains
(case-insensitive substring on the full file body; no semantic matching)
3. Returning top-K pages by score, with ties broken by:
a. Presence in hot.md (boost +5)
b. Presence in index.md (boost +3)
c. Total raw term-occurrence count
The simulation is intentionally simple — it represents what a human or a
basic agent does when reading hot/index "by hand" without any retrieval
infrastructure. Anything fancier would not be a fair v1.6 baseline.
Usage:
baseline-v16.py "your query" [--top 5]
baseline-v16.py "query" --top 5 --json # output as JSON (default: text)
Exit codes:
0 — success
2 — usage error
3 — wiki directory missing
"""
import argparse
import json
import re
import sys
from collections import Counter
from pathlib import Path
VAULT_ROOT = Path(__file__).resolve().parent.parent
WIKI_DIR = VAULT_ROOT / "wiki"
HOT_PATH = WIKI_DIR / "hot.md"
INDEX_PATH = WIKI_DIR / "index.md"
# Mirror bm25-index.py's tokenizer + stopword list so comparisons are fair.
STOPWORDS = frozenset("""
a an and are as at be by for from has have he her him his i if in is it its
of on or that the their them they this to was were will with you your
""".split())
# Mirrors bm25-index.py's Unicode-aware tokenizer (v1.7.2; closes M2).
TOKEN_RE = re.compile(r"\w[\w'\-]*", re.UNICODE)
HOT_BOOST = 5.0
INDEX_BOOST = 3.0
EXIT_OK = 0
EXIT_USAGE = 2
EXIT_NO_WIKI = 3
def tokenize(text):
return [t.lower() for t in TOKEN_RE.findall(text)
if t.lower() not in STOPWORDS and len(t) > 1]
def page_paths():
if not WIKI_DIR.is_dir():
print(f"ERR: no wiki directory at {WIKI_DIR}", file=sys.stderr)
sys.exit(EXIT_NO_WIKI)
return sorted(p for p in WIKI_DIR.rglob("*.md")
if not any(part.startswith(".") for part in p.parts))
def score_page(page_path, query_terms_set, query_terms_counter):
"""Score by distinct-query-term-presence + boost if cited in hot/index.
Returns (score, distinct_matches, total_occurrences).
"""
try:
body = page_path.read_text(encoding="utf-8", errors="replace").lower()
except OSError:
return (0.0, 0, 0)
distinct = sum(1 for term in query_terms_set if term in body)
total = sum(body.count(term) for term in query_terms_set)
score = float(distinct) + 0.01 * total # distinct dominates; total is tiebreak
# Hot-cache boost: if the page is referenced by name in hot.md
if HOT_PATH.is_file():
try:
hot_body = HOT_PATH.read_text(encoding="utf-8", errors="replace")
page_stem = page_path.stem
if page_stem in hot_body or str(page_path.relative_to(VAULT_ROOT)) in hot_body:
score += HOT_BOOST
except OSError:
pass
# Index boost: page is cited in index.md
if INDEX_PATH.is_file():
try:
index_body = INDEX_PATH.read_text(encoding="utf-8", errors="replace")
page_stem = page_path.stem
if page_stem in index_body or str(page_path.relative_to(VAULT_ROOT)) in index_body:
score += INDEX_BOOST
except OSError:
pass
return (score, distinct, total)
def baseline_query(query, top_k=5):
"""Return list of {path, score, distinct, total} for top-K pages."""
terms = tokenize(query)
if not terms:
return []
terms_set = set(terms)
terms_counter = Counter(terms)
scored = []
for p in page_paths():
score, distinct, total = score_page(p, terms_set, terms_counter)
if score > 0:
scored.append({
"path": str(p.relative_to(VAULT_ROOT)),
"score": round(score, 4),
"distinct_terms": distinct,
"total_occurrences": total,
})
scored.sort(key=lambda d: d["score"], reverse=True)
return scored[:top_k]
def main():
parser = argparse.ArgumentParser(description="v1.6 baseline retrieval simulator.")
parser.add_argument("query", help="Natural-language query")
parser.add_argument("--top", type=int, default=5, help="Top-K results")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
results = baseline_query(args.query, top_k=args.top)
if args.json:
print(json.dumps({
"query": args.query,
"strategy": "baseline-v1.6:hot+index+keyword",
"top_k": args.top,
"candidates": results,
}, indent=2))
else:
if not results:
print("(no matches)")
else:
print(f"v1.6 baseline for: {args.query!r}")
for i, r in enumerate(results, 1):
print(f" {i}. {r['path']} score={r['score']} distinct={r['distinct_terms']} occ={r['total_occurrences']}")
return EXIT_OK
if __name__ == "__main__":
sys.exit(main())
+215
View File
@@ -0,0 +1,215 @@
#!/usr/bin/env python3
"""benchmark-runner.py — score v1.7 hybrid retrieval vs v1.6 baseline.
Reads the 50-query corpus at wiki/meta/retrieval-benchmark-v1.7.md, runs both
pipelines for each query, scores top-1 / top-5 accuracy, prints a comparison
table. Used by the v1.7.0 audit.
Pure stdlib + subprocess. No network or LLM calls of its own — the subprocess
calls to retrieve.py may hit ollama (if installed) for rerank. baseline-v16.py
is pure filesystem.
Usage:
benchmark-runner.py # run all 50 queries, print summary
benchmark-runner.py --json results.json # also write per-query results
benchmark-runner.py --limit 5 # smoke: first 5 queries only
"""
import argparse
import json
import re
import subprocess
import sys
from pathlib import Path
VAULT_ROOT = Path(__file__).resolve().parent.parent
CORPUS = VAULT_ROOT / "wiki" / "meta" / "retrieval-benchmark-v1.7.md"
def parse_corpus(corpus_path):
"""Parse the ### <id> blocks into a list of query dicts."""
text = corpus_path.read_text(encoding="utf-8")
# Split on "### " at line start
blocks = re.split(r"\n### ", text)
queries = []
for blk in blocks[1:]: # skip prelude
# First line is the id (e.g. "D1\n")
lines = blk.split("\n", 1)
if len(lines) < 2:
continue
qid = lines[0].strip()
# Ignore non-ID lines (e.g. "Schema", "Scoring rules")
if not re.match(r"^[DH]\d+$", qid):
continue
body = lines[1]
# Stop at next "## " (next section header)
body = re.split(r"\n## ", body, 1)[0]
# Parse fields
def get(field):
m = re.search(rf"^- {field}:\s*(.+)$", body, re.MULTILINE)
return m.group(1).strip() if m else ""
def get_list(field):
raw = get(field)
if not raw or raw == "null":
return []
return [s.strip() for s in raw.split(",") if s.strip()]
queries.append({
"id": qid,
"query": get("query"),
"correct": get_list("correct"),
"relevant": get_list("relevant"),
"category": get("category"),
"rationale": get("rationale"),
})
return queries
def run_v17(query, top_k=5):
"""Returns ordered list of page_paths from v1.7 retrieve.py."""
try:
result = subprocess.run(
["python3", str(VAULT_ROOT / "scripts" / "retrieve.py"),
query, "--top", str(top_k)],
capture_output=True, text=True, timeout=60, check=False,
)
if result.returncode != 0:
return [], f"rc={result.returncode}: {result.stderr.strip()[:200]}"
data = json.loads(result.stdout)
return [c["page_path"] for c in data.get("candidates", [])], None
except (subprocess.TimeoutExpired, json.JSONDecodeError, OSError) as e:
return [], str(e)
def run_v16(query, top_k=5):
"""Returns ordered list of page_paths from v1.6 baseline-v16.py."""
try:
result = subprocess.run(
["python3", str(VAULT_ROOT / "scripts" / "baseline-v16.py"),
query, "--top", str(top_k), "--json"],
capture_output=True, text=True, timeout=30, check=False,
)
if result.returncode != 0:
return [], f"rc={result.returncode}"
data = json.loads(result.stdout)
return [c["path"] for c in data.get("candidates", [])], None
except (subprocess.TimeoutExpired, json.JSONDecodeError, OSError) as e:
return [], str(e)
def score_query(results, correct, relevant, category):
"""Returns (top1_success, top5_success) per the scoring rules."""
# Negative queries: correct is empty; success = no results OR result is in relevant
if category == "negative" or not correct:
if not results:
return (1, 1) # no results = correctly "found nothing"
top1 = 1 if results[0] in relevant else 0
top5 = 1 if any(r in relevant for r in results[:5]) else 0
return (top1, top5)
# Normal queries: top-1 if first result in correct; top-5 if any in correct
top1 = 1 if results and results[0] in correct else 0
top5 = 1 if any(r in correct for r in results[:5]) else 0
return (top1, top5)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--limit", type=int, default=None, help="Only run first N queries")
parser.add_argument("--json", help="Write per-query results to PATH")
parser.add_argument("--top", type=int, default=5)
args = parser.parse_args()
queries = parse_corpus(CORPUS)
if args.limit:
queries = queries[: args.limit]
print(f"Parsed {len(queries)} queries from {CORPUS.relative_to(VAULT_ROOT)}\n")
per_query = []
cat_stats = {} # category -> {v17_top1, v17_top5, v16_top1, v16_top5, count}
for q in queries:
v17_results, v17_err = run_v17(q["query"], top_k=args.top)
v16_results, v16_err = run_v16(q["query"], top_k=args.top)
v17_top1, v17_top5 = score_query(v17_results, q["correct"], q["relevant"], q["category"])
v16_top1, v16_top5 = score_query(v16_results, q["correct"], q["relevant"], q["category"])
record = {
"id": q["id"],
"category": q["category"],
"query": q["query"][:80] + ("..." if len(q["query"]) > 80 else ""),
"correct": q["correct"],
"v17_top1": v17_top1,
"v17_top5": v17_top5,
"v17_results": v17_results[:args.top],
"v17_err": v17_err,
"v16_top1": v16_top1,
"v16_top5": v16_top5,
"v16_results": v16_results[:args.top],
"v16_err": v16_err,
}
per_query.append(record)
cat = q["category"]
if cat not in cat_stats:
cat_stats[cat] = {"v17_t1": 0, "v17_t5": 0, "v16_t1": 0, "v16_t5": 0, "n": 0}
cat_stats[cat]["v17_t1"] += v17_top1
cat_stats[cat]["v17_t5"] += v17_top5
cat_stats[cat]["v16_t1"] += v16_top1
cat_stats[cat]["v16_t5"] += v16_top5
cat_stats[cat]["n"] += 1
# Live progress
marker = "" if v17_top1 else "·"
v16marker = "" if v16_top1 else "·"
print(f" {q['id']:4} [{q['category']:14}] v17:{marker} v16:{v16marker} {q['query'][:60]}")
# Aggregate
total_v17_t1 = sum(c["v17_t1"] for c in cat_stats.values())
total_v17_t5 = sum(c["v17_t5"] for c in cat_stats.values())
total_v16_t1 = sum(c["v16_t1"] for c in cat_stats.values())
total_v16_t5 = sum(c["v16_t5"] for c in cat_stats.values())
total_n = sum(c["n"] for c in cat_stats.values())
def pct(x, n):
return f"{100.0 * x / n:5.1f}%" if n else " n/a"
print()
print("=" * 80)
print(f"{'Category':<16} {'N':>4} {'v17 top-1':>10} {'v17 top-5':>10} {'v16 top-1':>10} {'v16 top-5':>10} Δ top-1")
print("-" * 80)
for cat, c in sorted(cat_stats.items()):
delta = (c["v17_t1"] - c["v16_t1"]) / c["n"] * 100 if c["n"] else 0
print(f"{cat:<16} {c['n']:>4} {pct(c['v17_t1'], c['n']):>10} {pct(c['v17_t5'], c['n']):>10} {pct(c['v16_t1'], c['n']):>10} {pct(c['v16_t5'], c['n']):>10} {delta:+6.1f}pp")
delta_total = (total_v17_t1 - total_v16_t1) / total_n * 100 if total_n else 0
print("-" * 80)
print(f"{'TOTAL':<16} {total_n:>4} {pct(total_v17_t1, total_n):>10} {pct(total_v17_t5, total_n):>10} {pct(total_v16_t1, total_n):>10} {pct(total_v16_t5, total_n):>10} {delta_total:+6.1f}pp")
print()
print(f"Plan §7 ship-gate target: ≥30 percentage-point improvement in top-1")
print(f"Actual: {delta_total:+.1f}pp ({'PASS' if delta_total >= 30 else 'INFO'} — pp gain alone, not failure-reduction %)")
# Also compute as a relative reduction in "wrong page cited" errors
v17_wrong = total_n - total_v17_t1
v16_wrong = total_n - total_v16_t1
err_reduction = (v16_wrong - v17_wrong) / v16_wrong * 100 if v16_wrong else 0
print(f"Error-reduction (the gate's actual framing): {err_reduction:+.1f}% ({'PASS' if err_reduction >= 30 else 'FAIL'})")
print()
if args.json:
Path(args.json).write_text(json.dumps({
"summary": {
"v17_top1_pct": 100 * total_v17_t1 / total_n if total_n else 0,
"v17_top5_pct": 100 * total_v17_t5 / total_n if total_n else 0,
"v16_top1_pct": 100 * total_v16_t1 / total_n if total_n else 0,
"v16_top5_pct": 100 * total_v16_t5 / total_n if total_n else 0,
"delta_top1_pp": delta_total,
"error_reduction_pct": err_reduction,
},
"by_category": {cat: {**c, "v17_top1_pct": 100*c["v17_t1"]/c["n"], "v16_top1_pct": 100*c["v16_t1"]/c["n"]} for cat, c in cat_stats.items()},
"per_query": per_query,
}, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"Wrote per-query results to {args.json}")
if __name__ == "__main__":
main()
+293
View File
@@ -0,0 +1,293 @@
#!/usr/bin/env python3
"""bm25-index.py — sparse BM25 inverted index over contextualized wiki chunks.
Pure stdlib (no rank_bm25 dep). Standard Okapi BM25 with k1=1.5, b=0.75.
Indexes the `contextualized_text` field of every chunk under .vault-meta/chunks/,
emits a single JSON file at .vault-meta/bm25/index.json with the schema below.
Concurrency:
- Locks .vault-meta/.bm25.lock (fcntl exclusive) around any index write.
- Atomic .tmp + rename for the index file.
Index schema (.vault-meta/bm25/index.json):
{
"schema_version": 1,
"params": {"k1": 1.5, "b": 0.75},
"doc_count": 1234,
"avg_dl": 487.5,
"updated_at": "2026-05-17T...",
"vocab": {
"<term>": {"df": 17, "postings": [["c-000001:0", 3], ["c-000042:2", 1], ...]}
},
"docs": {
"<chunk_id>": {"path": ".vault-meta/chunks/c-000001/chunk-000.json", "dl": 487}
}
}
Chunk id format: "<page-address>:<chunk-index>" (e.g. "c-000042:3").
Tokenization: lowercase, collapse whitespace, drop punctuation except in-word
apostrophes and hyphens. ASCII-only stopwords filtered (small list; favors
recall over precision).
Query interface (used by retrieve.py at query time):
bm25-index.py query "your text here" [--top 20]
Build interface:
bm25-index.py build # full rebuild (always; incremental is v1.7.x scope)
bm25-index.py stats # print index stats
Exit codes:
0 — success
1 — lock acquisition failed
2 — usage error
3 — index file missing or corrupt (query mode)
4 — chunks directory missing
"""
import argparse
import fcntl
import json
import math
import os
import re
import sys
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
VAULT_ROOT = Path(__file__).resolve().parent.parent
META_DIR = VAULT_ROOT / ".vault-meta"
CHUNKS_DIR = META_DIR / "chunks"
BM25_DIR = META_DIR / "bm25"
INDEX_PATH = BM25_DIR / "index.json"
LOCK_PATH = META_DIR / ".bm25.lock"
K1 = 1.5
B = 0.75
# Small high-frequency-stopword list (English). Conservative — keep recall high.
STOPWORDS = frozenset("""
a an and are as at be by for from has have he her him his i if in is it its
of on or that the their them they this to was were will with you your
""".split())
# Unicode-aware tokenizer (v1.7.2; closes audit M2). \w under re.UNICODE
# matches letters and digits from any script (CJK, Cyrillic, accented Latin,
# Devanagari, etc.) plus underscore. Internal apostrophes and hyphens are
# preserved so "user's" and "well-formed" stay single tokens. Pure-symbol or
# pure-emoji tokens fail the leading \w anchor and are correctly skipped.
TOKEN_RE = re.compile(r"\w[\w'\-]*", re.UNICODE)
EXIT_OK = 0
EXIT_LOCK = 1
EXIT_USAGE = 2
EXIT_INDEX_MISSING = 3
EXIT_NO_CHUNKS = 4
def log(msg):
print(msg, file=sys.stderr)
def tokenize(text):
"""Lowercase, strip punctuation, drop stopwords. Returns a list of terms."""
return [t.lower() for t in TOKEN_RE.findall(text)
if t.lower() not in STOPWORDS and len(t) > 1]
def acquire_lock():
META_DIR.mkdir(parents=True, exist_ok=True)
fd = os.open(str(LOCK_PATH), os.O_CREAT | os.O_WRONLY, 0o644)
try:
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
except OSError:
os.close(fd)
log("ERR: could not acquire bm25 lock")
sys.exit(EXIT_LOCK)
return fd
def release_lock(fd):
try:
fcntl.flock(fd, fcntl.LOCK_UN)
finally:
os.close(fd)
def discover_chunks():
"""Yield (chunk_id, path, contextualized_text) for every chunk on disk.
The yielded `path` is relative to the directory two levels above CHUNKS_DIR
(i.e. .vault-meta/chunks/<addr>/ → relative to the vault root). This works
both in production (CHUNKS_DIR is `<vault>/.vault-meta/chunks`) and when
tests monkey-patch CHUNKS_DIR to a sandbox `<tmp>/.vault-meta/chunks`.
"""
if not CHUNKS_DIR.is_dir():
log(f"ERR: no chunks directory at {CHUNKS_DIR}")
sys.exit(EXIT_NO_CHUNKS)
rel_root = CHUNKS_DIR.parent.parent
for chunk_file in sorted(CHUNKS_DIR.glob("*/chunk-*.json")):
try:
data = json.loads(chunk_file.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError) as e:
log(f" skip (unreadable): {chunk_file}{e}")
continue
address = data.get("page_address")
idx = data.get("chunk_index")
text = data.get("contextualized_text") or data.get("raw_text", "")
if address is None or idx is None:
continue
chunk_id = f"{address}:{idx}"
rel_path = str(chunk_file.relative_to(rel_root))
yield chunk_id, rel_path, text
def build_index():
docs = {}
df = Counter()
postings = defaultdict(list)
for chunk_id, rel_path, text in discover_chunks():
tokens = tokenize(text)
tf = Counter(tokens)
docs[chunk_id] = {"path": rel_path, "dl": len(tokens)}
for term, count in tf.items():
df[term] += 1
postings[term].append([chunk_id, count])
if not docs:
log("WARN: no chunks indexed")
return None
avg_dl = sum(d["dl"] for d in docs.values()) / len(docs)
vocab = {term: {"df": df[term], "postings": postings[term]}
for term in sorted(df.keys())}
return {
"schema_version": 1,
"params": {"k1": K1, "b": B},
"doc_count": len(docs),
"avg_dl": avg_dl,
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"vocab": vocab,
"docs": docs,
}
def write_index(index):
BM25_DIR.mkdir(parents=True, exist_ok=True)
tmp = INDEX_PATH.with_suffix(f".{os.getpid()}.tmp")
try:
tmp.write_text(json.dumps(index, ensure_ascii=False), encoding="utf-8")
os.replace(tmp, INDEX_PATH)
finally:
if tmp.exists():
tmp.unlink(missing_ok=True)
def load_index():
if not INDEX_PATH.is_file():
log(f"ERR: no index at {INDEX_PATH}. Run `bm25-index.py build` first.")
sys.exit(EXIT_INDEX_MISSING)
try:
return json.loads(INDEX_PATH.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError) as e:
log(f"ERR: index corrupt: {e}")
sys.exit(EXIT_INDEX_MISSING)
def query(text, top_k=20):
idx = load_index()
vocab = idx["vocab"]
docs = idx["docs"]
params = idx["params"]
avg_dl = idx["avg_dl"]
N = idx["doc_count"]
k1 = params["k1"]
b = params["b"]
qterms = tokenize(text)
if not qterms:
return []
# Defensive guard (v1.7.2; closes audit L7): avg_dl can only be 0 if the
# vocab is also empty (all chunks have zero tokens), in which case the
# loop never enters this divide path. But future refactors could change
# that invariant; the `or 1.0` keeps it safe by construction.
avg_dl_safe = avg_dl or 1.0
scores = defaultdict(float)
for term in qterms:
v = vocab.get(term)
if not v:
continue
df = v["df"]
idf = math.log(1 + (N - df + 0.5) / (df + 0.5))
for cid, cnt in v["postings"]:
dl = docs[cid]["dl"]
denom = cnt + k1 * (1 - b + b * dl / avg_dl_safe)
scores[cid] += idf * (cnt * (k1 + 1)) / denom
ranked = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:top_k]
return [
{
"chunk_id": cid,
"score": round(score, 6),
"path": docs[cid]["path"],
}
for cid, score in ranked
]
def stats():
idx = load_index()
print(json.dumps({
"doc_count": idx["doc_count"],
"avg_dl": round(idx["avg_dl"], 2),
"vocab_size": len(idx["vocab"]),
"updated_at": idx["updated_at"],
"params": idx["params"],
}, indent=2))
def main():
parser = argparse.ArgumentParser(description="BM25 inverted index over wiki chunks.")
sub = parser.add_subparsers(dest="cmd", required=True)
sub.add_parser("build", help="Build the index (full rebuild every time in v1.7).")
sp_query = sub.add_parser("query", help="Query the index.")
sp_query.add_argument("text", help="Query text")
sp_query.add_argument("--top", type=int, default=20, help="Top-K results")
sub.add_parser("stats", help="Print index stats.")
args = parser.parse_args()
if args.cmd == "build":
fd = acquire_lock()
try:
index = build_index()
if index is None:
log("Nothing to index.")
return EXIT_OK
write_index(index)
log(f"Wrote {INDEX_PATH} docs={index['doc_count']} vocab={len(index['vocab'])} avg_dl={index['avg_dl']:.1f}")
finally:
release_lock(fd)
return EXIT_OK
if args.cmd == "query":
results = query(args.text, top_k=args.top)
print(json.dumps(results, indent=2))
return EXIT_OK
if args.cmd == "stats":
stats()
return EXIT_OK
return EXIT_USAGE
if __name__ == "__main__":
sys.exit(main())
+312
View File
@@ -0,0 +1,312 @@
#!/usr/bin/env python3
"""boundary-score.py — DragonScale Mechanism 4: boundary-first autoresearch scorer.
Reads `wiki/**/*.md`, builds a wikilink graph, and emits per-page boundary
scores to stdout (text) or as JSON for tooling.
boundary_score(p) = (out_degree(p) - in_degree(p)) * recency_weight(p)
- out_degree(p): count of distinct wikilinks in p that resolve to a
scoreable page (scoreable = non-meta, non-fold, non-excluded).
- in_degree(p): count of distinct scoreable pages that link to p.
- recency_weight(p): exp(-days_since_updated / RECENCY_HALFLIFE_DAYS).
No floor; very old pages approach zero weight, which is the intended
semantic of "frontier" (recently-touched and outward-pointing).
High score = the page points at many things, is pointed at by few, and
has been touched recently. That is a vault frontier page. Low or
negative score = hub / integrated page.
Feature-gated opt-in: autoresearch only invokes this when DragonScale
setup is detected. Safe to run standalone even without DragonScale set
up (reads wiki/ only; never writes).
This script is intentionally stdout-only. There is no `--report PATH`
equivalent to `tiling-check.py --report` because the helper is small
enough to pipe directly (`./scripts/boundary-score.py --json | jq ...`)
and keeping it read-only removes a write-path attack surface.
Usage:
boundary-score.py # top-10 frontier, text
boundary-score.py --top N # top N frontier
boundary-score.py --json # JSON output
boundary-score.py --page PATH # score for a single page
boundary-score.py --include-score-zero # include pages with score=0
Exit codes:
0 success
2 usage error
"""
import argparse
import json
import math
import re
import sys
from datetime import date, datetime, timezone
from pathlib import Path
VAULT_ROOT = Path(__file__).resolve().parent.parent
WIKI_DIR = VAULT_ROOT / "wiki"
EXCLUDE_TYPES = {"meta", "fold"}
EXCLUDE_FILENAMES = {
"_index.md", "index.md", "log.md", "hot.md", "overview.md",
"dashboard.md", "Wiki Map.md", "getting-started.md",
}
EXCLUDE_PATH_PREFIXES = ("wiki/folds/", "wiki/meta/")
RECENCY_HALFLIFE_DAYS = 30.0
# No recency floor: a truly stale page should NOT dominate the frontier
# ranking, even if its out-degree is high. The exponential decay takes
# weight toward zero for year-old pages, which is the intended semantic
# of "frontier" (recently-touched and outward-pointing).
DEFAULT_TOP = 10
MAX_BODY_BYTES = 256 * 1024
# CommonMark-ish fence tracking: opening fence records (char, length);
# a closing fence must use the SAME char with SAME-OR-LONGER run length.
# Tilde fences (~~~) are supported alongside backtick fences (```). Indented
# code blocks (4+ spaces) are NOT filtered; in Obsidian usage, indented
# bullets commonly contain wikilinks and should count as edges.
FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
TYPE_RE = re.compile(r"^type:\s*(\S+)", re.MULTILINE)
UPDATED_RE = re.compile(r"^updated:\s*([0-9]{4}-[0-9]{2}-[0-9]{2})", re.MULTILINE)
CREATED_RE = re.compile(r"^created:\s*([0-9]{4}-[0-9]{2}-[0-9]{2})", re.MULTILINE)
TITLE_RE = re.compile(r'^title:\s*"?([^"\n]+?)"?\s*$', re.MULTILINE)
# Obsidian wikilinks: [[Target]] or [[Target|Alias]] or [[Target#Heading]]
WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#[^\]|]+)?(?:\|[^\]]+)?\]\]")
EXIT_OK = 0
EXIT_USAGE = 2
def log(msg: str) -> None:
print(msg, file=sys.stderr)
def parse_frontmatter(text: str) -> tuple[dict, str]:
m = FRONTMATTER_RE.match(text)
if not m:
return {}, text
fm_raw = m.group(1)
body = text[m.end():]
fm: dict = {}
for key, regex in (("type", TYPE_RE), ("updated", UPDATED_RE),
("created", CREATED_RE), ("title", TITLE_RE)):
tm = regex.search(fm_raw)
if tm:
fm[key] = tm.group(1).strip().strip('"').strip("'")
return fm, body
def included(path: Path, fm: dict) -> bool:
if path.is_symlink():
return False
try:
resolved = path.resolve(strict=True)
resolved.relative_to(VAULT_ROOT.resolve())
except (OSError, ValueError):
return False
rel = path.relative_to(VAULT_ROOT).as_posix()
if path.name in EXCLUDE_FILENAMES:
return False
for prefix in EXCLUDE_PATH_PREFIXES:
if rel.startswith(prefix):
return False
if fm.get("type") in EXCLUDE_TYPES:
return False
return True
def days_since(date_str: str | None) -> float:
"""Return days since the given YYYY-MM-DD string, or a large sentinel if missing."""
if not date_str:
return 10_000.0
try:
d = date.fromisoformat(date_str)
except ValueError:
return 10_000.0
delta = (date.today() - d).days
return max(0.0, float(delta))
def recency_weight(days: float,
halflife: float = RECENCY_HALFLIFE_DAYS) -> float:
return math.exp(-days / halflife)
_FENCE_RE = re.compile(r"^(\s*)(`{3,}|~{3,})")
def extract_wikilinks(body: str) -> set[str]:
"""Extract unique link targets (without alias or heading suffix) from the body.
Skips wikilinks inside fenced code blocks so documentation examples
(including in this repo's own skill files) do not pollute the graph.
Fence handling: backtick AND tilde fences, with length tracking per
CommonMark: the opening run sets (char, min_len); the closing line
must use the SAME char with a run of SAME-OR-LONGER length. Indented
code blocks (4+ spaces) are intentionally NOT filtered — indented
bullets in Obsidian often contain wikilinks.
"""
cleaned: list[str] = []
fence_char: str | None = None
fence_len: int = 0
for line in body.splitlines():
m = _FENCE_RE.match(line)
if m:
char = m.group(2)[0]
length = len(m.group(2))
if fence_char is None:
fence_char = char
fence_len = length
continue
if char == fence_char and length >= fence_len:
fence_char = None
fence_len = 0
continue
if fence_char is not None:
continue
cleaned.append(line)
scan = "\n".join(cleaned)
results: set[str] = set()
for m in WIKILINK_RE.finditer(scan):
raw = m.group(1).strip()
# Folder-qualified links like [[notes/Foo]] resolve to Foo.md by stem.
# This matches Obsidian default behavior for unique filenames.
stem = raw.rsplit("/", 1)[-1]
if stem:
results.add(stem)
return results
def collect_pages() -> dict[str, dict]:
"""Scan wiki/, return {title_key: {path, title, body, fm}} for scoreable pages.
`title_key` is the filename stem, which is what Obsidian wikilinks resolve
to by default. Assumes filenames are unique across the vault (enforced by
wiki-lint naming convention).
"""
pages: dict[str, dict] = {}
if not WIKI_DIR.is_dir():
return pages
for md in sorted(WIKI_DIR.rglob("*.md")):
try:
text = md.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
continue
if len(text.encode("utf-8")) > MAX_BODY_BYTES:
continue
fm, body = parse_frontmatter(text)
if not included(md, fm):
continue
title_key = md.stem # Obsidian wikilinks are filename-based
pages[title_key] = {
"path": md.relative_to(VAULT_ROOT).as_posix(),
"title": fm.get("title", title_key),
"body": body,
"fm": fm,
}
return pages
def build_graph(pages: dict[str, dict]) -> tuple[dict[str, set[str]], dict[str, set[str]]]:
"""Return (out_edges, in_edges) where each maps title_key -> set(title_key).
Only edges whose target is a known scoreable page are counted. Self-loops
are ignored.
"""
out_edges: dict[str, set[str]] = {k: set() for k in pages}
in_edges: dict[str, set[str]] = {k: set() for k in pages}
for src, entry in pages.items():
links = extract_wikilinks(entry["body"])
for target in links:
if target == src:
continue
if target in pages:
out_edges[src].add(target)
in_edges[target].add(src)
return out_edges, in_edges
def score_page(title_key: str,
pages: dict[str, dict],
out_edges: dict[str, set[str]],
in_edges: dict[str, set[str]]) -> dict:
entry = pages[title_key]
fm = entry["fm"]
out_deg = len(out_edges.get(title_key, set()))
in_deg = len(in_edges.get(title_key, set()))
date_str = fm.get("updated") or fm.get("created")
days = days_since(date_str)
rw = recency_weight(days)
score = (out_deg - in_deg) * rw
return {
"title": entry["title"],
"title_key": title_key,
"path": entry["path"],
"out_degree": out_deg,
"in_degree": in_deg,
"age_days": days,
"recency_weight": round(rw, 4),
"score": round(score, 4),
}
def run(top: int, want_json: bool, include_zero: bool, page_filter: str | None) -> int:
pages = collect_pages()
out_edges, in_edges = build_graph(pages)
scored = [score_page(k, pages, out_edges, in_edges) for k in pages]
if page_filter:
key = Path(page_filter).stem
matched = [s for s in scored if s["title_key"] == key or s["path"] == page_filter]
if not matched:
log(f"ERR: no scoreable page matches '{page_filter}'")
return EXIT_USAGE
scored = matched
else:
if not include_zero:
scored = [s for s in scored if s["score"] > 0.0]
scored.sort(key=lambda s: (-s["score"], s["title_key"]))
scored = scored[:top]
if want_json:
print(json.dumps({
"generated": datetime.now(timezone.utc).isoformat(timespec="seconds").replace("+00:00", "Z"),
"halflife_days": RECENCY_HALFLIFE_DAYS,
"page_count_scoreable": len(pages),
"results": scored,
}, indent=2))
else:
print("# Boundary Score Report")
print(f"scoreable pages: {len(pages)}; halflife: {RECENCY_HALFLIFE_DAYS} days")
if not scored:
print("\nNo positive-score frontier pages found.")
else:
print("")
print("| # | score | out | in | age_d | title | path |")
print("|---|---|---|---|---|---|---|")
for i, s in enumerate(scored, 1):
print(f"| {i} | {s['score']:.3f} | {s['out_degree']} | {s['in_degree']} | "
f"{int(s['age_days'])} | {s['title']} | {s['path']} |")
return EXIT_OK
def main(argv: list[str]) -> int:
p = argparse.ArgumentParser()
p.add_argument("--top", type=int, default=DEFAULT_TOP)
p.add_argument("--json", action="store_true")
p.add_argument("--include-score-zero", action="store_true",
help="Include pages whose score is zero or negative in the output")
p.add_argument("--page", default=None, help="Score a single page by path or stem")
args = p.parse_args(argv)
if args.top < 1:
log("ERR: --top must be >= 1")
return EXIT_USAGE
return run(args.top, args.json, args.include_score_zero, args.page)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
+505
View File
@@ -0,0 +1,505 @@
#!/usr/bin/env python3
"""contextual-prefix.py — chunk wiki pages and generate per-chunk contextual prefixes.
Implements the ingest-side of Anthropic's Sept 2024 Contextual Retrieval pattern
(https://www.anthropic.com/news/contextual-retrieval). For each chunk of a wiki
page, generates a 1-2 sentence prefix situating the chunk in its source. The
prefixed text is what gets BM25-indexed and embedded, materially improving
retrieval accuracy (Anthropic measured 35-49% failure reduction).
Three-tier prefix generation (chosen per-run automatically):
1. If ANTHROPIC_API_KEY is set → direct Anthropic API call (Haiku 4.5)
with prompt caching on the page body
(only when the body clears the ~16 KB
Haiku 4.5 cache floor; see
cache_control_for()).
~$12 / 1000 docs per Anthropic figures.
REQUIRES --allow-egress (sends bodies off-machine).
2. Elif `claude` binary on PATH → `claude -p` subprocess (uses CC subscription;
no API key needed; slower per call).
REQUIRES --allow-egress (subprocess egresses).
3. Else (default) → synthetic prefix from page frontmatter +
first paragraph (zero-cost floor; loses
most of the contextual benefit but BM25
and vector channels still work).
Data-egress posture (v1.7.1+):
Tiers 1 and 2 send wiki page bodies off-machine. Both are GATED behind
--allow-egress (default off). Without the flag, pick_prefix_tier() always
returns "synthetic" regardless of env vars or claude binary presence.
Mirror of scripts/tiling-check.py:351 --allow-remote-ollama precedent.
Chunk schema written to .vault-meta/chunks/<page-address>/chunk-NNN.json:
{
"schema_version": 1,
"page_path": "wiki/concepts/Foo.md",
"page_address": "c-000042",
"chunk_index": 3,
"raw_text": "...",
"contextualized_text": "<prefix> <raw_text>",
"prefix_source": "anthropic-api" | "claude-cli" | "synthetic" | "skipped",
"char_count": 487,
"body_hash": "sha256:...", # of raw_text
"page_body_hash": "sha256:...", # of the WHOLE source page (for invalidation)
"created_at": "2026-05-17T..."
}
Pages without an `address:` frontmatter field are still chunked (using a
synthetic address derived from the path slug) so this tool works on v1.6 vaults
without DragonScale Mechanism 2 enabled.
Usage:
contextual-prefix.py PATH # process a single page
contextual-prefix.py --all # process every wiki/*.md
contextual-prefix.py PATH --no-llm # force synthetic-prefix tier 3
contextual-prefix.py PATH --rebuild # ignore existing chunks
contextual-prefix.py PATH --peek # print what would happen; write nothing
Exit codes:
0 — success
2 — usage error
3 — page file missing or unreadable
4 — chunk dir creation failed
"""
import argparse
import hashlib
import json
import os
import re
import shutil
import subprocess
import sys
import urllib.error
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
VAULT_ROOT = Path(__file__).resolve().parent.parent
WIKI_DIR = VAULT_ROOT / "wiki"
META_DIR = VAULT_ROOT / ".vault-meta"
CHUNKS_DIR = META_DIR / "chunks"
CHUNK_TARGET_TOKENS = 500 # rough; we approximate via chars/4
CHUNK_TARGET_CHARS = CHUNK_TARGET_TOKENS * 4
CHUNK_OVERLAP_CHARS = 200
ANTHROPIC_MODEL = "claude-haiku-4-5-20251001"
ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
ANTHROPIC_TIMEOUT_SEC = 30
CLAUDE_CLI_TIMEOUT_SEC = 60
# Anthropic prompt caching ignores any cached prefix below the model's minimum
# cacheable size — 4,096 tokens for Haiku 4.5 (verified against the prompt-caching
# docs, 2026-05). At ~4 chars/token that is ~16 KB. We attach cache_control only
# when the body clears this floor so the marker reflects reality: below the floor
# the API treats it as a silent no-op. The per-call cache telemetry in
# anthropic_api_prefix() is what actually measures hit rate. The check counts the
# body only — a deliberately conservative ~370-char underestimate that ignores the
# system_msg + <page> wrapper also inside the cached prefix — so near the boundary
# it errs toward not-marking, never toward a wrongly-attached marker.
HAIKU_CACHE_MIN_CHARS = 16384 # 4096 tokens * 4 chars/token
EXIT_OK = 0
EXIT_USAGE = 2
EXIT_PAGE_MISSING = 3
EXIT_CHUNK_DIR = 4
FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
ADDRESS_RE = re.compile(r"^address:\s*(c-\d{6})\s*$", re.MULTILINE)
TITLE_RE = re.compile(r"^title:\s*['\"]?(.+?)['\"]?\s*$", re.MULTILINE)
def log(msg):
print(msg, file=sys.stderr)
def sha256(text):
return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()
def read_page(path):
if not path.is_file():
raise SystemExit(EXIT_PAGE_MISSING)
return path.read_text(encoding="utf-8", errors="replace")
def parse_frontmatter(body):
m = FRONTMATTER_RE.match(body)
if not m:
return {}, body
fm_text = m.group(1)
rest = body[m.end():]
addr_m = ADDRESS_RE.search(fm_text)
title_m = TITLE_RE.search(fm_text)
return {
"address": addr_m.group(1) if addr_m else None,
"title": title_m.group(1) if title_m else None,
"raw": fm_text,
}, rest
def derive_synthetic_address(page_path):
"""Stable per-path address-shaped string when no real address is set.
Format: c-NNNNNN derived from a hash of the relative path (deterministic).
Distinct from allocator addresses; used only for chunk filing.
"""
rel = page_path.relative_to(VAULT_ROOT)
h = hashlib.sha1(str(rel).encode("utf-8")).hexdigest()
return "syn-" + h[:6]
def chunk_body(body, target_chars=CHUNK_TARGET_CHARS, overlap=CHUNK_OVERLAP_CHARS):
"""Split body into overlapping chunks on paragraph boundaries when possible.
Heuristic: walk the body, accumulate paragraphs until len exceeds target,
flush, then keep the trailing `overlap` chars as the seed of the next chunk.
Empty paragraphs collapse to single boundaries.
"""
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", body) if p.strip()]
chunks = []
cur = []
cur_len = 0
for p in paragraphs:
cur.append(p)
cur_len += len(p) + 2
if cur_len >= target_chars:
chunk_text = "\n\n".join(cur)
chunks.append(chunk_text)
# seed next chunk with the tail
tail = chunk_text[-overlap:] if overlap > 0 else ""
cur = [tail] if tail else []
cur_len = len(tail)
if cur and "".join(cur).strip():
chunks.append("\n\n".join(cur))
if not chunks and body.strip():
# tiny page — single chunk
chunks = [body.strip()]
return chunks
def synthetic_prefix(fm, body, chunk_text):
"""Tier-3 prefix: page title + first sentence of the page body.
Free, hermetic, deterministic. Provides modest BM25 lift via title-word
re-injection into the chunk corpus.
"""
title = (fm.get("title") or "").strip() or "(untitled)"
# First sentence of the body (not the chunk — gives the chunk a page-level frame)
first_sentence = re.split(r"(?<=[.!?])\s+", body.strip(), maxsplit=1)
first = first_sentence[0][:300] if first_sentence else ""
return f"This passage is from the wiki page \"{title}\". The page opens: {first}"
def cache_control_for(page_body):
"""Ephemeral cache_control dict when the page body clears the Haiku cache
floor, else None. Pure function so the floor decision is unit-testable
without the network (the API call itself stays egress-gated).
"""
if len(page_body) >= HAIKU_CACHE_MIN_CHARS:
return {"type": "ephemeral"}
return None
def anthropic_api_prefix(api_key, page_title, page_body, chunk_text):
"""Tier-1 prefix: direct Anthropic API call, Haiku, prompt-cached page body.
The page body is the stable prefix shared by every chunk of a page, so it
goes in `system` behind a cache breakpoint and the variable chunk goes in
`messages`. Cache reads only land because chunks are processed sequentially
(chunk 0 warms the prefix) — see the loop note in process_page().
"""
system_msg = (
"You are a retrieval-augmentation assistant. Given a wiki page and one "
"chunk extracted from it, write a single short sentence (under 35 words) "
"that situates the chunk within the page's scope and topic. Output only "
"the sentence — no prefix, no quotation marks, no commentary."
)
page_block = {
"type": "text",
"text": f"<page title=\"{page_title}\">\n{page_body}\n</page>",
}
cc = cache_control_for(page_body)
if cc:
page_block["cache_control"] = cc
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 100,
"system": [
{"type": "text", "text": system_msg},
page_block,
],
"messages": [
{
"role": "user",
"content": (
"Write the single contextualizing sentence for this chunk:\n\n"
f"<chunk>\n{chunk_text}\n</chunk>"
),
}
],
}
body = json.dumps(payload).encode("utf-8")
req = urllib.request.Request(
ANTHROPIC_API_URL,
data=body,
headers={
"Content-Type": "application/json",
"x-api-key": api_key,
"anthropic-version": "2023-06-01",
},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=ANTHROPIC_TIMEOUT_SEC) as resp:
data = json.loads(resp.read().decode("utf-8"))
# Cache telemetry: integer token counts only, never page content, so
# the data-egress posture holds. Confirms whether the body cache is
# actually firing given the Haiku floor (wrote>0 on chunk 0, read>0
# on later chunks of the same page).
usage = data.get("usage", {})
log(f" cache: wrote={usage.get('cache_creation_input_tokens', 0)} "
f"read={usage.get('cache_read_input_tokens', 0)} tok")
for block in data.get("content", []):
if block.get("type") == "text":
return block["text"].strip().splitlines()[0]
except (urllib.error.URLError, json.JSONDecodeError, KeyError) as e:
log(f" anthropic-api call failed: {e}")
return None
return None
def claude_cli_prefix(page_title, page_body, chunk_text):
"""Tier-2 prefix: `claude -p` subprocess (uses CC subscription, no API key)."""
prompt = (
f"Wiki page \"{page_title}\":\n\n"
f"---\n{page_body[:4000]}\n---\n\n"
f"Chunk:\n<chunk>\n{chunk_text}\n</chunk>\n\n"
"Write one short sentence (under 35 words) situating this chunk within "
"the page's scope. Output only the sentence."
)
try:
result = subprocess.run(
["claude", "-p", prompt],
capture_output=True,
text=True,
timeout=CLAUDE_CLI_TIMEOUT_SEC,
check=False,
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip().splitlines()[0]
log(f" claude-cli rc={result.returncode}: {result.stderr.strip()[:200]}")
except (subprocess.TimeoutExpired, FileNotFoundError) as e:
log(f" claude-cli call failed: {e}")
return None
def pick_prefix_tier(force_synthetic, allow_egress=False):
"""Choose the contextual-prefix generation tier.
Without allow_egress=True, ALWAYS returns "synthetic" regardless of
env vars or claude binary availability. This is the v1.7.1 data-egress
guard: tiers 1 (Anthropic API) and 2 (claude CLI subprocess) both send
wiki page bodies off-machine, so they require explicit user consent via
the --allow-egress flag at the CLI layer.
Mirrors scripts/tiling-check.py:351 --allow-remote-ollama default-deny.
"""
if force_synthetic or not allow_egress:
return "synthetic"
if os.environ.get("ANTHROPIC_API_KEY"):
return "anthropic-api"
if shutil.which("claude"):
return "claude-cli"
return "synthetic"
def generate_prefix(tier, fm, body, chunk_text):
"""Asymmetric fallback by design:
- tier="anthropic-api" → on failure, try claude-cli (subprocess,
free) before synthetic. The API is the user's stated preference,
and claude-cli is the closer-in-quality fallback.
- tier="claude-cli" → on failure, go straight to synthetic. The
user has either no API key or has not opted into one; climbing
back to the API would silently spend money they did not authorize.
- tier="synthetic" → always synthetic.
"""
title = fm.get("title") or "(untitled)"
if tier == "anthropic-api":
result = anthropic_api_prefix(
os.environ["ANTHROPIC_API_KEY"], title, body, chunk_text
)
if result:
return result, "anthropic-api"
if shutil.which("claude"):
result = claude_cli_prefix(title, body, chunk_text)
if result:
return result, "claude-cli"
return synthetic_prefix(fm, body, chunk_text), "synthetic"
if tier == "claude-cli":
result = claude_cli_prefix(title, body, chunk_text)
if result:
return result, "claude-cli"
return synthetic_prefix(fm, body, chunk_text), "synthetic"
return synthetic_prefix(fm, body, chunk_text), "synthetic"
def process_page(page_path, force_synthetic=False, rebuild=False, peek=False,
allow_egress=False, progress_label=""):
body = read_page(page_path)
fm, content = parse_frontmatter(body)
address = fm.get("address") or derive_synthetic_address(page_path)
page_body_hash = sha256(body)
chunk_dir = CHUNKS_DIR / address
if not peek:
try:
chunk_dir.mkdir(parents=True, exist_ok=True)
except OSError as e:
log(f"ERR: cannot create chunk dir {chunk_dir}: {e}")
raise SystemExit(EXIT_CHUNK_DIR)
chunks = chunk_body(content)
tier = pick_prefix_tier(force_synthetic, allow_egress=allow_egress)
progress = (progress_label + " ") if progress_label else ""
if not chunks:
# v1.7.2 / closes audit M6: previously this logged "chunks=0" with no
# explanation and silently produced no index entries. Now: explicit WARN
# so the user notices empty-body pages (often frontmatter-only stubs).
log(f"{progress}WARN: {page_path.relative_to(VAULT_ROOT)} has no chunkable body content "
f"(empty after frontmatter strip). Skipping; no chunks written.")
return {"address": address, "written": [], "skipped": 0, "tier": tier}
log(f"{progress}-> {page_path.relative_to(VAULT_ROOT)} address={address} chunks={len(chunks)} tier={tier}")
written = []
skipped = 0
# Keep this loop sequential. The tier-1 Anthropic path caches the page body;
# a cache entry is only readable after the first response begins (Anthropic
# prompt-caching concurrency rule), so chunk 0 warms the prefix and chunks
# 1..N read it. Parallelizing here would silently zero every cache read.
for idx, raw in enumerate(chunks):
chunk_path = chunk_dir / f"chunk-{idx:03d}.json"
body_hash = sha256(raw)
if chunk_path.exists() and not rebuild:
try:
existing = json.loads(chunk_path.read_text(encoding="utf-8"))
if existing.get("body_hash") == body_hash and \
existing.get("page_body_hash") == page_body_hash:
skipped += 1
continue
except (json.JSONDecodeError, OSError):
pass # corrupted; overwrite
if peek:
log(f" would write {chunk_path.name} ({len(raw)} chars)")
continue
prefix, prefix_source = generate_prefix(tier, fm, content, raw)
contextualized = f"{prefix}\n\n{raw}" if prefix else raw
record = {
"schema_version": 1,
"page_path": str(page_path.relative_to(VAULT_ROOT)),
"page_address": address,
"chunk_index": idx,
"raw_text": raw,
"contextualized_text": contextualized,
"prefix": prefix or "",
"prefix_source": prefix_source,
"char_count": len(raw),
"body_hash": body_hash,
"page_body_hash": page_body_hash,
"created_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
}
tmp = chunk_path.with_suffix(f".{os.getpid()}.tmp")
try:
tmp.write_text(json.dumps(record, ensure_ascii=False, indent=2), encoding="utf-8")
os.replace(tmp, chunk_path)
finally:
if tmp.exists():
tmp.unlink(missing_ok=True)
written.append(chunk_path.name)
log(f" wrote={len(written)} skipped(unchanged)={skipped}")
return {"address": address, "written": written, "skipped": skipped, "tier": tier}
def collect_pages(target):
if target == "--all" or target is None:
return sorted(p for p in WIKI_DIR.rglob("*.md")
if not any(part.startswith(".") for part in p.parts))
p = Path(target)
if not p.is_absolute():
p = VAULT_ROOT / p
return [p]
def main():
parser = argparse.ArgumentParser(description="Chunk + contextualize wiki pages.")
parser.add_argument("path", nargs="?",
help="Page path relative to vault root. Omit (or pass --all) "
"to process every wiki page.")
parser.add_argument("--all", action="store_true",
help="Process every wiki page (equivalent to omitting path).")
parser.add_argument("--no-llm", action="store_true",
help="Force tier-3 synthetic prefix (skip LLM calls).")
parser.add_argument("--allow-egress", action="store_true",
help="Allow tier-1 (Anthropic API) or tier-2 (claude CLI "
"subprocess) prefix generation. Without this flag, page "
"bodies stay on-machine and only the tier-3 synthetic "
"prefix is used. Mirror of tiling-check.py's "
"--allow-remote-ollama guard.")
parser.add_argument("--rebuild", action="store_true",
help="Re-process chunks even if body_hash matches.")
parser.add_argument("--peek", action="store_true",
help="Print plan, write nothing.")
args = parser.parse_args()
if args.all and not args.path:
args.path = "--all"
elif not args.path:
# No path and no --all: default to all (matches the help text)
args.path = "--all"
pages = collect_pages(args.path)
# Explicit single-path invocations must point at a readable file inside the
# vault. --all only ever yields in-vault files, so this guard is explicit-only.
# Without it a typo'd path exited 0 silently, and an out-of-vault path raised
# a raw ValueError from relative_to().
if args.path != "--all":
target = pages[0].resolve()
if not target.is_relative_to(VAULT_ROOT):
log(f"ERR: {args.path} resolves outside the vault ({VAULT_ROOT}).")
return EXIT_USAGE
if not target.is_file():
log(f"ERR: {args.path} is not a readable file.")
return EXIT_PAGE_MISSING
# Filter to actual files up front so progress counter is meaningful
# (v1.7.2; closes audit L2: tier-2 over 47 pages can take 5+ min — the
# user needs a count, not just per-page log lines).
files = [p for p in pages if p.is_file()]
skipped_non_files = len(pages) - len(files)
if skipped_non_files:
log(f"({skipped_non_files} non-file paths skipped)")
total = len(files)
total_written = 0
total_skipped = 0
for i, page in enumerate(files, 1):
result = process_page(
page,
force_synthetic=args.no_llm,
rebuild=args.rebuild,
peek=args.peek,
allow_egress=args.allow_egress,
progress_label=f"[{i}/{total}]",
)
total_written += len(result["written"])
total_skipped += result["skipped"]
log(f"\nDone. pages={total} chunks_written={total_written} chunks_unchanged={total_skipped}")
return EXIT_OK
if __name__ == "__main__":
sys.exit(main())
+229
View File
@@ -0,0 +1,229 @@
#!/usr/bin/env bash
# detect-transport.sh — discover which vault-mutation transports are available
# on this machine, write a normalized JSON snapshot to .vault-meta/transport.json,
# and pick a preferred transport per the v1.7 fallback chain.
#
# Fallback chain (highest to lowest precedence):
# 1. cli — Obsidian CLI binary (Obsidian 1.12+). No MCP server, no TLS, no plugin.
# 2. mcp-obsidian — REST-API-backed MCP server (Local REST API plugin required).
# 3. mcpvault — Filesystem-backed MCP server (BM25 search; no Obsidian plugin).
# 4. filesystem — Direct Read/Write/Edit tools. Always available (ultimate floor).
#
# MCP auto-detection is deferred to a v1.7.x patch (calling `claude mcp list` from
# inside a running claude session has reentrancy concerns). For v1.7, we detect
# CLI + filesystem and leave MCP fields as `{"present": null, "detection": "deferred"}`.
# Users with MCP transports configured can either edit transport.json manually or
# follow the legacy guidance in wiki/references/mcp-setup.md.
#
# Usage:
# ./scripts/detect-transport.sh # detect and write .vault-meta/transport.json
# ./scripts/detect-transport.sh --peek # print result to stdout without writing
# ./scripts/detect-transport.sh --force # refresh even if existing snapshot is fresh (<7d)
# ./scripts/detect-transport.sh --quiet # suppress informational stderr output
#
# Exit codes:
# 0 — success (transport.json written or peeked)
# 2 — vault-meta/ missing and cannot be created
# 3 — unrecognized flag
set -euo pipefail
VAULT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
META_DIR="${VAULT_ROOT}/.vault-meta"
OUTPUT_FILE="${META_DIR}/transport.json"
STALE_AFTER_DAYS=7
MODE="write"
QUIET=false
while [ $# -gt 0 ]; do
case "$1" in
--peek) MODE="peek" ;;
--force) MODE="force" ;;
--quiet) QUIET=true ;;
-h|--help)
sed -n '2,28p' "$0" | sed 's/^# \{0,1\}//'
exit 0
;;
*)
echo "ERR: unknown flag: $1" >&2
exit 3
;;
esac
shift
done
log() { $QUIET || echo "$@" >&2; }
# json_escape: read stdin and emit a JSON-encoded string (including the
# surrounding double quotes). Used for any untrusted value that lands in the
# transport.json heredoc — newlines, backslashes, control chars in upstream
# binaries (obsidian-cli --version) would otherwise break the JSON.
json_escape() {
python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()), end="")'
}
mkdir -p "$META_DIR" || {
echo "ERR: cannot create .vault-meta/ at $META_DIR" >&2
exit 2
}
# ── 0. Honor manual_override from existing transport.json ────────────────────
# Users can pin a non-detected transport (mcp-obsidian, mcpvault, or any custom
# value) by editing transport.json to set:
# "manual_override": true
# "preferred": "<their-choice>"
# "fallback_chain": [...]
# Auto-detection still runs (to refresh CLI/Obsidian-running flags for visibility),
# but PREFERRED and CHAIN are preserved from the existing file across both the
# normal write path AND --force runs. Documented at
# wiki/references/transport-fallback.md §Manual override.
MANUAL_OVERRIDE_FLAG=false
MANUAL_OVERRIDE_PREFERRED=""
MANUAL_OVERRIDE_CHAIN=""
if [ -f "$OUTPUT_FILE" ]; then
MANUAL_PARSE="$(python3 - "$OUTPUT_FILE" 2>/dev/null <<'PYEOF'
import json, sys
try:
with open(sys.argv[1]) as fh:
data = json.load(fh)
if data.get("manual_override") is True:
pref = data.get("preferred", "")
chain = data.get("fallback_chain", [])
# Output: line 1 = preferred; line 2 = comma-separated quoted chain entries.
print(pref)
print(",".join('"' + str(c) + '"' for c in chain))
except Exception:
pass
PYEOF
)" || MANUAL_PARSE=""
if [ -n "${MANUAL_PARSE:-}" ]; then
MANUAL_OVERRIDE_FLAG=true
MANUAL_OVERRIDE_PREFERRED="$(printf '%s\n' "$MANUAL_PARSE" | sed -n '1p')"
MANUAL_OVERRIDE_CHAIN="$(printf '%s\n' "$MANUAL_PARSE" | sed -n '2p')"
log "manual_override=true; preserving preferred=${MANUAL_OVERRIDE_PREFERRED}"
fi
fi
# ── Freshness check: skip detection if snapshot is recent ────────────────────
if [ "$MODE" = "write" ] && [ -f "$OUTPUT_FILE" ]; then
if find "$OUTPUT_FILE" -mtime -${STALE_AFTER_DAYS} -print 2>/dev/null | grep -q .; then
log "transport.json is fresh (<${STALE_AFTER_DAYS}d). Use --force to refresh."
cat "$OUTPUT_FILE"
exit 0
fi
fi
# ── 1. CLI detection ─────────────────────────────────────────────────────────
CLI_PRESENT=false
CLI_BINARY=""
CLI_VERSION=""
CLI_VERSION_RAW=""
if command -v obsidian-cli >/dev/null 2>&1; then
CLI_PRESENT=true
CLI_BINARY="obsidian-cli"
# Keep two views of the version: RAW for the human log line, JSON-escaped
# for the transport.json heredoc. CLI_VERSION below is pre-quoted (includes
# the surrounding double quotes), so the heredoc emits ${CLI_VERSION}
# without wrapping quotes.
CLI_VERSION_RAW="$(obsidian-cli --version 2>/dev/null | head -1 || echo unknown)"
CLI_VERSION="$(printf '%s' "$CLI_VERSION_RAW" | json_escape || echo '"unknown"')"
elif command -v obsidian >/dev/null 2>&1; then
# Obsidian 1.12+ ships `obsidian` as the CLI binary on some platforms.
# We treat it as cli-capable if it accepts a --cli or --version flag without launching the GUI.
if obsidian --version >/dev/null 2>&1; then
CLI_PRESENT=true
CLI_BINARY="obsidian"
CLI_VERSION_RAW="$(obsidian --version 2>/dev/null | head -1 || echo unknown)"
CLI_VERSION="$(printf '%s' "$CLI_VERSION_RAW" | json_escape || echo '"unknown"')"
fi
fi
# Fallback default when neither binary was found: must still be a valid JSON literal.
if [ -z "$CLI_VERSION" ]; then
CLI_VERSION='""'
CLI_VERSION_RAW=""
fi
# ── 2. Obsidian app running? (informational only; CLI works either way) ──────
OBSIDIAN_RUNNING=false
if command -v pgrep >/dev/null 2>&1; then
if pgrep -if 'obsidian' >/dev/null 2>&1; then
OBSIDIAN_RUNNING=true
fi
fi
# ── 3. Compute preferred + fallback chain ────────────────────────────────────
if $CLI_PRESENT; then
PREFERRED="cli"
CHAIN='"cli", "filesystem"'
else
PREFERRED="filesystem"
CHAIN='"filesystem"'
fi
# ── 3b. Apply manual_override if it was parsed from the existing snapshot ────
# Auto-detected PREFERRED/CHAIN above are overridden so the user's pinned
# transport survives every refresh cycle including --force.
if $MANUAL_OVERRIDE_FLAG; then
PREFERRED="$MANUAL_OVERRIDE_PREFERRED"
CHAIN="$MANUAL_OVERRIDE_CHAIN"
fi
# ── 4. Build JSON snapshot ───────────────────────────────────────────────────
TIMESTAMP="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
HOSTNAME="$(hostname 2>/dev/null || echo unknown)"
snapshot() {
cat <<JSON
{
"schema_version": 1,
"detected_at": "${TIMESTAMP}",
"host": "${HOSTNAME}",
"vault_root": "${VAULT_ROOT}",
"manual_override": ${MANUAL_OVERRIDE_FLAG},
"preferred": "${PREFERRED}",
"fallback_chain": [${CHAIN}],
"available": {
"cli": {
"present": ${CLI_PRESENT},
"binary": "${CLI_BINARY}",
"version_string": ${CLI_VERSION},
"obsidian_app_running": ${OBSIDIAN_RUNNING}
},
"filesystem": {
"present": true,
"vault_root": "${VAULT_ROOT}",
"note": "ultimate fallback; uses Claude's Read/Write/Edit tools directly"
},
"mcp_obsidian": {
"present": null,
"detection": "deferred",
"note": "v1.7 does not auto-detect MCP servers. Configure manually per wiki/references/mcp-setup.md and edit this file by hand if needed."
},
"mcpvault": {
"present": null,
"detection": "deferred",
"note": "v1.7 does not auto-detect MCP servers. Configure manually per wiki/references/mcp-setup.md and edit this file by hand if needed."
}
}
}
JSON
}
if [ "$MODE" = "peek" ]; then
snapshot
exit 0
fi
# Atomic write: stage to .tmp then rename. Avoids partial files if killed mid-write.
TMP="${OUTPUT_FILE}.$$.tmp"
trap 'rm -f "$TMP"' EXIT
snapshot > "$TMP"
mv "$TMP" "$OUTPUT_FILE"
trap - EXIT
log "Wrote: ${OUTPUT_FILE}"
log "Preferred transport: ${PREFERRED}"
$CLI_PRESENT && log " CLI: ${CLI_BINARY} (${CLI_VERSION_RAW})"
log " Filesystem: always available (Read/Write/Edit tools)"
log " MCP: not auto-detected (see wiki/references/mcp-setup.md to configure)"
+312
View File
@@ -0,0 +1,312 @@
#!/usr/bin/env python3
"""rerank.py — query-time reranker for chunk candidates.
Takes a query string + a list of candidate chunks (from BM25, vector, or any
upstream stage) and reorders them using semantic similarity.
v1.7 strategy (in preference order, automatically chosen at runtime):
1. If ollama is reachable AND nomic-embed-text is pulled
→ embed the query, embed each candidate's contextualized_text,
rank by cosine. Caches per-chunk embeddings in
.vault-meta/embed-cache.json keyed by body_hash.
2. Otherwise
→ no-op rerank: return candidates in input order with a synthesized
note. Caller (retrieve.py) still gets a useful result; downstream
drill-into-page logic is unchanged.
Future v1.7.x upgrade paths:
- Cross-encoder reranker (sentence-transformers BGE-base) if installed
- Cohere Rerank API if COHERE_API_KEY set
- Voyage Rerank API if VOYAGE_API_KEY set
Mirrors the localhost-only OLLAMA_URL guard from scripts/tiling-check.py:
remote ollama endpoints require --allow-remote-ollama because page bodies
are POSTed as embedding input.
Usage:
rerank.py "query string" --candidates candidates.json [--top 5]
rerank.py "query string" --candidates - --top 5 # stdin
rerank.py --peek "query string" # show strategy chosen
Candidates JSON shape:
[{"chunk_id": "c-000042:3", "path": ".vault-meta/chunks/.../chunk-003.json", "score": 7.1}, ...]
Output: ranked candidates with `rerank_score` added.
Exit codes:
0 — success
2 — usage error
3 — candidate input malformed
10 — ollama unreachable (no-op rerank performed, exit 0 with note)
11 — model not pulled (no-op rerank performed, exit 0 with note)
"""
import argparse
import fcntl
import json
import math
import os
import shutil
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
VAULT_ROOT = Path(__file__).resolve().parent.parent
META_DIR = VAULT_ROOT / ".vault-meta"
EMBED_CACHE_PATH = META_DIR / "embed-cache.json"
CACHE_LOCK = META_DIR / ".embed-cache.lock"
DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434"
DEFAULT_MODEL = "nomic-embed-text"
OLLAMA_TIMEOUT_SEC = 3
EMBED_TIMEOUT_SEC = 30
MAX_RESPONSE_BYTES = 4 * 1024 * 1024
EXIT_OK = 0
EXIT_USAGE = 2
EXIT_CANDIDATES = 3
EXIT_NO_OLLAMA = 10
EXIT_NO_MODEL = 11
def log(msg):
print(msg, file=sys.stderr)
def cosine(a, b):
if not a or not b or len(a) != len(b):
return 0.0
dot = sum(x * y for x, y in zip(a, b))
na = math.sqrt(sum(x * x for x in a))
nb = math.sqrt(sum(y * y for y in b))
if na == 0 or nb == 0:
return 0.0
return dot / (na * nb)
def ollama_url(allow_remote):
url = os.environ.get("OLLAMA_URL", DEFAULT_OLLAMA_URL).rstrip("/")
if not allow_remote:
parsed = urllib.parse.urlparse(url)
host = parsed.hostname or ""
if host not in ("127.0.0.1", "localhost", "::1"):
log(f"ERR: OLLAMA_URL={url} points off-localhost (host={host!r}).")
log(" Either: (a) run ollama locally — `systemctl --user start ollama` or `ollama serve`")
log(" Or: (b) pass --allow-remote-ollama through retrieve.py, which forwards it here.")
log(" Or: (c) unset OLLAMA_URL to fall back to the local default (127.0.0.1:11434).")
sys.exit(EXIT_USAGE)
return url
def ollama_alive(url):
try:
req = urllib.request.Request(f"{url}/api/tags", method="GET")
with urllib.request.urlopen(req, timeout=OLLAMA_TIMEOUT_SEC) as resp:
data = json.loads(resp.read(MAX_RESPONSE_BYTES))
models = [m.get("name", "").split(":")[0] for m in data.get("models", [])]
return True, models
except (urllib.error.URLError, json.JSONDecodeError, OSError):
return False, []
def embed_one(url, model, text):
payload = json.dumps({"model": model, "prompt": text}).encode("utf-8")
req = urllib.request.Request(
f"{url}/api/embeddings",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=EMBED_TIMEOUT_SEC) as resp:
data = json.loads(resp.read(MAX_RESPONSE_BYTES))
return data.get("embedding") or []
def load_cache():
if not EMBED_CACHE_PATH.is_file():
return {}
try:
return json.loads(EMBED_CACHE_PATH.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return {}
def save_cache(cache):
"""Persist the embed cache atomically.
v1.7.2 / closes audit M7: previously used blocking fcntl.LOCK_EX with no
timeout, which could hang indefinitely on a non-flock-capable filesystem
(some NFS mounts, network shares, FUSE backends without lock support).
Now uses LOCK_NB with a 3-attempt retry loop, then falls back to writing
without the lock (with a WARN) so the rerank pipeline never hangs the
user's session. The temp + os.replace pattern provides write atomicity
even without the lock; the lock only serializes concurrent writers.
"""
META_DIR.mkdir(parents=True, exist_ok=True)
fd = os.open(str(CACHE_LOCK), os.O_CREAT | os.O_WRONLY, 0o644)
locked = False
try:
for attempt in range(3):
try:
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
locked = True
break
except BlockingIOError:
time.sleep(0.1)
if not locked:
msg = ("WARN: rerank embed-cache lock unavailable after 3 tries; "
"writing unlocked (atomic via temp+rename). Concurrent writers "
"may overwrite each other's last update.")
log(msg)
# v1.9.1 / closes audit Data M1: also route to .vault-meta/hook.log so
# the user sees the event via wiki-lint (stderr alone is invisible to
# most callers; this matches the hook's logging shape).
try:
META_DIR.mkdir(parents=True, exist_ok=True)
hook_log = META_DIR / "hook.log"
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
with hook_log.open("a", encoding="utf-8") as fh:
fh.write(f"{ts} rerank embed-cache lock unavailable; wrote unlocked\n")
except OSError:
pass # never block on a logging failure
tmp = EMBED_CACHE_PATH.with_suffix(f".{os.getpid()}.tmp")
tmp.write_text(json.dumps(cache, ensure_ascii=False), encoding="utf-8")
os.replace(tmp, EMBED_CACHE_PATH)
finally:
if locked:
try:
fcntl.flock(fd, fcntl.LOCK_UN)
except OSError:
pass
os.close(fd)
def load_chunk(chunk_rel_path):
p = VAULT_ROOT / chunk_rel_path
if not p.is_file():
return None
try:
return json.loads(p.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return None
def rerank(query, candidates, top_k=5, allow_remote=False):
"""Returns candidates list, possibly truncated to top_k, with rerank_score added.
Falls back to input-order if ollama is unavailable (still adds rerank_source: 'noop').
"""
url = ollama_url(allow_remote)
alive, models = ollama_alive(url)
if not alive:
log("ollama unreachable — no-op rerank")
for c in candidates:
c["rerank_score"] = float(c.get("score", 0.0))
c["rerank_source"] = "noop-no-ollama"
return candidates[:top_k]
if DEFAULT_MODEL not in models:
log(f"model {DEFAULT_MODEL} not pulled — no-op rerank")
for c in candidates:
c["rerank_score"] = float(c.get("score", 0.0))
c["rerank_source"] = "noop-no-model"
return candidates[:top_k]
cache = load_cache()
cache_dirty = False
try:
q_emb = embed_one(url, DEFAULT_MODEL, query)
except Exception as e:
log(f"query embed failed: {e}")
for c in candidates:
c["rerank_score"] = float(c.get("score", 0.0))
c["rerank_source"] = "noop-embed-error"
return candidates[:top_k]
for c in candidates:
chunk = load_chunk(c.get("path", ""))
if not chunk:
c["rerank_score"] = 0.0
c["rerank_source"] = "missing-chunk"
continue
body_hash = chunk.get("body_hash", "")
cache_key = f"{DEFAULT_MODEL}:{body_hash}"
emb = cache.get(cache_key)
if not emb:
text = chunk.get("contextualized_text") or chunk.get("raw_text", "")
try:
emb = embed_one(url, DEFAULT_MODEL, text)
except Exception as e:
log(f"embed failed for {c.get('chunk_id')}: {e}")
c["rerank_score"] = float(c.get("score", 0.0))
c["rerank_source"] = "embed-error"
continue
cache[cache_key] = emb
cache_dirty = True
c["rerank_score"] = cosine(q_emb, emb)
c["rerank_source"] = f"cosine:{DEFAULT_MODEL}"
if cache_dirty:
save_cache(cache)
ranked = sorted(candidates, key=lambda x: x.get("rerank_score", 0.0), reverse=True)
return ranked[:top_k]
def main():
parser = argparse.ArgumentParser(description="Rerank chunk candidates by semantic similarity.")
parser.add_argument("query", nargs="?", help="Query text")
parser.add_argument("--candidates", help="Path to candidates JSON or `-` for stdin",
default=None)
parser.add_argument("--top", type=int, default=5, help="Top-K to return")
parser.add_argument("--peek", action="store_true",
help="Print rerank strategy chosen and exit")
parser.add_argument("--allow-remote-ollama", action="store_true",
help="Accept non-localhost OLLAMA_URL (potential data exfil)")
args = parser.parse_args()
if args.peek:
if not args.query:
log("--peek needs a query string")
sys.exit(EXIT_USAGE)
url = ollama_url(args.allow_remote_ollama)
alive, models = ollama_alive(url)
strategy = "noop-no-ollama"
if alive:
strategy = f"cosine:{DEFAULT_MODEL}" if DEFAULT_MODEL in models else "noop-no-model"
print(json.dumps({
"query": args.query,
"strategy": strategy,
"ollama_url": url,
"ollama_alive": alive,
"model_present": DEFAULT_MODEL in models,
"checked_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
}, indent=2))
return EXIT_OK
if not args.query or args.candidates is None:
log("usage: rerank.py <query> --candidates <path|-> [--top N]")
return EXIT_USAGE
if args.candidates == "-":
cand_text = sys.stdin.read()
else:
cand_text = Path(args.candidates).read_text(encoding="utf-8")
try:
candidates = json.loads(cand_text)
if not isinstance(candidates, list):
raise ValueError("candidates must be a JSON list")
except (json.JSONDecodeError, ValueError) as e:
log(f"ERR: bad candidates JSON: {e}")
return EXIT_CANDIDATES
result = rerank(args.query, candidates, top_k=args.top,
allow_remote=args.allow_remote_ollama)
print(json.dumps(result, indent=2))
return EXIT_OK
if __name__ == "__main__":
sys.exit(main())
+195
View File
@@ -0,0 +1,195 @@
#!/usr/bin/env python3
"""retrieve.py — hybrid retrieval orchestrator for the Compound Vault.
Pipeline (v1.7):
query → bm25-index.py query (top-K candidates by BM25 over contextualized chunks)
→ rerank.py (cosine on nomic-embed-text vectors via ollama,
or no-op if ollama unavailable)
→ drill (return chunk pages with absolute paths so the
caller can Read them and synthesize)
Loads sibling scripts as Python modules (no subprocess overhead). Falls back
gracefully when index or rerank stage is missing:
- If .vault-meta/bm25/index.json is absent → exit 10 with friendly message;
caller falls back to v1.6 legacy
hot→index→drill read order.
- If .vault-meta/chunks/ is empty → exit 10 (same).
- If rerank stage cannot embed (no ollama) → no-op rerank, returns BM25 order.
Output schema (JSON to stdout):
{
"query": "...",
"strategy": "bm25+rerank:cosine:nomic-embed-text" | "bm25+noop-rerank",
"top_k": 5,
"candidates": [
{
"chunk_id": "c-000042:3",
"page_address": "c-000042",
"page_path": "wiki/concepts/Foo.md",
"absolute_path": "/abs/path/to/wiki/concepts/Foo.md",
"chunk_index": 3,
"bm25_score": 7.12,
"rerank_score": 0.81,
"rerank_source": "cosine:nomic-embed-text",
"snippet": "... first 200 chars of the chunk ..."
},
...
]
}
Usage:
retrieve.py "your query here" # standard: BM25 top-20, rerank to top-5
retrieve.py "query" --top 10 # change result count
retrieve.py "query" --no-rerank # skip rerank, BM25-only
retrieve.py "query" --explain # include per-stage diagnostics
Exit codes:
0 — success
2 — usage error
10 — feature not provisioned (no chunks or no BM25 index); caller falls back
"""
import argparse
import importlib.util
import json
import sys
from pathlib import Path
VAULT_ROOT = Path(__file__).resolve().parent.parent
SCRIPTS_DIR = VAULT_ROOT / "scripts"
META_DIR = VAULT_ROOT / ".vault-meta"
CHUNKS_DIR = META_DIR / "chunks"
BM25_INDEX = META_DIR / "bm25" / "index.json"
EXIT_OK = 0
EXIT_USAGE = 2
EXIT_NOT_PROVISIONED = 10
def log(msg):
print(msg, file=sys.stderr)
def import_sibling(name, filename):
"""Import a hyphenated sibling .py file as a Python module.
Wrapped in try/except (v1.7.2; closes audit M5) so a syntax error or
missing dependency in a sibling helper produces a friendly diagnostic
instead of a bare Python traceback at the user's first retrieve call.
"""
target = SCRIPTS_DIR / filename
if not target.is_file():
log(f"ERR: sibling helper {filename} not found at {target}")
log(" Run `bash bin/setup-retrieve.sh --check` to verify the install.")
sys.exit(EXIT_NOT_PROVISIONED)
try:
spec = importlib.util.spec_from_file_location(name, target)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
except (ImportError, SyntaxError, AttributeError) as e:
log(f"ERR: failed to import sibling helper {filename}: {type(e).__name__}: {e}")
log(" This likely means the helper script is corrupted or has a syntax error.")
log(" Run `python3 scripts/<helper>.py --help` directly to see the underlying error.")
log(" If it persists: re-clone the repo or check `git status` for local damage.")
sys.exit(EXIT_NOT_PROVISIONED)
def chunk_snippet(chunk_data, max_chars=200):
text = chunk_data.get("raw_text", "")
if len(text) <= max_chars:
return text
return text[:max_chars].rstrip() + ""
def main():
parser = argparse.ArgumentParser(description="Hybrid retrieval over the vault.")
parser.add_argument("query", help="Natural-language query")
parser.add_argument("--top", type=int, default=5, help="Final result count (post-rerank)")
parser.add_argument("--bm25-top", type=int, default=20,
help="Candidate count from BM25 (pre-rerank)")
parser.add_argument("--no-rerank", action="store_true",
help="Skip the rerank stage; return BM25-only")
parser.add_argument("--explain", action="store_true",
help="Include per-stage diagnostics in output")
parser.add_argument("--allow-remote-ollama", action="store_true",
help="Forwarded to rerank.py")
args = parser.parse_args()
if not BM25_INDEX.is_file():
log(f"ERR: no BM25 index at {BM25_INDEX}. Run `bash bin/setup-retrieve.sh` "
"to provision, or fall back to legacy hot→index→drill.")
return EXIT_NOT_PROVISIONED
if not CHUNKS_DIR.is_dir() or not any(CHUNKS_DIR.iterdir()):
log(f"ERR: no chunks at {CHUNKS_DIR}. Run "
"`python3 scripts/contextual-prefix.py --all` first.")
return EXIT_NOT_PROVISIONED
bm25 = import_sibling("bm25_index", "bm25-index.py")
reranker = import_sibling("rerank", "rerank.py")
bm25_hits = bm25.query(args.query, top_k=args.bm25_top)
log(f"bm25: {len(bm25_hits)} hits")
candidates = []
for h in bm25_hits:
chunk_path = VAULT_ROOT / h["path"]
try:
chunk = json.loads(chunk_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
continue
candidates.append({
"chunk_id": h["chunk_id"],
"page_address": chunk.get("page_address"),
"page_path": chunk.get("page_path"),
"absolute_path": str((VAULT_ROOT / chunk.get("page_path", "")).resolve()),
"chunk_index": chunk.get("chunk_index"),
"bm25_score": h["score"],
"path": h["path"],
"snippet": chunk_snippet(chunk),
})
if args.no_rerank:
final = candidates[:args.top]
strategy = "bm25-only"
for c in final:
c["rerank_score"] = c["bm25_score"]
c["rerank_source"] = "skipped"
else:
final = reranker.rerank(
args.query, candidates, top_k=args.top,
allow_remote=args.allow_remote_ollama,
)
# Derive strategy from first candidate's rerank_source
first_src = (final[0].get("rerank_source") if final else "unknown")
strategy = f"bm25+rerank:{first_src}"
# Dedupe by page (we may have multiple chunks of the same page; collapse to best)
by_page = {}
for c in final:
addr = c.get("page_address")
if addr not in by_page or c.get("rerank_score", 0) > by_page[addr].get("rerank_score", 0):
by_page[addr] = c
deduped = list(by_page.values())
deduped.sort(key=lambda c: c.get("rerank_score", 0), reverse=True)
out = {
"query": args.query,
"strategy": strategy,
"top_k": args.top,
"candidates": deduped[:args.top],
}
if args.explain:
out["explain"] = {
"bm25_candidate_count": len(bm25_hits),
"post_rerank_count": len(final),
"deduped_count": len(deduped),
"bm25_top_param": args.bm25_top,
}
print(json.dumps(out, indent=2, ensure_ascii=False))
return EXIT_OK
if __name__ == "__main__":
sys.exit(main())
+496
View File
@@ -0,0 +1,496 @@
#!/usr/bin/env python3
"""tiling-check.py — DragonScale Mechanism 3: semantic tiling lint.
Computes per-page embeddings via a local ollama instance and reports
candidate duplicate page pairs. Read-only; never modifies wiki pages.
Security model:
- Defaults to http://127.0.0.1:11434. Remote ollama endpoints require
--allow-remote-ollama explicitly (vault bodies are POSTed as embedding
input; a hostile env var would otherwise exfiltrate content).
- Rejects symlinked page files to prevent escape outside the vault root.
Feature-gated: exits 10 if ollama is unreachable or 11 if the embedding
model is not pulled, so the calling skill can no-op gracefully. Exits 0
on success. Exit 3 on cache corruption. Exit 2 on usage error.
Concurrency:
- Locks `.vault-meta/.tiling.lock` (flock exclusive) around cache I/O.
- Per-PID temp file to avoid shared-tempfile races.
Usage:
tiling-check.py # run; exit 10/11 if ollama/model missing
tiling-check.py --report PATH # also write report to PATH
tiling-check.py --rebuild-cache # ignore cached embeddings
tiling-check.py --peek # structured diagnostics; no compute
tiling-check.py --allow-remote-ollama # accept non-localhost OLLAMA_URL
"""
import argparse
import fcntl
import hashlib
import json
import math
import os
import re
import sys
import urllib.error
import urllib.parse
import urllib.request
from datetime import datetime
from pathlib import Path
DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434"
DEFAULT_MODEL = "nomic-embed-text"
OLLAMA_TIMEOUT_SEC = 3
EMBED_TIMEOUT_SEC = 30
MAX_RESPONSE_BYTES = 4 * 1024 * 1024 # 4 MB; embeddings can be ~10 KB each
VAULT_ROOT = Path(__file__).resolve().parent.parent
WIKI_DIR = VAULT_ROOT / "wiki"
META_DIR = VAULT_ROOT / ".vault-meta"
CACHE_PATH = META_DIR / "tiling-cache.json"
CACHE_LOCK = META_DIR / ".tiling.lock"
THRESHOLDS_PATH = META_DIR / "tiling-thresholds.json"
EXCLUDE_TYPES = {"meta", "fold"}
EXCLUDE_FILENAMES = {
"_index.md", "index.md", "log.md", "hot.md", "overview.md",
"dashboard.md", "Wiki Map.md", "getting-started.md",
}
EXCLUDE_PATH_PREFIXES = ("wiki/folds/", "wiki/meta/")
MAX_BODY_BYTES = 128 * 1024
SCALE_WARN_PAGES = 500
SCALE_HARD_FAIL_PAGES = 5000
EXIT_OK = 0
EXIT_USAGE = 2
EXIT_CACHE_CORRUPT = 3
EXIT_SCALE_EXCEEDED = 4
EXIT_NO_OLLAMA = 10
EXIT_NO_MODEL = 11
FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
TYPE_RE = re.compile(r"^type:\s*(\S+)", re.MULTILINE)
def log(msg: str) -> None:
print(msg, file=sys.stderr)
def _is_local_url(url: str) -> bool:
try:
host = urllib.parse.urlparse(url).hostname or ""
except ValueError:
return False
return host in ("127.0.0.1", "localhost", "::1")
def _http_get_json(url: str, timeout: float) -> dict:
with urllib.request.urlopen(url, timeout=timeout) as resp:
raw = resp.read(MAX_RESPONSE_BYTES + 1)
if len(raw) > MAX_RESPONSE_BYTES:
raise RuntimeError("response exceeded size limit")
return json.loads(raw.decode("utf-8"))
def _http_post_json(url: str, payload: dict, timeout: float) -> dict:
data = json.dumps(payload).encode("utf-8")
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
with urllib.request.urlopen(req, timeout=timeout) as resp:
raw = resp.read(MAX_RESPONSE_BYTES + 1)
if len(raw) > MAX_RESPONSE_BYTES:
raise RuntimeError("response exceeded size limit")
return json.loads(raw.decode("utf-8"))
def detect_ollama(url: str) -> bool:
try:
_http_get_json(f"{url}/api/version", OLLAMA_TIMEOUT_SEC)
return True
except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
return False
def detect_model(url: str, model: str) -> bool:
try:
data = _http_get_json(f"{url}/api/tags", OLLAMA_TIMEOUT_SEC)
except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
return False
models = data.get("models")
if not isinstance(models, list):
return False
for entry in models:
if not isinstance(entry, dict):
continue
name = entry.get("name", "")
if isinstance(name, str) and (name == model or name.startswith(f"{model}:")):
return True
return False
def parse_frontmatter(text: str) -> tuple[dict, str]:
m = FRONTMATTER_RE.match(text)
if not m:
return {}, text
fm_raw = m.group(1)
body = text[m.end():]
fm: dict = {}
tm = TYPE_RE.search(fm_raw)
if tm:
fm["type"] = tm.group(1).strip().strip('"').strip("'")
return fm, body
def body_hash(body: str, model: str) -> str:
h = hashlib.sha256()
h.update(f"model={model}\n".encode("utf-8"))
h.update(body.encode("utf-8"))
return h.hexdigest()
def cosine(a: list[float], b: list[float]) -> float:
if len(a) != len(b):
raise ValueError(f"dim mismatch: {len(a)} vs {len(b)}")
dot = sum(x * y for x, y in zip(a, b))
na = math.sqrt(sum(x * x for x in a))
nb = math.sqrt(sum(x * x for x in b))
if na == 0.0 or nb == 0.0:
return 0.0
return dot / (na * nb)
def _lock_cache():
META_DIR.mkdir(exist_ok=True)
fd = os.open(str(CACHE_LOCK), os.O_CREAT | os.O_RDWR, 0o644)
try:
fcntl.flock(fd, fcntl.LOCK_EX)
except OSError:
os.close(fd)
raise
return fd
def _unlock_cache(fd: int) -> None:
try:
fcntl.flock(fd, fcntl.LOCK_UN)
finally:
os.close(fd)
def load_cache(current_model: str) -> dict:
if not CACHE_PATH.exists():
return {"version": 1, "model": current_model, "embeddings": {}}
try:
with CACHE_PATH.open() as f:
data = json.load(f)
except (OSError, json.JSONDecodeError) as exc:
log(f"ERR: cache read failed: {exc}")
sys.exit(EXIT_CACHE_CORRUPT)
if data.get("version") != 1:
log(f"ERR: unknown cache version: {data.get('version')}")
sys.exit(EXIT_CACHE_CORRUPT)
cached_model = data.get("model", "")
if cached_model != current_model:
log(f"INFO: cached model '{cached_model}' differs from current '{current_model}'; invalidating cache")
return {"version": 1, "model": current_model, "embeddings": {}}
if not isinstance(data.get("embeddings"), dict):
log("ERR: cache.embeddings is not a dict")
sys.exit(EXIT_CACHE_CORRUPT)
return data
def save_cache(cache: dict) -> None:
META_DIR.mkdir(exist_ok=True)
tmp = CACHE_PATH.with_name(f"{CACHE_PATH.stem}.{os.getpid()}.tmp")
with tmp.open("w") as f:
json.dump(cache, f, indent=2)
tmp.replace(CACHE_PATH)
def load_thresholds() -> dict:
if not THRESHOLDS_PATH.exists():
return {
"version": 1, "model": DEFAULT_MODEL,
"bands": {"error": 0.90, "review": 0.80},
"calibrated": False, "calibration_pairs_labeled": 0,
}
with THRESHOLDS_PATH.open() as f:
return json.load(f)
def included(path: Path, fm: dict) -> tuple[bool, str]:
rel = path.relative_to(VAULT_ROOT).as_posix()
if path.is_symlink():
return False, "symlink"
resolved = path.resolve()
try:
resolved.relative_to(VAULT_ROOT.resolve())
except ValueError:
return False, "escapes vault"
if path.name in EXCLUDE_FILENAMES:
return False, "excluded filename"
for prefix in EXCLUDE_PATH_PREFIXES:
if rel.startswith(prefix):
return False, f"under {prefix}"
if fm.get("type") in EXCLUDE_TYPES:
return False, f"type={fm['type']}"
return True, "included"
def embed(text: str, model: str, url: str) -> list[float]:
data = _http_post_json(
f"{url}/api/embeddings",
{"model": model, "prompt": text},
EMBED_TIMEOUT_SEC,
)
emb = data.get("embedding")
if not isinstance(emb, list) or not emb:
raise RuntimeError(f"ollama returned no embedding: {str(data)[:200]}")
for v in emb:
if not isinstance(v, (int, float)):
raise RuntimeError("embedding contains non-numeric values")
return emb
def run_check(
rebuild: bool,
report_path: Path | None,
ollama_url: str,
model: str,
) -> int:
if not detect_ollama(ollama_url):
log(f"ollama not reachable at {ollama_url}; skipping tiling check")
return EXIT_NO_OLLAMA
if not detect_model(ollama_url, model):
log(f"model '{model}' not pulled; run: ollama pull {model}")
return EXIT_NO_MODEL
thresholds = load_thresholds()
lock_fd = _lock_cache()
try:
cache = (load_cache(model) if not rebuild
else {"version": 1, "model": model, "embeddings": {}})
pages: list[tuple[str, list[float]]] = []
scanned = 0
computed = 0
cached_hits = 0
skipped_counts: dict[str, int] = {}
live_paths: set[str] = set()
candidates = sorted(WIKI_DIR.rglob("*.md"))
scale_n = len(candidates)
if scale_n > SCALE_HARD_FAIL_PAGES:
log(f"ERR: {scale_n} pages exceed hard-fail limit {SCALE_HARD_FAIL_PAGES}")
return EXIT_SCALE_EXCEEDED
if scale_n > SCALE_WARN_PAGES:
log(f"WARN: {scale_n} pages; cold-cache embed will issue ~{scale_n} POSTs to ollama")
for md in candidates:
scanned += 1
# Symlink and vault-root guards must run BEFORE read_text so a
# hostile symlink cannot cause off-vault content to be read and
# POSTed to the embedding endpoint.
if md.is_symlink():
skipped_counts["symlink"] = skipped_counts.get("symlink", 0) + 1
continue
try:
resolved = md.resolve(strict=True)
resolved.relative_to(VAULT_ROOT.resolve())
except (OSError, ValueError):
skipped_counts["escapes vault"] = skipped_counts.get("escapes vault", 0) + 1
continue
try:
text = md.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
skipped_counts["read_error"] = skipped_counts.get("read_error", 0) + 1
continue
if len(text.encode("utf-8")) > MAX_BODY_BYTES:
skipped_counts["too_large"] = skipped_counts.get("too_large", 0) + 1
continue
fm, body = parse_frontmatter(text)
ok, reason = included(md, fm)
if not ok:
skipped_counts[reason] = skipped_counts.get(reason, 0) + 1
continue
rel = md.relative_to(VAULT_ROOT).as_posix()
live_paths.add(rel)
h = body_hash(body, model)
entry = cache["embeddings"].get(rel)
if entry and entry.get("hash") == h:
pages.append((rel, entry["embedding"]))
cached_hits += 1
continue
try:
emb = embed(body, model, ollama_url)
except Exception as exc:
log(f"ERR embedding {rel}: {exc}")
skipped_counts["embed_error"] = skipped_counts.get("embed_error", 0) + 1
continue
cache["embeddings"][rel] = {
"hash": h,
"embedding": emb,
"computed_at": datetime.utcnow().isoformat(timespec="seconds") + "Z",
}
pages.append((rel, emb))
computed += 1
# Orphan GC: drop cache entries for paths that no longer exist.
orphans = [k for k in cache["embeddings"] if k not in live_paths]
for k in orphans:
del cache["embeddings"][k]
save_cache(cache)
finally:
_unlock_cache(lock_fd)
review = thresholds["bands"]["review"]
error_ = thresholds["bands"]["error"]
pairs: list[tuple[float, str, str]] = []
for i in range(len(pages)):
for j in range(i + 1, len(pages)):
a_path, a_emb = pages[i]
b_path, b_emb = pages[j]
try:
sim = cosine(a_emb, b_emb)
except ValueError as exc:
log(f"WARN cosine skip ({a_path}, {b_path}): {exc}")
continue
if sim >= review:
pairs.append((sim, a_path, b_path))
pairs.sort(reverse=True)
errors = [p for p in pairs if p[0] >= error_]
reviews = [p for p in pairs if review <= p[0] < error_]
out_lines: list[str] = []
out_lines.append("# Semantic Tiling Report")
out_lines.append("")
out_lines.append(f"- generated: {datetime.utcnow().isoformat(timespec='seconds')}Z")
out_lines.append(f"- model: {model}")
out_lines.append(f"- ollama_url: {ollama_url}")
out_lines.append(f"- thresholds: error>={error_}, review={review}-{error_}")
out_lines.append(f"- calibrated: {thresholds.get('calibrated', False)}"
+ (" (using uncalibrated defaults)" if not thresholds.get("calibrated") else ""))
out_lines.append(f"- pages scanned: {scanned}; embedded: {len(pages)}; skipped: {sum(skipped_counts.values())}")
if skipped_counts:
out_lines.append("- skipped reasons: " + ", ".join(f"{k}={v}" for k, v in sorted(skipped_counts.items())))
out_lines.append(f"- cache hits: {cached_hits}; recomputed: {computed}; orphans pruned: {len(orphans)}")
out_lines.append("")
out_lines.append(f"## Errors (similarity >= {error_})")
out_lines.append("")
if not errors:
out_lines.append("- none")
else:
for sim, a, b in errors:
out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
out_lines.append("")
out_lines.append(f"## Review ({review} <= similarity < {error_})")
out_lines.append("")
if not reviews:
out_lines.append("- none")
else:
for sim, a, b in reviews:
out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
report = "\n".join(out_lines) + "\n"
print(report)
if report_path is not None:
# Confine report writes to VAULT_ROOT. A path that resolves outside
# the vault is refused (prevents `--report /etc/passwd` style
# accidents or hostile args from writing outside the repo).
try:
resolved_report = (
report_path if report_path.is_absolute() else (Path.cwd() / report_path)
).resolve()
resolved_report.relative_to(VAULT_ROOT.resolve())
except ValueError:
log(f"ERR: --report path '{report_path}' escapes vault root {VAULT_ROOT}")
return EXIT_USAGE
resolved_report.parent.mkdir(parents=True, exist_ok=True)
resolved_report.write_text(report, encoding="utf-8")
log(f"report written: {resolved_report}")
return EXIT_OK
def cmd_peek(ollama_url: str, model: str) -> int:
"""Structured diagnostics. Prints a JSON object and a plain summary."""
diag: dict = {}
script_path = Path(__file__).resolve()
diag["script_path"] = str(script_path)
diag["script_executable"] = os.access(script_path, os.X_OK)
diag["python"] = sys.executable
diag["vault_root"] = str(VAULT_ROOT)
diag["ollama_url"] = ollama_url
diag["ollama_reachable"] = detect_ollama(ollama_url)
diag["model_requested"] = model
diag["model_present"] = detect_model(ollama_url, model) if diag["ollama_reachable"] else False
diag["cache_present"] = CACHE_PATH.exists()
diag["cache_readable"] = False
diag["cache_entries"] = 0
diag["cache_model"] = None
if diag["cache_present"]:
try:
with CACHE_PATH.open() as f:
c = json.load(f)
diag["cache_readable"] = (c.get("version") == 1
and isinstance(c.get("embeddings"), dict))
diag["cache_entries"] = len(c.get("embeddings", {}))
diag["cache_model"] = c.get("model")
except (OSError, json.JSONDecodeError) as exc:
diag["cache_readable"] = False
diag["cache_error"] = str(exc)
diag["thresholds_present"] = THRESHOLDS_PATH.exists()
diag["thresholds_readable"] = False
if diag["thresholds_present"]:
try:
with THRESHOLDS_PATH.open() as f:
t = json.load(f)
diag["thresholds_readable"] = True
diag["thresholds_calibrated"] = bool(t.get("calibrated", False))
diag["thresholds_bands"] = t.get("bands", {})
except (OSError, json.JSONDecodeError):
diag["thresholds_readable"] = False
print(json.dumps(diag, indent=2))
if not diag["ollama_reachable"]:
return EXIT_NO_OLLAMA
if not diag["model_present"]:
return EXIT_NO_MODEL
if diag["cache_present"] and not diag["cache_readable"]:
return EXIT_CACHE_CORRUPT
return EXIT_OK
def main(argv: list[str]) -> int:
p = argparse.ArgumentParser()
p.add_argument("--report", type=Path, default=None)
p.add_argument("--rebuild-cache", action="store_true")
p.add_argument("--peek", action="store_true")
p.add_argument("--allow-remote-ollama", action="store_true",
help="allow OLLAMA_URL env override pointing outside localhost")
p.add_argument("--model", default=DEFAULT_MODEL)
args = p.parse_args(argv)
env_url = os.environ.get("OLLAMA_URL")
ollama_url = env_url or DEFAULT_OLLAMA_URL
if env_url and not _is_local_url(ollama_url) and not args.allow_remote_ollama:
log(f"ERR: OLLAMA_URL={ollama_url!r} is not localhost. "
f"Vault content would be POSTed to a non-local host. "
f"Pass --allow-remote-ollama to override.")
return EXIT_USAGE
if args.peek:
return cmd_peek(ollama_url, args.model)
return run_check(
rebuild=args.rebuild_cache,
report_path=args.report,
ollama_url=ollama_url,
model=args.model,
)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
+341
View File
@@ -0,0 +1,341 @@
#!/usr/bin/env bash
# wiki-lock.sh — per-file advisory locking for safe multi-writer vault mutation.
#
# Closes the latent multi-writer corruption bug in v1.6 where two parallel
# sub-agents writing to the same wiki page could silently trample each other.
# The README and skills/wiki-ingest/SKILL.md §259-264 documented "single-writer
# only" as a convention; this script makes it an enforceable guard.
#
# Design (age-based, not flock-style):
# flock(2) advisory locks release when the holding process exits. That
# doesn't fit our model where `acquire` and `release` are SEPARATE bash
# invocations from the same skill (each Bash tool call is its own short-
# lived process — neither's PID survives long enough to mean anything).
# So we use atomic lockfile creation with `set -o noclobber` plus
# epoch-timestamp AGE-based staleness detection. Race-safe because the
# noclobber write itself is atomic on POSIX filesystems.
#
# The PID written into the lockfile is informational only (helpful for
# `list` and debugging). The acquire decision considers AGE only:
# - If lockfile age < STALE_AFTER_SEC → refuse (return 75 EX_TEMPFAIL)
# - If lockfile age >= STALE_AFTER_SEC → reap and acquire
# Default STALE_AFTER_SEC=60. Long enough for any single skill operation
# (page writes are milliseconds; a multi-write ingest pass is seconds);
# short enough that a crashed holder unblocks quickly.
#
# Semantics:
# acquire <vault-rel-path>
# - Computes lock_file = .vault-meta/locks/<sha1(path)>.lock
# - Atomically creates the lockfile with this process's PID + epoch
# - Returns 0 if acquired, 75 (EX_TEMPFAIL) if held and age < threshold
# - Auto-reaps locks older than STALE_AFTER_SEC
# release <vault-rel-path>
# - Removes the lockfile unconditionally (rm -f). Idempotent.
# - Cross-process release IS allowed by design — acquire and release
# are typically separate bash invocations from the same skill, and
# PID-matching would never succeed. Skill authors are trusted not to
# release locks they don't own; that's no weaker than `rm` on the
# lockfile directly.
# list
# - Prints currently-held lock records (one per line: pid age path).
# clear-stale [--max-age N]
# - Removes lockfiles whose PID is dead OR whose age > N seconds.
# Default N = 3600 (1h). Returns count removed via stdout.
# (The N=3600 default is intentionally generous because clear-stale
# is admin-grade cleanup, distinct from the per-acquire age threshold.)
# peek <vault-rel-path>
# - Prints holder info or "unheld"; exit 0; never mutates.
#
# Globals:
# STALE_AFTER_SEC — default 60. Override via --stale-after-sec N on any cmd.
#
# Age-threshold naming (v1.7.2; closes audit L6):
# - STALE_AFTER_SEC (default 60) is the PER-ACQUIRE threshold. A new
# acquire that finds an existing lock will reap-and-take if the lock is
# older than this; refuse otherwise. Tuned for "single skill operation
# completes within 60s."
# - `clear-stale --max-age N` (default 3600) is the ADMIN reaper threshold,
# meant to be run periodically by an operator or hook to sweep abandoned
# locks. Tuned for "anything older than an hour is definitely abandoned."
# These are two distinct concerns; both are time-since-acquire but operate
# at different scopes. Do not unify the defaults.
#
# Usage:
# bash scripts/wiki-lock.sh acquire wiki/concepts/Foo.md
# bash scripts/wiki-lock.sh release wiki/concepts/Foo.md
# bash scripts/wiki-lock.sh list
# bash scripts/wiki-lock.sh clear-stale --max-age 1800
# bash scripts/wiki-lock.sh peek wiki/concepts/Foo.md
#
# Exit codes:
# 0 — success
# 2 — usage error
# 75 — acquire failed (lock held by alive process)
# 3 — vault-meta/locks dir creation failed
# 4 — invalid vault-relative path (escape attempt)
set -euo pipefail
VAULT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
META_DIR="${VAULT_ROOT}/.vault-meta"
LOCK_DIR="${META_DIR}/locks"
META_LOCK="${META_DIR}/.wiki-lock.meta"
STALE_AFTER_SEC=60
# ── helpers ──────────────────────────────────────────────────────────────────
die() { echo "ERR: $*" >&2; exit "${2:-2}"; }
log() { echo "$*" >&2; }
# Allow tests / non-default vault roots to override
if [ -n "${WIKI_LOCK_VAULT:-}" ]; then
VAULT_ROOT="$WIKI_LOCK_VAULT"
META_DIR="${VAULT_ROOT}/.vault-meta"
LOCK_DIR="${META_DIR}/locks"
META_LOCK="${META_DIR}/.wiki-lock.meta"
fi
sha1_of() {
if command -v sha1sum >/dev/null 2>&1; then
printf '%s' "$1" | sha1sum | awk '{print $1}'
else
# macOS fallback
printf '%s' "$1" | shasum -a 1 | awk '{print $1}'
fi
}
ensure_dirs() {
mkdir -p "$LOCK_DIR" 2>/dev/null || die "cannot create $LOCK_DIR" 3
}
validate_path() {
# Reject empty, absolute, escape, or newline-bearing paths to prevent
# lock-namespace pollution. v1.7.2 / closes audit M4: newlines would break
# the meta-lock line format (key=value lines separated by literal \n).
# v1.9.1 / closes audit M3 (symlink escape): when a vault-relative path
# resolves through a symlink to outside VAULT_ROOT, treat as path traversal.
local p="$1"
[ -z "$p" ] && die "path cannot be empty" 4
case "$p" in
/*) die "path must be vault-relative, not absolute: $p" 4 ;;
*..*) die "path may not contain '..': $p" 4 ;;
*$'\n'*) die "path may not contain newlines (lockfile format would break)" 4 ;;
*$'\r'*) die "path may not contain carriage returns" 4 ;;
esac
# Symlink canonicalization (only when the path or one of its parents exists).
# Non-existent paths can pass; the lock acquire itself creates leaves under
# LOCK_DIR, not the path itself. We resolve via python3 (portable across
# GNU coreutils + macOS BSD where realpath flag semantics differ).
if command -v python3 >/dev/null 2>&1; then
local resolved root
resolved=$(VAULT_ROOT_BASH="$VAULT_ROOT" P_BASH="$p" python3 -c '
import os, sys
root = os.path.realpath(os.environ["VAULT_ROOT_BASH"])
candidate = os.environ["P_BASH"]
target = os.path.realpath(os.path.join(root, candidate))
common = os.path.commonpath([root, target]) if target else ""
sys.stdout.write("INSIDE" if common == root else "OUTSIDE")
' 2>/dev/null)
[ "$resolved" = "OUTSIDE" ] && die "path resolves outside vault via symlink: $p" 4
fi
return 0
}
now_epoch() { date +%s; }
is_alive() {
# kill -0 returns 0 if process exists and we can signal it
kill -0 "$1" 2>/dev/null
}
# Atomic meta-lock wrapper. Funcs that mutate LOCK_DIR call under this lock so
# acquire/release/clear-stale don't race against each other.
with_meta_lock() {
ensure_dirs
# Use flock under bash's redirect; meta lock is short-lived per command.
(
flock -x -w 5 9 || die "could not acquire meta-lock within 5s" 1
"$@"
) 9>"$META_LOCK"
}
read_lockfile() {
# Echoes: <pid> <epoch> <path> (or empty if file missing/unreadable)
local lf="$1"
[ -f "$lf" ] || return 0
head -1 "$lf" 2>/dev/null || true
}
# ── commands ─────────────────────────────────────────────────────────────────
_cmd_acquire() {
local path="$1"
validate_path "$path"
ensure_dirs
local lf="${LOCK_DIR}/$(sha1_of "$path").lock"
local now
now=$(now_epoch)
# Try the cheap path first: noclobber-atomic create
if (set -o noclobber; printf '%s %s %s\n' "$$" "$now" "$path" > "$lf") 2>/dev/null; then
return 0
fi
# Lockfile already exists — examine age, not PID
local existing
existing=$(read_lockfile "$lf")
if [ -z "$existing" ]; then
# Empty/unreadable; treat as stale, clean and retry once
rm -f "$lf"
if (set -o noclobber; printf '%s %s %s\n' "$$" "$now" "$path" > "$lf") 2>/dev/null; then
return 0
fi
return 75
fi
local eepoch
eepoch=$(printf '%s' "$existing" | awk '{print $2}')
# Numeric sanity (corrupt lockfile → treat as stale)
case "$eepoch" in
''|*[!0-9]*) rm -f "$lf"
(set -o noclobber; printf '%s %s %s\n' "$$" "$now" "$path" > "$lf") 2>/dev/null && return 0
return 75 ;;
esac
local age=$((now - eepoch))
if [ "$age" -gt "$STALE_AFTER_SEC" ]; then
# Age exceeds threshold → reap and re-acquire (regardless of holder PID)
rm -f "$lf"
if (set -o noclobber; printf '%s %s %s\n' "$$" "$now" "$path" > "$lf") 2>/dev/null; then
return 0
fi
return 75
fi
# Held and not yet stale by age — refuse
return 75
}
_cmd_release() {
local path="$1"
validate_path "$path"
ensure_dirs
local lf="${LOCK_DIR}/$(sha1_of "$path").lock"
# Unconditional remove — cross-process release is allowed by design
# (acquire and release are typically separate bash invocations from the
# same skill; PID-matching would never succeed). See header comment.
rm -f "$lf"
return 0
}
_cmd_list() {
ensure_dirs
local count=0
for lf in "$LOCK_DIR"/*.lock; do
[ -f "$lf" ] || continue
local rec
rec=$(read_lockfile "$lf")
[ -n "$rec" ] || continue
local pid epoch path now age
pid=$(printf '%s' "$rec" | awk '{print $1}')
epoch=$(printf '%s' "$rec" | awk '{print $2}')
path=$(printf '%s' "$rec" | cut -d' ' -f3-)
now=$(now_epoch)
age=$((now - epoch))
printf 'pid=%s age=%ss path=%s\n' "$pid" "$age" "$path"
count=$((count + 1))
done
return 0
}
_cmd_clear_stale() {
local max_age="$1"
ensure_dirs
local removed=0
local now
now=$(now_epoch)
for lf in "$LOCK_DIR"/*.lock; do
[ -f "$lf" ] || continue
local rec
rec=$(read_lockfile "$lf")
if [ -z "$rec" ]; then
rm -f "$lf"; removed=$((removed + 1)); continue
fi
local pid epoch age
pid=$(printf '%s' "$rec" | awk '{print $1}')
epoch=$(printf '%s' "$rec" | awk '{print $2}')
age=$((now - epoch))
if ! is_alive "$pid" || [ "$age" -gt "$max_age" ]; then
rm -f "$lf"; removed=$((removed + 1))
fi
done
echo "$removed"
return 0
}
_cmd_peek() {
local path="$1"
validate_path "$path"
ensure_dirs
local lf="${LOCK_DIR}/$(sha1_of "$path").lock"
if [ ! -f "$lf" ]; then
echo "unheld"
return 0
fi
local rec
rec=$(read_lockfile "$lf")
echo "$rec"
return 0
}
# ── arg parsing (flags accepted in any position) ─────────────────────────────
if [ $# -lt 1 ]; then
sed -n '2,46p' "$0" | sed 's/^# \{0,1\}//'
exit 2
fi
CMD=""
ARGS=()
MAX_AGE_OVERRIDE=""
while [ $# -gt 0 ]; do
case "$1" in
--stale-after-sec) STALE_AFTER_SEC="$2"; shift 2 ;;
--max-age) MAX_AGE_OVERRIDE="$2"; shift 2 ;;
-h|--help) sed -n '2,46p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
--) shift; while [ $# -gt 0 ]; do ARGS+=("$1"); shift; done ;;
-*) die "unknown flag: $1" ;;
*)
if [ -z "$CMD" ]; then
CMD="$1"
else
ARGS+=("$1")
fi
shift
;;
esac
done
[ -n "$CMD" ] || die "no command given"
case "$CMD" in
acquire)
[ ${#ARGS[@]} -ge 1 ] || die "acquire needs a path"
with_meta_lock _cmd_acquire "${ARGS[0]}"
;;
release)
[ ${#ARGS[@]} -ge 1 ] || die "release needs a path"
with_meta_lock _cmd_release "${ARGS[0]}"
;;
list)
with_meta_lock _cmd_list
;;
clear-stale)
MAX="${MAX_AGE_OVERRIDE:-${ARGS[0]:-3600}}"
with_meta_lock _cmd_clear_stale "$MAX"
;;
peek)
[ ${#ARGS[@]} -ge 1 ] || die "peek needs a path"
with_meta_lock _cmd_peek "${ARGS[0]}"
;;
*)
die "unknown command: $CMD (try acquire|release|list|clear-stale|peek)"
;;
esac
+252
View File
@@ -0,0 +1,252 @@
#!/usr/bin/env python3
"""wiki-mode.py — read + route helper for v1.8 methodology modes.
Single source of truth for "which mode is this vault in" and "where should
new content of type X be filed under mode Y." Consumed by:
- skills/wiki-ingest/SKILL.md (where to file new source/entity/concept pages)
- skills/save/SKILL.md (where to file session notes)
- skills/autoresearch/SKILL.md (where to file research output)
- bin/setup-mode.sh (writes .vault-meta/mode.json)
If `.vault-meta/mode.json` is absent → mode = "generic" → behavior identical
to v1.7. No skill needs to special-case the missing-config path.
CLI:
wiki-mode.py get # print current mode (default: generic)
wiki-mode.py config # print full config JSON
wiki-mode.py route TYPE NAME # print suggested path for new content
# TYPE: source|entity|concept|session|research
wiki-mode.py set MODE # write mode (lyt|para|zettelkasten|generic)
wiki-mode.py id # mint a Zettelkasten ID (timestamp)
wiki-mode.py templates # list per-mode template files
Exit codes:
0 — success
2 — usage error
3 — invalid mode string
4 — invalid content type
"""
import argparse
import json
import re
import sys
import tempfile
from datetime import datetime, timezone
from pathlib import Path
VAULT_ROOT = Path(__file__).resolve().parent.parent
META_DIR = VAULT_ROOT / ".vault-meta"
MODE_PATH = META_DIR / "mode.json"
VALID_MODES = ("generic", "lyt", "para", "zettelkasten")
VALID_TYPES = ("source", "entity", "concept", "session", "research")
DEFAULT_CONFIG = {
"schema_version": 1,
"mode": "generic",
"configured_at": None,
"config": {
"lyt": {
"moc_folder": "wiki/mocs/",
"notes_folder": "wiki/notes/",
},
"para": {
"projects_folder": "wiki/projects/",
"areas_folder": "wiki/areas/",
"resources_folder": "wiki/resources/",
"archives_folder": "wiki/archives/",
},
"zettelkasten": {
"id_format": "YYYYMMDDHHMMSSffffff",
"no_folders": True,
"root_folder": "wiki/",
},
"generic": {
"sources_folder": "wiki/sources/",
"entities_folder": "wiki/entities/",
"concepts_folder": "wiki/concepts/",
"sessions_folder": "wiki/sessions/",
},
},
}
def load_config():
"""Return parsed mode.json, or DEFAULT_CONFIG with mode='generic' if absent."""
if not MODE_PATH.is_file():
return dict(DEFAULT_CONFIG)
try:
loaded = json.loads(MODE_PATH.read_text(encoding="utf-8"))
# Merge with defaults so partially-configured files still work
merged = dict(DEFAULT_CONFIG)
merged["mode"] = loaded.get("mode", "generic")
merged["configured_at"] = loaded.get("configured_at")
loaded_config = loaded.get("config", {})
for k, v in loaded_config.items():
if k in merged["config"] and isinstance(v, dict):
merged["config"][k].update(v)
return merged
except (json.JSONDecodeError, OSError) as e:
print(f"ERR: cannot parse {MODE_PATH}: {e}", file=sys.stderr)
print(" Falling back to mode=generic. Re-run `bash bin/setup-mode.sh` to fix.",
file=sys.stderr)
return dict(DEFAULT_CONFIG)
def save_config(cfg):
META_DIR.mkdir(parents=True, exist_ok=True)
payload = json.dumps(cfg, indent=2, ensure_ascii=False) + "\n"
fd, tmp_path = tempfile.mkstemp(prefix="mode.", suffix=".tmp", dir=str(META_DIR))
try:
with open(fd, "w", encoding="utf-8") as fh:
fh.write(payload)
Path(tmp_path).replace(MODE_PATH)
except Exception:
try:
Path(tmp_path).unlink()
except OSError:
pass
raise
def slugify(name):
"""Filesystem-safe slug; matches the convention used by the existing skills.
Any run of non-word, non-hyphen characters becomes a single hyphen so that
'v1.8 launch! prep?''v1-8-launch-prep' (not 'v18launchprep').
Unicode word characters (CJK, accented Latin, Cyrillic, etc.) are preserved.
"""
s = re.sub(r"[^\w\-]+", "-", name, flags=re.UNICODE)
s = re.sub(r"-+", "-", s).strip("-")
return s or "untitled"
def safe_name(name):
"""Sanitize a name that intentionally preserves case + spaces (entity/concept).
Strips path separators, null bytes, control characters, and leading dots or
hyphens so the returned string cannot escape its parent directory or be
interpreted as a hidden file or flag. Spaces and case are preserved.
"""
cleaned = re.sub(r"[/\\\x00-\x1f]+", "", name)
cleaned = cleaned.lstrip(".-")
return cleaned or "untitled"
def mint_zettel_id():
"""YYYYMMDDHHMMSSffffff in UTC (microsecond resolution).
Stable across timezones; lexicographically sortable; collision-resistant
against rapid back-to-back calls in the same second. Microsecond suffix
closes the v1.8.0 verifier LOW (two rapid mint calls produced the same
14-digit ID and would have generated colliding filenames).
"""
return datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S%f")
def route_path(mode, content_type, name, cfg):
"""Return the suggested vault-relative path for new content under `mode`."""
if content_type not in VALID_TYPES:
raise SystemExit(4)
slug = slugify(name)
raw = safe_name(name) # case + spaces preserved, but path-traversal stripped
if mode == "generic":
g = cfg["config"]["generic"]
mapping = {
"source": g["sources_folder"] + slug + ".md",
"entity": g["entities_folder"] + raw + ".md", # preserve capitalization for entities
"concept": g["concepts_folder"] + raw + ".md",
"session": g["sessions_folder"] + slug + ".md",
"research": g["concepts_folder"] + raw + ".md",
}
return mapping[content_type]
if mode == "lyt":
notes = cfg["config"]["lyt"]["notes_folder"]
# All atomic notes flat in wiki/notes/; routing is the same regardless of type
return notes + slug + ".md"
if mode == "para":
p = cfg["config"]["para"]
mapping = {
# New sources land in resources/<topic>/ (we use a generic 'incoming' bucket;
# the user will sort into specific topics via their own workflow)
"source": p["resources_folder"] + "incoming/" + slug + ".md",
"entity": p["resources_folder"] + "people/" + raw + ".md",
"concept": p["resources_folder"] + "concepts/" + raw + ".md",
# Session notes land in projects/inbox/; user reroutes to specific projects
"session": p["projects_folder"] + "inbox/" + slug + ".md",
"research": p["resources_folder"] + slug + "/" + slug + ".md",
}
return mapping[content_type]
if mode == "zettelkasten":
z = cfg["config"]["zettelkasten"]
zid = mint_zettel_id()
return z["root_folder"] + f"{zid}-{slug}.md"
raise SystemExit(3)
def main():
parser = argparse.ArgumentParser(description="Methodology-mode router for v1.8 Compound Vault.")
sub = parser.add_subparsers(dest="cmd", required=True)
sub.add_parser("get", help="Print current mode")
sub.add_parser("config", help="Print full config JSON")
sp_route = sub.add_parser("route", help="Print suggested vault path for new content")
sp_route.add_argument("type", choices=VALID_TYPES)
sp_route.add_argument("name", help="Content name (will be slugified for filenames)")
sp_route.add_argument("--mode", choices=VALID_MODES, default=None,
help="Preview routing under MODE without writing mode.json (default: use current vault mode)")
sp_set = sub.add_parser("set", help="Write a mode to .vault-meta/mode.json")
sp_set.add_argument("mode", choices=VALID_MODES)
sub.add_parser("id", help="Mint a Zettelkasten ID (timestamp)")
sub.add_parser("templates", help="List per-mode template files")
args = parser.parse_args()
cfg = load_config()
if args.cmd == "get":
print(cfg["mode"])
return 0
if args.cmd == "config":
print(json.dumps(cfg, indent=2, ensure_ascii=False))
return 0
if args.cmd == "route":
active_mode = args.mode if args.mode else cfg["mode"]
path = route_path(active_mode, args.type, args.name, cfg)
print(path)
return 0
if args.cmd == "set":
cfg["mode"] = args.mode
cfg["configured_at"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
save_config(cfg)
print(f"mode set: {args.mode}")
return 0
if args.cmd == "id":
print(mint_zettel_id())
return 0
if args.cmd == "templates":
templates_dir = VAULT_ROOT / "skills" / "wiki-mode" / "templates"
if not templates_dir.is_dir():
print(f"ERR: templates dir missing: {templates_dir}", file=sys.stderr)
return 2
for f in sorted(templates_dir.rglob("*.md")):
print(str(f.relative_to(VAULT_ROOT)))
return 0
return 2
if __name__ == "__main__":
sys.exit(main())