add claude-obsidian
This commit is contained in:
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env bash
|
||||
# allocate-address.sh — atomic creation-order address allocation for the vault.
|
||||
#
|
||||
# Reserves the next address of the form c-NNNNNN and increments the counter
|
||||
# under an exclusive flock. On missing counter file, recovers by scanning the
|
||||
# vault for the highest existing c-NNNNNN in page frontmatter and resuming from
|
||||
# max+1. Never silently resets to 1 in a non-empty vault.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/allocate-address.sh # prints the reserved address (e.g. c-000042) to stdout
|
||||
# ./scripts/allocate-address.sh --peek # prints the next value without incrementing
|
||||
# ./scripts/allocate-address.sh --rebuild # recomputes counter from max observed and exits
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 — success
|
||||
# 1 — lock acquisition failed (another writer is holding the lock)
|
||||
# 2 — vault-meta directory missing and cannot be created
|
||||
# 3 — counter value corrupt or non-numeric
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
VAULT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
COUNTER_FILE="${VAULT_ROOT}/.vault-meta/address-counter.txt"
|
||||
LOCK_FILE="${VAULT_ROOT}/.vault-meta/.address.lock"
|
||||
WIKI_DIR="${VAULT_ROOT}/wiki"
|
||||
|
||||
MODE="${1:-allocate}"
|
||||
|
||||
mkdir -p "$(dirname "$COUNTER_FILE")" || {
|
||||
echo "ERR: cannot create .vault-meta/" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
# Acquire exclusive lock with 5-second timeout. Release automatically on scope exit.
|
||||
exec 9>"$LOCK_FILE"
|
||||
if ! flock -x -w 5 9; then
|
||||
echo "ERR: could not acquire address allocator lock within 5s" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
scan_max_c_address() {
|
||||
# Emit the largest NNNNNN from "address: c-NNNNNN" lines that appear inside
|
||||
# the FIRST YAML frontmatter block of each wiki .md file. Code-block examples
|
||||
# and body prose are excluded. Returns 0 if none found.
|
||||
if [ ! -d "$WIKI_DIR" ]; then
|
||||
echo 0
|
||||
return
|
||||
fi
|
||||
find "$WIKI_DIR" -type f -name '*.md' -print0 2>/dev/null \
|
||||
| xargs -0 awk '
|
||||
FNR == 1 { state = "pre"; next_is_fm = ($0 == "---") ? 1 : 0 }
|
||||
FNR == 1 && $0 == "---" { state = "fm"; next }
|
||||
state == "fm" && $0 == "---" { state = "body"; nextfile }
|
||||
state == "fm" && match($0, /^address:[[:space:]]+c-[0-9]{6}[[:space:]]*$/) {
|
||||
if (match($0, /c-[0-9]{6}/)) {
|
||||
print substr($0, RSTART, RLENGTH)
|
||||
}
|
||||
}
|
||||
' 2>/dev/null \
|
||||
| sed 's/^c-0*//;s/^$/0/' \
|
||||
| sort -n \
|
||||
| tail -1 \
|
||||
| awk 'BEGIN{n=0} {n=$0} END{print (n+0)}'
|
||||
}
|
||||
|
||||
read_or_recover_counter() {
|
||||
if [ ! -f "$COUNTER_FILE" ]; then
|
||||
local max_c
|
||||
max_c="$(scan_max_c_address)"
|
||||
echo $((max_c + 1)) > "$COUNTER_FILE"
|
||||
echo "INFO: counter file missing; recovered from vault scan, set to $((max_c + 1))" >&2
|
||||
fi
|
||||
local raw
|
||||
raw="$(cat "$COUNTER_FILE")"
|
||||
if ! [[ "$raw" =~ ^[0-9]+$ ]]; then
|
||||
echo "ERR: counter file content is not a positive integer: $raw" >&2
|
||||
exit 3
|
||||
fi
|
||||
echo "$raw"
|
||||
}
|
||||
|
||||
case "$MODE" in
|
||||
--peek)
|
||||
read_or_recover_counter
|
||||
;;
|
||||
--rebuild)
|
||||
max_c="$(scan_max_c_address)"
|
||||
echo $((max_c + 1)) > "$COUNTER_FILE"
|
||||
echo "Counter rebuilt: next = $((max_c + 1))"
|
||||
;;
|
||||
allocate|"")
|
||||
current="$(read_or_recover_counter)"
|
||||
next=$((current + 1))
|
||||
echo "$next" > "$COUNTER_FILE"
|
||||
printf 'c-%06d\n' "$current"
|
||||
;;
|
||||
*)
|
||||
echo "ERR: unknown mode: $MODE" >&2
|
||||
echo "Usage: $0 [allocate|--peek|--rebuild]" >&2
|
||||
exit 3
|
||||
;;
|
||||
esac
|
||||
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
"""baseline-v16.py — simulate the v1.6 hot→index→drill retrieval chain.
|
||||
|
||||
Exists ONLY for benchmarking v1.7's hybrid retrieval against the legacy
|
||||
v1.6 behavior. Not used by any v1.7 skill; not feature-gated; not part of
|
||||
the regular vault workflow.
|
||||
|
||||
The v1.6 query path (per skills/wiki-query/SKILL.md before v1.7):
|
||||
1. Read wiki/hot.md (always; quick context)
|
||||
2. Read wiki/index.md (scan for descriptions matching query terms)
|
||||
3. Read top-N pages cited in the index whose entries best match query
|
||||
4. Caller synthesizes answer
|
||||
|
||||
This script approximates that path by:
|
||||
1. Tokenizing the query (same stopword-filtered ASCII tokenizer as bm25-index.py)
|
||||
2. Scoring each *.md page in wiki/ by the count of distinct query terms it contains
|
||||
(case-insensitive substring on the full file body; no semantic matching)
|
||||
3. Returning top-K pages by score, with ties broken by:
|
||||
a. Presence in hot.md (boost +5)
|
||||
b. Presence in index.md (boost +3)
|
||||
c. Total raw term-occurrence count
|
||||
|
||||
The simulation is intentionally simple — it represents what a human or a
|
||||
basic agent does when reading hot/index "by hand" without any retrieval
|
||||
infrastructure. Anything fancier would not be a fair v1.6 baseline.
|
||||
|
||||
Usage:
|
||||
baseline-v16.py "your query" [--top 5]
|
||||
baseline-v16.py "query" --top 5 --json # output as JSON (default: text)
|
||||
|
||||
Exit codes:
|
||||
0 — success
|
||||
2 — usage error
|
||||
3 — wiki directory missing
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
WIKI_DIR = VAULT_ROOT / "wiki"
|
||||
HOT_PATH = WIKI_DIR / "hot.md"
|
||||
INDEX_PATH = WIKI_DIR / "index.md"
|
||||
|
||||
# Mirror bm25-index.py's tokenizer + stopword list so comparisons are fair.
|
||||
STOPWORDS = frozenset("""
|
||||
a an and are as at be by for from has have he her him his i if in is it its
|
||||
of on or that the their them they this to was were will with you your
|
||||
""".split())
|
||||
|
||||
# Mirrors bm25-index.py's Unicode-aware tokenizer (v1.7.2; closes M2).
|
||||
TOKEN_RE = re.compile(r"\w[\w'\-]*", re.UNICODE)
|
||||
|
||||
HOT_BOOST = 5.0
|
||||
INDEX_BOOST = 3.0
|
||||
|
||||
EXIT_OK = 0
|
||||
EXIT_USAGE = 2
|
||||
EXIT_NO_WIKI = 3
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
return [t.lower() for t in TOKEN_RE.findall(text)
|
||||
if t.lower() not in STOPWORDS and len(t) > 1]
|
||||
|
||||
|
||||
def page_paths():
|
||||
if not WIKI_DIR.is_dir():
|
||||
print(f"ERR: no wiki directory at {WIKI_DIR}", file=sys.stderr)
|
||||
sys.exit(EXIT_NO_WIKI)
|
||||
return sorted(p for p in WIKI_DIR.rglob("*.md")
|
||||
if not any(part.startswith(".") for part in p.parts))
|
||||
|
||||
|
||||
def score_page(page_path, query_terms_set, query_terms_counter):
|
||||
"""Score by distinct-query-term-presence + boost if cited in hot/index.
|
||||
|
||||
Returns (score, distinct_matches, total_occurrences).
|
||||
"""
|
||||
try:
|
||||
body = page_path.read_text(encoding="utf-8", errors="replace").lower()
|
||||
except OSError:
|
||||
return (0.0, 0, 0)
|
||||
|
||||
distinct = sum(1 for term in query_terms_set if term in body)
|
||||
total = sum(body.count(term) for term in query_terms_set)
|
||||
score = float(distinct) + 0.01 * total # distinct dominates; total is tiebreak
|
||||
|
||||
# Hot-cache boost: if the page is referenced by name in hot.md
|
||||
if HOT_PATH.is_file():
|
||||
try:
|
||||
hot_body = HOT_PATH.read_text(encoding="utf-8", errors="replace")
|
||||
page_stem = page_path.stem
|
||||
if page_stem in hot_body or str(page_path.relative_to(VAULT_ROOT)) in hot_body:
|
||||
score += HOT_BOOST
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Index boost: page is cited in index.md
|
||||
if INDEX_PATH.is_file():
|
||||
try:
|
||||
index_body = INDEX_PATH.read_text(encoding="utf-8", errors="replace")
|
||||
page_stem = page_path.stem
|
||||
if page_stem in index_body or str(page_path.relative_to(VAULT_ROOT)) in index_body:
|
||||
score += INDEX_BOOST
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return (score, distinct, total)
|
||||
|
||||
|
||||
def baseline_query(query, top_k=5):
|
||||
"""Return list of {path, score, distinct, total} for top-K pages."""
|
||||
terms = tokenize(query)
|
||||
if not terms:
|
||||
return []
|
||||
terms_set = set(terms)
|
||||
terms_counter = Counter(terms)
|
||||
|
||||
scored = []
|
||||
for p in page_paths():
|
||||
score, distinct, total = score_page(p, terms_set, terms_counter)
|
||||
if score > 0:
|
||||
scored.append({
|
||||
"path": str(p.relative_to(VAULT_ROOT)),
|
||||
"score": round(score, 4),
|
||||
"distinct_terms": distinct,
|
||||
"total_occurrences": total,
|
||||
})
|
||||
|
||||
scored.sort(key=lambda d: d["score"], reverse=True)
|
||||
return scored[:top_k]
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="v1.6 baseline retrieval simulator.")
|
||||
parser.add_argument("query", help="Natural-language query")
|
||||
parser.add_argument("--top", type=int, default=5, help="Top-K results")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
args = parser.parse_args()
|
||||
|
||||
results = baseline_query(args.query, top_k=args.top)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps({
|
||||
"query": args.query,
|
||||
"strategy": "baseline-v1.6:hot+index+keyword",
|
||||
"top_k": args.top,
|
||||
"candidates": results,
|
||||
}, indent=2))
|
||||
else:
|
||||
if not results:
|
||||
print("(no matches)")
|
||||
else:
|
||||
print(f"v1.6 baseline for: {args.query!r}")
|
||||
for i, r in enumerate(results, 1):
|
||||
print(f" {i}. {r['path']} score={r['score']} distinct={r['distinct_terms']} occ={r['total_occurrences']}")
|
||||
|
||||
return EXIT_OK
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python3
|
||||
"""benchmark-runner.py — score v1.7 hybrid retrieval vs v1.6 baseline.
|
||||
|
||||
Reads the 50-query corpus at wiki/meta/retrieval-benchmark-v1.7.md, runs both
|
||||
pipelines for each query, scores top-1 / top-5 accuracy, prints a comparison
|
||||
table. Used by the v1.7.0 audit.
|
||||
|
||||
Pure stdlib + subprocess. No network or LLM calls of its own — the subprocess
|
||||
calls to retrieve.py may hit ollama (if installed) for rerank. baseline-v16.py
|
||||
is pure filesystem.
|
||||
|
||||
Usage:
|
||||
benchmark-runner.py # run all 50 queries, print summary
|
||||
benchmark-runner.py --json results.json # also write per-query results
|
||||
benchmark-runner.py --limit 5 # smoke: first 5 queries only
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
CORPUS = VAULT_ROOT / "wiki" / "meta" / "retrieval-benchmark-v1.7.md"
|
||||
|
||||
|
||||
def parse_corpus(corpus_path):
|
||||
"""Parse the ### <id> blocks into a list of query dicts."""
|
||||
text = corpus_path.read_text(encoding="utf-8")
|
||||
# Split on "### " at line start
|
||||
blocks = re.split(r"\n### ", text)
|
||||
queries = []
|
||||
for blk in blocks[1:]: # skip prelude
|
||||
# First line is the id (e.g. "D1\n")
|
||||
lines = blk.split("\n", 1)
|
||||
if len(lines) < 2:
|
||||
continue
|
||||
qid = lines[0].strip()
|
||||
# Ignore non-ID lines (e.g. "Schema", "Scoring rules")
|
||||
if not re.match(r"^[DH]\d+$", qid):
|
||||
continue
|
||||
body = lines[1]
|
||||
# Stop at next "## " (next section header)
|
||||
body = re.split(r"\n## ", body, 1)[0]
|
||||
# Parse fields
|
||||
def get(field):
|
||||
m = re.search(rf"^- {field}:\s*(.+)$", body, re.MULTILINE)
|
||||
return m.group(1).strip() if m else ""
|
||||
|
||||
def get_list(field):
|
||||
raw = get(field)
|
||||
if not raw or raw == "null":
|
||||
return []
|
||||
return [s.strip() for s in raw.split(",") if s.strip()]
|
||||
|
||||
queries.append({
|
||||
"id": qid,
|
||||
"query": get("query"),
|
||||
"correct": get_list("correct"),
|
||||
"relevant": get_list("relevant"),
|
||||
"category": get("category"),
|
||||
"rationale": get("rationale"),
|
||||
})
|
||||
return queries
|
||||
|
||||
|
||||
def run_v17(query, top_k=5):
|
||||
"""Returns ordered list of page_paths from v1.7 retrieve.py."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["python3", str(VAULT_ROOT / "scripts" / "retrieve.py"),
|
||||
query, "--top", str(top_k)],
|
||||
capture_output=True, text=True, timeout=60, check=False,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return [], f"rc={result.returncode}: {result.stderr.strip()[:200]}"
|
||||
data = json.loads(result.stdout)
|
||||
return [c["page_path"] for c in data.get("candidates", [])], None
|
||||
except (subprocess.TimeoutExpired, json.JSONDecodeError, OSError) as e:
|
||||
return [], str(e)
|
||||
|
||||
|
||||
def run_v16(query, top_k=5):
|
||||
"""Returns ordered list of page_paths from v1.6 baseline-v16.py."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["python3", str(VAULT_ROOT / "scripts" / "baseline-v16.py"),
|
||||
query, "--top", str(top_k), "--json"],
|
||||
capture_output=True, text=True, timeout=30, check=False,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return [], f"rc={result.returncode}"
|
||||
data = json.loads(result.stdout)
|
||||
return [c["path"] for c in data.get("candidates", [])], None
|
||||
except (subprocess.TimeoutExpired, json.JSONDecodeError, OSError) as e:
|
||||
return [], str(e)
|
||||
|
||||
|
||||
def score_query(results, correct, relevant, category):
|
||||
"""Returns (top1_success, top5_success) per the scoring rules."""
|
||||
# Negative queries: correct is empty; success = no results OR result is in relevant
|
||||
if category == "negative" or not correct:
|
||||
if not results:
|
||||
return (1, 1) # no results = correctly "found nothing"
|
||||
top1 = 1 if results[0] in relevant else 0
|
||||
top5 = 1 if any(r in relevant for r in results[:5]) else 0
|
||||
return (top1, top5)
|
||||
# Normal queries: top-1 if first result in correct; top-5 if any in correct
|
||||
top1 = 1 if results and results[0] in correct else 0
|
||||
top5 = 1 if any(r in correct for r in results[:5]) else 0
|
||||
return (top1, top5)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--limit", type=int, default=None, help="Only run first N queries")
|
||||
parser.add_argument("--json", help="Write per-query results to PATH")
|
||||
parser.add_argument("--top", type=int, default=5)
|
||||
args = parser.parse_args()
|
||||
|
||||
queries = parse_corpus(CORPUS)
|
||||
if args.limit:
|
||||
queries = queries[: args.limit]
|
||||
|
||||
print(f"Parsed {len(queries)} queries from {CORPUS.relative_to(VAULT_ROOT)}\n")
|
||||
|
||||
per_query = []
|
||||
cat_stats = {} # category -> {v17_top1, v17_top5, v16_top1, v16_top5, count}
|
||||
|
||||
for q in queries:
|
||||
v17_results, v17_err = run_v17(q["query"], top_k=args.top)
|
||||
v16_results, v16_err = run_v16(q["query"], top_k=args.top)
|
||||
v17_top1, v17_top5 = score_query(v17_results, q["correct"], q["relevant"], q["category"])
|
||||
v16_top1, v16_top5 = score_query(v16_results, q["correct"], q["relevant"], q["category"])
|
||||
|
||||
record = {
|
||||
"id": q["id"],
|
||||
"category": q["category"],
|
||||
"query": q["query"][:80] + ("..." if len(q["query"]) > 80 else ""),
|
||||
"correct": q["correct"],
|
||||
"v17_top1": v17_top1,
|
||||
"v17_top5": v17_top5,
|
||||
"v17_results": v17_results[:args.top],
|
||||
"v17_err": v17_err,
|
||||
"v16_top1": v16_top1,
|
||||
"v16_top5": v16_top5,
|
||||
"v16_results": v16_results[:args.top],
|
||||
"v16_err": v16_err,
|
||||
}
|
||||
per_query.append(record)
|
||||
|
||||
cat = q["category"]
|
||||
if cat not in cat_stats:
|
||||
cat_stats[cat] = {"v17_t1": 0, "v17_t5": 0, "v16_t1": 0, "v16_t5": 0, "n": 0}
|
||||
cat_stats[cat]["v17_t1"] += v17_top1
|
||||
cat_stats[cat]["v17_t5"] += v17_top5
|
||||
cat_stats[cat]["v16_t1"] += v16_top1
|
||||
cat_stats[cat]["v16_t5"] += v16_top5
|
||||
cat_stats[cat]["n"] += 1
|
||||
|
||||
# Live progress
|
||||
marker = "✓" if v17_top1 else "·"
|
||||
v16marker = "✓" if v16_top1 else "·"
|
||||
print(f" {q['id']:4} [{q['category']:14}] v17:{marker} v16:{v16marker} {q['query'][:60]}")
|
||||
|
||||
# Aggregate
|
||||
total_v17_t1 = sum(c["v17_t1"] for c in cat_stats.values())
|
||||
total_v17_t5 = sum(c["v17_t5"] for c in cat_stats.values())
|
||||
total_v16_t1 = sum(c["v16_t1"] for c in cat_stats.values())
|
||||
total_v16_t5 = sum(c["v16_t5"] for c in cat_stats.values())
|
||||
total_n = sum(c["n"] for c in cat_stats.values())
|
||||
|
||||
def pct(x, n):
|
||||
return f"{100.0 * x / n:5.1f}%" if n else " n/a"
|
||||
|
||||
print()
|
||||
print("=" * 80)
|
||||
print(f"{'Category':<16} {'N':>4} {'v17 top-1':>10} {'v17 top-5':>10} {'v16 top-1':>10} {'v16 top-5':>10} Δ top-1")
|
||||
print("-" * 80)
|
||||
for cat, c in sorted(cat_stats.items()):
|
||||
delta = (c["v17_t1"] - c["v16_t1"]) / c["n"] * 100 if c["n"] else 0
|
||||
print(f"{cat:<16} {c['n']:>4} {pct(c['v17_t1'], c['n']):>10} {pct(c['v17_t5'], c['n']):>10} {pct(c['v16_t1'], c['n']):>10} {pct(c['v16_t5'], c['n']):>10} {delta:+6.1f}pp")
|
||||
delta_total = (total_v17_t1 - total_v16_t1) / total_n * 100 if total_n else 0
|
||||
print("-" * 80)
|
||||
print(f"{'TOTAL':<16} {total_n:>4} {pct(total_v17_t1, total_n):>10} {pct(total_v17_t5, total_n):>10} {pct(total_v16_t1, total_n):>10} {pct(total_v16_t5, total_n):>10} {delta_total:+6.1f}pp")
|
||||
print()
|
||||
print(f"Plan §7 ship-gate target: ≥30 percentage-point improvement in top-1")
|
||||
print(f"Actual: {delta_total:+.1f}pp ({'PASS' if delta_total >= 30 else 'INFO'} — pp gain alone, not failure-reduction %)")
|
||||
# Also compute as a relative reduction in "wrong page cited" errors
|
||||
v17_wrong = total_n - total_v17_t1
|
||||
v16_wrong = total_n - total_v16_t1
|
||||
err_reduction = (v16_wrong - v17_wrong) / v16_wrong * 100 if v16_wrong else 0
|
||||
print(f"Error-reduction (the gate's actual framing): {err_reduction:+.1f}% ({'PASS' if err_reduction >= 30 else 'FAIL'})")
|
||||
print()
|
||||
|
||||
if args.json:
|
||||
Path(args.json).write_text(json.dumps({
|
||||
"summary": {
|
||||
"v17_top1_pct": 100 * total_v17_t1 / total_n if total_n else 0,
|
||||
"v17_top5_pct": 100 * total_v17_t5 / total_n if total_n else 0,
|
||||
"v16_top1_pct": 100 * total_v16_t1 / total_n if total_n else 0,
|
||||
"v16_top5_pct": 100 * total_v16_t5 / total_n if total_n else 0,
|
||||
"delta_top1_pp": delta_total,
|
||||
"error_reduction_pct": err_reduction,
|
||||
},
|
||||
"by_category": {cat: {**c, "v17_top1_pct": 100*c["v17_t1"]/c["n"], "v16_top1_pct": 100*c["v16_t1"]/c["n"]} for cat, c in cat_stats.items()},
|
||||
"per_query": per_query,
|
||||
}, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
print(f"Wrote per-query results to {args.json}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,293 @@
|
||||
#!/usr/bin/env python3
|
||||
"""bm25-index.py — sparse BM25 inverted index over contextualized wiki chunks.
|
||||
|
||||
Pure stdlib (no rank_bm25 dep). Standard Okapi BM25 with k1=1.5, b=0.75.
|
||||
Indexes the `contextualized_text` field of every chunk under .vault-meta/chunks/,
|
||||
emits a single JSON file at .vault-meta/bm25/index.json with the schema below.
|
||||
|
||||
Concurrency:
|
||||
- Locks .vault-meta/.bm25.lock (fcntl exclusive) around any index write.
|
||||
- Atomic .tmp + rename for the index file.
|
||||
|
||||
Index schema (.vault-meta/bm25/index.json):
|
||||
{
|
||||
"schema_version": 1,
|
||||
"params": {"k1": 1.5, "b": 0.75},
|
||||
"doc_count": 1234,
|
||||
"avg_dl": 487.5,
|
||||
"updated_at": "2026-05-17T...",
|
||||
"vocab": {
|
||||
"<term>": {"df": 17, "postings": [["c-000001:0", 3], ["c-000042:2", 1], ...]}
|
||||
},
|
||||
"docs": {
|
||||
"<chunk_id>": {"path": ".vault-meta/chunks/c-000001/chunk-000.json", "dl": 487}
|
||||
}
|
||||
}
|
||||
|
||||
Chunk id format: "<page-address>:<chunk-index>" (e.g. "c-000042:3").
|
||||
|
||||
Tokenization: lowercase, collapse whitespace, drop punctuation except in-word
|
||||
apostrophes and hyphens. ASCII-only stopwords filtered (small list; favors
|
||||
recall over precision).
|
||||
|
||||
Query interface (used by retrieve.py at query time):
|
||||
bm25-index.py query "your text here" [--top 20]
|
||||
|
||||
Build interface:
|
||||
bm25-index.py build # full rebuild (always; incremental is v1.7.x scope)
|
||||
bm25-index.py stats # print index stats
|
||||
|
||||
Exit codes:
|
||||
0 — success
|
||||
1 — lock acquisition failed
|
||||
2 — usage error
|
||||
3 — index file missing or corrupt (query mode)
|
||||
4 — chunks directory missing
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import fcntl
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
META_DIR = VAULT_ROOT / ".vault-meta"
|
||||
CHUNKS_DIR = META_DIR / "chunks"
|
||||
BM25_DIR = META_DIR / "bm25"
|
||||
INDEX_PATH = BM25_DIR / "index.json"
|
||||
LOCK_PATH = META_DIR / ".bm25.lock"
|
||||
|
||||
K1 = 1.5
|
||||
B = 0.75
|
||||
|
||||
# Small high-frequency-stopword list (English). Conservative — keep recall high.
|
||||
STOPWORDS = frozenset("""
|
||||
a an and are as at be by for from has have he her him his i if in is it its
|
||||
of on or that the their them they this to was were will with you your
|
||||
""".split())
|
||||
|
||||
# Unicode-aware tokenizer (v1.7.2; closes audit M2). \w under re.UNICODE
|
||||
# matches letters and digits from any script (CJK, Cyrillic, accented Latin,
|
||||
# Devanagari, etc.) plus underscore. Internal apostrophes and hyphens are
|
||||
# preserved so "user's" and "well-formed" stay single tokens. Pure-symbol or
|
||||
# pure-emoji tokens fail the leading \w anchor and are correctly skipped.
|
||||
TOKEN_RE = re.compile(r"\w[\w'\-]*", re.UNICODE)
|
||||
|
||||
EXIT_OK = 0
|
||||
EXIT_LOCK = 1
|
||||
EXIT_USAGE = 2
|
||||
EXIT_INDEX_MISSING = 3
|
||||
EXIT_NO_CHUNKS = 4
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
"""Lowercase, strip punctuation, drop stopwords. Returns a list of terms."""
|
||||
return [t.lower() for t in TOKEN_RE.findall(text)
|
||||
if t.lower() not in STOPWORDS and len(t) > 1]
|
||||
|
||||
|
||||
def acquire_lock():
|
||||
META_DIR.mkdir(parents=True, exist_ok=True)
|
||||
fd = os.open(str(LOCK_PATH), os.O_CREAT | os.O_WRONLY, 0o644)
|
||||
try:
|
||||
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except OSError:
|
||||
os.close(fd)
|
||||
log("ERR: could not acquire bm25 lock")
|
||||
sys.exit(EXIT_LOCK)
|
||||
return fd
|
||||
|
||||
|
||||
def release_lock(fd):
|
||||
try:
|
||||
fcntl.flock(fd, fcntl.LOCK_UN)
|
||||
finally:
|
||||
os.close(fd)
|
||||
|
||||
|
||||
def discover_chunks():
|
||||
"""Yield (chunk_id, path, contextualized_text) for every chunk on disk.
|
||||
|
||||
The yielded `path` is relative to the directory two levels above CHUNKS_DIR
|
||||
(i.e. .vault-meta/chunks/<addr>/ → relative to the vault root). This works
|
||||
both in production (CHUNKS_DIR is `<vault>/.vault-meta/chunks`) and when
|
||||
tests monkey-patch CHUNKS_DIR to a sandbox `<tmp>/.vault-meta/chunks`.
|
||||
"""
|
||||
if not CHUNKS_DIR.is_dir():
|
||||
log(f"ERR: no chunks directory at {CHUNKS_DIR}")
|
||||
sys.exit(EXIT_NO_CHUNKS)
|
||||
rel_root = CHUNKS_DIR.parent.parent
|
||||
for chunk_file in sorted(CHUNKS_DIR.glob("*/chunk-*.json")):
|
||||
try:
|
||||
data = json.loads(chunk_file.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
log(f" skip (unreadable): {chunk_file} — {e}")
|
||||
continue
|
||||
address = data.get("page_address")
|
||||
idx = data.get("chunk_index")
|
||||
text = data.get("contextualized_text") or data.get("raw_text", "")
|
||||
if address is None or idx is None:
|
||||
continue
|
||||
chunk_id = f"{address}:{idx}"
|
||||
rel_path = str(chunk_file.relative_to(rel_root))
|
||||
yield chunk_id, rel_path, text
|
||||
|
||||
|
||||
def build_index():
|
||||
docs = {}
|
||||
df = Counter()
|
||||
postings = defaultdict(list)
|
||||
|
||||
for chunk_id, rel_path, text in discover_chunks():
|
||||
tokens = tokenize(text)
|
||||
tf = Counter(tokens)
|
||||
docs[chunk_id] = {"path": rel_path, "dl": len(tokens)}
|
||||
for term, count in tf.items():
|
||||
df[term] += 1
|
||||
postings[term].append([chunk_id, count])
|
||||
|
||||
if not docs:
|
||||
log("WARN: no chunks indexed")
|
||||
return None
|
||||
|
||||
avg_dl = sum(d["dl"] for d in docs.values()) / len(docs)
|
||||
vocab = {term: {"df": df[term], "postings": postings[term]}
|
||||
for term in sorted(df.keys())}
|
||||
|
||||
return {
|
||||
"schema_version": 1,
|
||||
"params": {"k1": K1, "b": B},
|
||||
"doc_count": len(docs),
|
||||
"avg_dl": avg_dl,
|
||||
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"vocab": vocab,
|
||||
"docs": docs,
|
||||
}
|
||||
|
||||
|
||||
def write_index(index):
|
||||
BM25_DIR.mkdir(parents=True, exist_ok=True)
|
||||
tmp = INDEX_PATH.with_suffix(f".{os.getpid()}.tmp")
|
||||
try:
|
||||
tmp.write_text(json.dumps(index, ensure_ascii=False), encoding="utf-8")
|
||||
os.replace(tmp, INDEX_PATH)
|
||||
finally:
|
||||
if tmp.exists():
|
||||
tmp.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def load_index():
|
||||
if not INDEX_PATH.is_file():
|
||||
log(f"ERR: no index at {INDEX_PATH}. Run `bm25-index.py build` first.")
|
||||
sys.exit(EXIT_INDEX_MISSING)
|
||||
try:
|
||||
return json.loads(INDEX_PATH.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
log(f"ERR: index corrupt: {e}")
|
||||
sys.exit(EXIT_INDEX_MISSING)
|
||||
|
||||
|
||||
def query(text, top_k=20):
|
||||
idx = load_index()
|
||||
vocab = idx["vocab"]
|
||||
docs = idx["docs"]
|
||||
params = idx["params"]
|
||||
avg_dl = idx["avg_dl"]
|
||||
N = idx["doc_count"]
|
||||
k1 = params["k1"]
|
||||
b = params["b"]
|
||||
|
||||
qterms = tokenize(text)
|
||||
if not qterms:
|
||||
return []
|
||||
|
||||
# Defensive guard (v1.7.2; closes audit L7): avg_dl can only be 0 if the
|
||||
# vocab is also empty (all chunks have zero tokens), in which case the
|
||||
# loop never enters this divide path. But future refactors could change
|
||||
# that invariant; the `or 1.0` keeps it safe by construction.
|
||||
avg_dl_safe = avg_dl or 1.0
|
||||
scores = defaultdict(float)
|
||||
for term in qterms:
|
||||
v = vocab.get(term)
|
||||
if not v:
|
||||
continue
|
||||
df = v["df"]
|
||||
idf = math.log(1 + (N - df + 0.5) / (df + 0.5))
|
||||
for cid, cnt in v["postings"]:
|
||||
dl = docs[cid]["dl"]
|
||||
denom = cnt + k1 * (1 - b + b * dl / avg_dl_safe)
|
||||
scores[cid] += idf * (cnt * (k1 + 1)) / denom
|
||||
|
||||
ranked = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:top_k]
|
||||
return [
|
||||
{
|
||||
"chunk_id": cid,
|
||||
"score": round(score, 6),
|
||||
"path": docs[cid]["path"],
|
||||
}
|
||||
for cid, score in ranked
|
||||
]
|
||||
|
||||
|
||||
def stats():
|
||||
idx = load_index()
|
||||
print(json.dumps({
|
||||
"doc_count": idx["doc_count"],
|
||||
"avg_dl": round(idx["avg_dl"], 2),
|
||||
"vocab_size": len(idx["vocab"]),
|
||||
"updated_at": idx["updated_at"],
|
||||
"params": idx["params"],
|
||||
}, indent=2))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="BM25 inverted index over wiki chunks.")
|
||||
sub = parser.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
sub.add_parser("build", help="Build the index (full rebuild every time in v1.7).")
|
||||
|
||||
sp_query = sub.add_parser("query", help="Query the index.")
|
||||
sp_query.add_argument("text", help="Query text")
|
||||
sp_query.add_argument("--top", type=int, default=20, help="Top-K results")
|
||||
|
||||
sub.add_parser("stats", help="Print index stats.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.cmd == "build":
|
||||
fd = acquire_lock()
|
||||
try:
|
||||
index = build_index()
|
||||
if index is None:
|
||||
log("Nothing to index.")
|
||||
return EXIT_OK
|
||||
write_index(index)
|
||||
log(f"Wrote {INDEX_PATH} docs={index['doc_count']} vocab={len(index['vocab'])} avg_dl={index['avg_dl']:.1f}")
|
||||
finally:
|
||||
release_lock(fd)
|
||||
return EXIT_OK
|
||||
|
||||
if args.cmd == "query":
|
||||
results = query(args.text, top_k=args.top)
|
||||
print(json.dumps(results, indent=2))
|
||||
return EXIT_OK
|
||||
|
||||
if args.cmd == "stats":
|
||||
stats()
|
||||
return EXIT_OK
|
||||
|
||||
return EXIT_USAGE
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,312 @@
|
||||
#!/usr/bin/env python3
|
||||
"""boundary-score.py — DragonScale Mechanism 4: boundary-first autoresearch scorer.
|
||||
|
||||
Reads `wiki/**/*.md`, builds a wikilink graph, and emits per-page boundary
|
||||
scores to stdout (text) or as JSON for tooling.
|
||||
|
||||
boundary_score(p) = (out_degree(p) - in_degree(p)) * recency_weight(p)
|
||||
|
||||
- out_degree(p): count of distinct wikilinks in p that resolve to a
|
||||
scoreable page (scoreable = non-meta, non-fold, non-excluded).
|
||||
- in_degree(p): count of distinct scoreable pages that link to p.
|
||||
- recency_weight(p): exp(-days_since_updated / RECENCY_HALFLIFE_DAYS).
|
||||
No floor; very old pages approach zero weight, which is the intended
|
||||
semantic of "frontier" (recently-touched and outward-pointing).
|
||||
|
||||
High score = the page points at many things, is pointed at by few, and
|
||||
has been touched recently. That is a vault frontier page. Low or
|
||||
negative score = hub / integrated page.
|
||||
|
||||
Feature-gated opt-in: autoresearch only invokes this when DragonScale
|
||||
setup is detected. Safe to run standalone even without DragonScale set
|
||||
up (reads wiki/ only; never writes).
|
||||
|
||||
This script is intentionally stdout-only. There is no `--report PATH`
|
||||
equivalent to `tiling-check.py --report` because the helper is small
|
||||
enough to pipe directly (`./scripts/boundary-score.py --json | jq ...`)
|
||||
and keeping it read-only removes a write-path attack surface.
|
||||
|
||||
Usage:
|
||||
boundary-score.py # top-10 frontier, text
|
||||
boundary-score.py --top N # top N frontier
|
||||
boundary-score.py --json # JSON output
|
||||
boundary-score.py --page PATH # score for a single page
|
||||
boundary-score.py --include-score-zero # include pages with score=0
|
||||
|
||||
Exit codes:
|
||||
0 success
|
||||
2 usage error
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
import sys
|
||||
from datetime import date, datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
WIKI_DIR = VAULT_ROOT / "wiki"
|
||||
|
||||
EXCLUDE_TYPES = {"meta", "fold"}
|
||||
EXCLUDE_FILENAMES = {
|
||||
"_index.md", "index.md", "log.md", "hot.md", "overview.md",
|
||||
"dashboard.md", "Wiki Map.md", "getting-started.md",
|
||||
}
|
||||
EXCLUDE_PATH_PREFIXES = ("wiki/folds/", "wiki/meta/")
|
||||
|
||||
RECENCY_HALFLIFE_DAYS = 30.0
|
||||
# No recency floor: a truly stale page should NOT dominate the frontier
|
||||
# ranking, even if its out-degree is high. The exponential decay takes
|
||||
# weight toward zero for year-old pages, which is the intended semantic
|
||||
# of "frontier" (recently-touched and outward-pointing).
|
||||
DEFAULT_TOP = 10
|
||||
MAX_BODY_BYTES = 256 * 1024
|
||||
# CommonMark-ish fence tracking: opening fence records (char, length);
|
||||
# a closing fence must use the SAME char with SAME-OR-LONGER run length.
|
||||
# Tilde fences (~~~) are supported alongside backtick fences (```). Indented
|
||||
# code blocks (4+ spaces) are NOT filtered; in Obsidian usage, indented
|
||||
# bullets commonly contain wikilinks and should count as edges.
|
||||
|
||||
FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
|
||||
TYPE_RE = re.compile(r"^type:\s*(\S+)", re.MULTILINE)
|
||||
UPDATED_RE = re.compile(r"^updated:\s*([0-9]{4}-[0-9]{2}-[0-9]{2})", re.MULTILINE)
|
||||
CREATED_RE = re.compile(r"^created:\s*([0-9]{4}-[0-9]{2}-[0-9]{2})", re.MULTILINE)
|
||||
TITLE_RE = re.compile(r'^title:\s*"?([^"\n]+?)"?\s*$', re.MULTILINE)
|
||||
# Obsidian wikilinks: [[Target]] or [[Target|Alias]] or [[Target#Heading]]
|
||||
WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#[^\]|]+)?(?:\|[^\]]+)?\]\]")
|
||||
|
||||
EXIT_OK = 0
|
||||
EXIT_USAGE = 2
|
||||
|
||||
|
||||
def log(msg: str) -> None:
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
|
||||
def parse_frontmatter(text: str) -> tuple[dict, str]:
|
||||
m = FRONTMATTER_RE.match(text)
|
||||
if not m:
|
||||
return {}, text
|
||||
fm_raw = m.group(1)
|
||||
body = text[m.end():]
|
||||
fm: dict = {}
|
||||
for key, regex in (("type", TYPE_RE), ("updated", UPDATED_RE),
|
||||
("created", CREATED_RE), ("title", TITLE_RE)):
|
||||
tm = regex.search(fm_raw)
|
||||
if tm:
|
||||
fm[key] = tm.group(1).strip().strip('"').strip("'")
|
||||
return fm, body
|
||||
|
||||
|
||||
def included(path: Path, fm: dict) -> bool:
|
||||
if path.is_symlink():
|
||||
return False
|
||||
try:
|
||||
resolved = path.resolve(strict=True)
|
||||
resolved.relative_to(VAULT_ROOT.resolve())
|
||||
except (OSError, ValueError):
|
||||
return False
|
||||
rel = path.relative_to(VAULT_ROOT).as_posix()
|
||||
if path.name in EXCLUDE_FILENAMES:
|
||||
return False
|
||||
for prefix in EXCLUDE_PATH_PREFIXES:
|
||||
if rel.startswith(prefix):
|
||||
return False
|
||||
if fm.get("type") in EXCLUDE_TYPES:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def days_since(date_str: str | None) -> float:
|
||||
"""Return days since the given YYYY-MM-DD string, or a large sentinel if missing."""
|
||||
if not date_str:
|
||||
return 10_000.0
|
||||
try:
|
||||
d = date.fromisoformat(date_str)
|
||||
except ValueError:
|
||||
return 10_000.0
|
||||
delta = (date.today() - d).days
|
||||
return max(0.0, float(delta))
|
||||
|
||||
|
||||
def recency_weight(days: float,
|
||||
halflife: float = RECENCY_HALFLIFE_DAYS) -> float:
|
||||
return math.exp(-days / halflife)
|
||||
|
||||
|
||||
_FENCE_RE = re.compile(r"^(\s*)(`{3,}|~{3,})")
|
||||
|
||||
|
||||
def extract_wikilinks(body: str) -> set[str]:
|
||||
"""Extract unique link targets (without alias or heading suffix) from the body.
|
||||
|
||||
Skips wikilinks inside fenced code blocks so documentation examples
|
||||
(including in this repo's own skill files) do not pollute the graph.
|
||||
|
||||
Fence handling: backtick AND tilde fences, with length tracking per
|
||||
CommonMark: the opening run sets (char, min_len); the closing line
|
||||
must use the SAME char with a run of SAME-OR-LONGER length. Indented
|
||||
code blocks (4+ spaces) are intentionally NOT filtered — indented
|
||||
bullets in Obsidian often contain wikilinks.
|
||||
"""
|
||||
cleaned: list[str] = []
|
||||
fence_char: str | None = None
|
||||
fence_len: int = 0
|
||||
for line in body.splitlines():
|
||||
m = _FENCE_RE.match(line)
|
||||
if m:
|
||||
char = m.group(2)[0]
|
||||
length = len(m.group(2))
|
||||
if fence_char is None:
|
||||
fence_char = char
|
||||
fence_len = length
|
||||
continue
|
||||
if char == fence_char and length >= fence_len:
|
||||
fence_char = None
|
||||
fence_len = 0
|
||||
continue
|
||||
if fence_char is not None:
|
||||
continue
|
||||
cleaned.append(line)
|
||||
scan = "\n".join(cleaned)
|
||||
results: set[str] = set()
|
||||
for m in WIKILINK_RE.finditer(scan):
|
||||
raw = m.group(1).strip()
|
||||
# Folder-qualified links like [[notes/Foo]] resolve to Foo.md by stem.
|
||||
# This matches Obsidian default behavior for unique filenames.
|
||||
stem = raw.rsplit("/", 1)[-1]
|
||||
if stem:
|
||||
results.add(stem)
|
||||
return results
|
||||
|
||||
|
||||
def collect_pages() -> dict[str, dict]:
|
||||
"""Scan wiki/, return {title_key: {path, title, body, fm}} for scoreable pages.
|
||||
|
||||
`title_key` is the filename stem, which is what Obsidian wikilinks resolve
|
||||
to by default. Assumes filenames are unique across the vault (enforced by
|
||||
wiki-lint naming convention).
|
||||
"""
|
||||
pages: dict[str, dict] = {}
|
||||
if not WIKI_DIR.is_dir():
|
||||
return pages
|
||||
for md in sorted(WIKI_DIR.rglob("*.md")):
|
||||
try:
|
||||
text = md.read_text(encoding="utf-8")
|
||||
except (OSError, UnicodeDecodeError):
|
||||
continue
|
||||
if len(text.encode("utf-8")) > MAX_BODY_BYTES:
|
||||
continue
|
||||
fm, body = parse_frontmatter(text)
|
||||
if not included(md, fm):
|
||||
continue
|
||||
title_key = md.stem # Obsidian wikilinks are filename-based
|
||||
pages[title_key] = {
|
||||
"path": md.relative_to(VAULT_ROOT).as_posix(),
|
||||
"title": fm.get("title", title_key),
|
||||
"body": body,
|
||||
"fm": fm,
|
||||
}
|
||||
return pages
|
||||
|
||||
|
||||
def build_graph(pages: dict[str, dict]) -> tuple[dict[str, set[str]], dict[str, set[str]]]:
|
||||
"""Return (out_edges, in_edges) where each maps title_key -> set(title_key).
|
||||
|
||||
Only edges whose target is a known scoreable page are counted. Self-loops
|
||||
are ignored.
|
||||
"""
|
||||
out_edges: dict[str, set[str]] = {k: set() for k in pages}
|
||||
in_edges: dict[str, set[str]] = {k: set() for k in pages}
|
||||
for src, entry in pages.items():
|
||||
links = extract_wikilinks(entry["body"])
|
||||
for target in links:
|
||||
if target == src:
|
||||
continue
|
||||
if target in pages:
|
||||
out_edges[src].add(target)
|
||||
in_edges[target].add(src)
|
||||
return out_edges, in_edges
|
||||
|
||||
|
||||
def score_page(title_key: str,
|
||||
pages: dict[str, dict],
|
||||
out_edges: dict[str, set[str]],
|
||||
in_edges: dict[str, set[str]]) -> dict:
|
||||
entry = pages[title_key]
|
||||
fm = entry["fm"]
|
||||
out_deg = len(out_edges.get(title_key, set()))
|
||||
in_deg = len(in_edges.get(title_key, set()))
|
||||
date_str = fm.get("updated") or fm.get("created")
|
||||
days = days_since(date_str)
|
||||
rw = recency_weight(days)
|
||||
score = (out_deg - in_deg) * rw
|
||||
return {
|
||||
"title": entry["title"],
|
||||
"title_key": title_key,
|
||||
"path": entry["path"],
|
||||
"out_degree": out_deg,
|
||||
"in_degree": in_deg,
|
||||
"age_days": days,
|
||||
"recency_weight": round(rw, 4),
|
||||
"score": round(score, 4),
|
||||
}
|
||||
|
||||
|
||||
def run(top: int, want_json: bool, include_zero: bool, page_filter: str | None) -> int:
|
||||
pages = collect_pages()
|
||||
out_edges, in_edges = build_graph(pages)
|
||||
scored = [score_page(k, pages, out_edges, in_edges) for k in pages]
|
||||
if page_filter:
|
||||
key = Path(page_filter).stem
|
||||
matched = [s for s in scored if s["title_key"] == key or s["path"] == page_filter]
|
||||
if not matched:
|
||||
log(f"ERR: no scoreable page matches '{page_filter}'")
|
||||
return EXIT_USAGE
|
||||
scored = matched
|
||||
else:
|
||||
if not include_zero:
|
||||
scored = [s for s in scored if s["score"] > 0.0]
|
||||
scored.sort(key=lambda s: (-s["score"], s["title_key"]))
|
||||
scored = scored[:top]
|
||||
|
||||
if want_json:
|
||||
print(json.dumps({
|
||||
"generated": datetime.now(timezone.utc).isoformat(timespec="seconds").replace("+00:00", "Z"),
|
||||
"halflife_days": RECENCY_HALFLIFE_DAYS,
|
||||
"page_count_scoreable": len(pages),
|
||||
"results": scored,
|
||||
}, indent=2))
|
||||
else:
|
||||
print("# Boundary Score Report")
|
||||
print(f"scoreable pages: {len(pages)}; halflife: {RECENCY_HALFLIFE_DAYS} days")
|
||||
if not scored:
|
||||
print("\nNo positive-score frontier pages found.")
|
||||
else:
|
||||
print("")
|
||||
print("| # | score | out | in | age_d | title | path |")
|
||||
print("|---|---|---|---|---|---|---|")
|
||||
for i, s in enumerate(scored, 1):
|
||||
print(f"| {i} | {s['score']:.3f} | {s['out_degree']} | {s['in_degree']} | "
|
||||
f"{int(s['age_days'])} | {s['title']} | {s['path']} |")
|
||||
return EXIT_OK
|
||||
|
||||
|
||||
def main(argv: list[str]) -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--top", type=int, default=DEFAULT_TOP)
|
||||
p.add_argument("--json", action="store_true")
|
||||
p.add_argument("--include-score-zero", action="store_true",
|
||||
help="Include pages whose score is zero or negative in the output")
|
||||
p.add_argument("--page", default=None, help="Score a single page by path or stem")
|
||||
args = p.parse_args(argv)
|
||||
if args.top < 1:
|
||||
log("ERR: --top must be >= 1")
|
||||
return EXIT_USAGE
|
||||
return run(args.top, args.json, args.include_score_zero, args.page)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
@@ -0,0 +1,505 @@
|
||||
#!/usr/bin/env python3
|
||||
"""contextual-prefix.py — chunk wiki pages and generate per-chunk contextual prefixes.
|
||||
|
||||
Implements the ingest-side of Anthropic's Sept 2024 Contextual Retrieval pattern
|
||||
(https://www.anthropic.com/news/contextual-retrieval). For each chunk of a wiki
|
||||
page, generates a 1-2 sentence prefix situating the chunk in its source. The
|
||||
prefixed text is what gets BM25-indexed and embedded, materially improving
|
||||
retrieval accuracy (Anthropic measured 35-49% failure reduction).
|
||||
|
||||
Three-tier prefix generation (chosen per-run automatically):
|
||||
1. If ANTHROPIC_API_KEY is set → direct Anthropic API call (Haiku 4.5)
|
||||
with prompt caching on the page body
|
||||
(only when the body clears the ~16 KB
|
||||
Haiku 4.5 cache floor; see
|
||||
cache_control_for()).
|
||||
~$12 / 1000 docs per Anthropic figures.
|
||||
REQUIRES --allow-egress (sends bodies off-machine).
|
||||
2. Elif `claude` binary on PATH → `claude -p` subprocess (uses CC subscription;
|
||||
no API key needed; slower per call).
|
||||
REQUIRES --allow-egress (subprocess egresses).
|
||||
3. Else (default) → synthetic prefix from page frontmatter +
|
||||
first paragraph (zero-cost floor; loses
|
||||
most of the contextual benefit but BM25
|
||||
and vector channels still work).
|
||||
|
||||
Data-egress posture (v1.7.1+):
|
||||
Tiers 1 and 2 send wiki page bodies off-machine. Both are GATED behind
|
||||
--allow-egress (default off). Without the flag, pick_prefix_tier() always
|
||||
returns "synthetic" regardless of env vars or claude binary presence.
|
||||
Mirror of scripts/tiling-check.py:351 --allow-remote-ollama precedent.
|
||||
|
||||
Chunk schema written to .vault-meta/chunks/<page-address>/chunk-NNN.json:
|
||||
{
|
||||
"schema_version": 1,
|
||||
"page_path": "wiki/concepts/Foo.md",
|
||||
"page_address": "c-000042",
|
||||
"chunk_index": 3,
|
||||
"raw_text": "...",
|
||||
"contextualized_text": "<prefix> <raw_text>",
|
||||
"prefix_source": "anthropic-api" | "claude-cli" | "synthetic" | "skipped",
|
||||
"char_count": 487,
|
||||
"body_hash": "sha256:...", # of raw_text
|
||||
"page_body_hash": "sha256:...", # of the WHOLE source page (for invalidation)
|
||||
"created_at": "2026-05-17T..."
|
||||
}
|
||||
|
||||
Pages without an `address:` frontmatter field are still chunked (using a
|
||||
synthetic address derived from the path slug) so this tool works on v1.6 vaults
|
||||
without DragonScale Mechanism 2 enabled.
|
||||
|
||||
Usage:
|
||||
contextual-prefix.py PATH # process a single page
|
||||
contextual-prefix.py --all # process every wiki/*.md
|
||||
contextual-prefix.py PATH --no-llm # force synthetic-prefix tier 3
|
||||
contextual-prefix.py PATH --rebuild # ignore existing chunks
|
||||
contextual-prefix.py PATH --peek # print what would happen; write nothing
|
||||
|
||||
Exit codes:
|
||||
0 — success
|
||||
2 — usage error
|
||||
3 — page file missing or unreadable
|
||||
4 — chunk dir creation failed
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
WIKI_DIR = VAULT_ROOT / "wiki"
|
||||
META_DIR = VAULT_ROOT / ".vault-meta"
|
||||
CHUNKS_DIR = META_DIR / "chunks"
|
||||
|
||||
CHUNK_TARGET_TOKENS = 500 # rough; we approximate via chars/4
|
||||
CHUNK_TARGET_CHARS = CHUNK_TARGET_TOKENS * 4
|
||||
CHUNK_OVERLAP_CHARS = 200
|
||||
|
||||
ANTHROPIC_MODEL = "claude-haiku-4-5-20251001"
|
||||
ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
|
||||
ANTHROPIC_TIMEOUT_SEC = 30
|
||||
CLAUDE_CLI_TIMEOUT_SEC = 60
|
||||
|
||||
# Anthropic prompt caching ignores any cached prefix below the model's minimum
|
||||
# cacheable size — 4,096 tokens for Haiku 4.5 (verified against the prompt-caching
|
||||
# docs, 2026-05). At ~4 chars/token that is ~16 KB. We attach cache_control only
|
||||
# when the body clears this floor so the marker reflects reality: below the floor
|
||||
# the API treats it as a silent no-op. The per-call cache telemetry in
|
||||
# anthropic_api_prefix() is what actually measures hit rate. The check counts the
|
||||
# body only — a deliberately conservative ~370-char underestimate that ignores the
|
||||
# system_msg + <page> wrapper also inside the cached prefix — so near the boundary
|
||||
# it errs toward not-marking, never toward a wrongly-attached marker.
|
||||
HAIKU_CACHE_MIN_CHARS = 16384 # 4096 tokens * 4 chars/token
|
||||
|
||||
EXIT_OK = 0
|
||||
EXIT_USAGE = 2
|
||||
EXIT_PAGE_MISSING = 3
|
||||
EXIT_CHUNK_DIR = 4
|
||||
|
||||
FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
|
||||
ADDRESS_RE = re.compile(r"^address:\s*(c-\d{6})\s*$", re.MULTILINE)
|
||||
TITLE_RE = re.compile(r"^title:\s*['\"]?(.+?)['\"]?\s*$", re.MULTILINE)
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
|
||||
def sha256(text):
|
||||
return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def read_page(path):
|
||||
if not path.is_file():
|
||||
raise SystemExit(EXIT_PAGE_MISSING)
|
||||
return path.read_text(encoding="utf-8", errors="replace")
|
||||
|
||||
|
||||
def parse_frontmatter(body):
|
||||
m = FRONTMATTER_RE.match(body)
|
||||
if not m:
|
||||
return {}, body
|
||||
fm_text = m.group(1)
|
||||
rest = body[m.end():]
|
||||
addr_m = ADDRESS_RE.search(fm_text)
|
||||
title_m = TITLE_RE.search(fm_text)
|
||||
return {
|
||||
"address": addr_m.group(1) if addr_m else None,
|
||||
"title": title_m.group(1) if title_m else None,
|
||||
"raw": fm_text,
|
||||
}, rest
|
||||
|
||||
|
||||
def derive_synthetic_address(page_path):
|
||||
"""Stable per-path address-shaped string when no real address is set.
|
||||
Format: c-NNNNNN derived from a hash of the relative path (deterministic).
|
||||
Distinct from allocator addresses; used only for chunk filing.
|
||||
"""
|
||||
rel = page_path.relative_to(VAULT_ROOT)
|
||||
h = hashlib.sha1(str(rel).encode("utf-8")).hexdigest()
|
||||
return "syn-" + h[:6]
|
||||
|
||||
|
||||
def chunk_body(body, target_chars=CHUNK_TARGET_CHARS, overlap=CHUNK_OVERLAP_CHARS):
|
||||
"""Split body into overlapping chunks on paragraph boundaries when possible.
|
||||
Heuristic: walk the body, accumulate paragraphs until len exceeds target,
|
||||
flush, then keep the trailing `overlap` chars as the seed of the next chunk.
|
||||
Empty paragraphs collapse to single boundaries.
|
||||
"""
|
||||
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", body) if p.strip()]
|
||||
chunks = []
|
||||
cur = []
|
||||
cur_len = 0
|
||||
for p in paragraphs:
|
||||
cur.append(p)
|
||||
cur_len += len(p) + 2
|
||||
if cur_len >= target_chars:
|
||||
chunk_text = "\n\n".join(cur)
|
||||
chunks.append(chunk_text)
|
||||
# seed next chunk with the tail
|
||||
tail = chunk_text[-overlap:] if overlap > 0 else ""
|
||||
cur = [tail] if tail else []
|
||||
cur_len = len(tail)
|
||||
if cur and "".join(cur).strip():
|
||||
chunks.append("\n\n".join(cur))
|
||||
if not chunks and body.strip():
|
||||
# tiny page — single chunk
|
||||
chunks = [body.strip()]
|
||||
return chunks
|
||||
|
||||
|
||||
def synthetic_prefix(fm, body, chunk_text):
|
||||
"""Tier-3 prefix: page title + first sentence of the page body.
|
||||
Free, hermetic, deterministic. Provides modest BM25 lift via title-word
|
||||
re-injection into the chunk corpus.
|
||||
"""
|
||||
title = (fm.get("title") or "").strip() or "(untitled)"
|
||||
# First sentence of the body (not the chunk — gives the chunk a page-level frame)
|
||||
first_sentence = re.split(r"(?<=[.!?])\s+", body.strip(), maxsplit=1)
|
||||
first = first_sentence[0][:300] if first_sentence else ""
|
||||
return f"This passage is from the wiki page \"{title}\". The page opens: {first}"
|
||||
|
||||
|
||||
def cache_control_for(page_body):
|
||||
"""Ephemeral cache_control dict when the page body clears the Haiku cache
|
||||
floor, else None. Pure function so the floor decision is unit-testable
|
||||
without the network (the API call itself stays egress-gated).
|
||||
"""
|
||||
if len(page_body) >= HAIKU_CACHE_MIN_CHARS:
|
||||
return {"type": "ephemeral"}
|
||||
return None
|
||||
|
||||
|
||||
def anthropic_api_prefix(api_key, page_title, page_body, chunk_text):
|
||||
"""Tier-1 prefix: direct Anthropic API call, Haiku, prompt-cached page body.
|
||||
|
||||
The page body is the stable prefix shared by every chunk of a page, so it
|
||||
goes in `system` behind a cache breakpoint and the variable chunk goes in
|
||||
`messages`. Cache reads only land because chunks are processed sequentially
|
||||
(chunk 0 warms the prefix) — see the loop note in process_page().
|
||||
"""
|
||||
system_msg = (
|
||||
"You are a retrieval-augmentation assistant. Given a wiki page and one "
|
||||
"chunk extracted from it, write a single short sentence (under 35 words) "
|
||||
"that situates the chunk within the page's scope and topic. Output only "
|
||||
"the sentence — no prefix, no quotation marks, no commentary."
|
||||
)
|
||||
page_block = {
|
||||
"type": "text",
|
||||
"text": f"<page title=\"{page_title}\">\n{page_body}\n</page>",
|
||||
}
|
||||
cc = cache_control_for(page_body)
|
||||
if cc:
|
||||
page_block["cache_control"] = cc
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 100,
|
||||
"system": [
|
||||
{"type": "text", "text": system_msg},
|
||||
page_block,
|
||||
],
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Write the single contextualizing sentence for this chunk:\n\n"
|
||||
f"<chunk>\n{chunk_text}\n</chunk>"
|
||||
),
|
||||
}
|
||||
],
|
||||
}
|
||||
body = json.dumps(payload).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
ANTHROPIC_API_URL,
|
||||
data=body,
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"x-api-key": api_key,
|
||||
"anthropic-version": "2023-06-01",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=ANTHROPIC_TIMEOUT_SEC) as resp:
|
||||
data = json.loads(resp.read().decode("utf-8"))
|
||||
# Cache telemetry: integer token counts only, never page content, so
|
||||
# the data-egress posture holds. Confirms whether the body cache is
|
||||
# actually firing given the Haiku floor (wrote>0 on chunk 0, read>0
|
||||
# on later chunks of the same page).
|
||||
usage = data.get("usage", {})
|
||||
log(f" cache: wrote={usage.get('cache_creation_input_tokens', 0)} "
|
||||
f"read={usage.get('cache_read_input_tokens', 0)} tok")
|
||||
for block in data.get("content", []):
|
||||
if block.get("type") == "text":
|
||||
return block["text"].strip().splitlines()[0]
|
||||
except (urllib.error.URLError, json.JSONDecodeError, KeyError) as e:
|
||||
log(f" anthropic-api call failed: {e}")
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def claude_cli_prefix(page_title, page_body, chunk_text):
|
||||
"""Tier-2 prefix: `claude -p` subprocess (uses CC subscription, no API key)."""
|
||||
prompt = (
|
||||
f"Wiki page \"{page_title}\":\n\n"
|
||||
f"---\n{page_body[:4000]}\n---\n\n"
|
||||
f"Chunk:\n<chunk>\n{chunk_text}\n</chunk>\n\n"
|
||||
"Write one short sentence (under 35 words) situating this chunk within "
|
||||
"the page's scope. Output only the sentence."
|
||||
)
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["claude", "-p", prompt],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=CLAUDE_CLI_TIMEOUT_SEC,
|
||||
check=False,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip().splitlines()[0]
|
||||
log(f" claude-cli rc={result.returncode}: {result.stderr.strip()[:200]}")
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError) as e:
|
||||
log(f" claude-cli call failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def pick_prefix_tier(force_synthetic, allow_egress=False):
|
||||
"""Choose the contextual-prefix generation tier.
|
||||
|
||||
Without allow_egress=True, ALWAYS returns "synthetic" regardless of
|
||||
env vars or claude binary availability. This is the v1.7.1 data-egress
|
||||
guard: tiers 1 (Anthropic API) and 2 (claude CLI subprocess) both send
|
||||
wiki page bodies off-machine, so they require explicit user consent via
|
||||
the --allow-egress flag at the CLI layer.
|
||||
|
||||
Mirrors scripts/tiling-check.py:351 --allow-remote-ollama default-deny.
|
||||
"""
|
||||
if force_synthetic or not allow_egress:
|
||||
return "synthetic"
|
||||
if os.environ.get("ANTHROPIC_API_KEY"):
|
||||
return "anthropic-api"
|
||||
if shutil.which("claude"):
|
||||
return "claude-cli"
|
||||
return "synthetic"
|
||||
|
||||
|
||||
def generate_prefix(tier, fm, body, chunk_text):
|
||||
"""Asymmetric fallback by design:
|
||||
- tier="anthropic-api" → on failure, try claude-cli (subprocess,
|
||||
free) before synthetic. The API is the user's stated preference,
|
||||
and claude-cli is the closer-in-quality fallback.
|
||||
- tier="claude-cli" → on failure, go straight to synthetic. The
|
||||
user has either no API key or has not opted into one; climbing
|
||||
back to the API would silently spend money they did not authorize.
|
||||
- tier="synthetic" → always synthetic.
|
||||
"""
|
||||
title = fm.get("title") or "(untitled)"
|
||||
if tier == "anthropic-api":
|
||||
result = anthropic_api_prefix(
|
||||
os.environ["ANTHROPIC_API_KEY"], title, body, chunk_text
|
||||
)
|
||||
if result:
|
||||
return result, "anthropic-api"
|
||||
if shutil.which("claude"):
|
||||
result = claude_cli_prefix(title, body, chunk_text)
|
||||
if result:
|
||||
return result, "claude-cli"
|
||||
return synthetic_prefix(fm, body, chunk_text), "synthetic"
|
||||
if tier == "claude-cli":
|
||||
result = claude_cli_prefix(title, body, chunk_text)
|
||||
if result:
|
||||
return result, "claude-cli"
|
||||
return synthetic_prefix(fm, body, chunk_text), "synthetic"
|
||||
return synthetic_prefix(fm, body, chunk_text), "synthetic"
|
||||
|
||||
|
||||
def process_page(page_path, force_synthetic=False, rebuild=False, peek=False,
|
||||
allow_egress=False, progress_label=""):
|
||||
body = read_page(page_path)
|
||||
fm, content = parse_frontmatter(body)
|
||||
address = fm.get("address") or derive_synthetic_address(page_path)
|
||||
page_body_hash = sha256(body)
|
||||
|
||||
chunk_dir = CHUNKS_DIR / address
|
||||
if not peek:
|
||||
try:
|
||||
chunk_dir.mkdir(parents=True, exist_ok=True)
|
||||
except OSError as e:
|
||||
log(f"ERR: cannot create chunk dir {chunk_dir}: {e}")
|
||||
raise SystemExit(EXIT_CHUNK_DIR)
|
||||
|
||||
chunks = chunk_body(content)
|
||||
tier = pick_prefix_tier(force_synthetic, allow_egress=allow_egress)
|
||||
|
||||
progress = (progress_label + " ") if progress_label else ""
|
||||
if not chunks:
|
||||
# v1.7.2 / closes audit M6: previously this logged "chunks=0" with no
|
||||
# explanation and silently produced no index entries. Now: explicit WARN
|
||||
# so the user notices empty-body pages (often frontmatter-only stubs).
|
||||
log(f"{progress}WARN: {page_path.relative_to(VAULT_ROOT)} has no chunkable body content "
|
||||
f"(empty after frontmatter strip). Skipping; no chunks written.")
|
||||
return {"address": address, "written": [], "skipped": 0, "tier": tier}
|
||||
|
||||
log(f"{progress}-> {page_path.relative_to(VAULT_ROOT)} address={address} chunks={len(chunks)} tier={tier}")
|
||||
|
||||
written = []
|
||||
skipped = 0
|
||||
# Keep this loop sequential. The tier-1 Anthropic path caches the page body;
|
||||
# a cache entry is only readable after the first response begins (Anthropic
|
||||
# prompt-caching concurrency rule), so chunk 0 warms the prefix and chunks
|
||||
# 1..N read it. Parallelizing here would silently zero every cache read.
|
||||
for idx, raw in enumerate(chunks):
|
||||
chunk_path = chunk_dir / f"chunk-{idx:03d}.json"
|
||||
body_hash = sha256(raw)
|
||||
|
||||
if chunk_path.exists() and not rebuild:
|
||||
try:
|
||||
existing = json.loads(chunk_path.read_text(encoding="utf-8"))
|
||||
if existing.get("body_hash") == body_hash and \
|
||||
existing.get("page_body_hash") == page_body_hash:
|
||||
skipped += 1
|
||||
continue
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass # corrupted; overwrite
|
||||
|
||||
if peek:
|
||||
log(f" would write {chunk_path.name} ({len(raw)} chars)")
|
||||
continue
|
||||
|
||||
prefix, prefix_source = generate_prefix(tier, fm, content, raw)
|
||||
contextualized = f"{prefix}\n\n{raw}" if prefix else raw
|
||||
|
||||
record = {
|
||||
"schema_version": 1,
|
||||
"page_path": str(page_path.relative_to(VAULT_ROOT)),
|
||||
"page_address": address,
|
||||
"chunk_index": idx,
|
||||
"raw_text": raw,
|
||||
"contextualized_text": contextualized,
|
||||
"prefix": prefix or "",
|
||||
"prefix_source": prefix_source,
|
||||
"char_count": len(raw),
|
||||
"body_hash": body_hash,
|
||||
"page_body_hash": page_body_hash,
|
||||
"created_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
}
|
||||
tmp = chunk_path.with_suffix(f".{os.getpid()}.tmp")
|
||||
try:
|
||||
tmp.write_text(json.dumps(record, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
os.replace(tmp, chunk_path)
|
||||
finally:
|
||||
if tmp.exists():
|
||||
tmp.unlink(missing_ok=True)
|
||||
written.append(chunk_path.name)
|
||||
|
||||
log(f" wrote={len(written)} skipped(unchanged)={skipped}")
|
||||
return {"address": address, "written": written, "skipped": skipped, "tier": tier}
|
||||
|
||||
|
||||
def collect_pages(target):
|
||||
if target == "--all" or target is None:
|
||||
return sorted(p for p in WIKI_DIR.rglob("*.md")
|
||||
if not any(part.startswith(".") for part in p.parts))
|
||||
p = Path(target)
|
||||
if not p.is_absolute():
|
||||
p = VAULT_ROOT / p
|
||||
return [p]
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Chunk + contextualize wiki pages.")
|
||||
parser.add_argument("path", nargs="?",
|
||||
help="Page path relative to vault root. Omit (or pass --all) "
|
||||
"to process every wiki page.")
|
||||
parser.add_argument("--all", action="store_true",
|
||||
help="Process every wiki page (equivalent to omitting path).")
|
||||
parser.add_argument("--no-llm", action="store_true",
|
||||
help="Force tier-3 synthetic prefix (skip LLM calls).")
|
||||
parser.add_argument("--allow-egress", action="store_true",
|
||||
help="Allow tier-1 (Anthropic API) or tier-2 (claude CLI "
|
||||
"subprocess) prefix generation. Without this flag, page "
|
||||
"bodies stay on-machine and only the tier-3 synthetic "
|
||||
"prefix is used. Mirror of tiling-check.py's "
|
||||
"--allow-remote-ollama guard.")
|
||||
parser.add_argument("--rebuild", action="store_true",
|
||||
help="Re-process chunks even if body_hash matches.")
|
||||
parser.add_argument("--peek", action="store_true",
|
||||
help="Print plan, write nothing.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.all and not args.path:
|
||||
args.path = "--all"
|
||||
elif not args.path:
|
||||
# No path and no --all: default to all (matches the help text)
|
||||
args.path = "--all"
|
||||
|
||||
pages = collect_pages(args.path)
|
||||
# Explicit single-path invocations must point at a readable file inside the
|
||||
# vault. --all only ever yields in-vault files, so this guard is explicit-only.
|
||||
# Without it a typo'd path exited 0 silently, and an out-of-vault path raised
|
||||
# a raw ValueError from relative_to().
|
||||
if args.path != "--all":
|
||||
target = pages[0].resolve()
|
||||
if not target.is_relative_to(VAULT_ROOT):
|
||||
log(f"ERR: {args.path} resolves outside the vault ({VAULT_ROOT}).")
|
||||
return EXIT_USAGE
|
||||
if not target.is_file():
|
||||
log(f"ERR: {args.path} is not a readable file.")
|
||||
return EXIT_PAGE_MISSING
|
||||
# Filter to actual files up front so progress counter is meaningful
|
||||
# (v1.7.2; closes audit L2: tier-2 over 47 pages can take 5+ min — the
|
||||
# user needs a count, not just per-page log lines).
|
||||
files = [p for p in pages if p.is_file()]
|
||||
skipped_non_files = len(pages) - len(files)
|
||||
if skipped_non_files:
|
||||
log(f"({skipped_non_files} non-file paths skipped)")
|
||||
total = len(files)
|
||||
total_written = 0
|
||||
total_skipped = 0
|
||||
for i, page in enumerate(files, 1):
|
||||
result = process_page(
|
||||
page,
|
||||
force_synthetic=args.no_llm,
|
||||
rebuild=args.rebuild,
|
||||
peek=args.peek,
|
||||
allow_egress=args.allow_egress,
|
||||
progress_label=f"[{i}/{total}]",
|
||||
)
|
||||
total_written += len(result["written"])
|
||||
total_skipped += result["skipped"]
|
||||
|
||||
log(f"\nDone. pages={total} chunks_written={total_written} chunks_unchanged={total_skipped}")
|
||||
return EXIT_OK
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,229 @@
|
||||
#!/usr/bin/env bash
|
||||
# detect-transport.sh — discover which vault-mutation transports are available
|
||||
# on this machine, write a normalized JSON snapshot to .vault-meta/transport.json,
|
||||
# and pick a preferred transport per the v1.7 fallback chain.
|
||||
#
|
||||
# Fallback chain (highest to lowest precedence):
|
||||
# 1. cli — Obsidian CLI binary (Obsidian 1.12+). No MCP server, no TLS, no plugin.
|
||||
# 2. mcp-obsidian — REST-API-backed MCP server (Local REST API plugin required).
|
||||
# 3. mcpvault — Filesystem-backed MCP server (BM25 search; no Obsidian plugin).
|
||||
# 4. filesystem — Direct Read/Write/Edit tools. Always available (ultimate floor).
|
||||
#
|
||||
# MCP auto-detection is deferred to a v1.7.x patch (calling `claude mcp list` from
|
||||
# inside a running claude session has reentrancy concerns). For v1.7, we detect
|
||||
# CLI + filesystem and leave MCP fields as `{"present": null, "detection": "deferred"}`.
|
||||
# Users with MCP transports configured can either edit transport.json manually or
|
||||
# follow the legacy guidance in wiki/references/mcp-setup.md.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/detect-transport.sh # detect and write .vault-meta/transport.json
|
||||
# ./scripts/detect-transport.sh --peek # print result to stdout without writing
|
||||
# ./scripts/detect-transport.sh --force # refresh even if existing snapshot is fresh (<7d)
|
||||
# ./scripts/detect-transport.sh --quiet # suppress informational stderr output
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 — success (transport.json written or peeked)
|
||||
# 2 — vault-meta/ missing and cannot be created
|
||||
# 3 — unrecognized flag
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
VAULT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
META_DIR="${VAULT_ROOT}/.vault-meta"
|
||||
OUTPUT_FILE="${META_DIR}/transport.json"
|
||||
STALE_AFTER_DAYS=7
|
||||
|
||||
MODE="write"
|
||||
QUIET=false
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--peek) MODE="peek" ;;
|
||||
--force) MODE="force" ;;
|
||||
--quiet) QUIET=true ;;
|
||||
-h|--help)
|
||||
sed -n '2,28p' "$0" | sed 's/^# \{0,1\}//'
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "ERR: unknown flag: $1" >&2
|
||||
exit 3
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
log() { $QUIET || echo "$@" >&2; }
|
||||
|
||||
# json_escape: read stdin and emit a JSON-encoded string (including the
|
||||
# surrounding double quotes). Used for any untrusted value that lands in the
|
||||
# transport.json heredoc — newlines, backslashes, control chars in upstream
|
||||
# binaries (obsidian-cli --version) would otherwise break the JSON.
|
||||
json_escape() {
|
||||
python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()), end="")'
|
||||
}
|
||||
|
||||
mkdir -p "$META_DIR" || {
|
||||
echo "ERR: cannot create .vault-meta/ at $META_DIR" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
# ── 0. Honor manual_override from existing transport.json ────────────────────
|
||||
# Users can pin a non-detected transport (mcp-obsidian, mcpvault, or any custom
|
||||
# value) by editing transport.json to set:
|
||||
# "manual_override": true
|
||||
# "preferred": "<their-choice>"
|
||||
# "fallback_chain": [...]
|
||||
# Auto-detection still runs (to refresh CLI/Obsidian-running flags for visibility),
|
||||
# but PREFERRED and CHAIN are preserved from the existing file across both the
|
||||
# normal write path AND --force runs. Documented at
|
||||
# wiki/references/transport-fallback.md §Manual override.
|
||||
MANUAL_OVERRIDE_FLAG=false
|
||||
MANUAL_OVERRIDE_PREFERRED=""
|
||||
MANUAL_OVERRIDE_CHAIN=""
|
||||
if [ -f "$OUTPUT_FILE" ]; then
|
||||
MANUAL_PARSE="$(python3 - "$OUTPUT_FILE" 2>/dev/null <<'PYEOF'
|
||||
import json, sys
|
||||
try:
|
||||
with open(sys.argv[1]) as fh:
|
||||
data = json.load(fh)
|
||||
if data.get("manual_override") is True:
|
||||
pref = data.get("preferred", "")
|
||||
chain = data.get("fallback_chain", [])
|
||||
# Output: line 1 = preferred; line 2 = comma-separated quoted chain entries.
|
||||
print(pref)
|
||||
print(",".join('"' + str(c) + '"' for c in chain))
|
||||
except Exception:
|
||||
pass
|
||||
PYEOF
|
||||
)" || MANUAL_PARSE=""
|
||||
if [ -n "${MANUAL_PARSE:-}" ]; then
|
||||
MANUAL_OVERRIDE_FLAG=true
|
||||
MANUAL_OVERRIDE_PREFERRED="$(printf '%s\n' "$MANUAL_PARSE" | sed -n '1p')"
|
||||
MANUAL_OVERRIDE_CHAIN="$(printf '%s\n' "$MANUAL_PARSE" | sed -n '2p')"
|
||||
log "manual_override=true; preserving preferred=${MANUAL_OVERRIDE_PREFERRED}"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── Freshness check: skip detection if snapshot is recent ────────────────────
|
||||
if [ "$MODE" = "write" ] && [ -f "$OUTPUT_FILE" ]; then
|
||||
if find "$OUTPUT_FILE" -mtime -${STALE_AFTER_DAYS} -print 2>/dev/null | grep -q .; then
|
||||
log "transport.json is fresh (<${STALE_AFTER_DAYS}d). Use --force to refresh."
|
||||
cat "$OUTPUT_FILE"
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── 1. CLI detection ─────────────────────────────────────────────────────────
|
||||
CLI_PRESENT=false
|
||||
CLI_BINARY=""
|
||||
CLI_VERSION=""
|
||||
CLI_VERSION_RAW=""
|
||||
if command -v obsidian-cli >/dev/null 2>&1; then
|
||||
CLI_PRESENT=true
|
||||
CLI_BINARY="obsidian-cli"
|
||||
# Keep two views of the version: RAW for the human log line, JSON-escaped
|
||||
# for the transport.json heredoc. CLI_VERSION below is pre-quoted (includes
|
||||
# the surrounding double quotes), so the heredoc emits ${CLI_VERSION}
|
||||
# without wrapping quotes.
|
||||
CLI_VERSION_RAW="$(obsidian-cli --version 2>/dev/null | head -1 || echo unknown)"
|
||||
CLI_VERSION="$(printf '%s' "$CLI_VERSION_RAW" | json_escape || echo '"unknown"')"
|
||||
elif command -v obsidian >/dev/null 2>&1; then
|
||||
# Obsidian 1.12+ ships `obsidian` as the CLI binary on some platforms.
|
||||
# We treat it as cli-capable if it accepts a --cli or --version flag without launching the GUI.
|
||||
if obsidian --version >/dev/null 2>&1; then
|
||||
CLI_PRESENT=true
|
||||
CLI_BINARY="obsidian"
|
||||
CLI_VERSION_RAW="$(obsidian --version 2>/dev/null | head -1 || echo unknown)"
|
||||
CLI_VERSION="$(printf '%s' "$CLI_VERSION_RAW" | json_escape || echo '"unknown"')"
|
||||
fi
|
||||
fi
|
||||
# Fallback default when neither binary was found: must still be a valid JSON literal.
|
||||
if [ -z "$CLI_VERSION" ]; then
|
||||
CLI_VERSION='""'
|
||||
CLI_VERSION_RAW=""
|
||||
fi
|
||||
|
||||
# ── 2. Obsidian app running? (informational only; CLI works either way) ──────
|
||||
OBSIDIAN_RUNNING=false
|
||||
if command -v pgrep >/dev/null 2>&1; then
|
||||
if pgrep -if 'obsidian' >/dev/null 2>&1; then
|
||||
OBSIDIAN_RUNNING=true
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── 3. Compute preferred + fallback chain ────────────────────────────────────
|
||||
if $CLI_PRESENT; then
|
||||
PREFERRED="cli"
|
||||
CHAIN='"cli", "filesystem"'
|
||||
else
|
||||
PREFERRED="filesystem"
|
||||
CHAIN='"filesystem"'
|
||||
fi
|
||||
|
||||
# ── 3b. Apply manual_override if it was parsed from the existing snapshot ────
|
||||
# Auto-detected PREFERRED/CHAIN above are overridden so the user's pinned
|
||||
# transport survives every refresh cycle including --force.
|
||||
if $MANUAL_OVERRIDE_FLAG; then
|
||||
PREFERRED="$MANUAL_OVERRIDE_PREFERRED"
|
||||
CHAIN="$MANUAL_OVERRIDE_CHAIN"
|
||||
fi
|
||||
|
||||
# ── 4. Build JSON snapshot ───────────────────────────────────────────────────
|
||||
TIMESTAMP="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
HOSTNAME="$(hostname 2>/dev/null || echo unknown)"
|
||||
|
||||
snapshot() {
|
||||
cat <<JSON
|
||||
{
|
||||
"schema_version": 1,
|
||||
"detected_at": "${TIMESTAMP}",
|
||||
"host": "${HOSTNAME}",
|
||||
"vault_root": "${VAULT_ROOT}",
|
||||
"manual_override": ${MANUAL_OVERRIDE_FLAG},
|
||||
"preferred": "${PREFERRED}",
|
||||
"fallback_chain": [${CHAIN}],
|
||||
"available": {
|
||||
"cli": {
|
||||
"present": ${CLI_PRESENT},
|
||||
"binary": "${CLI_BINARY}",
|
||||
"version_string": ${CLI_VERSION},
|
||||
"obsidian_app_running": ${OBSIDIAN_RUNNING}
|
||||
},
|
||||
"filesystem": {
|
||||
"present": true,
|
||||
"vault_root": "${VAULT_ROOT}",
|
||||
"note": "ultimate fallback; uses Claude's Read/Write/Edit tools directly"
|
||||
},
|
||||
"mcp_obsidian": {
|
||||
"present": null,
|
||||
"detection": "deferred",
|
||||
"note": "v1.7 does not auto-detect MCP servers. Configure manually per wiki/references/mcp-setup.md and edit this file by hand if needed."
|
||||
},
|
||||
"mcpvault": {
|
||||
"present": null,
|
||||
"detection": "deferred",
|
||||
"note": "v1.7 does not auto-detect MCP servers. Configure manually per wiki/references/mcp-setup.md and edit this file by hand if needed."
|
||||
}
|
||||
}
|
||||
}
|
||||
JSON
|
||||
}
|
||||
|
||||
if [ "$MODE" = "peek" ]; then
|
||||
snapshot
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Atomic write: stage to .tmp then rename. Avoids partial files if killed mid-write.
|
||||
TMP="${OUTPUT_FILE}.$$.tmp"
|
||||
trap 'rm -f "$TMP"' EXIT
|
||||
snapshot > "$TMP"
|
||||
mv "$TMP" "$OUTPUT_FILE"
|
||||
trap - EXIT
|
||||
|
||||
log "Wrote: ${OUTPUT_FILE}"
|
||||
log "Preferred transport: ${PREFERRED}"
|
||||
$CLI_PRESENT && log " CLI: ${CLI_BINARY} (${CLI_VERSION_RAW})"
|
||||
log " Filesystem: always available (Read/Write/Edit tools)"
|
||||
log " MCP: not auto-detected (see wiki/references/mcp-setup.md to configure)"
|
||||
@@ -0,0 +1,312 @@
|
||||
#!/usr/bin/env python3
|
||||
"""rerank.py — query-time reranker for chunk candidates.
|
||||
|
||||
Takes a query string + a list of candidate chunks (from BM25, vector, or any
|
||||
upstream stage) and reorders them using semantic similarity.
|
||||
|
||||
v1.7 strategy (in preference order, automatically chosen at runtime):
|
||||
1. If ollama is reachable AND nomic-embed-text is pulled
|
||||
→ embed the query, embed each candidate's contextualized_text,
|
||||
rank by cosine. Caches per-chunk embeddings in
|
||||
.vault-meta/embed-cache.json keyed by body_hash.
|
||||
2. Otherwise
|
||||
→ no-op rerank: return candidates in input order with a synthesized
|
||||
note. Caller (retrieve.py) still gets a useful result; downstream
|
||||
drill-into-page logic is unchanged.
|
||||
|
||||
Future v1.7.x upgrade paths:
|
||||
- Cross-encoder reranker (sentence-transformers BGE-base) if installed
|
||||
- Cohere Rerank API if COHERE_API_KEY set
|
||||
- Voyage Rerank API if VOYAGE_API_KEY set
|
||||
|
||||
Mirrors the localhost-only OLLAMA_URL guard from scripts/tiling-check.py:
|
||||
remote ollama endpoints require --allow-remote-ollama because page bodies
|
||||
are POSTed as embedding input.
|
||||
|
||||
Usage:
|
||||
rerank.py "query string" --candidates candidates.json [--top 5]
|
||||
rerank.py "query string" --candidates - --top 5 # stdin
|
||||
rerank.py --peek "query string" # show strategy chosen
|
||||
|
||||
Candidates JSON shape:
|
||||
[{"chunk_id": "c-000042:3", "path": ".vault-meta/chunks/.../chunk-003.json", "score": 7.1}, ...]
|
||||
|
||||
Output: ranked candidates with `rerank_score` added.
|
||||
|
||||
Exit codes:
|
||||
0 — success
|
||||
2 — usage error
|
||||
3 — candidate input malformed
|
||||
10 — ollama unreachable (no-op rerank performed, exit 0 with note)
|
||||
11 — model not pulled (no-op rerank performed, exit 0 with note)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import fcntl
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
META_DIR = VAULT_ROOT / ".vault-meta"
|
||||
EMBED_CACHE_PATH = META_DIR / "embed-cache.json"
|
||||
CACHE_LOCK = META_DIR / ".embed-cache.lock"
|
||||
|
||||
DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434"
|
||||
DEFAULT_MODEL = "nomic-embed-text"
|
||||
OLLAMA_TIMEOUT_SEC = 3
|
||||
EMBED_TIMEOUT_SEC = 30
|
||||
MAX_RESPONSE_BYTES = 4 * 1024 * 1024
|
||||
|
||||
EXIT_OK = 0
|
||||
EXIT_USAGE = 2
|
||||
EXIT_CANDIDATES = 3
|
||||
EXIT_NO_OLLAMA = 10
|
||||
EXIT_NO_MODEL = 11
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
|
||||
def cosine(a, b):
|
||||
if not a or not b or len(a) != len(b):
|
||||
return 0.0
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
na = math.sqrt(sum(x * x for x in a))
|
||||
nb = math.sqrt(sum(y * y for y in b))
|
||||
if na == 0 or nb == 0:
|
||||
return 0.0
|
||||
return dot / (na * nb)
|
||||
|
||||
|
||||
def ollama_url(allow_remote):
|
||||
url = os.environ.get("OLLAMA_URL", DEFAULT_OLLAMA_URL).rstrip("/")
|
||||
if not allow_remote:
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
host = parsed.hostname or ""
|
||||
if host not in ("127.0.0.1", "localhost", "::1"):
|
||||
log(f"ERR: OLLAMA_URL={url} points off-localhost (host={host!r}).")
|
||||
log(" Either: (a) run ollama locally — `systemctl --user start ollama` or `ollama serve`")
|
||||
log(" Or: (b) pass --allow-remote-ollama through retrieve.py, which forwards it here.")
|
||||
log(" Or: (c) unset OLLAMA_URL to fall back to the local default (127.0.0.1:11434).")
|
||||
sys.exit(EXIT_USAGE)
|
||||
return url
|
||||
|
||||
|
||||
def ollama_alive(url):
|
||||
try:
|
||||
req = urllib.request.Request(f"{url}/api/tags", method="GET")
|
||||
with urllib.request.urlopen(req, timeout=OLLAMA_TIMEOUT_SEC) as resp:
|
||||
data = json.loads(resp.read(MAX_RESPONSE_BYTES))
|
||||
models = [m.get("name", "").split(":")[0] for m in data.get("models", [])]
|
||||
return True, models
|
||||
except (urllib.error.URLError, json.JSONDecodeError, OSError):
|
||||
return False, []
|
||||
|
||||
|
||||
def embed_one(url, model, text):
|
||||
payload = json.dumps({"model": model, "prompt": text}).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{url}/api/embeddings",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=EMBED_TIMEOUT_SEC) as resp:
|
||||
data = json.loads(resp.read(MAX_RESPONSE_BYTES))
|
||||
return data.get("embedding") or []
|
||||
|
||||
|
||||
def load_cache():
|
||||
if not EMBED_CACHE_PATH.is_file():
|
||||
return {}
|
||||
try:
|
||||
return json.loads(EMBED_CACHE_PATH.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
|
||||
|
||||
def save_cache(cache):
|
||||
"""Persist the embed cache atomically.
|
||||
|
||||
v1.7.2 / closes audit M7: previously used blocking fcntl.LOCK_EX with no
|
||||
timeout, which could hang indefinitely on a non-flock-capable filesystem
|
||||
(some NFS mounts, network shares, FUSE backends without lock support).
|
||||
Now uses LOCK_NB with a 3-attempt retry loop, then falls back to writing
|
||||
without the lock (with a WARN) so the rerank pipeline never hangs the
|
||||
user's session. The temp + os.replace pattern provides write atomicity
|
||||
even without the lock; the lock only serializes concurrent writers.
|
||||
"""
|
||||
META_DIR.mkdir(parents=True, exist_ok=True)
|
||||
fd = os.open(str(CACHE_LOCK), os.O_CREAT | os.O_WRONLY, 0o644)
|
||||
locked = False
|
||||
try:
|
||||
for attempt in range(3):
|
||||
try:
|
||||
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
locked = True
|
||||
break
|
||||
except BlockingIOError:
|
||||
time.sleep(0.1)
|
||||
if not locked:
|
||||
msg = ("WARN: rerank embed-cache lock unavailable after 3 tries; "
|
||||
"writing unlocked (atomic via temp+rename). Concurrent writers "
|
||||
"may overwrite each other's last update.")
|
||||
log(msg)
|
||||
# v1.9.1 / closes audit Data M1: also route to .vault-meta/hook.log so
|
||||
# the user sees the event via wiki-lint (stderr alone is invisible to
|
||||
# most callers; this matches the hook's logging shape).
|
||||
try:
|
||||
META_DIR.mkdir(parents=True, exist_ok=True)
|
||||
hook_log = META_DIR / "hook.log"
|
||||
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
with hook_log.open("a", encoding="utf-8") as fh:
|
||||
fh.write(f"{ts} rerank embed-cache lock unavailable; wrote unlocked\n")
|
||||
except OSError:
|
||||
pass # never block on a logging failure
|
||||
tmp = EMBED_CACHE_PATH.with_suffix(f".{os.getpid()}.tmp")
|
||||
tmp.write_text(json.dumps(cache, ensure_ascii=False), encoding="utf-8")
|
||||
os.replace(tmp, EMBED_CACHE_PATH)
|
||||
finally:
|
||||
if locked:
|
||||
try:
|
||||
fcntl.flock(fd, fcntl.LOCK_UN)
|
||||
except OSError:
|
||||
pass
|
||||
os.close(fd)
|
||||
|
||||
|
||||
def load_chunk(chunk_rel_path):
|
||||
p = VAULT_ROOT / chunk_rel_path
|
||||
if not p.is_file():
|
||||
return None
|
||||
try:
|
||||
return json.loads(p.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return None
|
||||
|
||||
|
||||
def rerank(query, candidates, top_k=5, allow_remote=False):
|
||||
"""Returns candidates list, possibly truncated to top_k, with rerank_score added.
|
||||
Falls back to input-order if ollama is unavailable (still adds rerank_source: 'noop').
|
||||
"""
|
||||
url = ollama_url(allow_remote)
|
||||
alive, models = ollama_alive(url)
|
||||
if not alive:
|
||||
log("ollama unreachable — no-op rerank")
|
||||
for c in candidates:
|
||||
c["rerank_score"] = float(c.get("score", 0.0))
|
||||
c["rerank_source"] = "noop-no-ollama"
|
||||
return candidates[:top_k]
|
||||
if DEFAULT_MODEL not in models:
|
||||
log(f"model {DEFAULT_MODEL} not pulled — no-op rerank")
|
||||
for c in candidates:
|
||||
c["rerank_score"] = float(c.get("score", 0.0))
|
||||
c["rerank_source"] = "noop-no-model"
|
||||
return candidates[:top_k]
|
||||
|
||||
cache = load_cache()
|
||||
cache_dirty = False
|
||||
try:
|
||||
q_emb = embed_one(url, DEFAULT_MODEL, query)
|
||||
except Exception as e:
|
||||
log(f"query embed failed: {e}")
|
||||
for c in candidates:
|
||||
c["rerank_score"] = float(c.get("score", 0.0))
|
||||
c["rerank_source"] = "noop-embed-error"
|
||||
return candidates[:top_k]
|
||||
|
||||
for c in candidates:
|
||||
chunk = load_chunk(c.get("path", ""))
|
||||
if not chunk:
|
||||
c["rerank_score"] = 0.0
|
||||
c["rerank_source"] = "missing-chunk"
|
||||
continue
|
||||
body_hash = chunk.get("body_hash", "")
|
||||
cache_key = f"{DEFAULT_MODEL}:{body_hash}"
|
||||
emb = cache.get(cache_key)
|
||||
if not emb:
|
||||
text = chunk.get("contextualized_text") or chunk.get("raw_text", "")
|
||||
try:
|
||||
emb = embed_one(url, DEFAULT_MODEL, text)
|
||||
except Exception as e:
|
||||
log(f"embed failed for {c.get('chunk_id')}: {e}")
|
||||
c["rerank_score"] = float(c.get("score", 0.0))
|
||||
c["rerank_source"] = "embed-error"
|
||||
continue
|
||||
cache[cache_key] = emb
|
||||
cache_dirty = True
|
||||
c["rerank_score"] = cosine(q_emb, emb)
|
||||
c["rerank_source"] = f"cosine:{DEFAULT_MODEL}"
|
||||
|
||||
if cache_dirty:
|
||||
save_cache(cache)
|
||||
|
||||
ranked = sorted(candidates, key=lambda x: x.get("rerank_score", 0.0), reverse=True)
|
||||
return ranked[:top_k]
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Rerank chunk candidates by semantic similarity.")
|
||||
parser.add_argument("query", nargs="?", help="Query text")
|
||||
parser.add_argument("--candidates", help="Path to candidates JSON or `-` for stdin",
|
||||
default=None)
|
||||
parser.add_argument("--top", type=int, default=5, help="Top-K to return")
|
||||
parser.add_argument("--peek", action="store_true",
|
||||
help="Print rerank strategy chosen and exit")
|
||||
parser.add_argument("--allow-remote-ollama", action="store_true",
|
||||
help="Accept non-localhost OLLAMA_URL (potential data exfil)")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.peek:
|
||||
if not args.query:
|
||||
log("--peek needs a query string")
|
||||
sys.exit(EXIT_USAGE)
|
||||
url = ollama_url(args.allow_remote_ollama)
|
||||
alive, models = ollama_alive(url)
|
||||
strategy = "noop-no-ollama"
|
||||
if alive:
|
||||
strategy = f"cosine:{DEFAULT_MODEL}" if DEFAULT_MODEL in models else "noop-no-model"
|
||||
print(json.dumps({
|
||||
"query": args.query,
|
||||
"strategy": strategy,
|
||||
"ollama_url": url,
|
||||
"ollama_alive": alive,
|
||||
"model_present": DEFAULT_MODEL in models,
|
||||
"checked_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
}, indent=2))
|
||||
return EXIT_OK
|
||||
|
||||
if not args.query or args.candidates is None:
|
||||
log("usage: rerank.py <query> --candidates <path|-> [--top N]")
|
||||
return EXIT_USAGE
|
||||
|
||||
if args.candidates == "-":
|
||||
cand_text = sys.stdin.read()
|
||||
else:
|
||||
cand_text = Path(args.candidates).read_text(encoding="utf-8")
|
||||
try:
|
||||
candidates = json.loads(cand_text)
|
||||
if not isinstance(candidates, list):
|
||||
raise ValueError("candidates must be a JSON list")
|
||||
except (json.JSONDecodeError, ValueError) as e:
|
||||
log(f"ERR: bad candidates JSON: {e}")
|
||||
return EXIT_CANDIDATES
|
||||
|
||||
result = rerank(args.query, candidates, top_k=args.top,
|
||||
allow_remote=args.allow_remote_ollama)
|
||||
print(json.dumps(result, indent=2))
|
||||
return EXIT_OK
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,195 @@
|
||||
#!/usr/bin/env python3
|
||||
"""retrieve.py — hybrid retrieval orchestrator for the Compound Vault.
|
||||
|
||||
Pipeline (v1.7):
|
||||
query → bm25-index.py query (top-K candidates by BM25 over contextualized chunks)
|
||||
→ rerank.py (cosine on nomic-embed-text vectors via ollama,
|
||||
or no-op if ollama unavailable)
|
||||
→ drill (return chunk pages with absolute paths so the
|
||||
caller can Read them and synthesize)
|
||||
|
||||
Loads sibling scripts as Python modules (no subprocess overhead). Falls back
|
||||
gracefully when index or rerank stage is missing:
|
||||
- If .vault-meta/bm25/index.json is absent → exit 10 with friendly message;
|
||||
caller falls back to v1.6 legacy
|
||||
hot→index→drill read order.
|
||||
- If .vault-meta/chunks/ is empty → exit 10 (same).
|
||||
- If rerank stage cannot embed (no ollama) → no-op rerank, returns BM25 order.
|
||||
|
||||
Output schema (JSON to stdout):
|
||||
{
|
||||
"query": "...",
|
||||
"strategy": "bm25+rerank:cosine:nomic-embed-text" | "bm25+noop-rerank",
|
||||
"top_k": 5,
|
||||
"candidates": [
|
||||
{
|
||||
"chunk_id": "c-000042:3",
|
||||
"page_address": "c-000042",
|
||||
"page_path": "wiki/concepts/Foo.md",
|
||||
"absolute_path": "/abs/path/to/wiki/concepts/Foo.md",
|
||||
"chunk_index": 3,
|
||||
"bm25_score": 7.12,
|
||||
"rerank_score": 0.81,
|
||||
"rerank_source": "cosine:nomic-embed-text",
|
||||
"snippet": "... first 200 chars of the chunk ..."
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Usage:
|
||||
retrieve.py "your query here" # standard: BM25 top-20, rerank to top-5
|
||||
retrieve.py "query" --top 10 # change result count
|
||||
retrieve.py "query" --no-rerank # skip rerank, BM25-only
|
||||
retrieve.py "query" --explain # include per-stage diagnostics
|
||||
|
||||
Exit codes:
|
||||
0 — success
|
||||
2 — usage error
|
||||
10 — feature not provisioned (no chunks or no BM25 index); caller falls back
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import importlib.util
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
SCRIPTS_DIR = VAULT_ROOT / "scripts"
|
||||
META_DIR = VAULT_ROOT / ".vault-meta"
|
||||
CHUNKS_DIR = META_DIR / "chunks"
|
||||
BM25_INDEX = META_DIR / "bm25" / "index.json"
|
||||
|
||||
EXIT_OK = 0
|
||||
EXIT_USAGE = 2
|
||||
EXIT_NOT_PROVISIONED = 10
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
|
||||
def import_sibling(name, filename):
|
||||
"""Import a hyphenated sibling .py file as a Python module.
|
||||
|
||||
Wrapped in try/except (v1.7.2; closes audit M5) so a syntax error or
|
||||
missing dependency in a sibling helper produces a friendly diagnostic
|
||||
instead of a bare Python traceback at the user's first retrieve call.
|
||||
"""
|
||||
target = SCRIPTS_DIR / filename
|
||||
if not target.is_file():
|
||||
log(f"ERR: sibling helper {filename} not found at {target}")
|
||||
log(" Run `bash bin/setup-retrieve.sh --check` to verify the install.")
|
||||
sys.exit(EXIT_NOT_PROVISIONED)
|
||||
try:
|
||||
spec = importlib.util.spec_from_file_location(name, target)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
return mod
|
||||
except (ImportError, SyntaxError, AttributeError) as e:
|
||||
log(f"ERR: failed to import sibling helper {filename}: {type(e).__name__}: {e}")
|
||||
log(" This likely means the helper script is corrupted or has a syntax error.")
|
||||
log(" Run `python3 scripts/<helper>.py --help` directly to see the underlying error.")
|
||||
log(" If it persists: re-clone the repo or check `git status` for local damage.")
|
||||
sys.exit(EXIT_NOT_PROVISIONED)
|
||||
|
||||
|
||||
def chunk_snippet(chunk_data, max_chars=200):
|
||||
text = chunk_data.get("raw_text", "")
|
||||
if len(text) <= max_chars:
|
||||
return text
|
||||
return text[:max_chars].rstrip() + "…"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Hybrid retrieval over the vault.")
|
||||
parser.add_argument("query", help="Natural-language query")
|
||||
parser.add_argument("--top", type=int, default=5, help="Final result count (post-rerank)")
|
||||
parser.add_argument("--bm25-top", type=int, default=20,
|
||||
help="Candidate count from BM25 (pre-rerank)")
|
||||
parser.add_argument("--no-rerank", action="store_true",
|
||||
help="Skip the rerank stage; return BM25-only")
|
||||
parser.add_argument("--explain", action="store_true",
|
||||
help="Include per-stage diagnostics in output")
|
||||
parser.add_argument("--allow-remote-ollama", action="store_true",
|
||||
help="Forwarded to rerank.py")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not BM25_INDEX.is_file():
|
||||
log(f"ERR: no BM25 index at {BM25_INDEX}. Run `bash bin/setup-retrieve.sh` "
|
||||
"to provision, or fall back to legacy hot→index→drill.")
|
||||
return EXIT_NOT_PROVISIONED
|
||||
if not CHUNKS_DIR.is_dir() or not any(CHUNKS_DIR.iterdir()):
|
||||
log(f"ERR: no chunks at {CHUNKS_DIR}. Run "
|
||||
"`python3 scripts/contextual-prefix.py --all` first.")
|
||||
return EXIT_NOT_PROVISIONED
|
||||
|
||||
bm25 = import_sibling("bm25_index", "bm25-index.py")
|
||||
reranker = import_sibling("rerank", "rerank.py")
|
||||
|
||||
bm25_hits = bm25.query(args.query, top_k=args.bm25_top)
|
||||
log(f"bm25: {len(bm25_hits)} hits")
|
||||
|
||||
candidates = []
|
||||
for h in bm25_hits:
|
||||
chunk_path = VAULT_ROOT / h["path"]
|
||||
try:
|
||||
chunk = json.loads(chunk_path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
continue
|
||||
candidates.append({
|
||||
"chunk_id": h["chunk_id"],
|
||||
"page_address": chunk.get("page_address"),
|
||||
"page_path": chunk.get("page_path"),
|
||||
"absolute_path": str((VAULT_ROOT / chunk.get("page_path", "")).resolve()),
|
||||
"chunk_index": chunk.get("chunk_index"),
|
||||
"bm25_score": h["score"],
|
||||
"path": h["path"],
|
||||
"snippet": chunk_snippet(chunk),
|
||||
})
|
||||
|
||||
if args.no_rerank:
|
||||
final = candidates[:args.top]
|
||||
strategy = "bm25-only"
|
||||
for c in final:
|
||||
c["rerank_score"] = c["bm25_score"]
|
||||
c["rerank_source"] = "skipped"
|
||||
else:
|
||||
final = reranker.rerank(
|
||||
args.query, candidates, top_k=args.top,
|
||||
allow_remote=args.allow_remote_ollama,
|
||||
)
|
||||
# Derive strategy from first candidate's rerank_source
|
||||
first_src = (final[0].get("rerank_source") if final else "unknown")
|
||||
strategy = f"bm25+rerank:{first_src}"
|
||||
|
||||
# Dedupe by page (we may have multiple chunks of the same page; collapse to best)
|
||||
by_page = {}
|
||||
for c in final:
|
||||
addr = c.get("page_address")
|
||||
if addr not in by_page or c.get("rerank_score", 0) > by_page[addr].get("rerank_score", 0):
|
||||
by_page[addr] = c
|
||||
deduped = list(by_page.values())
|
||||
deduped.sort(key=lambda c: c.get("rerank_score", 0), reverse=True)
|
||||
|
||||
out = {
|
||||
"query": args.query,
|
||||
"strategy": strategy,
|
||||
"top_k": args.top,
|
||||
"candidates": deduped[:args.top],
|
||||
}
|
||||
if args.explain:
|
||||
out["explain"] = {
|
||||
"bm25_candidate_count": len(bm25_hits),
|
||||
"post_rerank_count": len(final),
|
||||
"deduped_count": len(deduped),
|
||||
"bm25_top_param": args.bm25_top,
|
||||
}
|
||||
|
||||
print(json.dumps(out, indent=2, ensure_ascii=False))
|
||||
return EXIT_OK
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,496 @@
|
||||
#!/usr/bin/env python3
|
||||
"""tiling-check.py — DragonScale Mechanism 3: semantic tiling lint.
|
||||
|
||||
Computes per-page embeddings via a local ollama instance and reports
|
||||
candidate duplicate page pairs. Read-only; never modifies wiki pages.
|
||||
|
||||
Security model:
|
||||
- Defaults to http://127.0.0.1:11434. Remote ollama endpoints require
|
||||
--allow-remote-ollama explicitly (vault bodies are POSTed as embedding
|
||||
input; a hostile env var would otherwise exfiltrate content).
|
||||
- Rejects symlinked page files to prevent escape outside the vault root.
|
||||
|
||||
Feature-gated: exits 10 if ollama is unreachable or 11 if the embedding
|
||||
model is not pulled, so the calling skill can no-op gracefully. Exits 0
|
||||
on success. Exit 3 on cache corruption. Exit 2 on usage error.
|
||||
|
||||
Concurrency:
|
||||
- Locks `.vault-meta/.tiling.lock` (flock exclusive) around cache I/O.
|
||||
- Per-PID temp file to avoid shared-tempfile races.
|
||||
|
||||
Usage:
|
||||
tiling-check.py # run; exit 10/11 if ollama/model missing
|
||||
tiling-check.py --report PATH # also write report to PATH
|
||||
tiling-check.py --rebuild-cache # ignore cached embeddings
|
||||
tiling-check.py --peek # structured diagnostics; no compute
|
||||
tiling-check.py --allow-remote-ollama # accept non-localhost OLLAMA_URL
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import fcntl
|
||||
import hashlib
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434"
|
||||
DEFAULT_MODEL = "nomic-embed-text"
|
||||
OLLAMA_TIMEOUT_SEC = 3
|
||||
EMBED_TIMEOUT_SEC = 30
|
||||
MAX_RESPONSE_BYTES = 4 * 1024 * 1024 # 4 MB; embeddings can be ~10 KB each
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
WIKI_DIR = VAULT_ROOT / "wiki"
|
||||
META_DIR = VAULT_ROOT / ".vault-meta"
|
||||
CACHE_PATH = META_DIR / "tiling-cache.json"
|
||||
CACHE_LOCK = META_DIR / ".tiling.lock"
|
||||
THRESHOLDS_PATH = META_DIR / "tiling-thresholds.json"
|
||||
|
||||
EXCLUDE_TYPES = {"meta", "fold"}
|
||||
EXCLUDE_FILENAMES = {
|
||||
"_index.md", "index.md", "log.md", "hot.md", "overview.md",
|
||||
"dashboard.md", "Wiki Map.md", "getting-started.md",
|
||||
}
|
||||
EXCLUDE_PATH_PREFIXES = ("wiki/folds/", "wiki/meta/")
|
||||
MAX_BODY_BYTES = 128 * 1024
|
||||
SCALE_WARN_PAGES = 500
|
||||
SCALE_HARD_FAIL_PAGES = 5000
|
||||
|
||||
EXIT_OK = 0
|
||||
EXIT_USAGE = 2
|
||||
EXIT_CACHE_CORRUPT = 3
|
||||
EXIT_SCALE_EXCEEDED = 4
|
||||
EXIT_NO_OLLAMA = 10
|
||||
EXIT_NO_MODEL = 11
|
||||
|
||||
FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
|
||||
TYPE_RE = re.compile(r"^type:\s*(\S+)", re.MULTILINE)
|
||||
|
||||
|
||||
def log(msg: str) -> None:
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
|
||||
def _is_local_url(url: str) -> bool:
|
||||
try:
|
||||
host = urllib.parse.urlparse(url).hostname or ""
|
||||
except ValueError:
|
||||
return False
|
||||
return host in ("127.0.0.1", "localhost", "::1")
|
||||
|
||||
|
||||
def _http_get_json(url: str, timeout: float) -> dict:
|
||||
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
||||
raw = resp.read(MAX_RESPONSE_BYTES + 1)
|
||||
if len(raw) > MAX_RESPONSE_BYTES:
|
||||
raise RuntimeError("response exceeded size limit")
|
||||
return json.loads(raw.decode("utf-8"))
|
||||
|
||||
|
||||
def _http_post_json(url: str, payload: dict, timeout: float) -> dict:
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
raw = resp.read(MAX_RESPONSE_BYTES + 1)
|
||||
if len(raw) > MAX_RESPONSE_BYTES:
|
||||
raise RuntimeError("response exceeded size limit")
|
||||
return json.loads(raw.decode("utf-8"))
|
||||
|
||||
|
||||
def detect_ollama(url: str) -> bool:
|
||||
try:
|
||||
_http_get_json(f"{url}/api/version", OLLAMA_TIMEOUT_SEC)
|
||||
return True
|
||||
except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
|
||||
return False
|
||||
|
||||
|
||||
def detect_model(url: str, model: str) -> bool:
|
||||
try:
|
||||
data = _http_get_json(f"{url}/api/tags", OLLAMA_TIMEOUT_SEC)
|
||||
except (urllib.error.URLError, OSError, ValueError, TimeoutError, RuntimeError):
|
||||
return False
|
||||
models = data.get("models")
|
||||
if not isinstance(models, list):
|
||||
return False
|
||||
for entry in models:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
name = entry.get("name", "")
|
||||
if isinstance(name, str) and (name == model or name.startswith(f"{model}:")):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def parse_frontmatter(text: str) -> tuple[dict, str]:
|
||||
m = FRONTMATTER_RE.match(text)
|
||||
if not m:
|
||||
return {}, text
|
||||
fm_raw = m.group(1)
|
||||
body = text[m.end():]
|
||||
fm: dict = {}
|
||||
tm = TYPE_RE.search(fm_raw)
|
||||
if tm:
|
||||
fm["type"] = tm.group(1).strip().strip('"').strip("'")
|
||||
return fm, body
|
||||
|
||||
|
||||
def body_hash(body: str, model: str) -> str:
|
||||
h = hashlib.sha256()
|
||||
h.update(f"model={model}\n".encode("utf-8"))
|
||||
h.update(body.encode("utf-8"))
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def cosine(a: list[float], b: list[float]) -> float:
|
||||
if len(a) != len(b):
|
||||
raise ValueError(f"dim mismatch: {len(a)} vs {len(b)}")
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
na = math.sqrt(sum(x * x for x in a))
|
||||
nb = math.sqrt(sum(x * x for x in b))
|
||||
if na == 0.0 or nb == 0.0:
|
||||
return 0.0
|
||||
return dot / (na * nb)
|
||||
|
||||
|
||||
def _lock_cache():
|
||||
META_DIR.mkdir(exist_ok=True)
|
||||
fd = os.open(str(CACHE_LOCK), os.O_CREAT | os.O_RDWR, 0o644)
|
||||
try:
|
||||
fcntl.flock(fd, fcntl.LOCK_EX)
|
||||
except OSError:
|
||||
os.close(fd)
|
||||
raise
|
||||
return fd
|
||||
|
||||
|
||||
def _unlock_cache(fd: int) -> None:
|
||||
try:
|
||||
fcntl.flock(fd, fcntl.LOCK_UN)
|
||||
finally:
|
||||
os.close(fd)
|
||||
|
||||
|
||||
def load_cache(current_model: str) -> dict:
|
||||
if not CACHE_PATH.exists():
|
||||
return {"version": 1, "model": current_model, "embeddings": {}}
|
||||
try:
|
||||
with CACHE_PATH.open() as f:
|
||||
data = json.load(f)
|
||||
except (OSError, json.JSONDecodeError) as exc:
|
||||
log(f"ERR: cache read failed: {exc}")
|
||||
sys.exit(EXIT_CACHE_CORRUPT)
|
||||
if data.get("version") != 1:
|
||||
log(f"ERR: unknown cache version: {data.get('version')}")
|
||||
sys.exit(EXIT_CACHE_CORRUPT)
|
||||
cached_model = data.get("model", "")
|
||||
if cached_model != current_model:
|
||||
log(f"INFO: cached model '{cached_model}' differs from current '{current_model}'; invalidating cache")
|
||||
return {"version": 1, "model": current_model, "embeddings": {}}
|
||||
if not isinstance(data.get("embeddings"), dict):
|
||||
log("ERR: cache.embeddings is not a dict")
|
||||
sys.exit(EXIT_CACHE_CORRUPT)
|
||||
return data
|
||||
|
||||
|
||||
def save_cache(cache: dict) -> None:
|
||||
META_DIR.mkdir(exist_ok=True)
|
||||
tmp = CACHE_PATH.with_name(f"{CACHE_PATH.stem}.{os.getpid()}.tmp")
|
||||
with tmp.open("w") as f:
|
||||
json.dump(cache, f, indent=2)
|
||||
tmp.replace(CACHE_PATH)
|
||||
|
||||
|
||||
def load_thresholds() -> dict:
|
||||
if not THRESHOLDS_PATH.exists():
|
||||
return {
|
||||
"version": 1, "model": DEFAULT_MODEL,
|
||||
"bands": {"error": 0.90, "review": 0.80},
|
||||
"calibrated": False, "calibration_pairs_labeled": 0,
|
||||
}
|
||||
with THRESHOLDS_PATH.open() as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def included(path: Path, fm: dict) -> tuple[bool, str]:
|
||||
rel = path.relative_to(VAULT_ROOT).as_posix()
|
||||
if path.is_symlink():
|
||||
return False, "symlink"
|
||||
resolved = path.resolve()
|
||||
try:
|
||||
resolved.relative_to(VAULT_ROOT.resolve())
|
||||
except ValueError:
|
||||
return False, "escapes vault"
|
||||
if path.name in EXCLUDE_FILENAMES:
|
||||
return False, "excluded filename"
|
||||
for prefix in EXCLUDE_PATH_PREFIXES:
|
||||
if rel.startswith(prefix):
|
||||
return False, f"under {prefix}"
|
||||
if fm.get("type") in EXCLUDE_TYPES:
|
||||
return False, f"type={fm['type']}"
|
||||
return True, "included"
|
||||
|
||||
|
||||
def embed(text: str, model: str, url: str) -> list[float]:
|
||||
data = _http_post_json(
|
||||
f"{url}/api/embeddings",
|
||||
{"model": model, "prompt": text},
|
||||
EMBED_TIMEOUT_SEC,
|
||||
)
|
||||
emb = data.get("embedding")
|
||||
if not isinstance(emb, list) or not emb:
|
||||
raise RuntimeError(f"ollama returned no embedding: {str(data)[:200]}")
|
||||
for v in emb:
|
||||
if not isinstance(v, (int, float)):
|
||||
raise RuntimeError("embedding contains non-numeric values")
|
||||
return emb
|
||||
|
||||
|
||||
def run_check(
|
||||
rebuild: bool,
|
||||
report_path: Path | None,
|
||||
ollama_url: str,
|
||||
model: str,
|
||||
) -> int:
|
||||
if not detect_ollama(ollama_url):
|
||||
log(f"ollama not reachable at {ollama_url}; skipping tiling check")
|
||||
return EXIT_NO_OLLAMA
|
||||
if not detect_model(ollama_url, model):
|
||||
log(f"model '{model}' not pulled; run: ollama pull {model}")
|
||||
return EXIT_NO_MODEL
|
||||
|
||||
thresholds = load_thresholds()
|
||||
|
||||
lock_fd = _lock_cache()
|
||||
try:
|
||||
cache = (load_cache(model) if not rebuild
|
||||
else {"version": 1, "model": model, "embeddings": {}})
|
||||
|
||||
pages: list[tuple[str, list[float]]] = []
|
||||
scanned = 0
|
||||
computed = 0
|
||||
cached_hits = 0
|
||||
skipped_counts: dict[str, int] = {}
|
||||
live_paths: set[str] = set()
|
||||
|
||||
candidates = sorted(WIKI_DIR.rglob("*.md"))
|
||||
scale_n = len(candidates)
|
||||
if scale_n > SCALE_HARD_FAIL_PAGES:
|
||||
log(f"ERR: {scale_n} pages exceed hard-fail limit {SCALE_HARD_FAIL_PAGES}")
|
||||
return EXIT_SCALE_EXCEEDED
|
||||
if scale_n > SCALE_WARN_PAGES:
|
||||
log(f"WARN: {scale_n} pages; cold-cache embed will issue ~{scale_n} POSTs to ollama")
|
||||
|
||||
for md in candidates:
|
||||
scanned += 1
|
||||
# Symlink and vault-root guards must run BEFORE read_text so a
|
||||
# hostile symlink cannot cause off-vault content to be read and
|
||||
# POSTed to the embedding endpoint.
|
||||
if md.is_symlink():
|
||||
skipped_counts["symlink"] = skipped_counts.get("symlink", 0) + 1
|
||||
continue
|
||||
try:
|
||||
resolved = md.resolve(strict=True)
|
||||
resolved.relative_to(VAULT_ROOT.resolve())
|
||||
except (OSError, ValueError):
|
||||
skipped_counts["escapes vault"] = skipped_counts.get("escapes vault", 0) + 1
|
||||
continue
|
||||
try:
|
||||
text = md.read_text(encoding="utf-8")
|
||||
except (OSError, UnicodeDecodeError):
|
||||
skipped_counts["read_error"] = skipped_counts.get("read_error", 0) + 1
|
||||
continue
|
||||
if len(text.encode("utf-8")) > MAX_BODY_BYTES:
|
||||
skipped_counts["too_large"] = skipped_counts.get("too_large", 0) + 1
|
||||
continue
|
||||
fm, body = parse_frontmatter(text)
|
||||
ok, reason = included(md, fm)
|
||||
if not ok:
|
||||
skipped_counts[reason] = skipped_counts.get(reason, 0) + 1
|
||||
continue
|
||||
rel = md.relative_to(VAULT_ROOT).as_posix()
|
||||
live_paths.add(rel)
|
||||
h = body_hash(body, model)
|
||||
entry = cache["embeddings"].get(rel)
|
||||
if entry and entry.get("hash") == h:
|
||||
pages.append((rel, entry["embedding"]))
|
||||
cached_hits += 1
|
||||
continue
|
||||
try:
|
||||
emb = embed(body, model, ollama_url)
|
||||
except Exception as exc:
|
||||
log(f"ERR embedding {rel}: {exc}")
|
||||
skipped_counts["embed_error"] = skipped_counts.get("embed_error", 0) + 1
|
||||
continue
|
||||
cache["embeddings"][rel] = {
|
||||
"hash": h,
|
||||
"embedding": emb,
|
||||
"computed_at": datetime.utcnow().isoformat(timespec="seconds") + "Z",
|
||||
}
|
||||
pages.append((rel, emb))
|
||||
computed += 1
|
||||
|
||||
# Orphan GC: drop cache entries for paths that no longer exist.
|
||||
orphans = [k for k in cache["embeddings"] if k not in live_paths]
|
||||
for k in orphans:
|
||||
del cache["embeddings"][k]
|
||||
|
||||
save_cache(cache)
|
||||
finally:
|
||||
_unlock_cache(lock_fd)
|
||||
|
||||
review = thresholds["bands"]["review"]
|
||||
error_ = thresholds["bands"]["error"]
|
||||
pairs: list[tuple[float, str, str]] = []
|
||||
for i in range(len(pages)):
|
||||
for j in range(i + 1, len(pages)):
|
||||
a_path, a_emb = pages[i]
|
||||
b_path, b_emb = pages[j]
|
||||
try:
|
||||
sim = cosine(a_emb, b_emb)
|
||||
except ValueError as exc:
|
||||
log(f"WARN cosine skip ({a_path}, {b_path}): {exc}")
|
||||
continue
|
||||
if sim >= review:
|
||||
pairs.append((sim, a_path, b_path))
|
||||
pairs.sort(reverse=True)
|
||||
|
||||
errors = [p for p in pairs if p[0] >= error_]
|
||||
reviews = [p for p in pairs if review <= p[0] < error_]
|
||||
|
||||
out_lines: list[str] = []
|
||||
out_lines.append("# Semantic Tiling Report")
|
||||
out_lines.append("")
|
||||
out_lines.append(f"- generated: {datetime.utcnow().isoformat(timespec='seconds')}Z")
|
||||
out_lines.append(f"- model: {model}")
|
||||
out_lines.append(f"- ollama_url: {ollama_url}")
|
||||
out_lines.append(f"- thresholds: error>={error_}, review={review}-{error_}")
|
||||
out_lines.append(f"- calibrated: {thresholds.get('calibrated', False)}"
|
||||
+ (" (using uncalibrated defaults)" if not thresholds.get("calibrated") else ""))
|
||||
out_lines.append(f"- pages scanned: {scanned}; embedded: {len(pages)}; skipped: {sum(skipped_counts.values())}")
|
||||
if skipped_counts:
|
||||
out_lines.append("- skipped reasons: " + ", ".join(f"{k}={v}" for k, v in sorted(skipped_counts.items())))
|
||||
out_lines.append(f"- cache hits: {cached_hits}; recomputed: {computed}; orphans pruned: {len(orphans)}")
|
||||
out_lines.append("")
|
||||
out_lines.append(f"## Errors (similarity >= {error_})")
|
||||
out_lines.append("")
|
||||
if not errors:
|
||||
out_lines.append("- none")
|
||||
else:
|
||||
for sim, a, b in errors:
|
||||
out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
|
||||
out_lines.append("")
|
||||
out_lines.append(f"## Review ({review} <= similarity < {error_})")
|
||||
out_lines.append("")
|
||||
if not reviews:
|
||||
out_lines.append("- none")
|
||||
else:
|
||||
for sim, a, b in reviews:
|
||||
out_lines.append(f"- `{sim:.4f}` {a} -- {b}")
|
||||
report = "\n".join(out_lines) + "\n"
|
||||
|
||||
print(report)
|
||||
if report_path is not None:
|
||||
# Confine report writes to VAULT_ROOT. A path that resolves outside
|
||||
# the vault is refused (prevents `--report /etc/passwd` style
|
||||
# accidents or hostile args from writing outside the repo).
|
||||
try:
|
||||
resolved_report = (
|
||||
report_path if report_path.is_absolute() else (Path.cwd() / report_path)
|
||||
).resolve()
|
||||
resolved_report.relative_to(VAULT_ROOT.resolve())
|
||||
except ValueError:
|
||||
log(f"ERR: --report path '{report_path}' escapes vault root {VAULT_ROOT}")
|
||||
return EXIT_USAGE
|
||||
resolved_report.parent.mkdir(parents=True, exist_ok=True)
|
||||
resolved_report.write_text(report, encoding="utf-8")
|
||||
log(f"report written: {resolved_report}")
|
||||
|
||||
return EXIT_OK
|
||||
|
||||
|
||||
def cmd_peek(ollama_url: str, model: str) -> int:
|
||||
"""Structured diagnostics. Prints a JSON object and a plain summary."""
|
||||
diag: dict = {}
|
||||
script_path = Path(__file__).resolve()
|
||||
diag["script_path"] = str(script_path)
|
||||
diag["script_executable"] = os.access(script_path, os.X_OK)
|
||||
diag["python"] = sys.executable
|
||||
diag["vault_root"] = str(VAULT_ROOT)
|
||||
diag["ollama_url"] = ollama_url
|
||||
diag["ollama_reachable"] = detect_ollama(ollama_url)
|
||||
diag["model_requested"] = model
|
||||
diag["model_present"] = detect_model(ollama_url, model) if diag["ollama_reachable"] else False
|
||||
diag["cache_present"] = CACHE_PATH.exists()
|
||||
diag["cache_readable"] = False
|
||||
diag["cache_entries"] = 0
|
||||
diag["cache_model"] = None
|
||||
if diag["cache_present"]:
|
||||
try:
|
||||
with CACHE_PATH.open() as f:
|
||||
c = json.load(f)
|
||||
diag["cache_readable"] = (c.get("version") == 1
|
||||
and isinstance(c.get("embeddings"), dict))
|
||||
diag["cache_entries"] = len(c.get("embeddings", {}))
|
||||
diag["cache_model"] = c.get("model")
|
||||
except (OSError, json.JSONDecodeError) as exc:
|
||||
diag["cache_readable"] = False
|
||||
diag["cache_error"] = str(exc)
|
||||
diag["thresholds_present"] = THRESHOLDS_PATH.exists()
|
||||
diag["thresholds_readable"] = False
|
||||
if diag["thresholds_present"]:
|
||||
try:
|
||||
with THRESHOLDS_PATH.open() as f:
|
||||
t = json.load(f)
|
||||
diag["thresholds_readable"] = True
|
||||
diag["thresholds_calibrated"] = bool(t.get("calibrated", False))
|
||||
diag["thresholds_bands"] = t.get("bands", {})
|
||||
except (OSError, json.JSONDecodeError):
|
||||
diag["thresholds_readable"] = False
|
||||
print(json.dumps(diag, indent=2))
|
||||
if not diag["ollama_reachable"]:
|
||||
return EXIT_NO_OLLAMA
|
||||
if not diag["model_present"]:
|
||||
return EXIT_NO_MODEL
|
||||
if diag["cache_present"] and not diag["cache_readable"]:
|
||||
return EXIT_CACHE_CORRUPT
|
||||
return EXIT_OK
|
||||
|
||||
|
||||
def main(argv: list[str]) -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--report", type=Path, default=None)
|
||||
p.add_argument("--rebuild-cache", action="store_true")
|
||||
p.add_argument("--peek", action="store_true")
|
||||
p.add_argument("--allow-remote-ollama", action="store_true",
|
||||
help="allow OLLAMA_URL env override pointing outside localhost")
|
||||
p.add_argument("--model", default=DEFAULT_MODEL)
|
||||
args = p.parse_args(argv)
|
||||
|
||||
env_url = os.environ.get("OLLAMA_URL")
|
||||
ollama_url = env_url or DEFAULT_OLLAMA_URL
|
||||
if env_url and not _is_local_url(ollama_url) and not args.allow_remote_ollama:
|
||||
log(f"ERR: OLLAMA_URL={ollama_url!r} is not localhost. "
|
||||
f"Vault content would be POSTed to a non-local host. "
|
||||
f"Pass --allow-remote-ollama to override.")
|
||||
return EXIT_USAGE
|
||||
|
||||
if args.peek:
|
||||
return cmd_peek(ollama_url, args.model)
|
||||
return run_check(
|
||||
rebuild=args.rebuild_cache,
|
||||
report_path=args.report,
|
||||
ollama_url=ollama_url,
|
||||
model=args.model,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
@@ -0,0 +1,341 @@
|
||||
#!/usr/bin/env bash
|
||||
# wiki-lock.sh — per-file advisory locking for safe multi-writer vault mutation.
|
||||
#
|
||||
# Closes the latent multi-writer corruption bug in v1.6 where two parallel
|
||||
# sub-agents writing to the same wiki page could silently trample each other.
|
||||
# The README and skills/wiki-ingest/SKILL.md §259-264 documented "single-writer
|
||||
# only" as a convention; this script makes it an enforceable guard.
|
||||
#
|
||||
# Design (age-based, not flock-style):
|
||||
# flock(2) advisory locks release when the holding process exits. That
|
||||
# doesn't fit our model where `acquire` and `release` are SEPARATE bash
|
||||
# invocations from the same skill (each Bash tool call is its own short-
|
||||
# lived process — neither's PID survives long enough to mean anything).
|
||||
# So we use atomic lockfile creation with `set -o noclobber` plus
|
||||
# epoch-timestamp AGE-based staleness detection. Race-safe because the
|
||||
# noclobber write itself is atomic on POSIX filesystems.
|
||||
#
|
||||
# The PID written into the lockfile is informational only (helpful for
|
||||
# `list` and debugging). The acquire decision considers AGE only:
|
||||
# - If lockfile age < STALE_AFTER_SEC → refuse (return 75 EX_TEMPFAIL)
|
||||
# - If lockfile age >= STALE_AFTER_SEC → reap and acquire
|
||||
# Default STALE_AFTER_SEC=60. Long enough for any single skill operation
|
||||
# (page writes are milliseconds; a multi-write ingest pass is seconds);
|
||||
# short enough that a crashed holder unblocks quickly.
|
||||
#
|
||||
# Semantics:
|
||||
# acquire <vault-rel-path>
|
||||
# - Computes lock_file = .vault-meta/locks/<sha1(path)>.lock
|
||||
# - Atomically creates the lockfile with this process's PID + epoch
|
||||
# - Returns 0 if acquired, 75 (EX_TEMPFAIL) if held and age < threshold
|
||||
# - Auto-reaps locks older than STALE_AFTER_SEC
|
||||
# release <vault-rel-path>
|
||||
# - Removes the lockfile unconditionally (rm -f). Idempotent.
|
||||
# - Cross-process release IS allowed by design — acquire and release
|
||||
# are typically separate bash invocations from the same skill, and
|
||||
# PID-matching would never succeed. Skill authors are trusted not to
|
||||
# release locks they don't own; that's no weaker than `rm` on the
|
||||
# lockfile directly.
|
||||
# list
|
||||
# - Prints currently-held lock records (one per line: pid age path).
|
||||
# clear-stale [--max-age N]
|
||||
# - Removes lockfiles whose PID is dead OR whose age > N seconds.
|
||||
# Default N = 3600 (1h). Returns count removed via stdout.
|
||||
# (The N=3600 default is intentionally generous because clear-stale
|
||||
# is admin-grade cleanup, distinct from the per-acquire age threshold.)
|
||||
# peek <vault-rel-path>
|
||||
# - Prints holder info or "unheld"; exit 0; never mutates.
|
||||
#
|
||||
# Globals:
|
||||
# STALE_AFTER_SEC — default 60. Override via --stale-after-sec N on any cmd.
|
||||
#
|
||||
# Age-threshold naming (v1.7.2; closes audit L6):
|
||||
# - STALE_AFTER_SEC (default 60) is the PER-ACQUIRE threshold. A new
|
||||
# acquire that finds an existing lock will reap-and-take if the lock is
|
||||
# older than this; refuse otherwise. Tuned for "single skill operation
|
||||
# completes within 60s."
|
||||
# - `clear-stale --max-age N` (default 3600) is the ADMIN reaper threshold,
|
||||
# meant to be run periodically by an operator or hook to sweep abandoned
|
||||
# locks. Tuned for "anything older than an hour is definitely abandoned."
|
||||
# These are two distinct concerns; both are time-since-acquire but operate
|
||||
# at different scopes. Do not unify the defaults.
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/wiki-lock.sh acquire wiki/concepts/Foo.md
|
||||
# bash scripts/wiki-lock.sh release wiki/concepts/Foo.md
|
||||
# bash scripts/wiki-lock.sh list
|
||||
# bash scripts/wiki-lock.sh clear-stale --max-age 1800
|
||||
# bash scripts/wiki-lock.sh peek wiki/concepts/Foo.md
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 — success
|
||||
# 2 — usage error
|
||||
# 75 — acquire failed (lock held by alive process)
|
||||
# 3 — vault-meta/locks dir creation failed
|
||||
# 4 — invalid vault-relative path (escape attempt)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
VAULT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
META_DIR="${VAULT_ROOT}/.vault-meta"
|
||||
LOCK_DIR="${META_DIR}/locks"
|
||||
META_LOCK="${META_DIR}/.wiki-lock.meta"
|
||||
STALE_AFTER_SEC=60
|
||||
|
||||
# ── helpers ──────────────────────────────────────────────────────────────────
|
||||
die() { echo "ERR: $*" >&2; exit "${2:-2}"; }
|
||||
log() { echo "$*" >&2; }
|
||||
|
||||
# Allow tests / non-default vault roots to override
|
||||
if [ -n "${WIKI_LOCK_VAULT:-}" ]; then
|
||||
VAULT_ROOT="$WIKI_LOCK_VAULT"
|
||||
META_DIR="${VAULT_ROOT}/.vault-meta"
|
||||
LOCK_DIR="${META_DIR}/locks"
|
||||
META_LOCK="${META_DIR}/.wiki-lock.meta"
|
||||
fi
|
||||
|
||||
sha1_of() {
|
||||
if command -v sha1sum >/dev/null 2>&1; then
|
||||
printf '%s' "$1" | sha1sum | awk '{print $1}'
|
||||
else
|
||||
# macOS fallback
|
||||
printf '%s' "$1" | shasum -a 1 | awk '{print $1}'
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_dirs() {
|
||||
mkdir -p "$LOCK_DIR" 2>/dev/null || die "cannot create $LOCK_DIR" 3
|
||||
}
|
||||
|
||||
validate_path() {
|
||||
# Reject empty, absolute, escape, or newline-bearing paths to prevent
|
||||
# lock-namespace pollution. v1.7.2 / closes audit M4: newlines would break
|
||||
# the meta-lock line format (key=value lines separated by literal \n).
|
||||
# v1.9.1 / closes audit M3 (symlink escape): when a vault-relative path
|
||||
# resolves through a symlink to outside VAULT_ROOT, treat as path traversal.
|
||||
local p="$1"
|
||||
[ -z "$p" ] && die "path cannot be empty" 4
|
||||
case "$p" in
|
||||
/*) die "path must be vault-relative, not absolute: $p" 4 ;;
|
||||
*..*) die "path may not contain '..': $p" 4 ;;
|
||||
*$'\n'*) die "path may not contain newlines (lockfile format would break)" 4 ;;
|
||||
*$'\r'*) die "path may not contain carriage returns" 4 ;;
|
||||
esac
|
||||
# Symlink canonicalization (only when the path or one of its parents exists).
|
||||
# Non-existent paths can pass; the lock acquire itself creates leaves under
|
||||
# LOCK_DIR, not the path itself. We resolve via python3 (portable across
|
||||
# GNU coreutils + macOS BSD where realpath flag semantics differ).
|
||||
if command -v python3 >/dev/null 2>&1; then
|
||||
local resolved root
|
||||
resolved=$(VAULT_ROOT_BASH="$VAULT_ROOT" P_BASH="$p" python3 -c '
|
||||
import os, sys
|
||||
root = os.path.realpath(os.environ["VAULT_ROOT_BASH"])
|
||||
candidate = os.environ["P_BASH"]
|
||||
target = os.path.realpath(os.path.join(root, candidate))
|
||||
common = os.path.commonpath([root, target]) if target else ""
|
||||
sys.stdout.write("INSIDE" if common == root else "OUTSIDE")
|
||||
' 2>/dev/null)
|
||||
[ "$resolved" = "OUTSIDE" ] && die "path resolves outside vault via symlink: $p" 4
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
now_epoch() { date +%s; }
|
||||
|
||||
is_alive() {
|
||||
# kill -0 returns 0 if process exists and we can signal it
|
||||
kill -0 "$1" 2>/dev/null
|
||||
}
|
||||
|
||||
# Atomic meta-lock wrapper. Funcs that mutate LOCK_DIR call under this lock so
|
||||
# acquire/release/clear-stale don't race against each other.
|
||||
with_meta_lock() {
|
||||
ensure_dirs
|
||||
# Use flock under bash's redirect; meta lock is short-lived per command.
|
||||
(
|
||||
flock -x -w 5 9 || die "could not acquire meta-lock within 5s" 1
|
||||
"$@"
|
||||
) 9>"$META_LOCK"
|
||||
}
|
||||
|
||||
read_lockfile() {
|
||||
# Echoes: <pid> <epoch> <path> (or empty if file missing/unreadable)
|
||||
local lf="$1"
|
||||
[ -f "$lf" ] || return 0
|
||||
head -1 "$lf" 2>/dev/null || true
|
||||
}
|
||||
|
||||
# ── commands ─────────────────────────────────────────────────────────────────
|
||||
_cmd_acquire() {
|
||||
local path="$1"
|
||||
validate_path "$path"
|
||||
ensure_dirs
|
||||
local lf="${LOCK_DIR}/$(sha1_of "$path").lock"
|
||||
local now
|
||||
now=$(now_epoch)
|
||||
|
||||
# Try the cheap path first: noclobber-atomic create
|
||||
if (set -o noclobber; printf '%s %s %s\n' "$$" "$now" "$path" > "$lf") 2>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Lockfile already exists — examine age, not PID
|
||||
local existing
|
||||
existing=$(read_lockfile "$lf")
|
||||
if [ -z "$existing" ]; then
|
||||
# Empty/unreadable; treat as stale, clean and retry once
|
||||
rm -f "$lf"
|
||||
if (set -o noclobber; printf '%s %s %s\n' "$$" "$now" "$path" > "$lf") 2>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
return 75
|
||||
fi
|
||||
|
||||
local eepoch
|
||||
eepoch=$(printf '%s' "$existing" | awk '{print $2}')
|
||||
# Numeric sanity (corrupt lockfile → treat as stale)
|
||||
case "$eepoch" in
|
||||
''|*[!0-9]*) rm -f "$lf"
|
||||
(set -o noclobber; printf '%s %s %s\n' "$$" "$now" "$path" > "$lf") 2>/dev/null && return 0
|
||||
return 75 ;;
|
||||
esac
|
||||
local age=$((now - eepoch))
|
||||
|
||||
if [ "$age" -gt "$STALE_AFTER_SEC" ]; then
|
||||
# Age exceeds threshold → reap and re-acquire (regardless of holder PID)
|
||||
rm -f "$lf"
|
||||
if (set -o noclobber; printf '%s %s %s\n' "$$" "$now" "$path" > "$lf") 2>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
return 75
|
||||
fi
|
||||
|
||||
# Held and not yet stale by age — refuse
|
||||
return 75
|
||||
}
|
||||
|
||||
_cmd_release() {
|
||||
local path="$1"
|
||||
validate_path "$path"
|
||||
ensure_dirs
|
||||
local lf="${LOCK_DIR}/$(sha1_of "$path").lock"
|
||||
# Unconditional remove — cross-process release is allowed by design
|
||||
# (acquire and release are typically separate bash invocations from the
|
||||
# same skill; PID-matching would never succeed). See header comment.
|
||||
rm -f "$lf"
|
||||
return 0
|
||||
}
|
||||
|
||||
_cmd_list() {
|
||||
ensure_dirs
|
||||
local count=0
|
||||
for lf in "$LOCK_DIR"/*.lock; do
|
||||
[ -f "$lf" ] || continue
|
||||
local rec
|
||||
rec=$(read_lockfile "$lf")
|
||||
[ -n "$rec" ] || continue
|
||||
local pid epoch path now age
|
||||
pid=$(printf '%s' "$rec" | awk '{print $1}')
|
||||
epoch=$(printf '%s' "$rec" | awk '{print $2}')
|
||||
path=$(printf '%s' "$rec" | cut -d' ' -f3-)
|
||||
now=$(now_epoch)
|
||||
age=$((now - epoch))
|
||||
printf 'pid=%s age=%ss path=%s\n' "$pid" "$age" "$path"
|
||||
count=$((count + 1))
|
||||
done
|
||||
return 0
|
||||
}
|
||||
|
||||
_cmd_clear_stale() {
|
||||
local max_age="$1"
|
||||
ensure_dirs
|
||||
local removed=0
|
||||
local now
|
||||
now=$(now_epoch)
|
||||
for lf in "$LOCK_DIR"/*.lock; do
|
||||
[ -f "$lf" ] || continue
|
||||
local rec
|
||||
rec=$(read_lockfile "$lf")
|
||||
if [ -z "$rec" ]; then
|
||||
rm -f "$lf"; removed=$((removed + 1)); continue
|
||||
fi
|
||||
local pid epoch age
|
||||
pid=$(printf '%s' "$rec" | awk '{print $1}')
|
||||
epoch=$(printf '%s' "$rec" | awk '{print $2}')
|
||||
age=$((now - epoch))
|
||||
if ! is_alive "$pid" || [ "$age" -gt "$max_age" ]; then
|
||||
rm -f "$lf"; removed=$((removed + 1))
|
||||
fi
|
||||
done
|
||||
echo "$removed"
|
||||
return 0
|
||||
}
|
||||
|
||||
_cmd_peek() {
|
||||
local path="$1"
|
||||
validate_path "$path"
|
||||
ensure_dirs
|
||||
local lf="${LOCK_DIR}/$(sha1_of "$path").lock"
|
||||
if [ ! -f "$lf" ]; then
|
||||
echo "unheld"
|
||||
return 0
|
||||
fi
|
||||
local rec
|
||||
rec=$(read_lockfile "$lf")
|
||||
echo "$rec"
|
||||
return 0
|
||||
}
|
||||
|
||||
# ── arg parsing (flags accepted in any position) ─────────────────────────────
|
||||
if [ $# -lt 1 ]; then
|
||||
sed -n '2,46p' "$0" | sed 's/^# \{0,1\}//'
|
||||
exit 2
|
||||
fi
|
||||
|
||||
CMD=""
|
||||
ARGS=()
|
||||
MAX_AGE_OVERRIDE=""
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--stale-after-sec) STALE_AFTER_SEC="$2"; shift 2 ;;
|
||||
--max-age) MAX_AGE_OVERRIDE="$2"; shift 2 ;;
|
||||
-h|--help) sed -n '2,46p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
|
||||
--) shift; while [ $# -gt 0 ]; do ARGS+=("$1"); shift; done ;;
|
||||
-*) die "unknown flag: $1" ;;
|
||||
*)
|
||||
if [ -z "$CMD" ]; then
|
||||
CMD="$1"
|
||||
else
|
||||
ARGS+=("$1")
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
[ -n "$CMD" ] || die "no command given"
|
||||
|
||||
case "$CMD" in
|
||||
acquire)
|
||||
[ ${#ARGS[@]} -ge 1 ] || die "acquire needs a path"
|
||||
with_meta_lock _cmd_acquire "${ARGS[0]}"
|
||||
;;
|
||||
release)
|
||||
[ ${#ARGS[@]} -ge 1 ] || die "release needs a path"
|
||||
with_meta_lock _cmd_release "${ARGS[0]}"
|
||||
;;
|
||||
list)
|
||||
with_meta_lock _cmd_list
|
||||
;;
|
||||
clear-stale)
|
||||
MAX="${MAX_AGE_OVERRIDE:-${ARGS[0]:-3600}}"
|
||||
with_meta_lock _cmd_clear_stale "$MAX"
|
||||
;;
|
||||
peek)
|
||||
[ ${#ARGS[@]} -ge 1 ] || die "peek needs a path"
|
||||
with_meta_lock _cmd_peek "${ARGS[0]}"
|
||||
;;
|
||||
*)
|
||||
die "unknown command: $CMD (try acquire|release|list|clear-stale|peek)"
|
||||
;;
|
||||
esac
|
||||
@@ -0,0 +1,252 @@
|
||||
#!/usr/bin/env python3
|
||||
"""wiki-mode.py — read + route helper for v1.8 methodology modes.
|
||||
|
||||
Single source of truth for "which mode is this vault in" and "where should
|
||||
new content of type X be filed under mode Y." Consumed by:
|
||||
|
||||
- skills/wiki-ingest/SKILL.md (where to file new source/entity/concept pages)
|
||||
- skills/save/SKILL.md (where to file session notes)
|
||||
- skills/autoresearch/SKILL.md (where to file research output)
|
||||
- bin/setup-mode.sh (writes .vault-meta/mode.json)
|
||||
|
||||
If `.vault-meta/mode.json` is absent → mode = "generic" → behavior identical
|
||||
to v1.7. No skill needs to special-case the missing-config path.
|
||||
|
||||
CLI:
|
||||
wiki-mode.py get # print current mode (default: generic)
|
||||
wiki-mode.py config # print full config JSON
|
||||
wiki-mode.py route TYPE NAME # print suggested path for new content
|
||||
# TYPE: source|entity|concept|session|research
|
||||
wiki-mode.py set MODE # write mode (lyt|para|zettelkasten|generic)
|
||||
wiki-mode.py id # mint a Zettelkasten ID (timestamp)
|
||||
wiki-mode.py templates # list per-mode template files
|
||||
|
||||
Exit codes:
|
||||
0 — success
|
||||
2 — usage error
|
||||
3 — invalid mode string
|
||||
4 — invalid content type
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import tempfile
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
META_DIR = VAULT_ROOT / ".vault-meta"
|
||||
MODE_PATH = META_DIR / "mode.json"
|
||||
|
||||
VALID_MODES = ("generic", "lyt", "para", "zettelkasten")
|
||||
VALID_TYPES = ("source", "entity", "concept", "session", "research")
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
"schema_version": 1,
|
||||
"mode": "generic",
|
||||
"configured_at": None,
|
||||
"config": {
|
||||
"lyt": {
|
||||
"moc_folder": "wiki/mocs/",
|
||||
"notes_folder": "wiki/notes/",
|
||||
},
|
||||
"para": {
|
||||
"projects_folder": "wiki/projects/",
|
||||
"areas_folder": "wiki/areas/",
|
||||
"resources_folder": "wiki/resources/",
|
||||
"archives_folder": "wiki/archives/",
|
||||
},
|
||||
"zettelkasten": {
|
||||
"id_format": "YYYYMMDDHHMMSSffffff",
|
||||
"no_folders": True,
|
||||
"root_folder": "wiki/",
|
||||
},
|
||||
"generic": {
|
||||
"sources_folder": "wiki/sources/",
|
||||
"entities_folder": "wiki/entities/",
|
||||
"concepts_folder": "wiki/concepts/",
|
||||
"sessions_folder": "wiki/sessions/",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def load_config():
|
||||
"""Return parsed mode.json, or DEFAULT_CONFIG with mode='generic' if absent."""
|
||||
if not MODE_PATH.is_file():
|
||||
return dict(DEFAULT_CONFIG)
|
||||
try:
|
||||
loaded = json.loads(MODE_PATH.read_text(encoding="utf-8"))
|
||||
# Merge with defaults so partially-configured files still work
|
||||
merged = dict(DEFAULT_CONFIG)
|
||||
merged["mode"] = loaded.get("mode", "generic")
|
||||
merged["configured_at"] = loaded.get("configured_at")
|
||||
loaded_config = loaded.get("config", {})
|
||||
for k, v in loaded_config.items():
|
||||
if k in merged["config"] and isinstance(v, dict):
|
||||
merged["config"][k].update(v)
|
||||
return merged
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
print(f"ERR: cannot parse {MODE_PATH}: {e}", file=sys.stderr)
|
||||
print(" Falling back to mode=generic. Re-run `bash bin/setup-mode.sh` to fix.",
|
||||
file=sys.stderr)
|
||||
return dict(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
def save_config(cfg):
|
||||
META_DIR.mkdir(parents=True, exist_ok=True)
|
||||
payload = json.dumps(cfg, indent=2, ensure_ascii=False) + "\n"
|
||||
fd, tmp_path = tempfile.mkstemp(prefix="mode.", suffix=".tmp", dir=str(META_DIR))
|
||||
try:
|
||||
with open(fd, "w", encoding="utf-8") as fh:
|
||||
fh.write(payload)
|
||||
Path(tmp_path).replace(MODE_PATH)
|
||||
except Exception:
|
||||
try:
|
||||
Path(tmp_path).unlink()
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
|
||||
def slugify(name):
|
||||
"""Filesystem-safe slug; matches the convention used by the existing skills.
|
||||
Any run of non-word, non-hyphen characters becomes a single hyphen so that
|
||||
'v1.8 launch! prep?' → 'v1-8-launch-prep' (not 'v18launchprep').
|
||||
Unicode word characters (CJK, accented Latin, Cyrillic, etc.) are preserved.
|
||||
"""
|
||||
s = re.sub(r"[^\w\-]+", "-", name, flags=re.UNICODE)
|
||||
s = re.sub(r"-+", "-", s).strip("-")
|
||||
return s or "untitled"
|
||||
|
||||
|
||||
def safe_name(name):
|
||||
"""Sanitize a name that intentionally preserves case + spaces (entity/concept).
|
||||
Strips path separators, null bytes, control characters, and leading dots or
|
||||
hyphens so the returned string cannot escape its parent directory or be
|
||||
interpreted as a hidden file or flag. Spaces and case are preserved.
|
||||
"""
|
||||
cleaned = re.sub(r"[/\\\x00-\x1f]+", "", name)
|
||||
cleaned = cleaned.lstrip(".-")
|
||||
return cleaned or "untitled"
|
||||
|
||||
|
||||
def mint_zettel_id():
|
||||
"""YYYYMMDDHHMMSSffffff in UTC (microsecond resolution).
|
||||
Stable across timezones; lexicographically sortable; collision-resistant
|
||||
against rapid back-to-back calls in the same second. Microsecond suffix
|
||||
closes the v1.8.0 verifier LOW (two rapid mint calls produced the same
|
||||
14-digit ID and would have generated colliding filenames).
|
||||
"""
|
||||
return datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S%f")
|
||||
|
||||
|
||||
def route_path(mode, content_type, name, cfg):
|
||||
"""Return the suggested vault-relative path for new content under `mode`."""
|
||||
if content_type not in VALID_TYPES:
|
||||
raise SystemExit(4)
|
||||
slug = slugify(name)
|
||||
|
||||
raw = safe_name(name) # case + spaces preserved, but path-traversal stripped
|
||||
|
||||
if mode == "generic":
|
||||
g = cfg["config"]["generic"]
|
||||
mapping = {
|
||||
"source": g["sources_folder"] + slug + ".md",
|
||||
"entity": g["entities_folder"] + raw + ".md", # preserve capitalization for entities
|
||||
"concept": g["concepts_folder"] + raw + ".md",
|
||||
"session": g["sessions_folder"] + slug + ".md",
|
||||
"research": g["concepts_folder"] + raw + ".md",
|
||||
}
|
||||
return mapping[content_type]
|
||||
|
||||
if mode == "lyt":
|
||||
notes = cfg["config"]["lyt"]["notes_folder"]
|
||||
# All atomic notes flat in wiki/notes/; routing is the same regardless of type
|
||||
return notes + slug + ".md"
|
||||
|
||||
if mode == "para":
|
||||
p = cfg["config"]["para"]
|
||||
mapping = {
|
||||
# New sources land in resources/<topic>/ (we use a generic 'incoming' bucket;
|
||||
# the user will sort into specific topics via their own workflow)
|
||||
"source": p["resources_folder"] + "incoming/" + slug + ".md",
|
||||
"entity": p["resources_folder"] + "people/" + raw + ".md",
|
||||
"concept": p["resources_folder"] + "concepts/" + raw + ".md",
|
||||
# Session notes land in projects/inbox/; user reroutes to specific projects
|
||||
"session": p["projects_folder"] + "inbox/" + slug + ".md",
|
||||
"research": p["resources_folder"] + slug + "/" + slug + ".md",
|
||||
}
|
||||
return mapping[content_type]
|
||||
|
||||
if mode == "zettelkasten":
|
||||
z = cfg["config"]["zettelkasten"]
|
||||
zid = mint_zettel_id()
|
||||
return z["root_folder"] + f"{zid}-{slug}.md"
|
||||
|
||||
raise SystemExit(3)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Methodology-mode router for v1.8 Compound Vault.")
|
||||
sub = parser.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
sub.add_parser("get", help="Print current mode")
|
||||
sub.add_parser("config", help="Print full config JSON")
|
||||
|
||||
sp_route = sub.add_parser("route", help="Print suggested vault path for new content")
|
||||
sp_route.add_argument("type", choices=VALID_TYPES)
|
||||
sp_route.add_argument("name", help="Content name (will be slugified for filenames)")
|
||||
sp_route.add_argument("--mode", choices=VALID_MODES, default=None,
|
||||
help="Preview routing under MODE without writing mode.json (default: use current vault mode)")
|
||||
|
||||
sp_set = sub.add_parser("set", help="Write a mode to .vault-meta/mode.json")
|
||||
sp_set.add_argument("mode", choices=VALID_MODES)
|
||||
|
||||
sub.add_parser("id", help="Mint a Zettelkasten ID (timestamp)")
|
||||
sub.add_parser("templates", help="List per-mode template files")
|
||||
|
||||
args = parser.parse_args()
|
||||
cfg = load_config()
|
||||
|
||||
if args.cmd == "get":
|
||||
print(cfg["mode"])
|
||||
return 0
|
||||
|
||||
if args.cmd == "config":
|
||||
print(json.dumps(cfg, indent=2, ensure_ascii=False))
|
||||
return 0
|
||||
|
||||
if args.cmd == "route":
|
||||
active_mode = args.mode if args.mode else cfg["mode"]
|
||||
path = route_path(active_mode, args.type, args.name, cfg)
|
||||
print(path)
|
||||
return 0
|
||||
|
||||
if args.cmd == "set":
|
||||
cfg["mode"] = args.mode
|
||||
cfg["configured_at"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
save_config(cfg)
|
||||
print(f"mode set: {args.mode}")
|
||||
return 0
|
||||
|
||||
if args.cmd == "id":
|
||||
print(mint_zettel_id())
|
||||
return 0
|
||||
|
||||
if args.cmd == "templates":
|
||||
templates_dir = VAULT_ROOT / "skills" / "wiki-mode" / "templates"
|
||||
if not templates_dir.is_dir():
|
||||
print(f"ERR: templates dir missing: {templates_dir}", file=sys.stderr)
|
||||
return 2
|
||||
for f in sorted(templates_dir.rglob("*.md")):
|
||||
print(str(f.relative_to(VAULT_ROOT)))
|
||||
return 0
|
||||
|
||||
return 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user