add claude-obsidian
This commit is contained in:
@@ -0,0 +1,293 @@
|
||||
#!/usr/bin/env python3
|
||||
"""bm25-index.py — sparse BM25 inverted index over contextualized wiki chunks.
|
||||
|
||||
Pure stdlib (no rank_bm25 dep). Standard Okapi BM25 with k1=1.5, b=0.75.
|
||||
Indexes the `contextualized_text` field of every chunk under .vault-meta/chunks/,
|
||||
emits a single JSON file at .vault-meta/bm25/index.json with the schema below.
|
||||
|
||||
Concurrency:
|
||||
- Locks .vault-meta/.bm25.lock (fcntl exclusive) around any index write.
|
||||
- Atomic .tmp + rename for the index file.
|
||||
|
||||
Index schema (.vault-meta/bm25/index.json):
|
||||
{
|
||||
"schema_version": 1,
|
||||
"params": {"k1": 1.5, "b": 0.75},
|
||||
"doc_count": 1234,
|
||||
"avg_dl": 487.5,
|
||||
"updated_at": "2026-05-17T...",
|
||||
"vocab": {
|
||||
"<term>": {"df": 17, "postings": [["c-000001:0", 3], ["c-000042:2", 1], ...]}
|
||||
},
|
||||
"docs": {
|
||||
"<chunk_id>": {"path": ".vault-meta/chunks/c-000001/chunk-000.json", "dl": 487}
|
||||
}
|
||||
}
|
||||
|
||||
Chunk id format: "<page-address>:<chunk-index>" (e.g. "c-000042:3").
|
||||
|
||||
Tokenization: lowercase, collapse whitespace, drop punctuation except in-word
|
||||
apostrophes and hyphens. ASCII-only stopwords filtered (small list; favors
|
||||
recall over precision).
|
||||
|
||||
Query interface (used by retrieve.py at query time):
|
||||
bm25-index.py query "your text here" [--top 20]
|
||||
|
||||
Build interface:
|
||||
bm25-index.py build # full rebuild (always; incremental is v1.7.x scope)
|
||||
bm25-index.py stats # print index stats
|
||||
|
||||
Exit codes:
|
||||
0 — success
|
||||
1 — lock acquisition failed
|
||||
2 — usage error
|
||||
3 — index file missing or corrupt (query mode)
|
||||
4 — chunks directory missing
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import fcntl
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
META_DIR = VAULT_ROOT / ".vault-meta"
|
||||
CHUNKS_DIR = META_DIR / "chunks"
|
||||
BM25_DIR = META_DIR / "bm25"
|
||||
INDEX_PATH = BM25_DIR / "index.json"
|
||||
LOCK_PATH = META_DIR / ".bm25.lock"
|
||||
|
||||
K1 = 1.5
|
||||
B = 0.75
|
||||
|
||||
# Small high-frequency-stopword list (English). Conservative — keep recall high.
|
||||
STOPWORDS = frozenset("""
|
||||
a an and are as at be by for from has have he her him his i if in is it its
|
||||
of on or that the their them they this to was were will with you your
|
||||
""".split())
|
||||
|
||||
# Unicode-aware tokenizer (v1.7.2; closes audit M2). \w under re.UNICODE
|
||||
# matches letters and digits from any script (CJK, Cyrillic, accented Latin,
|
||||
# Devanagari, etc.) plus underscore. Internal apostrophes and hyphens are
|
||||
# preserved so "user's" and "well-formed" stay single tokens. Pure-symbol or
|
||||
# pure-emoji tokens fail the leading \w anchor and are correctly skipped.
|
||||
TOKEN_RE = re.compile(r"\w[\w'\-]*", re.UNICODE)
|
||||
|
||||
EXIT_OK = 0
|
||||
EXIT_LOCK = 1
|
||||
EXIT_USAGE = 2
|
||||
EXIT_INDEX_MISSING = 3
|
||||
EXIT_NO_CHUNKS = 4
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
"""Lowercase, strip punctuation, drop stopwords. Returns a list of terms."""
|
||||
return [t.lower() for t in TOKEN_RE.findall(text)
|
||||
if t.lower() not in STOPWORDS and len(t) > 1]
|
||||
|
||||
|
||||
def acquire_lock():
|
||||
META_DIR.mkdir(parents=True, exist_ok=True)
|
||||
fd = os.open(str(LOCK_PATH), os.O_CREAT | os.O_WRONLY, 0o644)
|
||||
try:
|
||||
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except OSError:
|
||||
os.close(fd)
|
||||
log("ERR: could not acquire bm25 lock")
|
||||
sys.exit(EXIT_LOCK)
|
||||
return fd
|
||||
|
||||
|
||||
def release_lock(fd):
|
||||
try:
|
||||
fcntl.flock(fd, fcntl.LOCK_UN)
|
||||
finally:
|
||||
os.close(fd)
|
||||
|
||||
|
||||
def discover_chunks():
|
||||
"""Yield (chunk_id, path, contextualized_text) for every chunk on disk.
|
||||
|
||||
The yielded `path` is relative to the directory two levels above CHUNKS_DIR
|
||||
(i.e. .vault-meta/chunks/<addr>/ → relative to the vault root). This works
|
||||
both in production (CHUNKS_DIR is `<vault>/.vault-meta/chunks`) and when
|
||||
tests monkey-patch CHUNKS_DIR to a sandbox `<tmp>/.vault-meta/chunks`.
|
||||
"""
|
||||
if not CHUNKS_DIR.is_dir():
|
||||
log(f"ERR: no chunks directory at {CHUNKS_DIR}")
|
||||
sys.exit(EXIT_NO_CHUNKS)
|
||||
rel_root = CHUNKS_DIR.parent.parent
|
||||
for chunk_file in sorted(CHUNKS_DIR.glob("*/chunk-*.json")):
|
||||
try:
|
||||
data = json.loads(chunk_file.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
log(f" skip (unreadable): {chunk_file} — {e}")
|
||||
continue
|
||||
address = data.get("page_address")
|
||||
idx = data.get("chunk_index")
|
||||
text = data.get("contextualized_text") or data.get("raw_text", "")
|
||||
if address is None or idx is None:
|
||||
continue
|
||||
chunk_id = f"{address}:{idx}"
|
||||
rel_path = str(chunk_file.relative_to(rel_root))
|
||||
yield chunk_id, rel_path, text
|
||||
|
||||
|
||||
def build_index():
|
||||
docs = {}
|
||||
df = Counter()
|
||||
postings = defaultdict(list)
|
||||
|
||||
for chunk_id, rel_path, text in discover_chunks():
|
||||
tokens = tokenize(text)
|
||||
tf = Counter(tokens)
|
||||
docs[chunk_id] = {"path": rel_path, "dl": len(tokens)}
|
||||
for term, count in tf.items():
|
||||
df[term] += 1
|
||||
postings[term].append([chunk_id, count])
|
||||
|
||||
if not docs:
|
||||
log("WARN: no chunks indexed")
|
||||
return None
|
||||
|
||||
avg_dl = sum(d["dl"] for d in docs.values()) / len(docs)
|
||||
vocab = {term: {"df": df[term], "postings": postings[term]}
|
||||
for term in sorted(df.keys())}
|
||||
|
||||
return {
|
||||
"schema_version": 1,
|
||||
"params": {"k1": K1, "b": B},
|
||||
"doc_count": len(docs),
|
||||
"avg_dl": avg_dl,
|
||||
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"vocab": vocab,
|
||||
"docs": docs,
|
||||
}
|
||||
|
||||
|
||||
def write_index(index):
|
||||
BM25_DIR.mkdir(parents=True, exist_ok=True)
|
||||
tmp = INDEX_PATH.with_suffix(f".{os.getpid()}.tmp")
|
||||
try:
|
||||
tmp.write_text(json.dumps(index, ensure_ascii=False), encoding="utf-8")
|
||||
os.replace(tmp, INDEX_PATH)
|
||||
finally:
|
||||
if tmp.exists():
|
||||
tmp.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def load_index():
|
||||
if not INDEX_PATH.is_file():
|
||||
log(f"ERR: no index at {INDEX_PATH}. Run `bm25-index.py build` first.")
|
||||
sys.exit(EXIT_INDEX_MISSING)
|
||||
try:
|
||||
return json.loads(INDEX_PATH.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
log(f"ERR: index corrupt: {e}")
|
||||
sys.exit(EXIT_INDEX_MISSING)
|
||||
|
||||
|
||||
def query(text, top_k=20):
|
||||
idx = load_index()
|
||||
vocab = idx["vocab"]
|
||||
docs = idx["docs"]
|
||||
params = idx["params"]
|
||||
avg_dl = idx["avg_dl"]
|
||||
N = idx["doc_count"]
|
||||
k1 = params["k1"]
|
||||
b = params["b"]
|
||||
|
||||
qterms = tokenize(text)
|
||||
if not qterms:
|
||||
return []
|
||||
|
||||
# Defensive guard (v1.7.2; closes audit L7): avg_dl can only be 0 if the
|
||||
# vocab is also empty (all chunks have zero tokens), in which case the
|
||||
# loop never enters this divide path. But future refactors could change
|
||||
# that invariant; the `or 1.0` keeps it safe by construction.
|
||||
avg_dl_safe = avg_dl or 1.0
|
||||
scores = defaultdict(float)
|
||||
for term in qterms:
|
||||
v = vocab.get(term)
|
||||
if not v:
|
||||
continue
|
||||
df = v["df"]
|
||||
idf = math.log(1 + (N - df + 0.5) / (df + 0.5))
|
||||
for cid, cnt in v["postings"]:
|
||||
dl = docs[cid]["dl"]
|
||||
denom = cnt + k1 * (1 - b + b * dl / avg_dl_safe)
|
||||
scores[cid] += idf * (cnt * (k1 + 1)) / denom
|
||||
|
||||
ranked = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:top_k]
|
||||
return [
|
||||
{
|
||||
"chunk_id": cid,
|
||||
"score": round(score, 6),
|
||||
"path": docs[cid]["path"],
|
||||
}
|
||||
for cid, score in ranked
|
||||
]
|
||||
|
||||
|
||||
def stats():
|
||||
idx = load_index()
|
||||
print(json.dumps({
|
||||
"doc_count": idx["doc_count"],
|
||||
"avg_dl": round(idx["avg_dl"], 2),
|
||||
"vocab_size": len(idx["vocab"]),
|
||||
"updated_at": idx["updated_at"],
|
||||
"params": idx["params"],
|
||||
}, indent=2))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="BM25 inverted index over wiki chunks.")
|
||||
sub = parser.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
sub.add_parser("build", help="Build the index (full rebuild every time in v1.7).")
|
||||
|
||||
sp_query = sub.add_parser("query", help="Query the index.")
|
||||
sp_query.add_argument("text", help="Query text")
|
||||
sp_query.add_argument("--top", type=int, default=20, help="Top-K results")
|
||||
|
||||
sub.add_parser("stats", help="Print index stats.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.cmd == "build":
|
||||
fd = acquire_lock()
|
||||
try:
|
||||
index = build_index()
|
||||
if index is None:
|
||||
log("Nothing to index.")
|
||||
return EXIT_OK
|
||||
write_index(index)
|
||||
log(f"Wrote {INDEX_PATH} docs={index['doc_count']} vocab={len(index['vocab'])} avg_dl={index['avg_dl']:.1f}")
|
||||
finally:
|
||||
release_lock(fd)
|
||||
return EXIT_OK
|
||||
|
||||
if args.cmd == "query":
|
||||
results = query(args.text, top_k=args.top)
|
||||
print(json.dumps(results, indent=2))
|
||||
return EXIT_OK
|
||||
|
||||
if args.cmd == "stats":
|
||||
stats()
|
||||
return EXIT_OK
|
||||
|
||||
return EXIT_USAGE
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user