Files
MultiPhysicsVault/scripts/retrieve.py
T
김경종 72dad72703
Tests / Hermetic test suite (push) Has been cancelled
Tests / Skill frontmatter validation (push) Has been cancelled
add claude-obsidian
2026-05-28 10:57:16 +09:00

196 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""retrieve.py — hybrid retrieval orchestrator for the Compound Vault.
Pipeline (v1.7):
query → bm25-index.py query (top-K candidates by BM25 over contextualized chunks)
→ rerank.py (cosine on nomic-embed-text vectors via ollama,
or no-op if ollama unavailable)
→ drill (return chunk pages with absolute paths so the
caller can Read them and synthesize)
Loads sibling scripts as Python modules (no subprocess overhead). Falls back
gracefully when index or rerank stage is missing:
- If .vault-meta/bm25/index.json is absent → exit 10 with friendly message;
caller falls back to v1.6 legacy
hot→index→drill read order.
- If .vault-meta/chunks/ is empty → exit 10 (same).
- If rerank stage cannot embed (no ollama) → no-op rerank, returns BM25 order.
Output schema (JSON to stdout):
{
"query": "...",
"strategy": "bm25+rerank:cosine:nomic-embed-text" | "bm25+noop-rerank",
"top_k": 5,
"candidates": [
{
"chunk_id": "c-000042:3",
"page_address": "c-000042",
"page_path": "wiki/concepts/Foo.md",
"absolute_path": "/abs/path/to/wiki/concepts/Foo.md",
"chunk_index": 3,
"bm25_score": 7.12,
"rerank_score": 0.81,
"rerank_source": "cosine:nomic-embed-text",
"snippet": "... first 200 chars of the chunk ..."
},
...
]
}
Usage:
retrieve.py "your query here" # standard: BM25 top-20, rerank to top-5
retrieve.py "query" --top 10 # change result count
retrieve.py "query" --no-rerank # skip rerank, BM25-only
retrieve.py "query" --explain # include per-stage diagnostics
Exit codes:
0 — success
2 — usage error
10 — feature not provisioned (no chunks or no BM25 index); caller falls back
"""
import argparse
import importlib.util
import json
import sys
from pathlib import Path
VAULT_ROOT = Path(__file__).resolve().parent.parent
SCRIPTS_DIR = VAULT_ROOT / "scripts"
META_DIR = VAULT_ROOT / ".vault-meta"
CHUNKS_DIR = META_DIR / "chunks"
BM25_INDEX = META_DIR / "bm25" / "index.json"
EXIT_OK = 0
EXIT_USAGE = 2
EXIT_NOT_PROVISIONED = 10
def log(msg):
print(msg, file=sys.stderr)
def import_sibling(name, filename):
"""Import a hyphenated sibling .py file as a Python module.
Wrapped in try/except (v1.7.2; closes audit M5) so a syntax error or
missing dependency in a sibling helper produces a friendly diagnostic
instead of a bare Python traceback at the user's first retrieve call.
"""
target = SCRIPTS_DIR / filename
if not target.is_file():
log(f"ERR: sibling helper {filename} not found at {target}")
log(" Run `bash bin/setup-retrieve.sh --check` to verify the install.")
sys.exit(EXIT_NOT_PROVISIONED)
try:
spec = importlib.util.spec_from_file_location(name, target)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
except (ImportError, SyntaxError, AttributeError) as e:
log(f"ERR: failed to import sibling helper {filename}: {type(e).__name__}: {e}")
log(" This likely means the helper script is corrupted or has a syntax error.")
log(" Run `python3 scripts/<helper>.py --help` directly to see the underlying error.")
log(" If it persists: re-clone the repo or check `git status` for local damage.")
sys.exit(EXIT_NOT_PROVISIONED)
def chunk_snippet(chunk_data, max_chars=200):
text = chunk_data.get("raw_text", "")
if len(text) <= max_chars:
return text
return text[:max_chars].rstrip() + ""
def main():
parser = argparse.ArgumentParser(description="Hybrid retrieval over the vault.")
parser.add_argument("query", help="Natural-language query")
parser.add_argument("--top", type=int, default=5, help="Final result count (post-rerank)")
parser.add_argument("--bm25-top", type=int, default=20,
help="Candidate count from BM25 (pre-rerank)")
parser.add_argument("--no-rerank", action="store_true",
help="Skip the rerank stage; return BM25-only")
parser.add_argument("--explain", action="store_true",
help="Include per-stage diagnostics in output")
parser.add_argument("--allow-remote-ollama", action="store_true",
help="Forwarded to rerank.py")
args = parser.parse_args()
if not BM25_INDEX.is_file():
log(f"ERR: no BM25 index at {BM25_INDEX}. Run `bash bin/setup-retrieve.sh` "
"to provision, or fall back to legacy hot→index→drill.")
return EXIT_NOT_PROVISIONED
if not CHUNKS_DIR.is_dir() or not any(CHUNKS_DIR.iterdir()):
log(f"ERR: no chunks at {CHUNKS_DIR}. Run "
"`python3 scripts/contextual-prefix.py --all` first.")
return EXIT_NOT_PROVISIONED
bm25 = import_sibling("bm25_index", "bm25-index.py")
reranker = import_sibling("rerank", "rerank.py")
bm25_hits = bm25.query(args.query, top_k=args.bm25_top)
log(f"bm25: {len(bm25_hits)} hits")
candidates = []
for h in bm25_hits:
chunk_path = VAULT_ROOT / h["path"]
try:
chunk = json.loads(chunk_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
continue
candidates.append({
"chunk_id": h["chunk_id"],
"page_address": chunk.get("page_address"),
"page_path": chunk.get("page_path"),
"absolute_path": str((VAULT_ROOT / chunk.get("page_path", "")).resolve()),
"chunk_index": chunk.get("chunk_index"),
"bm25_score": h["score"],
"path": h["path"],
"snippet": chunk_snippet(chunk),
})
if args.no_rerank:
final = candidates[:args.top]
strategy = "bm25-only"
for c in final:
c["rerank_score"] = c["bm25_score"]
c["rerank_source"] = "skipped"
else:
final = reranker.rerank(
args.query, candidates, top_k=args.top,
allow_remote=args.allow_remote_ollama,
)
# Derive strategy from first candidate's rerank_source
first_src = (final[0].get("rerank_source") if final else "unknown")
strategy = f"bm25+rerank:{first_src}"
# Dedupe by page (we may have multiple chunks of the same page; collapse to best)
by_page = {}
for c in final:
addr = c.get("page_address")
if addr not in by_page or c.get("rerank_score", 0) > by_page[addr].get("rerank_score", 0):
by_page[addr] = c
deduped = list(by_page.values())
deduped.sort(key=lambda c: c.get("rerank_score", 0), reverse=True)
out = {
"query": args.query,
"strategy": strategy,
"top_k": args.top,
"candidates": deduped[:args.top],
}
if args.explain:
out["explain"] = {
"bm25_candidate_count": len(bm25_hits),
"post_rerank_count": len(final),
"deduped_count": len(deduped),
"bm25_top_param": args.bm25_top,
}
print(json.dumps(out, indent=2, ensure_ascii=False))
return EXIT_OK
if __name__ == "__main__":
sys.exit(main())