#!/usr/bin/env python3 """baseline-v16.py — simulate the v1.6 hot→index→drill retrieval chain. Exists ONLY for benchmarking v1.7's hybrid retrieval against the legacy v1.6 behavior. Not used by any v1.7 skill; not feature-gated; not part of the regular vault workflow. The v1.6 query path (per skills/wiki-query/SKILL.md before v1.7): 1. Read wiki/hot.md (always; quick context) 2. Read wiki/index.md (scan for descriptions matching query terms) 3. Read top-N pages cited in the index whose entries best match query 4. Caller synthesizes answer This script approximates that path by: 1. Tokenizing the query (same stopword-filtered ASCII tokenizer as bm25-index.py) 2. Scoring each *.md page in wiki/ by the count of distinct query terms it contains (case-insensitive substring on the full file body; no semantic matching) 3. Returning top-K pages by score, with ties broken by: a. Presence in hot.md (boost +5) b. Presence in index.md (boost +3) c. Total raw term-occurrence count The simulation is intentionally simple — it represents what a human or a basic agent does when reading hot/index "by hand" without any retrieval infrastructure. Anything fancier would not be a fair v1.6 baseline. Usage: baseline-v16.py "your query" [--top 5] baseline-v16.py "query" --top 5 --json # output as JSON (default: text) Exit codes: 0 — success 2 — usage error 3 — wiki directory missing """ import argparse import json import re import sys from collections import Counter from pathlib import Path VAULT_ROOT = Path(__file__).resolve().parent.parent WIKI_DIR = VAULT_ROOT / "wiki" HOT_PATH = WIKI_DIR / "hot.md" INDEX_PATH = WIKI_DIR / "index.md" # Mirror bm25-index.py's tokenizer + stopword list so comparisons are fair. STOPWORDS = frozenset(""" a an and are as at be by for from has have he her him his i if in is it its of on or that the their them they this to was were will with you your """.split()) # Mirrors bm25-index.py's Unicode-aware tokenizer (v1.7.2; closes M2). TOKEN_RE = re.compile(r"\w[\w'\-]*", re.UNICODE) HOT_BOOST = 5.0 INDEX_BOOST = 3.0 EXIT_OK = 0 EXIT_USAGE = 2 EXIT_NO_WIKI = 3 def tokenize(text): return [t.lower() for t in TOKEN_RE.findall(text) if t.lower() not in STOPWORDS and len(t) > 1] def page_paths(): if not WIKI_DIR.is_dir(): print(f"ERR: no wiki directory at {WIKI_DIR}", file=sys.stderr) sys.exit(EXIT_NO_WIKI) return sorted(p for p in WIKI_DIR.rglob("*.md") if not any(part.startswith(".") for part in p.parts)) def score_page(page_path, query_terms_set, query_terms_counter): """Score by distinct-query-term-presence + boost if cited in hot/index. Returns (score, distinct_matches, total_occurrences). """ try: body = page_path.read_text(encoding="utf-8", errors="replace").lower() except OSError: return (0.0, 0, 0) distinct = sum(1 for term in query_terms_set if term in body) total = sum(body.count(term) for term in query_terms_set) score = float(distinct) + 0.01 * total # distinct dominates; total is tiebreak # Hot-cache boost: if the page is referenced by name in hot.md if HOT_PATH.is_file(): try: hot_body = HOT_PATH.read_text(encoding="utf-8", errors="replace") page_stem = page_path.stem if page_stem in hot_body or str(page_path.relative_to(VAULT_ROOT)) in hot_body: score += HOT_BOOST except OSError: pass # Index boost: page is cited in index.md if INDEX_PATH.is_file(): try: index_body = INDEX_PATH.read_text(encoding="utf-8", errors="replace") page_stem = page_path.stem if page_stem in index_body or str(page_path.relative_to(VAULT_ROOT)) in index_body: score += INDEX_BOOST except OSError: pass return (score, distinct, total) def baseline_query(query, top_k=5): """Return list of {path, score, distinct, total} for top-K pages.""" terms = tokenize(query) if not terms: return [] terms_set = set(terms) terms_counter = Counter(terms) scored = [] for p in page_paths(): score, distinct, total = score_page(p, terms_set, terms_counter) if score > 0: scored.append({ "path": str(p.relative_to(VAULT_ROOT)), "score": round(score, 4), "distinct_terms": distinct, "total_occurrences": total, }) scored.sort(key=lambda d: d["score"], reverse=True) return scored[:top_k] def main(): parser = argparse.ArgumentParser(description="v1.6 baseline retrieval simulator.") parser.add_argument("query", help="Natural-language query") parser.add_argument("--top", type=int, default=5, help="Top-K results") parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args() results = baseline_query(args.query, top_k=args.top) if args.json: print(json.dumps({ "query": args.query, "strategy": "baseline-v1.6:hot+index+keyword", "top_k": args.top, "candidates": results, }, indent=2)) else: if not results: print("(no matches)") else: print(f"v1.6 baseline for: {args.query!r}") for i, r in enumerate(results, 1): print(f" {i}. {r['path']} score={r['score']} distinct={r['distinct_terms']} occ={r['total_occurrences']}") return EXIT_OK if __name__ == "__main__": sys.exit(main())