add claude-obsidian
This commit is contained in:
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
"""baseline-v16.py — simulate the v1.6 hot→index→drill retrieval chain.
|
||||
|
||||
Exists ONLY for benchmarking v1.7's hybrid retrieval against the legacy
|
||||
v1.6 behavior. Not used by any v1.7 skill; not feature-gated; not part of
|
||||
the regular vault workflow.
|
||||
|
||||
The v1.6 query path (per skills/wiki-query/SKILL.md before v1.7):
|
||||
1. Read wiki/hot.md (always; quick context)
|
||||
2. Read wiki/index.md (scan for descriptions matching query terms)
|
||||
3. Read top-N pages cited in the index whose entries best match query
|
||||
4. Caller synthesizes answer
|
||||
|
||||
This script approximates that path by:
|
||||
1. Tokenizing the query (same stopword-filtered ASCII tokenizer as bm25-index.py)
|
||||
2. Scoring each *.md page in wiki/ by the count of distinct query terms it contains
|
||||
(case-insensitive substring on the full file body; no semantic matching)
|
||||
3. Returning top-K pages by score, with ties broken by:
|
||||
a. Presence in hot.md (boost +5)
|
||||
b. Presence in index.md (boost +3)
|
||||
c. Total raw term-occurrence count
|
||||
|
||||
The simulation is intentionally simple — it represents what a human or a
|
||||
basic agent does when reading hot/index "by hand" without any retrieval
|
||||
infrastructure. Anything fancier would not be a fair v1.6 baseline.
|
||||
|
||||
Usage:
|
||||
baseline-v16.py "your query" [--top 5]
|
||||
baseline-v16.py "query" --top 5 --json # output as JSON (default: text)
|
||||
|
||||
Exit codes:
|
||||
0 — success
|
||||
2 — usage error
|
||||
3 — wiki directory missing
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
VAULT_ROOT = Path(__file__).resolve().parent.parent
|
||||
WIKI_DIR = VAULT_ROOT / "wiki"
|
||||
HOT_PATH = WIKI_DIR / "hot.md"
|
||||
INDEX_PATH = WIKI_DIR / "index.md"
|
||||
|
||||
# Mirror bm25-index.py's tokenizer + stopword list so comparisons are fair.
|
||||
STOPWORDS = frozenset("""
|
||||
a an and are as at be by for from has have he her him his i if in is it its
|
||||
of on or that the their them they this to was were will with you your
|
||||
""".split())
|
||||
|
||||
# Mirrors bm25-index.py's Unicode-aware tokenizer (v1.7.2; closes M2).
|
||||
TOKEN_RE = re.compile(r"\w[\w'\-]*", re.UNICODE)
|
||||
|
||||
HOT_BOOST = 5.0
|
||||
INDEX_BOOST = 3.0
|
||||
|
||||
EXIT_OK = 0
|
||||
EXIT_USAGE = 2
|
||||
EXIT_NO_WIKI = 3
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
return [t.lower() for t in TOKEN_RE.findall(text)
|
||||
if t.lower() not in STOPWORDS and len(t) > 1]
|
||||
|
||||
|
||||
def page_paths():
|
||||
if not WIKI_DIR.is_dir():
|
||||
print(f"ERR: no wiki directory at {WIKI_DIR}", file=sys.stderr)
|
||||
sys.exit(EXIT_NO_WIKI)
|
||||
return sorted(p for p in WIKI_DIR.rglob("*.md")
|
||||
if not any(part.startswith(".") for part in p.parts))
|
||||
|
||||
|
||||
def score_page(page_path, query_terms_set, query_terms_counter):
|
||||
"""Score by distinct-query-term-presence + boost if cited in hot/index.
|
||||
|
||||
Returns (score, distinct_matches, total_occurrences).
|
||||
"""
|
||||
try:
|
||||
body = page_path.read_text(encoding="utf-8", errors="replace").lower()
|
||||
except OSError:
|
||||
return (0.0, 0, 0)
|
||||
|
||||
distinct = sum(1 for term in query_terms_set if term in body)
|
||||
total = sum(body.count(term) for term in query_terms_set)
|
||||
score = float(distinct) + 0.01 * total # distinct dominates; total is tiebreak
|
||||
|
||||
# Hot-cache boost: if the page is referenced by name in hot.md
|
||||
if HOT_PATH.is_file():
|
||||
try:
|
||||
hot_body = HOT_PATH.read_text(encoding="utf-8", errors="replace")
|
||||
page_stem = page_path.stem
|
||||
if page_stem in hot_body or str(page_path.relative_to(VAULT_ROOT)) in hot_body:
|
||||
score += HOT_BOOST
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Index boost: page is cited in index.md
|
||||
if INDEX_PATH.is_file():
|
||||
try:
|
||||
index_body = INDEX_PATH.read_text(encoding="utf-8", errors="replace")
|
||||
page_stem = page_path.stem
|
||||
if page_stem in index_body or str(page_path.relative_to(VAULT_ROOT)) in index_body:
|
||||
score += INDEX_BOOST
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return (score, distinct, total)
|
||||
|
||||
|
||||
def baseline_query(query, top_k=5):
|
||||
"""Return list of {path, score, distinct, total} for top-K pages."""
|
||||
terms = tokenize(query)
|
||||
if not terms:
|
||||
return []
|
||||
terms_set = set(terms)
|
||||
terms_counter = Counter(terms)
|
||||
|
||||
scored = []
|
||||
for p in page_paths():
|
||||
score, distinct, total = score_page(p, terms_set, terms_counter)
|
||||
if score > 0:
|
||||
scored.append({
|
||||
"path": str(p.relative_to(VAULT_ROOT)),
|
||||
"score": round(score, 4),
|
||||
"distinct_terms": distinct,
|
||||
"total_occurrences": total,
|
||||
})
|
||||
|
||||
scored.sort(key=lambda d: d["score"], reverse=True)
|
||||
return scored[:top_k]
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="v1.6 baseline retrieval simulator.")
|
||||
parser.add_argument("query", help="Natural-language query")
|
||||
parser.add_argument("--top", type=int, default=5, help="Top-K results")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
args = parser.parse_args()
|
||||
|
||||
results = baseline_query(args.query, top_k=args.top)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps({
|
||||
"query": args.query,
|
||||
"strategy": "baseline-v1.6:hot+index+keyword",
|
||||
"top_k": args.top,
|
||||
"candidates": results,
|
||||
}, indent=2))
|
||||
else:
|
||||
if not results:
|
||||
print("(no matches)")
|
||||
else:
|
||||
print(f"v1.6 baseline for: {args.query!r}")
|
||||
for i, r in enumerate(results, 1):
|
||||
print(f" {i}. {r['path']} score={r['score']} distinct={r['distinct_terms']} occ={r['total_occurrences']}")
|
||||
|
||||
return EXIT_OK
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user