add claude-obsidian
Tests / Hermetic test suite (push) Has been cancelled
Tests / Skill frontmatter validation (push) Has been cancelled

This commit is contained in:
김경종
2026-05-28 10:57:16 +09:00
parent 1b07531a45
commit 72dad72703
205 changed files with 41703 additions and 80 deletions
+167
View File
@@ -0,0 +1,167 @@
#!/usr/bin/env python3
"""baseline-v16.py — simulate the v1.6 hot→index→drill retrieval chain.
Exists ONLY for benchmarking v1.7's hybrid retrieval against the legacy
v1.6 behavior. Not used by any v1.7 skill; not feature-gated; not part of
the regular vault workflow.
The v1.6 query path (per skills/wiki-query/SKILL.md before v1.7):
1. Read wiki/hot.md (always; quick context)
2. Read wiki/index.md (scan for descriptions matching query terms)
3. Read top-N pages cited in the index whose entries best match query
4. Caller synthesizes answer
This script approximates that path by:
1. Tokenizing the query (same stopword-filtered ASCII tokenizer as bm25-index.py)
2. Scoring each *.md page in wiki/ by the count of distinct query terms it contains
(case-insensitive substring on the full file body; no semantic matching)
3. Returning top-K pages by score, with ties broken by:
a. Presence in hot.md (boost +5)
b. Presence in index.md (boost +3)
c. Total raw term-occurrence count
The simulation is intentionally simple — it represents what a human or a
basic agent does when reading hot/index "by hand" without any retrieval
infrastructure. Anything fancier would not be a fair v1.6 baseline.
Usage:
baseline-v16.py "your query" [--top 5]
baseline-v16.py "query" --top 5 --json # output as JSON (default: text)
Exit codes:
0 — success
2 — usage error
3 — wiki directory missing
"""
import argparse
import json
import re
import sys
from collections import Counter
from pathlib import Path
VAULT_ROOT = Path(__file__).resolve().parent.parent
WIKI_DIR = VAULT_ROOT / "wiki"
HOT_PATH = WIKI_DIR / "hot.md"
INDEX_PATH = WIKI_DIR / "index.md"
# Mirror bm25-index.py's tokenizer + stopword list so comparisons are fair.
STOPWORDS = frozenset("""
a an and are as at be by for from has have he her him his i if in is it its
of on or that the their them they this to was were will with you your
""".split())
# Mirrors bm25-index.py's Unicode-aware tokenizer (v1.7.2; closes M2).
TOKEN_RE = re.compile(r"\w[\w'\-]*", re.UNICODE)
HOT_BOOST = 5.0
INDEX_BOOST = 3.0
EXIT_OK = 0
EXIT_USAGE = 2
EXIT_NO_WIKI = 3
def tokenize(text):
return [t.lower() for t in TOKEN_RE.findall(text)
if t.lower() not in STOPWORDS and len(t) > 1]
def page_paths():
if not WIKI_DIR.is_dir():
print(f"ERR: no wiki directory at {WIKI_DIR}", file=sys.stderr)
sys.exit(EXIT_NO_WIKI)
return sorted(p for p in WIKI_DIR.rglob("*.md")
if not any(part.startswith(".") for part in p.parts))
def score_page(page_path, query_terms_set, query_terms_counter):
"""Score by distinct-query-term-presence + boost if cited in hot/index.
Returns (score, distinct_matches, total_occurrences).
"""
try:
body = page_path.read_text(encoding="utf-8", errors="replace").lower()
except OSError:
return (0.0, 0, 0)
distinct = sum(1 for term in query_terms_set if term in body)
total = sum(body.count(term) for term in query_terms_set)
score = float(distinct) + 0.01 * total # distinct dominates; total is tiebreak
# Hot-cache boost: if the page is referenced by name in hot.md
if HOT_PATH.is_file():
try:
hot_body = HOT_PATH.read_text(encoding="utf-8", errors="replace")
page_stem = page_path.stem
if page_stem in hot_body or str(page_path.relative_to(VAULT_ROOT)) in hot_body:
score += HOT_BOOST
except OSError:
pass
# Index boost: page is cited in index.md
if INDEX_PATH.is_file():
try:
index_body = INDEX_PATH.read_text(encoding="utf-8", errors="replace")
page_stem = page_path.stem
if page_stem in index_body or str(page_path.relative_to(VAULT_ROOT)) in index_body:
score += INDEX_BOOST
except OSError:
pass
return (score, distinct, total)
def baseline_query(query, top_k=5):
"""Return list of {path, score, distinct, total} for top-K pages."""
terms = tokenize(query)
if not terms:
return []
terms_set = set(terms)
terms_counter = Counter(terms)
scored = []
for p in page_paths():
score, distinct, total = score_page(p, terms_set, terms_counter)
if score > 0:
scored.append({
"path": str(p.relative_to(VAULT_ROOT)),
"score": round(score, 4),
"distinct_terms": distinct,
"total_occurrences": total,
})
scored.sort(key=lambda d: d["score"], reverse=True)
return scored[:top_k]
def main():
parser = argparse.ArgumentParser(description="v1.6 baseline retrieval simulator.")
parser.add_argument("query", help="Natural-language query")
parser.add_argument("--top", type=int, default=5, help="Top-K results")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
results = baseline_query(args.query, top_k=args.top)
if args.json:
print(json.dumps({
"query": args.query,
"strategy": "baseline-v1.6:hot+index+keyword",
"top_k": args.top,
"candidates": results,
}, indent=2))
else:
if not results:
print("(no matches)")
else:
print(f"v1.6 baseline for: {args.query!r}")
for i, r in enumerate(results, 1):
print(f" {i}. {r['path']} score={r['score']} distinct={r['distinct_terms']} occ={r['total_occurrences']}")
return EXIT_OK
if __name__ == "__main__":
sys.exit(main())