add claude-obsidian

2026-05-28 10:57:16 +09:00
parent 1b07531a45
commit 72dad72703
205 changed files with 41703 additions and 80 deletions
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""baseline-v16.py — simulate the v1.6 hot→index→drill retrieval chain.
+
+Exists ONLY for benchmarking v1.7's hybrid retrieval against the legacy
+v1.6 behavior. Not used by any v1.7 skill; not feature-gated; not part of
+the regular vault workflow.
+
+The v1.6 query path (per skills/wiki-query/SKILL.md before v1.7):
+  1. Read wiki/hot.md (always; quick context)
+  2. Read wiki/index.md (scan for descriptions matching query terms)
+  3. Read top-N pages cited in the index whose entries best match query
+  4. Caller synthesizes answer
+
+This script approximates that path by:
+  1. Tokenizing the query (same stopword-filtered ASCII tokenizer as bm25-index.py)
+  2. Scoring each *.md page in wiki/ by the count of distinct query terms it contains
+     (case-insensitive substring on the full file body; no semantic matching)
+  3. Returning top-K pages by score, with ties broken by:
+     a. Presence in hot.md (boost +5)
+     b. Presence in index.md (boost +3)
+     c. Total raw term-occurrence count
+
+The simulation is intentionally simple — it represents what a human or a
+basic agent does when reading hot/index "by hand" without any retrieval
+infrastructure. Anything fancier would not be a fair v1.6 baseline.
+
+Usage:
+  baseline-v16.py "your query" [--top 5]
+  baseline-v16.py "query" --top 5 --json   # output as JSON (default: text)
+
+Exit codes:
+  0 — success
+  2 — usage error
+  3 — wiki directory missing
+"""
+
+import argparse
+import json
+import re
+import sys
+from collections import Counter
+from pathlib import Path
+
+VAULT_ROOT = Path(__file__).resolve().parent.parent
+WIKI_DIR = VAULT_ROOT / "wiki"
+HOT_PATH = WIKI_DIR / "hot.md"
+INDEX_PATH = WIKI_DIR / "index.md"
+
+# Mirror bm25-index.py's tokenizer + stopword list so comparisons are fair.
+STOPWORDS = frozenset("""
+a an and are as at be by for from has have he her him his i if in is it its
+of on or that the their them they this to was were will with you your
+""".split())
+
+# Mirrors bm25-index.py's Unicode-aware tokenizer (v1.7.2; closes M2).
+TOKEN_RE = re.compile(r"\w[\w'\-]*", re.UNICODE)
+
+HOT_BOOST = 5.0
+INDEX_BOOST = 3.0
+
+EXIT_OK = 0
+EXIT_USAGE = 2
+EXIT_NO_WIKI = 3
+
+
+def tokenize(text):
+    return [t.lower() for t in TOKEN_RE.findall(text)
+            if t.lower() not in STOPWORDS and len(t) > 1]
+
+
+def page_paths():
+    if not WIKI_DIR.is_dir():
+        print(f"ERR: no wiki directory at {WIKI_DIR}", file=sys.stderr)
+        sys.exit(EXIT_NO_WIKI)
+    return sorted(p for p in WIKI_DIR.rglob("*.md")
+                  if not any(part.startswith(".") for part in p.parts))
+
+
+def score_page(page_path, query_terms_set, query_terms_counter):
+    """Score by distinct-query-term-presence + boost if cited in hot/index.
+
+    Returns (score, distinct_matches, total_occurrences).
+    """
+    try:
+        body = page_path.read_text(encoding="utf-8", errors="replace").lower()
+    except OSError:
+        return (0.0, 0, 0)
+
+    distinct = sum(1 for term in query_terms_set if term in body)
+    total = sum(body.count(term) for term in query_terms_set)
+    score = float(distinct) + 0.01 * total  # distinct dominates; total is tiebreak
+
+    # Hot-cache boost: if the page is referenced by name in hot.md
+    if HOT_PATH.is_file():
+        try:
+            hot_body = HOT_PATH.read_text(encoding="utf-8", errors="replace")
+            page_stem = page_path.stem
+            if page_stem in hot_body or str(page_path.relative_to(VAULT_ROOT)) in hot_body:
+                score += HOT_BOOST
+        except OSError:
+            pass
+
+    # Index boost: page is cited in index.md
+    if INDEX_PATH.is_file():
+        try:
+            index_body = INDEX_PATH.read_text(encoding="utf-8", errors="replace")
+            page_stem = page_path.stem
+            if page_stem in index_body or str(page_path.relative_to(VAULT_ROOT)) in index_body:
+                score += INDEX_BOOST
+        except OSError:
+            pass
+
+    return (score, distinct, total)
+
+
+def baseline_query(query, top_k=5):
+    """Return list of {path, score, distinct, total} for top-K pages."""
+    terms = tokenize(query)
+    if not terms:
+        return []
+    terms_set = set(terms)
+    terms_counter = Counter(terms)
+
+    scored = []
+    for p in page_paths():
+        score, distinct, total = score_page(p, terms_set, terms_counter)
+        if score > 0:
+            scored.append({
+                "path": str(p.relative_to(VAULT_ROOT)),
+                "score": round(score, 4),
+                "distinct_terms": distinct,
+                "total_occurrences": total,
+            })
+
+    scored.sort(key=lambda d: d["score"], reverse=True)
+    return scored[:top_k]
+
+
+def main():
+    parser = argparse.ArgumentParser(description="v1.6 baseline retrieval simulator.")
+    parser.add_argument("query", help="Natural-language query")
+    parser.add_argument("--top", type=int, default=5, help="Top-K results")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    args = parser.parse_args()
+
+    results = baseline_query(args.query, top_k=args.top)
+
+    if args.json:
+        print(json.dumps({
+            "query": args.query,
+            "strategy": "baseline-v1.6:hot+index+keyword",
+            "top_k": args.top,
+            "candidates": results,
+        }, indent=2))
+    else:
+        if not results:
+            print("(no matches)")
+        else:
+            print(f"v1.6 baseline for: {args.query!r}")
+            for i, r in enumerate(results, 1):
+                print(f"  {i}. {r['path']}  score={r['score']}  distinct={r['distinct_terms']}  occ={r['total_occurrences']}")
+
+    return EXIT_OK
+
+
+if __name__ == "__main__":
+    sys.exit(main())