add claude-obsidian
Tests / Hermetic test suite (push) Has been cancelled
Tests / Skill frontmatter validation (push) Has been cancelled

This commit is contained in:
김경종
2026-05-28 10:57:16 +09:00
parent 1b07531a45
commit 72dad72703
205 changed files with 41703 additions and 80 deletions
+312
View File
@@ -0,0 +1,312 @@
#!/usr/bin/env python3
"""boundary-score.py — DragonScale Mechanism 4: boundary-first autoresearch scorer.
Reads `wiki/**/*.md`, builds a wikilink graph, and emits per-page boundary
scores to stdout (text) or as JSON for tooling.
boundary_score(p) = (out_degree(p) - in_degree(p)) * recency_weight(p)
- out_degree(p): count of distinct wikilinks in p that resolve to a
scoreable page (scoreable = non-meta, non-fold, non-excluded).
- in_degree(p): count of distinct scoreable pages that link to p.
- recency_weight(p): exp(-days_since_updated / RECENCY_HALFLIFE_DAYS).
No floor; very old pages approach zero weight, which is the intended
semantic of "frontier" (recently-touched and outward-pointing).
High score = the page points at many things, is pointed at by few, and
has been touched recently. That is a vault frontier page. Low or
negative score = hub / integrated page.
Feature-gated opt-in: autoresearch only invokes this when DragonScale
setup is detected. Safe to run standalone even without DragonScale set
up (reads wiki/ only; never writes).
This script is intentionally stdout-only. There is no `--report PATH`
equivalent to `tiling-check.py --report` because the helper is small
enough to pipe directly (`./scripts/boundary-score.py --json | jq ...`)
and keeping it read-only removes a write-path attack surface.
Usage:
boundary-score.py # top-10 frontier, text
boundary-score.py --top N # top N frontier
boundary-score.py --json # JSON output
boundary-score.py --page PATH # score for a single page
boundary-score.py --include-score-zero # include pages with score=0
Exit codes:
0 success
2 usage error
"""
import argparse
import json
import math
import re
import sys
from datetime import date, datetime, timezone
from pathlib import Path
VAULT_ROOT = Path(__file__).resolve().parent.parent
WIKI_DIR = VAULT_ROOT / "wiki"
EXCLUDE_TYPES = {"meta", "fold"}
EXCLUDE_FILENAMES = {
"_index.md", "index.md", "log.md", "hot.md", "overview.md",
"dashboard.md", "Wiki Map.md", "getting-started.md",
}
EXCLUDE_PATH_PREFIXES = ("wiki/folds/", "wiki/meta/")
RECENCY_HALFLIFE_DAYS = 30.0
# No recency floor: a truly stale page should NOT dominate the frontier
# ranking, even if its out-degree is high. The exponential decay takes
# weight toward zero for year-old pages, which is the intended semantic
# of "frontier" (recently-touched and outward-pointing).
DEFAULT_TOP = 10
MAX_BODY_BYTES = 256 * 1024
# CommonMark-ish fence tracking: opening fence records (char, length);
# a closing fence must use the SAME char with SAME-OR-LONGER run length.
# Tilde fences (~~~) are supported alongside backtick fences (```). Indented
# code blocks (4+ spaces) are NOT filtered; in Obsidian usage, indented
# bullets commonly contain wikilinks and should count as edges.
FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
TYPE_RE = re.compile(r"^type:\s*(\S+)", re.MULTILINE)
UPDATED_RE = re.compile(r"^updated:\s*([0-9]{4}-[0-9]{2}-[0-9]{2})", re.MULTILINE)
CREATED_RE = re.compile(r"^created:\s*([0-9]{4}-[0-9]{2}-[0-9]{2})", re.MULTILINE)
TITLE_RE = re.compile(r'^title:\s*"?([^"\n]+?)"?\s*$', re.MULTILINE)
# Obsidian wikilinks: [[Target]] or [[Target|Alias]] or [[Target#Heading]]
WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#[^\]|]+)?(?:\|[^\]]+)?\]\]")
EXIT_OK = 0
EXIT_USAGE = 2
def log(msg: str) -> None:
print(msg, file=sys.stderr)
def parse_frontmatter(text: str) -> tuple[dict, str]:
m = FRONTMATTER_RE.match(text)
if not m:
return {}, text
fm_raw = m.group(1)
body = text[m.end():]
fm: dict = {}
for key, regex in (("type", TYPE_RE), ("updated", UPDATED_RE),
("created", CREATED_RE), ("title", TITLE_RE)):
tm = regex.search(fm_raw)
if tm:
fm[key] = tm.group(1).strip().strip('"').strip("'")
return fm, body
def included(path: Path, fm: dict) -> bool:
if path.is_symlink():
return False
try:
resolved = path.resolve(strict=True)
resolved.relative_to(VAULT_ROOT.resolve())
except (OSError, ValueError):
return False
rel = path.relative_to(VAULT_ROOT).as_posix()
if path.name in EXCLUDE_FILENAMES:
return False
for prefix in EXCLUDE_PATH_PREFIXES:
if rel.startswith(prefix):
return False
if fm.get("type") in EXCLUDE_TYPES:
return False
return True
def days_since(date_str: str | None) -> float:
"""Return days since the given YYYY-MM-DD string, or a large sentinel if missing."""
if not date_str:
return 10_000.0
try:
d = date.fromisoformat(date_str)
except ValueError:
return 10_000.0
delta = (date.today() - d).days
return max(0.0, float(delta))
def recency_weight(days: float,
halflife: float = RECENCY_HALFLIFE_DAYS) -> float:
return math.exp(-days / halflife)
_FENCE_RE = re.compile(r"^(\s*)(`{3,}|~{3,})")
def extract_wikilinks(body: str) -> set[str]:
"""Extract unique link targets (without alias or heading suffix) from the body.
Skips wikilinks inside fenced code blocks so documentation examples
(including in this repo's own skill files) do not pollute the graph.
Fence handling: backtick AND tilde fences, with length tracking per
CommonMark: the opening run sets (char, min_len); the closing line
must use the SAME char with a run of SAME-OR-LONGER length. Indented
code blocks (4+ spaces) are intentionally NOT filtered — indented
bullets in Obsidian often contain wikilinks.
"""
cleaned: list[str] = []
fence_char: str | None = None
fence_len: int = 0
for line in body.splitlines():
m = _FENCE_RE.match(line)
if m:
char = m.group(2)[0]
length = len(m.group(2))
if fence_char is None:
fence_char = char
fence_len = length
continue
if char == fence_char and length >= fence_len:
fence_char = None
fence_len = 0
continue
if fence_char is not None:
continue
cleaned.append(line)
scan = "\n".join(cleaned)
results: set[str] = set()
for m in WIKILINK_RE.finditer(scan):
raw = m.group(1).strip()
# Folder-qualified links like [[notes/Foo]] resolve to Foo.md by stem.
# This matches Obsidian default behavior for unique filenames.
stem = raw.rsplit("/", 1)[-1]
if stem:
results.add(stem)
return results
def collect_pages() -> dict[str, dict]:
"""Scan wiki/, return {title_key: {path, title, body, fm}} for scoreable pages.
`title_key` is the filename stem, which is what Obsidian wikilinks resolve
to by default. Assumes filenames are unique across the vault (enforced by
wiki-lint naming convention).
"""
pages: dict[str, dict] = {}
if not WIKI_DIR.is_dir():
return pages
for md in sorted(WIKI_DIR.rglob("*.md")):
try:
text = md.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
continue
if len(text.encode("utf-8")) > MAX_BODY_BYTES:
continue
fm, body = parse_frontmatter(text)
if not included(md, fm):
continue
title_key = md.stem # Obsidian wikilinks are filename-based
pages[title_key] = {
"path": md.relative_to(VAULT_ROOT).as_posix(),
"title": fm.get("title", title_key),
"body": body,
"fm": fm,
}
return pages
def build_graph(pages: dict[str, dict]) -> tuple[dict[str, set[str]], dict[str, set[str]]]:
"""Return (out_edges, in_edges) where each maps title_key -> set(title_key).
Only edges whose target is a known scoreable page are counted. Self-loops
are ignored.
"""
out_edges: dict[str, set[str]] = {k: set() for k in pages}
in_edges: dict[str, set[str]] = {k: set() for k in pages}
for src, entry in pages.items():
links = extract_wikilinks(entry["body"])
for target in links:
if target == src:
continue
if target in pages:
out_edges[src].add(target)
in_edges[target].add(src)
return out_edges, in_edges
def score_page(title_key: str,
pages: dict[str, dict],
out_edges: dict[str, set[str]],
in_edges: dict[str, set[str]]) -> dict:
entry = pages[title_key]
fm = entry["fm"]
out_deg = len(out_edges.get(title_key, set()))
in_deg = len(in_edges.get(title_key, set()))
date_str = fm.get("updated") or fm.get("created")
days = days_since(date_str)
rw = recency_weight(days)
score = (out_deg - in_deg) * rw
return {
"title": entry["title"],
"title_key": title_key,
"path": entry["path"],
"out_degree": out_deg,
"in_degree": in_deg,
"age_days": days,
"recency_weight": round(rw, 4),
"score": round(score, 4),
}
def run(top: int, want_json: bool, include_zero: bool, page_filter: str | None) -> int:
pages = collect_pages()
out_edges, in_edges = build_graph(pages)
scored = [score_page(k, pages, out_edges, in_edges) for k in pages]
if page_filter:
key = Path(page_filter).stem
matched = [s for s in scored if s["title_key"] == key or s["path"] == page_filter]
if not matched:
log(f"ERR: no scoreable page matches '{page_filter}'")
return EXIT_USAGE
scored = matched
else:
if not include_zero:
scored = [s for s in scored if s["score"] > 0.0]
scored.sort(key=lambda s: (-s["score"], s["title_key"]))
scored = scored[:top]
if want_json:
print(json.dumps({
"generated": datetime.now(timezone.utc).isoformat(timespec="seconds").replace("+00:00", "Z"),
"halflife_days": RECENCY_HALFLIFE_DAYS,
"page_count_scoreable": len(pages),
"results": scored,
}, indent=2))
else:
print("# Boundary Score Report")
print(f"scoreable pages: {len(pages)}; halflife: {RECENCY_HALFLIFE_DAYS} days")
if not scored:
print("\nNo positive-score frontier pages found.")
else:
print("")
print("| # | score | out | in | age_d | title | path |")
print("|---|---|---|---|---|---|---|")
for i, s in enumerate(scored, 1):
print(f"| {i} | {s['score']:.3f} | {s['out_degree']} | {s['in_degree']} | "
f"{int(s['age_days'])} | {s['title']} | {s['path']} |")
return EXIT_OK
def main(argv: list[str]) -> int:
p = argparse.ArgumentParser()
p.add_argument("--top", type=int, default=DEFAULT_TOP)
p.add_argument("--json", action="store_true")
p.add_argument("--include-score-zero", action="store_true",
help="Include pages whose score is zero or negative in the output")
p.add_argument("--page", default=None, help="Score a single page by path or stem")
args = p.parse_args(argv)
if args.top < 1:
log("ERR: --top must be >= 1")
return EXIT_USAGE
return run(args.top, args.json, args.include_score_zero, args.page)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))