MultiPhysicsVault/scripts/boundary-score.py

#!/usr/bin/env python3
"""boundary-score.py — DragonScale Mechanism 4: boundary-first autoresearch scorer.

Reads `wiki/**/*.md`, builds a wikilink graph, and emits per-page boundary
scores to stdout (text) or as JSON for tooling.

boundary_score(p) = (out_degree(p) - in_degree(p)) * recency_weight(p)

- out_degree(p): count of distinct wikilinks in p that resolve to a
  scoreable page (scoreable = non-meta, non-fold, non-excluded).
- in_degree(p):  count of distinct scoreable pages that link to p.
- recency_weight(p): exp(-days_since_updated / RECENCY_HALFLIFE_DAYS).
  No floor; very old pages approach zero weight, which is the intended
  semantic of "frontier" (recently-touched and outward-pointing).

High score = the page points at many things, is pointed at by few, and
has been touched recently. That is a vault frontier page. Low or
negative score = hub / integrated page.

Feature-gated opt-in: autoresearch only invokes this when DragonScale
setup is detected. Safe to run standalone even without DragonScale set
up (reads wiki/ only; never writes).

This script is intentionally stdout-only. There is no `--report PATH`
equivalent to `tiling-check.py --report` because the helper is small
enough to pipe directly (`./scripts/boundary-score.py --json | jq ...`)
and keeping it read-only removes a write-path attack surface.

Usage:
  boundary-score.py                         # top-10 frontier, text
  boundary-score.py --top N                 # top N frontier
  boundary-score.py --json                  # JSON output
  boundary-score.py --page PATH             # score for a single page
  boundary-score.py --include-score-zero    # include pages with score=0

Exit codes:
  0  success
  2  usage error
"""

import argparse
import json
import math
import re
import sys
from datetime import date, datetime, timezone
from pathlib import Path

VAULT_ROOT = Path(__file__).resolve().parent.parent
WIKI_DIR = VAULT_ROOT / "wiki"

EXCLUDE_TYPES = {"meta", "fold"}
EXCLUDE_FILENAMES = {
    "_index.md", "index.md", "log.md", "hot.md", "overview.md",
    "dashboard.md", "Wiki Map.md", "getting-started.md",
}
EXCLUDE_PATH_PREFIXES = ("wiki/folds/", "wiki/meta/")

RECENCY_HALFLIFE_DAYS = 30.0
# No recency floor: a truly stale page should NOT dominate the frontier
# ranking, even if its out-degree is high. The exponential decay takes
# weight toward zero for year-old pages, which is the intended semantic
# of "frontier" (recently-touched and outward-pointing).
DEFAULT_TOP = 10
MAX_BODY_BYTES = 256 * 1024
# CommonMark-ish fence tracking: opening fence records (char, length);
# a closing fence must use the SAME char with SAME-OR-LONGER run length.
# Tilde fences (~~~) are supported alongside backtick fences (```). Indented
# code blocks (4+ spaces) are NOT filtered; in Obsidian usage, indented
# bullets commonly contain wikilinks and should count as edges.

FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
TYPE_RE = re.compile(r"^type:\s*(\S+)", re.MULTILINE)
UPDATED_RE = re.compile(r"^updated:\s*([0-9]{4}-[0-9]{2}-[0-9]{2})", re.MULTILINE)
CREATED_RE = re.compile(r"^created:\s*([0-9]{4}-[0-9]{2}-[0-9]{2})", re.MULTILINE)
TITLE_RE = re.compile(r'^title:\s*"?([^"\n]+?)"?\s*$', re.MULTILINE)
# Obsidian wikilinks: [[Target]] or [[Target|Alias]] or [[Target#Heading]]
WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#[^\]|]+)?(?:\|[^\]]+)?\]\]")

EXIT_OK = 0
EXIT_USAGE = 2


def log(msg: str) -> None:
    print(msg, file=sys.stderr)


def parse_frontmatter(text: str) -> tuple[dict, str]:
    m = FRONTMATTER_RE.match(text)
    if not m:
        return {}, text
    fm_raw = m.group(1)
    body = text[m.end():]
    fm: dict = {}
    for key, regex in (("type", TYPE_RE), ("updated", UPDATED_RE),
                       ("created", CREATED_RE), ("title", TITLE_RE)):
        tm = regex.search(fm_raw)
        if tm:
            fm[key] = tm.group(1).strip().strip('"').strip("'")
    return fm, body


def included(path: Path, fm: dict) -> bool:
    if path.is_symlink():
        return False
    try:
        resolved = path.resolve(strict=True)
        resolved.relative_to(VAULT_ROOT.resolve())
    except (OSError, ValueError):
        return False
    rel = path.relative_to(VAULT_ROOT).as_posix()
    if path.name in EXCLUDE_FILENAMES:
        return False
    for prefix in EXCLUDE_PATH_PREFIXES:
        if rel.startswith(prefix):
            return False
    if fm.get("type") in EXCLUDE_TYPES:
        return False
    return True


def days_since(date_str: str | None) -> float:
    """Return days since the given YYYY-MM-DD string, or a large sentinel if missing."""
    if not date_str:
        return 10_000.0
    try:
        d = date.fromisoformat(date_str)
    except ValueError:
        return 10_000.0
    delta = (date.today() - d).days
    return max(0.0, float(delta))


def recency_weight(days: float,
                   halflife: float = RECENCY_HALFLIFE_DAYS) -> float:
    return math.exp(-days / halflife)


_FENCE_RE = re.compile(r"^(\s*)(`{3,}|~{3,})")


def extract_wikilinks(body: str) -> set[str]:
    """Extract unique link targets (without alias or heading suffix) from the body.

    Skips wikilinks inside fenced code blocks so documentation examples
    (including in this repo's own skill files) do not pollute the graph.

    Fence handling: backtick AND tilde fences, with length tracking per
    CommonMark: the opening run sets (char, min_len); the closing line
    must use the SAME char with a run of SAME-OR-LONGER length. Indented
    code blocks (4+ spaces) are intentionally NOT filtered — indented
    bullets in Obsidian often contain wikilinks.
    """
    cleaned: list[str] = []
    fence_char: str | None = None
    fence_len: int = 0
    for line in body.splitlines():
        m = _FENCE_RE.match(line)
        if m:
            char = m.group(2)[0]
            length = len(m.group(2))
            if fence_char is None:
                fence_char = char
                fence_len = length
                continue
            if char == fence_char and length >= fence_len:
                fence_char = None
                fence_len = 0
                continue
        if fence_char is not None:
            continue
        cleaned.append(line)
    scan = "\n".join(cleaned)
    results: set[str] = set()
    for m in WIKILINK_RE.finditer(scan):
        raw = m.group(1).strip()
        # Folder-qualified links like [[notes/Foo]] resolve to Foo.md by stem.
        # This matches Obsidian default behavior for unique filenames.
        stem = raw.rsplit("/", 1)[-1]
        if stem:
            results.add(stem)
    return results


def collect_pages() -> dict[str, dict]:
    """Scan wiki/, return {title_key: {path, title, body, fm}} for scoreable pages.

    `title_key` is the filename stem, which is what Obsidian wikilinks resolve
    to by default. Assumes filenames are unique across the vault (enforced by
    wiki-lint naming convention).
    """
    pages: dict[str, dict] = {}
    if not WIKI_DIR.is_dir():
        return pages
    for md in sorted(WIKI_DIR.rglob("*.md")):
        try:
            text = md.read_text(encoding="utf-8")
        except (OSError, UnicodeDecodeError):
            continue
        if len(text.encode("utf-8")) > MAX_BODY_BYTES:
            continue
        fm, body = parse_frontmatter(text)
        if not included(md, fm):
            continue
        title_key = md.stem  # Obsidian wikilinks are filename-based
        pages[title_key] = {
            "path": md.relative_to(VAULT_ROOT).as_posix(),
            "title": fm.get("title", title_key),
            "body": body,
            "fm": fm,
        }
    return pages


def build_graph(pages: dict[str, dict]) -> tuple[dict[str, set[str]], dict[str, set[str]]]:
    """Return (out_edges, in_edges) where each maps title_key -> set(title_key).

    Only edges whose target is a known scoreable page are counted. Self-loops
    are ignored.
    """
    out_edges: dict[str, set[str]] = {k: set() for k in pages}
    in_edges: dict[str, set[str]] = {k: set() for k in pages}
    for src, entry in pages.items():
        links = extract_wikilinks(entry["body"])
        for target in links:
            if target == src:
                continue
            if target in pages:
                out_edges[src].add(target)
                in_edges[target].add(src)
    return out_edges, in_edges


def score_page(title_key: str,
               pages: dict[str, dict],
               out_edges: dict[str, set[str]],
               in_edges: dict[str, set[str]]) -> dict:
    entry = pages[title_key]
    fm = entry["fm"]
    out_deg = len(out_edges.get(title_key, set()))
    in_deg = len(in_edges.get(title_key, set()))
    date_str = fm.get("updated") or fm.get("created")
    days = days_since(date_str)
    rw = recency_weight(days)
    score = (out_deg - in_deg) * rw
    return {
        "title": entry["title"],
        "title_key": title_key,
        "path": entry["path"],
        "out_degree": out_deg,
        "in_degree": in_deg,
        "age_days": days,
        "recency_weight": round(rw, 4),
        "score": round(score, 4),
    }


def run(top: int, want_json: bool, include_zero: bool, page_filter: str | None) -> int:
    pages = collect_pages()
    out_edges, in_edges = build_graph(pages)
    scored = [score_page(k, pages, out_edges, in_edges) for k in pages]
    if page_filter:
        key = Path(page_filter).stem
        matched = [s for s in scored if s["title_key"] == key or s["path"] == page_filter]
        if not matched:
            log(f"ERR: no scoreable page matches '{page_filter}'")
            return EXIT_USAGE
        scored = matched
    else:
        if not include_zero:
            scored = [s for s in scored if s["score"] > 0.0]
        scored.sort(key=lambda s: (-s["score"], s["title_key"]))
        scored = scored[:top]

    if want_json:
        print(json.dumps({
            "generated": datetime.now(timezone.utc).isoformat(timespec="seconds").replace("+00:00", "Z"),
            "halflife_days": RECENCY_HALFLIFE_DAYS,
            "page_count_scoreable": len(pages),
            "results": scored,
        }, indent=2))
    else:
        print("# Boundary Score Report")
        print(f"scoreable pages: {len(pages)}; halflife: {RECENCY_HALFLIFE_DAYS} days")
        if not scored:
            print("\nNo positive-score frontier pages found.")
        else:
            print("")
            print("| # | score | out | in | age_d | title | path |")
            print("|---|---|---|---|---|---|---|")
            for i, s in enumerate(scored, 1):
                print(f"| {i} | {s['score']:.3f} | {s['out_degree']} | {s['in_degree']} | "
                      f"{int(s['age_days'])} | {s['title']} | {s['path']} |")
    return EXIT_OK


def main(argv: list[str]) -> int:
    p = argparse.ArgumentParser()
    p.add_argument("--top", type=int, default=DEFAULT_TOP)
    p.add_argument("--json", action="store_true")
    p.add_argument("--include-score-zero", action="store_true",
                   help="Include pages whose score is zero or negative in the output")
    p.add_argument("--page", default=None, help="Score a single page by path or stem")
    args = p.parse_args(argv)
    if args.top < 1:
        log("ERR: --top must be >= 1")
        return EXIT_USAGE
    return run(args.top, args.json, args.include_score_zero, args.page)


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))