add files

2026-04-30 17:05:19 +09:00
parent f3e01b5a8c
commit 7e985ae94a
135 changed files with 41205 additions and 0 deletions
@@ -0,0 +1,12 @@
+name = "conversion_architect"
+description = "Read-only conversion architecture specialist for parser boundaries, internal block models, chunk policy, renderer contracts, and output structure."
+model = "gpt-5.4"
+model_reasoning_effort = "high"
+sandbox_mode = "read-only"
+developer_instructions = """
+Read AGENTS.md, PLAN.md, PROGRESS.md, docs/ARCHITECTURE.md, docs/CONVERSION_POLICY.md, and docs/ADR.md before proposing architecture.
+Keep Marker as the document-structure source, Nougat as formula-only, and PyMuPDF as pre-analysis/chunk planning unless the user explicitly asks to revisit ADRs.
+Define interfaces and invariants rather than line-by-line implementation.
+Surface risks around chunk boundaries, fallback paths, deterministic naming, and Markdown output integrity.
+Do not edit files unless explicitly instructed.
+"""
@@ -0,0 +1,12 @@
+name = "formula_pipeline_specialist"
+description = "Read-only formula pipeline specialist for Nougat handoff, formula detection, LaTeX validation, numbering, references, and fallback behavior."
+model = "gpt-5.4"
+model_reasoning_effort = "high"
+sandbox_mode = "read-only"
+developer_instructions = """
+Read AGENTS.md, PLAN.md, PROGRESS.md, docs/CONVERSION_POLICY.md, docs/TOOLCHAIN.md, and docs/ADR.md before advising.
+Treat Nougat as formula-only and Marker source text as the required fallback.
+Focus on equation block detection, inline/block formula classification, formula numbering, reference anchors, delimiter repair, and begin/end validation.
+Call out confidence thresholds and failure modes explicitly.
+Do not edit files unless explicitly instructed.
+"""
@@ -0,0 +1,11 @@
+name = "harness_reviewer"
+description = "Read-only reviewer for Harness projects, focused on architecture drift, critical rule violations, and missing validation."
+model = "gpt-5.4"
+model_reasoning_effort = "high"
+sandbox_mode = "read-only"
+developer_instructions = """
+Review changes like a repository owner.
+Prioritize correctness, architecture compliance, behavior regressions, and missing tests over style.
+Always compare the patch against AGENTS.md, docs/ARCHITECTURE.md, docs/ADR.md, and the requested acceptance criteria.
+Lead with concrete findings and file references. If no material issues are found, say so explicitly and mention residual risks.
+"""
@@ -0,0 +1,11 @@
+name = "layout_table_figure_specialist"
+description = "Read-only layout/table/figure specialist for reading order, paragraph stitching, table rendering, figure extraction, captions, and references."
+model = "gpt-5.4"
+model_reasoning_effort = "high"
+sandbox_mode = "read-only"
+developer_instructions = """
+Read AGENTS.md, PLAN.md, PROGRESS.md, docs/CONVERSION_POLICY.md, docs/ARCHITECTURE.md, and docs/PRD.md before advising.
+Focus on logical reading order, multi-column layouts, header/footer removal, paragraph stitching, Markdown vs HTML table decisions, table screenshot fallback, figure asset naming, deduplication, captions, and internal references.
+Return testable heuristics and edge cases grounded in sample PDFs when available.
+Do not edit files unless explicitly instructed.
+"""
@@ -0,0 +1,12 @@
+name = "pdf_toolchain_researcher"
+description = "Read-only PDF toolchain researcher for Marker, Nougat, PyMuPDF, PyTorch/CUDA, model cache, and licensing compatibility."
+model = "gpt-5.4"
+model_reasoning_effort = "high"
+sandbox_mode = "read-only"
+developer_instructions = """
+Read AGENTS.md, PLAN.md, PROGRESS.md, docs/TOOLCHAIN.md, docs/ARCHITECTURE.md, docs/CONVERSION_POLICY.md, and docs/ADR.md before answering.
+Focus on official or primary sources for Marker, Nougat, PyMuPDF, PyTorch/CUDA, Markdown math, and comparison baselines.
+Return compatibility findings, recommended dependency pins, runtime risks, model cache implications, and licensing questions.
+Do not edit files unless the parent agent explicitly asks for a patch.
+Do not propose replacing Marker as the primary parser without explaining architecture and ADR impact.
+"""
@@ -0,0 +1,12 @@
+name = "phase_planner"
+description = "Read-heavy Harness planner that decomposes docs into minimal, self-contained phase and step files."
+model = "gpt-5.4"
+model_reasoning_effort = "high"
+sandbox_mode = "read-only"
+developer_instructions = """
+Plan before implementing.
+Read AGENTS.md and the docs directory, identify the smallest coherent phase boundaries, and draft self-contained steps.
+Keep each step scoped to one layer or one module when possible.
+Do not make code changes unless the parent agent explicitly asks you to write files.
+Return concrete file paths, acceptance commands, and blocking assumptions.
+"""
@@ -0,0 +1,12 @@
+name = "quality_evaluator"
+description = "Read-only quality evaluator for focused PDF-to-Markdown tests, sample corpus coverage, regression strategy, and validation gaps."
+model = "gpt-5.4"
+model_reasoning_effort = "high"
+sandbox_mode = "read-only"
+developer_instructions = """
+Read AGENTS.md, PLAN.md, PROGRESS.md, docs/PRD.md, docs/CONVERSION_POLICY.md, and docs/ARCHITECTURE.md before evaluating quality.
+Prefer focused assertions over full Markdown snapshots.
+Prioritize tests for headings, formula delimiters, LaTeX environment pairs, table parseability, image links, caption matching, chunk integrity, Windows paths, Korean filenames, and no-exception conversion.
+Return concrete pytest targets, fixture needs, and residual risks.
+Do not write tests unless explicitly asked.
+"""
@@ -0,0 +1,12 @@
+name = "sample_corpus_analyst"
+description = "Read-only analyst for samples/ PDFs, focused on page traits, text-layer quality, OCR needs, formulas, tables, figures, and regression metadata."
+model = "gpt-5.4"
+model_reasoning_effort = "high"
+sandbox_mode = "read-only"
+developer_instructions = """
+Read AGENTS.md, PLAN.md, PROGRESS.md, docs/PRD.md, docs/CONVERSION_POLICY.md, and docs/TOOLCHAIN.md before analyzing samples.
+Use PyMuPDF-oriented evidence when possible: page count, first-page text length, image count, suspected scan pages, OCR candidates, and layout complexity.
+Design sample metadata schema and quality test implications, but do not create or modify metadata files unless explicitly asked.
+Preserve Korean filenames exactly in reports.
+Return concrete next tests and any sample coverage gaps.
+"""
@@ -0,0 +1,26 @@
+---
+description: Review conversion policy, architecture, ADRs, and AGENTS.md for consistency.
+argument-hint: [optional-topic]
+allowed-tools: [Read, Glob, Grep, Bash]
+---
+
+# /conversion-policy-review
+
+## Arguments
+
+The user invoked this command with: $ARGUMENTS
+
+## Workflow
+
+1. Read `AGENTS.md`, `docs/ARCHITECTURE.md`, `docs/CONVERSION_POLICY.md`, `docs/ADR.md`, `docs/PRD.md`, and `docs/TOOLCHAIN.md`.
+2. Check for drift in parser responsibilities, output contract, runtime policy, logging/resume policy, environment pins, and sidecar scope.
+3. Lead with concrete inconsistencies and file references.
+4. Run `python scripts\validate_workspace.py` if file changes were made or if the user asks.
+5. Do not edit files unless explicitly asked.
+
+## Output
+
+- **Findings**
+- **Consistency Status**
+- **Open Questions**
+- **Suggested Fixes**
@@ -0,0 +1,28 @@
+---
+description: Verify the repo-local PDFtoMD Python environment, CUDA, and Nougat CLI.
+argument-hint: [quick|full]
+allowed-tools: [Read, Bash]
+---
+
+# /env-check
+
+## Arguments
+
+The user invoked this command with: $ARGUMENTS
+
+## Workflow
+
+1. Read `AGENTS.md`, `requirements.txt`, `docs/TOOLCHAIN.md`, and `PROGRESS.md`.
+2. Run `.\venv\python.exe -m pip check`.
+3. Run a CUDA smoke test with `torch.ones((1,), device="cuda")` unless `$ARGUMENTS` says `quick`.
+4. Run `.\venv\Scripts\nougat.exe --help`.
+5. Summarize versions and failures.
+6. Do not install or upgrade packages unless the user explicitly asks.
+
+## Output
+
+- **Environment**
+- **CUDA**
+- **Nougat**
+- **Dependency Health**
+- **Action Needed**
@@ -0,0 +1,25 @@
+---
+description: Check model cache and offline-readiness assumptions for Marker, Nougat, and Hugging Face assets.
+argument-hint: [cache-path-or-empty]
+allowed-tools: [Read, Bash]
+---
+
+# /model-cache-check
+
+## Arguments
+
+The user invoked this command with: $ARGUMENTS
+
+## Workflow
+
+1. Read `AGENTS.md`, `docs/TOOLCHAIN.md`, `docs/ARCHITECTURE.md`, and `docs/CONVERSION_POLICY.md`.
+2. Inspect relevant environment variables and common Hugging Face cache paths.
+3. Check whether local cache paths are explicit enough for offline execution.
+4. Do not download model weights unless the user explicitly asks.
+
+## Output
+
+- **Cache Paths**
+- **Offline Readiness**
+- **Missing Assets**
+- **Documentation Gaps**
@@ -0,0 +1,28 @@
+---
+description: Draft Harness phase steps for PDFtoMD implementation without executing them.
+argument-hint: [phase-goal]
+allowed-tools: [Read, Glob, Grep, Write, Edit]
+---
+
+# /phase-draft
+
+## Arguments
+
+The user invoked this command with: $ARGUMENTS
+
+## Workflow
+
+1. Read `AGENTS.md`, `PLAN.md`, `PROGRESS.md`, `docs/PRD.md`, `docs/ARCHITECTURE.md`, `docs/CONVERSION_POLICY.md`, `docs/HARNESS.md`, `docs/ADR.md`, and `docs/TOOLCHAIN.md`.
+2. Use `$harness-workflow` guidance if phase files should be created.
+3. Keep each step self-contained and scoped to one layer or module.
+4. Include executable acceptance commands.
+5. Include a Sprint Contract with done criteria, hard thresholds, owned files, and dependencies.
+6. Do not create phase files unless the user explicitly requested file generation.
+
+## Output
+
+- **Phase Goal**
+- **Step List**
+- **Dependencies**
+- **Acceptance Commands**
+- **Do Not**
@@ -0,0 +1,26 @@
+---
+description: Draft focused pytest coverage for PDFtoMD conversion quality.
+argument-hint: [feature-or-sample-focus]
+allowed-tools: [Read, Glob, Grep]
+---
+
+# /quality-plan
+
+## Arguments
+
+The user invoked this command with: $ARGUMENTS
+
+## Workflow
+
+1. Read `AGENTS.md`, `PLAN.md`, `PROGRESS.md`, `docs/PRD.md`, `docs/ARCHITECTURE.md`, and `docs/CONVERSION_POLICY.md`.
+2. Identify focused tests for headings, formulas, tables, images, captions, links, chunk boundaries, Windows paths, Korean filenames, and no-exception conversion.
+3. Prefer concrete pytest names and fixture inputs.
+4. Do not write tests unless explicitly asked.
+
+## Output
+
+- **Test Goals**
+- **Proposed Test Files**
+- **Fixture Needs**
+- **Acceptance Commands**
+- **Residual Risks**
@@ -0,0 +1,27 @@
+---
+description: Audit samples/ PDFs for page counts, text-layer quality, images, and OCR candidates.
+argument-hint: [pdf-glob-or-empty]
+allowed-tools: [Read, Glob, Bash, Write, Edit]
+---
+
+# /sample-audit
+
+## Arguments
+
+The user invoked this command with: $ARGUMENTS
+
+## Workflow
+
+1. Read `AGENTS.md`, `PLAN.md`, `PROGRESS.md`, and `docs/CONVERSION_POLICY.md`.
+2. Use PyMuPDF from `.\venv` to inspect matching `samples/*.pdf` files.
+3. Report page count, first-page text length, image counts, suspected scan/OCR pages, Korean filename coverage, and obvious layout risks.
+4. If the user asks to write metadata, create or update `samples/metadata.json`; otherwise only report.
+5. Update `PROGRESS.md` when files are changed.
+
+## Output
+
+- **Corpus Summary**
+- **Per-PDF Traits**
+- **OCR Candidates**
+- **Test Implications**
+- **Recommended Metadata Changes**
@@ -0,0 +1,31 @@
+---
+description: Draft or review a step-level generator/evaluator contract before implementation.
+argument-hint: [phase-dir step-number]
+allowed-tools: [Read, Glob, Grep, Edit]
+---
+
+# /sprint-contract
+
+## Arguments
+
+The user invoked this command with: $ARGUMENTS
+
+## Workflow
+
+1. Read `AGENTS.md`, `PLAN.md`, `PROGRESS.md`, `docs/HARNESS.md`, and the target `phases/{phase}/stepN.md`.
+2. Confirm the step has a concrete Sprint Contract:
+   - Done means
+   - Hard thresholds
+   - Files owned
+   - Dependencies
+   - Acceptance commands
+   - Explicit Do Not list
+3. If the contract is missing or vague, edit only the target step file to make the contract executable by a fresh agent.
+4. Do not implement the step.
+
+## Output
+
+- **Target Step**
+- **Contract Status**: ready | updated | blocked
+- **Evaluator Thresholds**
+- **Remaining Ambiguity**
@@ -0,0 +1,27 @@
+---
+description: Summarize current PDFtoMD plan, progress, blockers, and next work.
+argument-hint: [optional-focus]
+allowed-tools: [Read, Glob, Grep, Bash]
+---
+
+# /status
+
+## Arguments
+
+The user invoked this command with: $ARGUMENTS
+
+## Workflow
+
+1. Read `AGENTS.md`, `PLAN.md`, `PROGRESS.md`, and `docs/HARNESS.md`.
+2. Summarize the current project goal, scope, completed work, in-progress work, blockers, and next work.
+3. If `$ARGUMENTS` names an area, focus the summary on that area.
+4. Do not modify files.
+
+## Output
+
+- **Goal**
+- **Current State**
+- **Next Work**
+- **Blockers**
+- **Relevant Files**
+- **Active Phase/Step**
@@ -0,0 +1,9 @@
+# Project-scoped Codex defaults for the Harness template.
+# As of 2026-04-15, hooks are experimental and disabled on native Windows.
+
+[features]
+codex_hooks = true
+
+[agents]
+max_threads = 6
+max_depth = 1
@@ -0,0 +1,40 @@
+{
+  "hooks": {
+    "PreToolUse": [
+      {
+        "matcher": "Bash",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "python \".codex/hooks/pre_tool_use_policy.py\"",
+            "statusMessage": "Checking risky shell command"
+          }
+        ]
+      }
+    ],
+    "Stop": [
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "python \".codex/hooks/stop_continue.py\"",
+            "statusMessage": "Running Harness validation",
+            "timeout": 300
+          },
+          {
+            "type": "command",
+            "command": "python \".codex/hooks/handoff_policy.py\"",
+            "statusMessage": "Checking PLAN/PROGRESS handoff",
+            "timeout": 60
+          },
+          {
+            "type": "command",
+            "command": "python \".codex/hooks/drift_policy.py\"",
+            "statusMessage": "Checking documentation drift",
+            "timeout": 60
+          }
+        ]
+      }
+    ]
+  }
+}
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+"""Catch high-confidence documentation drift before a Codex turn ends."""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+def changed_paths(root: Path) -> set[str]:
+    result = subprocess.run(
+        ["git", "status", "--porcelain"],
+        cwd=root,
+        capture_output=True,
+        text=True,
+        timeout=20,
+    )
+    if result.returncode != 0:
+        return set()
+
+    paths: set[str] = set()
+    for line in result.stdout.splitlines():
+        if not line.strip():
+            continue
+        path = line[3:].replace("\\", "/")
+        if " -> " in path:
+            path = path.split(" -> ", 1)[1]
+        paths.add(path)
+    return paths
+
+
+def block(reason: str) -> int:
+    json.dump({"decision": "block", "reason": reason}, sys.stdout)
+    return 0
+
+
+def main() -> int:
+    try:
+        payload = json.load(sys.stdin)
+    except json.JSONDecodeError:
+        return 0
+
+    if payload.get("stop_hook_active"):
+        return 0
+
+    root = Path(payload.get("cwd") or ".").resolve()
+    paths = changed_paths(root)
+
+    if "requirements.txt" in paths and "docs/TOOLCHAIN.md" not in paths:
+        return block(
+            "requirements.txt changed without docs/TOOLCHAIN.md. "
+            "Update the toolchain notes with dependency compatibility rationale."
+        )
+
+    sample_pdf_changed = any(path.startswith("samples/") and path.lower().endswith(".pdf") for path in paths)
+    metadata_changed = "samples/metadata.json" in paths
+    if sample_pdf_changed and not metadata_changed:
+        return block(
+            "A sample PDF changed without samples/metadata.json. "
+            "Update the sample metadata mapping so quality tests know the corpus traits."
+        )
+
+    policy_docs = {
+        "docs/ARCHITECTURE.md",
+        "docs/CONVERSION_POLICY.md",
+        "docs/ADR.md",
+    }
+    touched_policy_docs = policy_docs.intersection(paths)
+    if touched_policy_docs and "PROGRESS.md" not in paths:
+        return block(
+            "Architecture or conversion policy docs changed without PROGRESS.md. "
+            "Record the decision and handoff context in PROGRESS.md."
+        )
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""Require PLAN/PROGRESS handoff discipline for multi-agent work."""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+TRACKED_PREFIXES = (
+    ".agents/",
+    ".codex/",
+    "AGENTS.md",
+    "PLAN.md",
+    "docs/",
+    "phases/",
+    "plugins/",
+    "requirements.txt",
+    "scripts/",
+    "src/",
+    "tests/",
+)
+
+
+def git_status_names(root: Path) -> list[str]:
+    result = subprocess.run(
+        ["git", "status", "--porcelain"],
+        cwd=root,
+        capture_output=True,
+        text=True,
+        timeout=20,
+    )
+    if result.returncode != 0:
+        return []
+
+    names: list[str] = []
+    for line in result.stdout.splitlines():
+        if not line.strip():
+            continue
+        path = line[3:].replace("\\", "/")
+        if " -> " in path:
+            path = path.split(" -> ", 1)[1]
+        names.append(path)
+    return names
+
+
+def is_coordination_relevant(path: str) -> bool:
+    return any(path == prefix or path.startswith(prefix) for prefix in TRACKED_PREFIXES)
+
+
+def block(reason: str) -> int:
+    json.dump({"decision": "block", "reason": reason}, sys.stdout)
+    return 0
+
+
+def main() -> int:
+    try:
+        payload = json.load(sys.stdin)
+    except json.JSONDecodeError:
+        return 0
+
+    if payload.get("stop_hook_active"):
+        return 0
+
+    root = Path(payload.get("cwd") or ".").resolve()
+    plan = root / "PLAN.md"
+    progress = root / "PROGRESS.md"
+
+    if not plan.exists() or not progress.exists():
+        return block(
+            "Multi-agent coordination requires PLAN.md and PROGRESS.md. "
+            "Create or restore both files before ending the turn."
+        )
+
+    changed = git_status_names(root)
+    if not changed:
+        return 0
+
+    relevant = [path for path in changed if is_coordination_relevant(path)]
+    progress_changed = "PROGRESS.md" in changed
+
+    if relevant and not progress_changed:
+        return block(
+            "Repository planning, docs, code, tests, requirements, or .codex files changed, "
+            "but PROGRESS.md was not updated. Add a concise handoff note so the next agent "
+            "can see what changed, what was verified, and what remains next."
+        )
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+"""Block obviously destructive shell commands before Codex runs them."""
+
+from __future__ import annotations
+
+import json
+import re
+import sys
+
+
+BLOCK_PATTERNS = (
+    r"\brm\s+-rf\b",
+    r"\bgit\s+push\s+--force(?:-with-lease)?\b",
+    r"\bgit\s+reset\s+--hard\b",
+    r"\bgit\s+clean\s+-[a-zA-Z]*f[a-zA-Z]*[dx][a-zA-Z]*\b",
+    r"\bgit\s+checkout\s+--\s+\.\b",
+    r"\bDROP\s+TABLE\b",
+    r"\btruncate\s+table\b",
+    r"\bRemove-Item\b.*\b-Recurse\b",
+    r"\bdel\b\s+/s\b",
+    r"\bconda\s+(?:env\s+)?remove\b.*\b--all\b",
+)
+
+
+def main() -> int:
+    try:
+        payload = json.load(sys.stdin)
+    except json.JSONDecodeError:
+        return 0
+
+    command = payload.get("tool_input", {}).get("command", "")
+    for pattern in BLOCK_PATTERNS:
+        if re.search(pattern, command, re.IGNORECASE):
+            json.dump(
+                {
+                    "hookSpecificOutput": {
+                        "hookEventName": "PreToolUse",
+                        "permissionDecision": "deny",
+                        "permissionDecisionReason": "Harness guardrail blocked a risky shell command.",
+                    }
+                },
+                sys.stdout,
+            )
+            return 0
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""Run repository validation when a Codex turn stops and request one more pass if it fails."""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+def main() -> int:
+    try:
+        payload = json.load(sys.stdin)
+    except json.JSONDecodeError:
+        return 0
+
+    if payload.get("stop_hook_active"):
+        return 0
+
+    root = Path(payload.get("cwd") or ".").resolve()
+    validator = root / "scripts" / "validate_workspace.py"
+    if not validator.exists():
+        return 0
+
+    result = subprocess.run(
+        [sys.executable, str(validator)],
+        cwd=root,
+        capture_output=True,
+        text=True,
+        timeout=240,
+    )
+
+    if result.returncode == 0:
+        return 0
+
+    summary = (result.stdout or result.stderr or "workspace validation failed").strip()
+    if len(summary) > 1200:
+        summary = summary[:1200].rstrip() + "..."
+
+    json.dump(
+        {
+            "decision": "block",
+            "reason": (
+                "Validation failed. Review the output, fix the repo, then continue.\n\n"
+                f"{summary}"
+            ),
+        },
+        sys.stdout,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,23 @@
+---
+name: conversion-architecture
+description: Design PDFtoMD conversion architecture, parser boundaries, internal block models, chunk policy, renderer contracts, output structure, logging, and resume behavior. Use when planning or reviewing conversion engine design.
+---
+
+# Conversion Architecture
+
+## Workflow
+
+1. Read `AGENTS.md`, `PLAN.md`, `PROGRESS.md`, `docs/ARCHITECTURE.md`, `docs/CONVERSION_POLICY.md`, and `docs/ADR.md`.
+2. Keep responsibilities stable:
+   - Marker: layout, OCR, reading order, body, headings, tables, figures, captions
+   - Nougat: formula-only LaTeX parsing
+   - PyMuPDF: page pre-analysis, text-layer quality, page counts, chunk planning
+3. Define interfaces and invariants before implementation.
+4. Keep output deterministic and chunked under the documented output contract.
+5. Record architecture changes in `docs/ADR.md` when decisions change.
+
+## Guardrails
+
+- Do not place conversion logic in a future PyQt UI.
+- Do not add document sidecars unless explicitly requested.
+- Do not let chunking split a paragraph, table, figure, or formula without a fallback plan.
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Conversion Architecture"
+  short_description: "Plan parser and renderer boundaries"
+  default_prompt: "Use $conversion-architecture to design the next PDFtoMD engine phase."
@@ -0,0 +1,24 @@
+---
+name: formula-quality
+description: Plan and review formula extraction quality for PDFtoMD. Use when Codex needs Nougat handoff rules, inline/block formula classification, LaTeX delimiter checks, equation numbering, reference anchors, or Marker fallback behavior.
+---
+
+# Formula Quality
+
+## Workflow
+
+1. Read `AGENTS.md`, `docs/CONVERSION_POLICY.md`, `docs/TOOLCHAIN.md`, and `docs/ADR.md`.
+2. Identify formula candidates from Marker equation blocks or mathematical text patterns.
+3. Classify formulas as inline or block based on layout context.
+4. Validate:
+   - `$ ... $` and `$$ ... $$` balance
+   - `\begin{...}` / `\end{...}` pairs
+   - formula numbering
+   - body references such as `Eq. (3)` or Korean equation references
+5. Use Marker source text as fallback when Nougat fails.
+
+## Guardrails
+
+- Do not pass whole documents through Nougat as the primary parser.
+- Do not discard formula text on parse failure.
+- Do not rewrite references as links unless the target confidence is sufficient.
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Formula Quality"
+  short_description: "Validate equations and LaTeX output"
+  default_prompt: "Use $formula-quality to design formula parsing tests and fallback behavior."
@@ -0,0 +1,27 @@
+---
+name: markdown-quality
+description: Plan and review Markdown output quality for PDFtoMD. Use when Codex needs tests or policies for headings, tables, HTML fallback, image links, captions, frontmatter, chunk integrity, and deterministic output.
+---
+
+# Markdown Quality
+
+## Workflow
+
+1. Read `AGENTS.md`, `docs/PRD.md`, `docs/ARCHITECTURE.md`, and `docs/CONVERSION_POLICY.md`.
+2. Prefer focused assertions over full snapshots.
+3. Validate:
+   - heading hierarchy
+   - table parseability
+   - limited HTML table fallback
+   - image link existence
+   - figure/table captions
+   - internal references
+   - chunk frontmatter
+   - deterministic filenames and anchors
+4. Use Markdown or HTML parsers when practical.
+
+## Guardrails
+
+- Do not inject runtime warnings into generated Markdown.
+- Do not rely only on brittle whole-file snapshots.
+- Do not lose complex table content without linking a fallback asset.
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Markdown Quality"
+  short_description: "Check chunk Markdown and assets"
+  default_prompt: "Use $markdown-quality to plan focused Markdown output validation."
@@ -0,0 +1,23 @@
+---
+name: pdf-toolchain
+description: Research and maintain PDFtoMD toolchain compatibility for Marker, Nougat, PyMuPDF, PyTorch/CUDA, model cache, and licensing. Use when Codex needs dependency pins, runtime compatibility checks, official-source research, or updates to docs/TOOLCHAIN.md and related ADRs.
+---
+
+# PDF Toolchain
+
+## Workflow
+
+1. Read `AGENTS.md`, `PLAN.md`, `PROGRESS.md`, `docs/TOOLCHAIN.md`, `docs/ARCHITECTURE.md`, and `docs/ADR.md`.
+2. Prefer official or primary sources for current facts.
+3. Verify local facts with commands when relevant:
+   - `.\venv\python.exe -m pip check`
+   - `.\venv\python.exe -c "import torch; print(torch.__version__, torch.version.cuda, torch.cuda.is_available())"`
+   - `.\venv\Scripts\nougat.exe --help`
+4. Preserve the verified GTX 1070 Ti baseline unless a replacement is tested.
+5. Update `docs/TOOLCHAIN.md` and `docs/ADR.md` when dependency decisions change.
+
+## Guardrails
+
+- Do not upgrade `torch`, `transformers`, `albumentations`, `pypdfium2`, `opencv-python-headless`, `Pillow`, or `fsspec` without re-running compatibility checks.
+- Do not switch the primary parser away from Marker without an ADR update.
+- Do not download model weights unless the user explicitly asks.
@@ -0,0 +1,4 @@
+interface:
+  display_name: "PDF Toolchain"
+  short_description: "PDF parser and CUDA dependency guidance"
+  default_prompt: "Use $pdf-toolchain to verify PDFtoMD dependency compatibility and update toolchain notes."
@@ -0,0 +1,27 @@
+---
+name: sample-corpus
+description: Analyze and maintain the PDFtoMD samples corpus. Use when Codex needs to classify samples/ PDFs, design samples/metadata.json, identify OCR candidates, or connect corpus traits to focused regression tests.
+---
+
+# Sample Corpus
+
+## Workflow
+
+1. Read `AGENTS.md`, `PLAN.md`, `PROGRESS.md`, `docs/PRD.md`, and `docs/CONVERSION_POLICY.md`.
+2. Inspect PDFs with PyMuPDF before proposing tests.
+3. Track these traits per PDF:
+   - page count
+   - text-layer quality
+   - scanned or mixed pages
+   - multi-column layout
+   - formula density
+   - table density
+   - figure density
+   - Korean filename/path coverage
+4. If writing metadata, use `samples/metadata.json` and update `PROGRESS.md`.
+
+## Guardrails
+
+- Preserve original sample PDFs.
+- Do not rename Korean sample files unless the user explicitly asks.
+- Do not treat first-page text length as the only OCR signal.
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Sample Corpus"
+  short_description: "Classify PDF samples for quality tests"
+  default_prompt: "Use $sample-corpus to audit samples/ PDFs and propose regression metadata."
@@ -0,0 +1,23 @@
+---
+name: windows-runtime
+description: Maintain Windows-native PDFtoMD runtime behavior. Use when Codex needs guidance for repo-local venv, CUDA/OOM handling, Korean paths, long paths, model cache, offline operation, stderr logs, or resume cache behavior.
+---
+
+# Windows Runtime
+
+## Workflow
+
+1. Read `AGENTS.md`, `docs/TOOLCHAIN.md`, `docs/ARCHITECTURE.md`, and `docs/CONVERSION_POLICY.md`.
+2. Verify environment health with:
+   - `.\venv\python.exe -m pip check`
+   - CUDA smoke test
+   - `.\venv\Scripts\nougat.exe --help`
+3. Use `pathlib` for path design and tests.
+4. Include Korean filenames, spaces, and long Windows paths in test plans.
+5. Keep model cache and offline behavior explicit.
+
+## Guardrails
+
+- Do not silently fall back to CPU when the user explicitly requested CUDA.
+- Do not choose batch sizes that assume more than 8 GB VRAM.
+- Do not delete local environments or sample PDFs without explicit approval.
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Windows Runtime"
+  short_description: "Windows, CUDA, paths, and offline checks"
+  default_prompt: "Use $windows-runtime to verify PDFtoMD local runtime assumptions."