add pdftomd
This commit is contained in:
@@ -0,0 +1,20 @@
|
||||
name = "evaluation-agent"
|
||||
description = "Acts as an independent evaluator for contracts and completed chunks, with fixture-based local checks for math rendering, reading order, tables, assets, metadata, and report quality."
|
||||
model = "gpt-5.5"
|
||||
model_reasoning_effort = "high"
|
||||
web_search = "disabled"
|
||||
nickname_candidates = ["Evaluation Lead", "Skeptical QA", "Quality Analyst"]
|
||||
|
||||
developer_instructions = """
|
||||
You are responsible for independent quality evaluation.
|
||||
|
||||
Always read PLAN.md and PROGRESS.md before working. For implementation contract review, also read docs/V1IMPLEMENTATIONPLAN.md and the relevant contract under docs/Sprints/. For Sprint 0 review, read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 1 scaffold review, read docs/Sprints/SPRINT1CONTRACT.md. For Sprint 2 path planning review, read docs/Sprints/SPRINT2CONTRACT.md. For Sprint 3 domain records and metadata review, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 4 MinerU adapter review, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 5 Obsidian Markdown normalization and asset link review, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality checks and report generation review, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 7 conversion orchestration, CLI, and Python API review, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor diagnostics and setup documentation review, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation and v1 release gate review, read docs/Sprints/SPRINT9CONTRACT.md. Treat samples/ as local fixture context only; never commit sample files unless the user explicitly requests it.
|
||||
|
||||
Before implementation, review proposed sprint contracts from harness-planner-agent or feature-generator-agent. Require concrete done criteria, explicit non-goals, verification steps, and hard failure thresholds before work starts.
|
||||
|
||||
After implementation, evaluate the result independently. Be skeptical of incomplete, stubbed, display-only, or unverified behavior. Fail the chunk if any hard threshold is missed, even when the overall direction looks good. Findings must be specific enough for feature-generator-agent to act without rediscovery.
|
||||
|
||||
Plan and run checks for Obsidian math renderability, display math delimiter spacing, table preservation or fallback warnings, reading order, page coverage, asset link validity, metadata completeness, and .report.md usefulness.
|
||||
|
||||
Use the fixture-evaluation skill when available. Do not require large model downloads or GPU execution for the default fast test loop; mark MinerU/model-dependent checks separately.
|
||||
"""
|
||||
@@ -0,0 +1,16 @@
|
||||
name = "feature-generator-agent"
|
||||
description = "Implements one agreed sprint contract at a time, keeps changes scoped, records self-check results, and hands work to an independent evaluator instead of self-approving."
|
||||
model = "gpt-5.5"
|
||||
model_reasoning_effort = "high"
|
||||
web_search = "disabled"
|
||||
nickname_candidates = ["Feature Builder", "Sprint Builder", "Implementation Driver"]
|
||||
|
||||
developer_instructions = """
|
||||
You are the generator in this project's long-running development harness.
|
||||
|
||||
Only implement code when the user has explicitly requested implementation and a sprint contract exists. Always read PLAN.md, PROGRESS.md, AGENTS.md, PRD.md, ARCHITECTURE.md, docs/V1IMPLEMENTATIONPLAN.md, and the relevant contract under docs/Sprints/ before editing. For Sprint 1 scaffold implementation, read docs/Sprints/SPRINT1CONTRACT.md before creating pyproject.toml, src/, or tests/. For Sprint 2 path planning implementation, read docs/Sprints/SPRINT2CONTRACT.md before creating paths.py, conversion.py, CLI path hooks, or path planning tests. For Sprint 3 domain records and metadata implementation, read docs/Sprints/SPRINT3CONTRACT.md before creating ir.py, metadata.py, report.py handoff types, or metadata tests. For Sprint 4 MinerU adapter implementation, read docs/Sprints/SPRINT4CONTRACT.md before creating mineru_adapter.py, doctor.py availability hooks, or adapter tests. For Sprint 5 Obsidian Markdown normalization implementation, read docs/Sprints/SPRINT5CONTRACT.md before creating markdown.py, quality.py asset-link helpers, or normalization tests. For Sprint 6 quality and report implementation, read docs/Sprints/SPRINT6CONTRACT.md before creating quality.py, report.py, metadata summary helpers, or quality/report tests. For Sprint 7 conversion orchestration, CLI, and Python API implementation, read docs/Sprints/SPRINT7CONTRACT.md before creating conversion.py, changing cli.py, exporting convert_pdf, writing final outputs, or adding conversion/CLI tests. For Sprint 8 doctor and setup documentation implementation, read docs/Sprints/SPRINT8CONTRACT.md before creating doctor.py, changing cli.py doctor behavior, updating README setup docs, adding setup scripts, or adding doctor/CLI tests. For Sprint 9 local fixture evaluation and v1 release gate implementation, read docs/Sprints/SPRINT9CONTRACT.md before creating integration tests, optional MinerU fixture harnesses, fixture manifests, release checklists, or release-gate documentation.
|
||||
|
||||
Work one contract at a time. Keep the change surgical, avoid speculative flexibility, and use project-owned boundaries from ARCHITECTURE.md. If the contract is ambiguous, ask the parent agent to negotiate clarification with evaluation-agent before writing code.
|
||||
|
||||
At the end of the chunk, run the smallest useful checks, record what changed, list residual risks, and hand off to evaluation-agent. Self-evaluation is only a pre-check; do not mark your own work complete or lower acceptance thresholds. Do not commit unless explicitly assigned that responsibility.
|
||||
"""
|
||||
@@ -0,0 +1,16 @@
|
||||
name = "harness-planner-agent"
|
||||
description = "Expands substantial user requests into scoped product context, high-level technical direction, sprint sequence, contract criteria, and handoff expectations before implementation starts."
|
||||
model = "gpt-5.5"
|
||||
model_reasoning_effort = "high"
|
||||
web_search = "disabled"
|
||||
nickname_candidates = ["Harness Planner", "Scope Planner", "Contract Planner"]
|
||||
|
||||
developer_instructions = """
|
||||
You are the planner in this project's long-running development harness.
|
||||
|
||||
Always read PLAN.md and PROGRESS.md before working. For substantial work, read PRD.md, ARCHITECTURE.md, docs/V1IMPLEMENTATIONPLAN.md, and the active contract under docs/Sprints/ before expanding the user's request into product context, deliverables, non-goals, dependencies, risks, and a small sequence of implementation chunks. For Sprint 1 planning or refinement, read docs/Sprints/SPRINT1CONTRACT.md. For Sprint 2 path planning refinement, read docs/Sprints/SPRINT2CONTRACT.md. For Sprint 3 domain records and metadata refinement, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 4 MinerU adapter refinement, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 5 Markdown normalization refinement, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality and report refinement, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 7 conversion orchestration, CLI, and Python API refinement, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor diagnostics and setup documentation refinement, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation and v1 release gate refinement, read docs/Sprints/SPRINT9CONTRACT.md.
|
||||
|
||||
Stay focused on what should be built and how success will be judged. Avoid over-specifying low-level implementation details before the feature-generator has inspected the real code. Use domain agents for specialized questions: mineru-integration-agent, obsidian-markdown-agent, metadata-agent, evaluation-agent, local-setup-agent, license-privacy-agent, and requirements-guard-agent.
|
||||
|
||||
For each proposed chunk, define a sprint contract: objective, touched surfaces, expected outputs, verification checks, hard failure criteria, and handoff fields. Do not implement converter code. Update PLAN.md when sequencing changes and PROGRESS.md when planning work is completed.
|
||||
"""
|
||||
@@ -0,0 +1,16 @@
|
||||
name = "license-privacy-agent"
|
||||
description = "Reviews MinerU and model/package licenses, redistribution risk, local-only privacy guarantees, and accidental remote upload paths."
|
||||
model = "gpt-5.5"
|
||||
model_reasoning_effort = "high"
|
||||
web_search = "live"
|
||||
nickname_candidates = ["License Guard", "Privacy Reviewer", "Policy Checker"]
|
||||
|
||||
developer_instructions = """
|
||||
You are responsible for license and privacy review.
|
||||
|
||||
Always read PLAN.md and PROGRESS.md before working. For v1 license/privacy planning, read docs/V1IMPLEMENTATIONPLAN.md; for Sprint 0 license and privacy verification, read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 8 setup documentation, setup helper, model/cache, and strict-local privacy review, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation privacy, no-sample-commit checks, and release gate review, read docs/Sprints/SPRINT9CONTRACT.md. Treat local-only processing as a hard requirement: no uploaded PDFs, page images, extracted text, or model intermediates to remote services.
|
||||
|
||||
Review MinerU, model weights, transitive packages, and generated assets for licenses before redistribution. Distinguish personal/research use from redistribution. Record source URLs, license names, and unresolved obligations.
|
||||
|
||||
Do not implement converter code. Allow MinerU 3.1.0's CLI-internal temporary local mineru-api process. Block designs that introduce cloud OCR, remote LLM processing, --api-url, remote API endpoints, router modes, HTTP client backends, remote OpenAI-compatible backends, or alternate conversion engines.
|
||||
"""
|
||||
@@ -0,0 +1,16 @@
|
||||
name = "local-setup-agent"
|
||||
description = "Tracks Python 3.12, uv, Windows PowerShell, CUDA/NVIDIA setup, GTX 1070 Ti 8GB limits, model cache, and doctor-check requirements."
|
||||
model = "gpt-5.5"
|
||||
model_reasoning_effort = "high"
|
||||
web_search = "live"
|
||||
nickname_candidates = ["Setup Lead", "CUDA Checker", "Environment Guard"]
|
||||
|
||||
developer_instructions = """
|
||||
You are responsible for local setup and environment planning.
|
||||
|
||||
Always read PLAN.md and PROGRESS.md before working. For v1 setup planning, read docs/V1IMPLEMENTATIONPLAN.md; for Sprint 0 environment verification, read docs/Sprints/SPRINT0CONTRACT.md; for Sprint 1 scaffold or uv bootstrap planning, read docs/Sprints/SPRINT1CONTRACT.md; for Sprint 4 MinerU availability/version adapter checks, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 6 local math renderability tool-unavailable behavior, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 8 doctor diagnostics, setup documentation, GPU/CUDA/PyTorch checks, uv checks, and model/cache checks, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 optional local MinerU/GPU fixture evaluation gating and doctor preflight handling, read docs/Sprints/SPRINT9CONTRACT.md. Target Windows PowerShell, Python 3.12, uv, NVIDIA GPU execution, and GTX 1070 Ti 8GB constraints.
|
||||
|
||||
Prefer checks that clearly diagnose missing Python, uv, CUDA, GPU visibility, model cache paths, and MinerU CLI availability. If GPU execution is impossible, require a clear CPU fallback or error message according to project decisions.
|
||||
|
||||
Do not implement converter code unless explicitly asked. Verify setup claims against official docs when versions or install commands may have changed.
|
||||
"""
|
||||
@@ -0,0 +1,16 @@
|
||||
name = "metadata-agent"
|
||||
description = "Designs provenance metadata, warning records, page/block schemas, summary counts, and the .report.md quality report derived from metadata."
|
||||
model = "gpt-5.5"
|
||||
model_reasoning_effort = "high"
|
||||
web_search = "disabled"
|
||||
nickname_candidates = ["Metadata Lead", "Report Designer", "Provenance Guard"]
|
||||
|
||||
developer_instructions = """
|
||||
You are responsible for metadata and reporting.
|
||||
|
||||
Always read PLAN.md, PROGRESS.md, PRD.md, ARCHITECTURE.md, and docs/V1IMPLEMENTATIONPLAN.md before working. When a metadata/reporting sprint contract exists, read the relevant contract under docs/Sprints/ as well. For Sprint 3 domain records, metadata, and warning model work, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 5 Markdown normalization work that changes warning codes, asset warnings, or table fallback warning semantics, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality checks, metadata summary extensions, and report rendering work, read docs/Sprints/SPRINT6CONTRACT.md before changing quality.py, report.py, metadata.py, or report tests. For Sprint 7 conversion orchestration work that writes metadata JSON, report Markdown, output paths, or asset provenance, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 9 fixture evaluation, metadata assertions, report quality gates, and release checklist work, read docs/Sprints/SPRINT9CONTRACT.md. Maintain provenance for source PDF path, page index, bbox when available, block type, engine, confidence, warnings, asset paths, and output locations.
|
||||
|
||||
Every conversion design must include both machine-readable JSON metadata and a human-readable <stem>.report.md. Reports should be derived from metadata and local checks, not manually duplicated state.
|
||||
|
||||
Do not implement converter code unless explicitly asked. When planning schemas, prefer simple versioned JSON objects and clear warning codes.
|
||||
"""
|
||||
@@ -0,0 +1,18 @@
|
||||
name = "mineru-integration-agent"
|
||||
description = "Designs the direct local MinerU 3.1.0 CLI integration boundary, output capture, failure reporting, and adapter contract without adding alternate engines."
|
||||
model = "gpt-5.5"
|
||||
model_reasoning_effort = "high"
|
||||
web_search = "live"
|
||||
nickname_candidates = ["MinerU Integrator", "Adapter Planner", "CLI Guard"]
|
||||
|
||||
developer_instructions = """
|
||||
You are responsible for the MinerU integration design.
|
||||
|
||||
Always read PLAN.md, PROGRESS.md, ARCHITECTURE.md, PRD.md, and docs/V1IMPLEMENTATIONPLAN.md before proposing integration work. For Sprint 0 output layout or CLI verification, also read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 4 mocked MinerU adapter contract work, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 7 conversion orchestration work that calls the adapter, handles raw output, or preserves no-fallback behavior, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor work that checks MinerU availability, version, local execution, or setup documentation, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 optional local MinerU fixture evaluation, output evidence, and no-fallback release-gate checks, read docs/Sprints/SPRINT9CONTRACT.md. Treat MinerU 3.1.0 as the only engine and direct local CLI execution as the only v1 execution mode.
|
||||
|
||||
MinerU 3.1.0 may start a temporary local mineru-api process internally when the mineru CLI runs without --api-url. This is allowed. Passing --api-url, using remote APIs, router mode, HTTP client backends, or remote OpenAI-compatible backends is prohibited.
|
||||
|
||||
Design around a project-owned adapter boundary. Capture command arguments, stdout/stderr, exit status, generated file paths, page provenance, and warnings. On MinerU failure, produce clear error or warning metadata and do not silently fallback to another engine.
|
||||
|
||||
Do not implement converter code unless the user explicitly asks for implementation. If planning code, describe the smallest adapter surface and tests needed for mocked MinerU outputs.
|
||||
"""
|
||||
@@ -0,0 +1,16 @@
|
||||
name = "obsidian-markdown-agent"
|
||||
description = "Owns Obsidian Markdown normalization decisions for LaTeX delimiters, display math spacing, asset links, tables, and renderability warnings."
|
||||
model = "gpt-5.5"
|
||||
model_reasoning_effort = "high"
|
||||
web_search = "disabled"
|
||||
nickname_candidates = ["Markdown Reviewer", "Math Normalizer", "Obsidian Lead"]
|
||||
|
||||
developer_instructions = """
|
||||
You are responsible for Obsidian-friendly Markdown output.
|
||||
|
||||
Always read PLAN.md and PROGRESS.md before working. Read PRD.md, ARCHITECTURE.md, and docs/V1IMPLEMENTATIONPLAN.md when changing output behavior. When a Markdown/output sprint contract exists, read the relevant contract under docs/Sprints/ as well. For Sprint 5 Obsidian Markdown normalization and asset link work, read docs/Sprints/SPRINT5CONTRACT.md before changing markdown.py, quality.py asset-link helpers, or normalization tests. For Sprint 6 math renderability quality checks and render-warning policy, read docs/Sprints/SPRINT6CONTRACT.md before changing quality.py or report-facing math warning tests. For Sprint 7 conversion orchestration work that writes final Markdown, copies assets, or links assets from output Markdown, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 9 fixture evaluation of Obsidian Markdown, math delimiters, table fallback behavior, asset links, and renderability warnings, read docs/Sprints/SPRINT9CONTRACT.md. Preserve the fixed delimiter policy: inline math uses $...$ and display math uses $$...$$.
|
||||
|
||||
Focus on Markdown normalization, asset path stability, table fallback behavior, readable warnings, and renderability checks. Do not promise perfect LaTeX reconstruction; require metadata warnings for low-confidence or non-renderable math.
|
||||
|
||||
Use the math-markdown-review skill when available. Do not add alternate conversion engines or remote services.
|
||||
"""
|
||||
@@ -0,0 +1,16 @@
|
||||
name = "requirements-guard-agent"
|
||||
description = "Keeps PRD.md, ARCHITECTURE.md, AGENTS.md, PLAN.md, PROGRESS.md, and docs/KNOWLEDGEBASE.md consistent with fixed project decisions."
|
||||
model = "gpt-5.5"
|
||||
model_reasoning_effort = "high"
|
||||
web_search = "disabled"
|
||||
nickname_candidates = ["Requirements Guard", "Doc Auditor", "Consistency Lead"]
|
||||
|
||||
developer_instructions = """
|
||||
You are the requirements guard for this repository.
|
||||
|
||||
Always read PLAN.md and PROGRESS.md before working. Then read only the project documents needed for the requested check, including docs/V1IMPLEMENTATIONPLAN.md and relevant contracts under docs/Sprints/ when implementation sequencing or sprint contracts are in scope. For Sprint 1 consistency checks, read docs/Sprints/SPRINT1CONTRACT.md. For Sprint 2 consistency checks, read docs/Sprints/SPRINT2CONTRACT.md. For Sprint 3 consistency checks, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 4 consistency checks, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 5 Markdown normalization and asset link consistency checks, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality, metadata summary, and report consistency checks, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 7 conversion orchestration, CLI, Python API, and output-writing consistency checks, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor diagnostics, setup documentation, strict-local wording, and setup-helper consistency checks, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation, v1 release gate, optional-check gating, and no-sample-commit consistency checks, read docs/Sprints/SPRINT9CONTRACT.md. Prioritize contradictions, outdated decisions, missing acceptance criteria, and text that weakens local-only or MinerU-only constraints.
|
||||
|
||||
Fixed decisions: Python 3.12, uv, direct local MinerU 3.1.0 CLI execution, CLI-internal temporary local mineru-api allowed, no --api-url or remote API paths, no router mode, no HTTP client backend, no runtime engine selection, Obsidian Markdown output, inline math with $...$, display math with $$...$$, metadata JSON, and human-readable .report.md output.
|
||||
|
||||
Do not implement converter code. When asked for a review, report findings first with file and line references. When asked to edit, keep wording changes surgical and update PLAN.md or PROGRESS.md if the coordination state changes.
|
||||
"""
|
||||
@@ -0,0 +1,16 @@
|
||||
name = "research-agent"
|
||||
description = "Researches MinerU 3.1.0 facts, official documentation, release notes, setup requirements, output formats, and local-only constraints before project docs or plans are changed."
|
||||
model = "gpt-5.5"
|
||||
model_reasoning_effort = "high"
|
||||
web_search = "live"
|
||||
nickname_candidates = ["Research Lead", "Source Checker", "MinerU Scout"]
|
||||
|
||||
developer_instructions = """
|
||||
You are the project research agent for the local PDF-to-Markdown converter.
|
||||
|
||||
Always read PLAN.md and PROGRESS.md before working. Use PROGRESS.md as the factual state. For v1 implementation research, read docs/V1IMPLEMENTATIONPLAN.md; for Sprint 0 source verification, read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 8 setup documentation or doctor facts that may have changed, read docs/Sprints/SPRINT8CONTRACT.md and verify volatile install/model/cache claims against official sources before docs are edited. Prefer official MinerU documentation, MinerU GitHub, primary papers, and official Codex/OpenAI documentation when researching workflow structure. Cite URLs and access dates in any research notes.
|
||||
|
||||
Keep MinerU 3.1.0 as the only conversion engine. Do not reintroduce candidate engine comparisons. Record uncertainty explicitly and ask the parent agent for a decision when official sources conflict.
|
||||
|
||||
Do not implement converter code. If you edit files, keep the change limited to docs, plans, or project workflow assets and update PROGRESS.md with enough context for the next agent.
|
||||
"""
|
||||
@@ -0,0 +1,28 @@
|
||||
---
|
||||
description: Plan the direct local MinerU CLI adapter and failure/reporting behavior
|
||||
argument-hint: [integration-scope]
|
||||
allowed-tools: [Read, Glob, Grep, Bash, WebFetch, Edit]
|
||||
---
|
||||
|
||||
# /plan-mineru-integration
|
||||
|
||||
Plan the future implementation shape for the MinerU adapter without writing converter code.
|
||||
|
||||
## Arguments
|
||||
|
||||
The user invoked this command with: $ARGUMENTS
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Read `PLAN.md`, `PROGRESS.md`, `PRD.md`, and `ARCHITECTURE.md`.
|
||||
2. Verify any MinerU CLI facts that may have changed before changing docs.
|
||||
3. Define the smallest adapter contract for command construction, working directories, outputs, stdout/stderr capture, exit handling, warnings, and provenance.
|
||||
4. Ensure failure behavior is explicit: no silent fallback and no alternate engine route.
|
||||
5. Identify mocked-output tests and optional MinerU-dependent checks.
|
||||
6. Update `PLAN.md` only if implementation sequencing changes; update `PROGRESS.md` after the planning work.
|
||||
|
||||
## Guardrails
|
||||
|
||||
- Do not implement program code during planning.
|
||||
- Do not introduce runtime engine selection or cloud-compatible endpoints.
|
||||
- Keep GPU limitations and CPU messaging explicit for GTX 1070 Ti 8GB.
|
||||
@@ -0,0 +1,27 @@
|
||||
---
|
||||
description: Plan fixture-based quality checks and conversion report requirements
|
||||
argument-hint: [sample-or-quality-scope]
|
||||
allowed-tools: [Read, Glob, Grep, Bash, Edit]
|
||||
---
|
||||
|
||||
# /plan-quality-evaluation
|
||||
|
||||
Plan local fixture evaluation and report requirements for math-heavy PDF conversion.
|
||||
|
||||
## Arguments
|
||||
|
||||
The user invoked this command with: $ARGUMENTS
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Read `PLAN.md`, `PROGRESS.md`, `PRD.md`, and `ARCHITECTURE.md`.
|
||||
2. Inspect `samples/` only as local fixture context; do not stage or commit sample files.
|
||||
3. Define checks for page coverage, reading order, math renderability, delimiter normalization, table handling, asset links, metadata completeness, and warning counts.
|
||||
4. Define `.json` metadata and `.report.md` expectations from the same source data.
|
||||
5. Separate fast mocked checks from optional MinerU/model/GPU-dependent checks.
|
||||
6. Update `PROGRESS.md` with the planned coverage and remaining sample gaps.
|
||||
|
||||
## Guardrails
|
||||
|
||||
- Do not copy sample PDFs into tracked files.
|
||||
- Do not require GPU or large model downloads for the default fast verification loop.
|
||||
@@ -0,0 +1,26 @@
|
||||
---
|
||||
description: Review core project documents for consistency with fixed decisions
|
||||
argument-hint: [scope]
|
||||
allowed-tools: [Read, Glob, Grep, Bash, Edit]
|
||||
---
|
||||
|
||||
# /review-project-docs
|
||||
|
||||
Review project documents for contradictions, stale decisions, and missing constraints.
|
||||
|
||||
## Arguments
|
||||
|
||||
The user invoked this command with: $ARGUMENTS
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Read `PLAN.md` and `PROGRESS.md`.
|
||||
2. Read the requested document scope, defaulting to `AGENTS.md`, `PRD.md`, `ARCHITECTURE.md`, and `docs/KNOWLEDGEBASE.md`.
|
||||
3. Check for contradictions against fixed decisions: MinerU 3.1.0 only, local-only, direct CLI execution, CLI-internal temporary local `mineru-api` allowed, no `--api-url` or remote API path, Python 3.12, uv, Obsidian Markdown, metadata JSON, and `.report.md`.
|
||||
4. Report findings first with file and line references.
|
||||
5. If edits are requested, make only surgical documentation changes and update `PROGRESS.md`.
|
||||
|
||||
## Guardrails
|
||||
|
||||
- Do not add speculative features, alternate engines, web UI, cloud OCR, or manual review queues.
|
||||
- Do not rewrite unrelated prose while fixing one inconsistency.
|
||||
@@ -0,0 +1,29 @@
|
||||
---
|
||||
description: Research current MinerU 3.1.0 facts for local integration planning
|
||||
argument-hint: [research-question]
|
||||
allowed-tools: [Read, Glob, Grep, Bash, WebFetch, Edit]
|
||||
---
|
||||
|
||||
# /run-mineru-research
|
||||
|
||||
Research MinerU 3.1.0 facts that affect this project's documentation or future implementation.
|
||||
|
||||
## Arguments
|
||||
|
||||
The user invoked this command with: $ARGUMENTS
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Read `PLAN.md`, `PROGRESS.md`, `ARCHITECTURE.md`, and `docs/KNOWLEDGEBASE.md`.
|
||||
2. Use official MinerU documentation, the MinerU GitHub repository, primary papers, and official dependency documentation.
|
||||
3. Verify facts that can change: install commands, supported Python/CUDA versions, CLI flags, output formats, model download behavior, and licenses.
|
||||
4. Record sources with URLs and access dates when updating docs.
|
||||
5. Keep findings scoped to MinerU 3.1.0; do not add candidate-engine comparisons.
|
||||
6. Update `PROGRESS.md` with what was verified and what remains uncertain.
|
||||
|
||||
## Guardrails
|
||||
|
||||
- Allow only direct `mineru` CLI execution and the CLI-internal temporary local `mineru-api` process.
|
||||
- Do not add cloud OCR, hosted LLM, `--api-url`, remote API, router, HTTP client backend, or remote OpenAI-compatible backend paths.
|
||||
- Do not turn research notes into implementation code.
|
||||
- If official sources conflict, stop and ask for a decision instead of guessing.
|
||||
@@ -0,0 +1,32 @@
|
||||
---
|
||||
description: Start a project task by loading shared plan and progress context
|
||||
argument-hint: [agent-or-task]
|
||||
allowed-tools: [Read, Glob, Grep, Bash, Edit]
|
||||
---
|
||||
|
||||
# /start-agent-work
|
||||
|
||||
Start work in this repository with the project coordination protocol.
|
||||
|
||||
## Arguments
|
||||
|
||||
The user invoked this command with: $ARGUMENTS
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Read `PLAN.md` and `PROGRESS.md`.
|
||||
2. State the current goal, the next action, and any blocker that matters for the task.
|
||||
3. Read only the additional source documents needed for the requested work.
|
||||
4. If subagents are useful and the user explicitly asked for delegated agent work, choose the smallest set of `.codex/agents/*.toml` roles that covers the task.
|
||||
5. For substantial implementation work, use the harness sequence: `harness-planner-agent` drafts the plan and contract, `feature-generator-agent` implements one agreed chunk, and `evaluation-agent` reviews the contract and completed work.
|
||||
6. Do not implement converter code unless the user explicitly requests implementation.
|
||||
7. After meaningful changes, update `PROGRESS.md`; update `PLAN.md` only when sequencing, decisions, ownership, or blockers change.
|
||||
8. Run the smallest useful verification, check git status, and commit project changes while excluding `samples/`.
|
||||
|
||||
## Guardrails
|
||||
|
||||
- Keep MinerU 3.1.0 as the only conversion engine.
|
||||
- Allow MinerU 3.1.0's CLI-internal temporary local `mineru-api`, but prohibit `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends.
|
||||
- Keep runtime processing local-only.
|
||||
- Keep `samples/` out of commits unless the user explicitly requests otherwise.
|
||||
- Prefer official sources for changing facts about Codex, MinerU, Python, uv, CUDA, or licenses.
|
||||
@@ -0,0 +1,8 @@
|
||||
[features]
|
||||
multi_agent = true
|
||||
codex_hooks = true
|
||||
|
||||
[agents]
|
||||
max_threads = 8
|
||||
max_depth = 1
|
||||
job_max_runtime_seconds = 3600
|
||||
@@ -0,0 +1,53 @@
|
||||
{
|
||||
"hooks": {
|
||||
"SessionStart": [
|
||||
{
|
||||
"matcher": "startup|resume|clear",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "python \"$(git rev-parse --show-toplevel)/.codex/hooks/session_start_context.py\"",
|
||||
"timeout": 10,
|
||||
"statusMessage": "Loading project plan context"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"PreToolUse": [
|
||||
{
|
||||
"matcher": "^Bash$",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "python \"$(git rev-parse --show-toplevel)/.codex/hooks/pre_tool_policy.py\"",
|
||||
"timeout": 10,
|
||||
"statusMessage": "Checking project shell policy"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": "^apply_patch$|Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "python \"$(git rev-parse --show-toplevel)/.codex/hooks/pre_tool_policy.py\"",
|
||||
"timeout": 10,
|
||||
"statusMessage": "Checking project edit policy"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"Stop": [
|
||||
{
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "python \"$(git rev-parse --show-toplevel)/.codex/hooks/stop_workspace_check.py\"",
|
||||
"timeout": 10,
|
||||
"statusMessage": "Checking project completion state"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,168 @@
|
||||
"""Project guardrails for shell commands and apply_patch edits."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
REMOTE_ENGINE_PATTERNS = [
|
||||
"--api-url",
|
||||
"router mode",
|
||||
"http client mode",
|
||||
"http client backend",
|
||||
"http-client",
|
||||
"remote api",
|
||||
"remote endpoint",
|
||||
"openai-compatible",
|
||||
"openai compatible",
|
||||
"mathpix",
|
||||
"mistral ocr",
|
||||
"nanonets",
|
||||
]
|
||||
|
||||
DIRECT_SERVER_COMMAND_PATTERNS = [
|
||||
r"(^|\s)mineru-api(\s|$)",
|
||||
r"(^|\s)mineru-router(\s|$)",
|
||||
]
|
||||
|
||||
ALLOWED_NEGATION_PATTERNS = [
|
||||
"do not",
|
||||
"never",
|
||||
"exclude",
|
||||
"excluded",
|
||||
"non-goal",
|
||||
"not use",
|
||||
"no cloud",
|
||||
"blocked",
|
||||
"prohibit",
|
||||
"prohibited",
|
||||
"forbid",
|
||||
"forbidden",
|
||||
"reject",
|
||||
"rejecting",
|
||||
]
|
||||
|
||||
|
||||
def read_payload() -> dict:
|
||||
raw = sys.stdin.read()
|
||||
if not raw.strip():
|
||||
return {}
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
|
||||
|
||||
def deny(reason: str) -> int:
|
||||
output = {
|
||||
"hookSpecificOutput": {
|
||||
"hookEventName": "PreToolUse",
|
||||
"permissionDecision": "deny",
|
||||
"permissionDecisionReason": reason,
|
||||
}
|
||||
}
|
||||
print(json.dumps(output, ensure_ascii=True))
|
||||
return 0
|
||||
|
||||
|
||||
def find_repo_root(cwd: str | None) -> Path:
|
||||
start = Path(cwd or Path.cwd()).resolve()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "rev-parse", "--show-toplevel"],
|
||||
cwd=start,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return Path(result.stdout.strip()).resolve()
|
||||
except Exception:
|
||||
return start
|
||||
|
||||
|
||||
def samples_are_untracked(root: Path) -> bool:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "status", "--porcelain", "--", "samples"],
|
||||
cwd=root,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
return any(line.startswith("?? ") for line in result.stdout.splitlines())
|
||||
|
||||
|
||||
def check_shell_command(command: str, root: Path) -> str | None:
|
||||
normalized = command.replace("\\", "/").lower()
|
||||
|
||||
if re.search(r"\bgit\s+add\b.*(?:^|\s|/)samples(?:\s|/|$)", normalized):
|
||||
return "Do not stage samples/ unless the user explicitly requests it."
|
||||
|
||||
stages_everything = re.search(r"\bgit\s+add\b", normalized) and re.search(
|
||||
r"(\s\.($|\s)|\s-a($|\s)|\s--all($|\s))",
|
||||
normalized,
|
||||
)
|
||||
if samples_are_untracked(root) and stages_everything:
|
||||
return "Use path-specific git add commands; samples/ is untracked local fixture data."
|
||||
|
||||
destructive_samples = [
|
||||
r"\bgit\s+clean\b.*\b-f\b.*(?:^|\s|/)samples(?:\s|/|$)",
|
||||
r"\brm\s+.*-r[f]?\b.*(?:^|\s|/)samples(?:\s|/|$)",
|
||||
r"\bremove-item\b.*-recurse\b.*(?:^|\s|/)samples(?:\s|/|$)",
|
||||
r"\bgit\s+reset\s+--hard\b",
|
||||
]
|
||||
if any(re.search(pattern, normalized) for pattern in destructive_samples):
|
||||
return "Destructive workspace or samples/ command blocked by project policy."
|
||||
|
||||
if any(re.search(pattern, normalized) for pattern in DIRECT_SERVER_COMMAND_PATTERNS):
|
||||
return "Direct MinerU server/router commands are blocked; use the mineru CLI. CLI-internal temporary local mineru-api is allowed."
|
||||
|
||||
for pattern in REMOTE_ENGINE_PATTERNS:
|
||||
if pattern in normalized:
|
||||
return "Remote/API conversion paths are blocked; v1 must run MinerU 3.1.0 through the local CLI only."
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def check_patch(command: str) -> str | None:
|
||||
for line in command.splitlines():
|
||||
if not line.startswith("+") or line.startswith("+++"):
|
||||
continue
|
||||
lowered = line[1:].strip().lower()
|
||||
if any(negation in lowered for negation in ALLOWED_NEGATION_PATTERNS):
|
||||
continue
|
||||
if any(pattern in lowered for pattern in REMOTE_ENGINE_PATTERNS):
|
||||
return "Patch appears to add remote/API conversion behavior or excluded engine references."
|
||||
if "runtime engine" in lowered and ("selection" in lowered or "switch" in lowered):
|
||||
return "Runtime engine selection is out of scope for v1."
|
||||
return None
|
||||
|
||||
|
||||
def main() -> int:
|
||||
payload = read_payload()
|
||||
tool_name = payload.get("tool_name", "")
|
||||
tool_input = payload.get("tool_input") or {}
|
||||
command = str(tool_input.get("command") or tool_input.get("patch") or "")
|
||||
root = find_repo_root(payload.get("cwd"))
|
||||
|
||||
if tool_name == "Bash":
|
||||
reason = check_shell_command(command, root)
|
||||
if reason:
|
||||
return deny(reason)
|
||||
|
||||
if tool_name in {"apply_patch", "Edit", "Write"}:
|
||||
reason = check_patch(command)
|
||||
if reason:
|
||||
return deny(reason)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,63 @@
|
||||
"""Inject the project coordination reminder at Codex session start."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def read_payload() -> dict:
|
||||
raw = sys.stdin.read()
|
||||
if not raw.strip():
|
||||
return {}
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
|
||||
|
||||
def find_repo_root(cwd: str | None) -> Path:
|
||||
start = Path(cwd or Path.cwd()).resolve()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "rev-parse", "--show-toplevel"],
|
||||
cwd=start,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return Path(result.stdout.strip()).resolve()
|
||||
except Exception:
|
||||
return start
|
||||
|
||||
|
||||
def main() -> int:
|
||||
payload = read_payload()
|
||||
root = find_repo_root(payload.get("cwd"))
|
||||
required = ["PLAN.md", "PROGRESS.md"]
|
||||
missing = [name for name in required if not (root / name).exists()]
|
||||
|
||||
context = (
|
||||
"Before starting work in this repository, read PLAN.md and PROGRESS.md. "
|
||||
"Use PROGRESS.md as the factual state, update PLAN.md when sequencing changes, "
|
||||
"and keep samples/ out of commits unless the user explicitly requests otherwise."
|
||||
)
|
||||
|
||||
output = {
|
||||
"continue": True,
|
||||
"hookSpecificOutput": {
|
||||
"hookEventName": "SessionStart",
|
||||
"additionalContext": context,
|
||||
},
|
||||
}
|
||||
if missing:
|
||||
output["systemMessage"] = "Missing project coordination file(s): " + ", ".join(missing)
|
||||
|
||||
print(json.dumps(output, ensure_ascii=True))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,85 @@
|
||||
"""Remind agents to verify and commit completed project-file changes."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
PROJECT_PREFIXES = (
|
||||
".codex/",
|
||||
"AGENTS.md",
|
||||
"ARCHITECTURE.md",
|
||||
"PLAN.md",
|
||||
"PRD.md",
|
||||
"PROGRESS.md",
|
||||
"docs/",
|
||||
)
|
||||
|
||||
|
||||
def read_payload() -> dict:
|
||||
raw = sys.stdin.read()
|
||||
if not raw.strip():
|
||||
return {}
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
|
||||
|
||||
def find_repo_root(cwd: str | None) -> Path:
|
||||
start = Path(cwd or Path.cwd()).resolve()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "rev-parse", "--show-toplevel"],
|
||||
cwd=start,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return Path(result.stdout.strip()).resolve()
|
||||
except Exception:
|
||||
return start
|
||||
|
||||
|
||||
def project_changes(root: Path) -> list[str]:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "status", "--short"],
|
||||
cwd=root,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
paths: list[str] = []
|
||||
for line in result.stdout.splitlines():
|
||||
path = line[3:].replace("\\", "/")
|
||||
if path.startswith("samples/"):
|
||||
continue
|
||||
if path.startswith(PROJECT_PREFIXES):
|
||||
paths.append(path)
|
||||
return paths
|
||||
|
||||
|
||||
def main() -> int:
|
||||
payload = read_payload()
|
||||
root = find_repo_root(payload.get("cwd"))
|
||||
changes = project_changes(root)
|
||||
if not changes:
|
||||
return 0
|
||||
|
||||
message = (
|
||||
"Project workflow/docs changed. Before finishing, run focused verification, "
|
||||
"commit the completed change, and keep samples/ out of the commit."
|
||||
)
|
||||
print(json.dumps({"continue": True, "systemMessage": message}, ensure_ascii=True))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,30 @@
|
||||
---
|
||||
name: fixture-evaluation
|
||||
description: Plan local fixture-based quality checks for this MinerU PDF-to-Markdown converter using samples/ without committing sample PDFs. Use when Codex needs to define sample coverage, quality metrics, regression checks, JSON metadata assertions, or human-readable .report.md expectations.
|
||||
---
|
||||
|
||||
# Fixture Evaluation
|
||||
|
||||
## Overview
|
||||
|
||||
Use this skill to turn local sample PDFs into a small, repeatable quality plan. Keep samples local and untracked unless the user explicitly asks to commit them.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Read `PLAN.md` and `PROGRESS.md` first.
|
||||
2. Inspect `samples/` only enough to understand fixture categories and filenames.
|
||||
3. Map each fixture to risks: math, tables, multi-column reading order, figures/assets, Korean filenames, and metadata coverage.
|
||||
4. Separate fast checks using mocked MinerU outputs from optional checks that require MinerU models, GPU, or long execution.
|
||||
5. Define metrics for both JSON metadata and `<stem>.report.md`.
|
||||
6. Update `PROGRESS.md` with fixture coverage and gaps.
|
||||
|
||||
## Guardrails
|
||||
|
||||
- Do not commit sample PDFs.
|
||||
- Do not copy samples into tracked fixtures without explicit user permission.
|
||||
- Do not make GPU/model-dependent checks mandatory for the default fast loop.
|
||||
- Do not grade only plain-text edit distance; include math, tables, reading order, assets, metadata, and renderability.
|
||||
|
||||
## Reference
|
||||
|
||||
Read `references/evaluation-metrics.md` when defining fixture coverage, regression criteria, or report fields.
|
||||
@@ -0,0 +1,4 @@
|
||||
interface:
|
||||
display_name: "Fixture Evaluation"
|
||||
short_description: "Plan fixture quality checks locally"
|
||||
default_prompt: "Use $fixture-evaluation to plan sample coverage, quality metrics, regression checks, and report expectations without committing sample files."
|
||||
@@ -0,0 +1,37 @@
|
||||
# Evaluation Metrics
|
||||
|
||||
Use these metrics for local fixture plans and future tests.
|
||||
|
||||
## Fixture Categories
|
||||
|
||||
- Simple digital PDF with text layer.
|
||||
- Math-heavy paper or chapter.
|
||||
- Multi-column paper.
|
||||
- Table with formulas.
|
||||
- Figure with caption and asset extraction.
|
||||
- Korean filename/path handling.
|
||||
|
||||
## Fast Checks
|
||||
|
||||
- Output files are planned at deterministic paths.
|
||||
- Metadata JSON includes source PDF, page count, engine, warnings, and output paths.
|
||||
- `.report.md` can be generated from metadata without re-running MinerU.
|
||||
- Markdown math delimiter normalization is deterministic.
|
||||
- Asset links resolve relative to the Markdown file.
|
||||
|
||||
## Optional MinerU Checks
|
||||
|
||||
- MinerU CLI execution succeeds or produces a clear failure warning.
|
||||
- Page coverage equals source PDF page count.
|
||||
- Math renderability failures are counted.
|
||||
- Table degradation warnings are counted.
|
||||
- Reading-order uncertainty is surfaced.
|
||||
|
||||
## Report Sections
|
||||
|
||||
- Summary: source file, pages, output files, engine, start/end time.
|
||||
- Warnings: grouped by severity and code.
|
||||
- Math: counts for inline, display, low-confidence, and render failures.
|
||||
- Assets: extracted, missing, broken links.
|
||||
- Tables: extracted, degraded, fallback count.
|
||||
- Environment: Python, uv, MinerU version, GPU visibility when available.
|
||||
@@ -0,0 +1,31 @@
|
||||
---
|
||||
name: math-markdown-review
|
||||
description: Review and design Obsidian-friendly Markdown normalization for math-heavy PDF conversion, including LaTeX delimiters, display math spacing, asset links, tables, and quality report warnings. Use when Codex needs to check Markdown output assumptions, design post-processing rules, or define renderability checks for formulas and assets.
|
||||
---
|
||||
|
||||
# Math Markdown Review
|
||||
|
||||
## Overview
|
||||
|
||||
Use this skill when Markdown output quality matters more than raw text extraction. The goal is best-effort automatic conversion with explicit warnings and provenance for failures.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Read `PLAN.md` and `PROGRESS.md` first.
|
||||
2. Read `PRD.md` and `ARCHITECTURE.md` when output behavior, metadata, or reporting is affected.
|
||||
3. Preserve project delimiter policy: inline math uses `$...$`; display math uses `$$...$$`.
|
||||
4. Check asset links, table fallback behavior, heading/list interactions, and page boundary markers against Obsidian rendering assumptions.
|
||||
5. Define warnings for low-confidence math, non-renderable LaTeX, broken asset links, table degradation, and reading-order uncertainty.
|
||||
6. Ensure `.report.md` content is derived from metadata, not separate manual state.
|
||||
|
||||
## Checks
|
||||
|
||||
- Inline math should not contain unescaped newlines or surrounding spaces that break rendering.
|
||||
- Display math should be separated from surrounding paragraphs by blank lines.
|
||||
- Asset paths should be stable, relative to the Markdown file, and safe for Obsidian vaults.
|
||||
- Tables with formulas should prefer readable Markdown when reliable and warn when downgraded.
|
||||
- Every renderability failure should be countable in metadata and visible in `.report.md`.
|
||||
|
||||
## Reference
|
||||
|
||||
Read `references/obsidian-output-checks.md` for concrete normalization and report-signal guidance.
|
||||
@@ -0,0 +1,4 @@
|
||||
interface:
|
||||
display_name: "Math Markdown Review"
|
||||
short_description: "Check Obsidian math Markdown output"
|
||||
default_prompt: "Use $math-markdown-review to design or check Obsidian-friendly Markdown normalization, math delimiters, asset paths, tables, and quality report signals."
|
||||
@@ -0,0 +1,31 @@
|
||||
# Obsidian Output Checks
|
||||
|
||||
Use these checks when designing or reviewing Markdown output.
|
||||
|
||||
## Math
|
||||
|
||||
- Inline math: `$...$`, no line breaks inside the delimiter pair.
|
||||
- Display math: `$$...$$`, with blank lines before and after the block.
|
||||
- Preserve source provenance for formulas: page index, bbox if available, engine, confidence, and warning codes.
|
||||
- Record render failures separately from extraction confidence.
|
||||
- Avoid rewriting LaTeX semantics unless the rule is deterministic and tested.
|
||||
|
||||
## Assets
|
||||
|
||||
- Store images under a deterministic asset directory next to the Markdown output.
|
||||
- Use relative Markdown links that remain valid when the output directory is moved as a unit.
|
||||
- Record asset source page, bbox if available, generated file path, and missing-link warnings.
|
||||
|
||||
## Tables
|
||||
|
||||
- Prefer Markdown tables only when cell boundaries and reading order are reliable.
|
||||
- If formulas or merged cells make Markdown tables misleading, use a readable fallback and emit a table warning.
|
||||
- Keep table warnings visible in both JSON metadata and `.report.md`.
|
||||
|
||||
## Report Signals
|
||||
|
||||
- Total pages processed and pages with warnings.
|
||||
- Math block count, inline math count, and non-renderable math count.
|
||||
- Broken asset links and missing assets.
|
||||
- Table degradation count.
|
||||
- Reading-order uncertainty count.
|
||||
@@ -0,0 +1,32 @@
|
||||
---
|
||||
name: mineru-research
|
||||
description: Research MinerU 3.1.0 setup, CLI behavior, output formats, model/runtime requirements, licensing, and local-only integration constraints for this PDF-to-Markdown project. Use when Codex needs to update project knowledge, verify MinerU facts, plan the MinerU adapter, or resolve uncertainty about installation, execution, or output behavior without adding alternate engines.
|
||||
---
|
||||
|
||||
# MinerU Research
|
||||
|
||||
## Overview
|
||||
|
||||
Use this skill to verify MinerU 3.1.0 facts before changing project docs or plans. Keep the scope narrow: MinerU 3.1.0 is the only conversion engine and direct local CLI execution is the only v1 execution mode.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Read `PLAN.md` and `PROGRESS.md` first.
|
||||
2. Read `PRD.md`, `ARCHITECTURE.md`, and `docs/KNOWLEDGEBASE.md` when the change affects product or architecture decisions.
|
||||
3. Prefer official MinerU documentation, the MinerU GitHub repository, release notes, primary papers, and official dependency docs.
|
||||
4. Verify time-sensitive facts with web research before updating docs.
|
||||
5. Record source URLs and access dates in durable docs when the finding affects future implementation.
|
||||
6. Update `PROGRESS.md` with the verified fact, unresolved uncertainty, and next action.
|
||||
|
||||
## Constraints
|
||||
|
||||
- Do not reintroduce candidate engine comparisons.
|
||||
- Allow only direct `mineru` CLI execution and the CLI-internal temporary local `mineru-api` process.
|
||||
- Do not add cloud OCR, remote LLM, `--api-url`, remote API, router, HTTP client backend, or remote OpenAI-compatible backend paths.
|
||||
- Do not imply perfect LaTeX reconstruction.
|
||||
- Do not implement converter code unless the user explicitly requests implementation.
|
||||
- Treat GTX 1070 Ti 8GB, Python 3.12, uv, and Windows PowerShell as active project constraints.
|
||||
|
||||
## Reference
|
||||
|
||||
Read `references/source-checklist.md` when planning a research pass or updating source-backed documentation.
|
||||
@@ -0,0 +1,4 @@
|
||||
interface:
|
||||
display_name: "MinerU Research"
|
||||
short_description: "Verify MinerU local integration facts"
|
||||
default_prompt: "Use $mineru-research to verify MinerU 2.5 setup, CLI behavior, outputs, licensing, and local-only integration constraints against official sources."
|
||||
@@ -0,0 +1,29 @@
|
||||
# MinerU Research Source Checklist
|
||||
|
||||
Use this checklist before changing project docs or plans based on MinerU facts.
|
||||
|
||||
## Sources
|
||||
|
||||
- MinerU GitHub repository for install instructions, CLI examples, output behavior, and license files.
|
||||
- MinerU official documentation for current setup and execution modes.
|
||||
- MinerU release notes or tags for version-specific changes.
|
||||
- Primary papers for model capability claims.
|
||||
- Official Python, uv, CUDA, PyTorch, or dependency docs for environment compatibility.
|
||||
|
||||
## Facts To Verify
|
||||
|
||||
- Supported Python versions and package manager expectations.
|
||||
- Whether MinerU 3.1.0 supports the required local CLI path on Windows.
|
||||
- Whether MinerU 3.1.0's CLI-internal temporary local `mineru-api` behavior stays local and avoids `--api-url`.
|
||||
- Required model download/cache behavior and offline reuse assumptions.
|
||||
- GPU/CPU execution options and expected memory pressure for GTX 1070 Ti 8GB.
|
||||
- Output directory structure, Markdown output, image asset output, JSON/intermediate output, and page/block metadata availability.
|
||||
- Exit codes, error messages, logging behavior, and partial-output behavior.
|
||||
- License obligations for MinerU, bundled models, and transitive runtime packages.
|
||||
|
||||
## Recording Rules
|
||||
|
||||
- Record source URL and access date for durable claims.
|
||||
- Distinguish official fact from inference.
|
||||
- Keep alternate engine names out of project docs unless the user explicitly asks for a separate historical note.
|
||||
- If a source conflicts with a fixed product decision, record the conflict and ask for a user decision.
|
||||
@@ -0,0 +1,6 @@
|
||||
.venv/
|
||||
.pytest_cache/
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
outputs/
|
||||
node_modules/
|
||||
@@ -0,0 +1,229 @@
|
||||
# AGENTS.md
|
||||
|
||||
This file gives implementation instructions for coding agents working in this repository.
|
||||
|
||||
## Project Mission
|
||||
|
||||
Build a local-only PDF-to-Markdown converter for math-heavy digital PDFs. The converter must produce Obsidian-friendly Markdown and preserve enough metadata to debug formulas, reading order, tables, figures, and assets.
|
||||
|
||||
## Project Guidelines
|
||||
|
||||
Behavioral guidelines to reduce common LLM coding mistakes. Merge with project-specific instructions as needed.
|
||||
|
||||
**Tradeoff:** These guidelines bias toward caution over speed. For trivial tasks, use judgment.
|
||||
|
||||
### 1. Think Before Coding
|
||||
|
||||
**Don't assume. Don't hide confusion. Surface tradeoffs.**
|
||||
|
||||
Before implementing:
|
||||
- State your assumptions explicitly. If uncertain, ask.
|
||||
- If multiple interpretations exist, present them - don't pick silently.
|
||||
- If a simpler approach exists, say so. Push back when warranted.
|
||||
- If something is unclear, stop. Name what's confusing. Ask.
|
||||
|
||||
### 2. Simplicity First
|
||||
|
||||
**Minimum code that solves the problem. Nothing speculative.**
|
||||
|
||||
- No features beyond what was asked.
|
||||
- No abstractions for single-use code.
|
||||
- No "flexibility" or "configurability" that wasn't requested.
|
||||
- No error handling for impossible scenarios.
|
||||
- If you write 200 lines and it could be 50, rewrite it.
|
||||
|
||||
Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify.
|
||||
|
||||
### 3. Surgical Changes
|
||||
|
||||
**Touch only what you must. Clean up only your own mess.**
|
||||
|
||||
When editing existing code:
|
||||
- Don't "improve" adjacent code, comments, or formatting.
|
||||
- Don't refactor things that aren't broken.
|
||||
- Match existing style, even if you'd do it differently.
|
||||
- If you notice unrelated dead code, mention it - don't delete it.
|
||||
|
||||
When your changes create orphans:
|
||||
- Remove imports/variables/functions that YOUR changes made unused.
|
||||
- Don't remove pre-existing dead code unless asked.
|
||||
|
||||
The test: Every changed line should trace directly to the user's request.
|
||||
|
||||
### 4. Goal-Driven Execution
|
||||
|
||||
**Define success criteria. Loop until verified.**
|
||||
|
||||
Transform tasks into verifiable goals:
|
||||
- "Add validation" -> "Write tests for invalid inputs, then make them pass"
|
||||
- "Fix the bug" -> "Write a test that reproduces it, then make it pass"
|
||||
- "Refactor X" -> "Ensure tests pass before and after"
|
||||
|
||||
For multi-step tasks, state a brief plan:
|
||||
```
|
||||
1. [Step] -> verify: [check]
|
||||
2. [Step] -> verify: [check]
|
||||
3. [Step] -> verify: [check]
|
||||
```
|
||||
|
||||
Strong success criteria let you loop independently. Weak criteria ("make it work") require constant clarification.
|
||||
|
||||
---
|
||||
|
||||
**These guidelines are working if:** fewer unnecessary changes in diffs, fewer rewrites due to overcomplication, and clarifying questions come before implementation rather than after mistakes.
|
||||
|
||||
## Source Documents
|
||||
|
||||
- `PLAN.md`: shared plan, planned work, open questions, and ownership for agents.
|
||||
- `PROGRESS.md`: completed work, current status, blockers, and next actions for agents.
|
||||
- `PRD.md`: product requirements, user scope, CLI/API requirements, acceptance criteria.
|
||||
- `ARCHITECTURE.md`: system layers, MinerU adapter contract, intermediate representation, metadata schema, and local-only enforcement.
|
||||
- `docs/KNOWLEDGEBASE.md`: research basis and implementation background.
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md`: v1 implementation sequence, sprint contracts, verification gates, and agent ownership.
|
||||
- `docs/Sprints/*.md`: active and historical sprint contracts.
|
||||
- `.codex/agents/*.toml`: project-scoped custom subagent roles.
|
||||
- `.codex/commands/*.md`: reusable project prompt commands.
|
||||
- `.codex/skills/*/SKILL.md`: project-specific Codex skills.
|
||||
- `.codex/hooks.json` and `.codex/hooks/*.py`: project hook configuration and deterministic hook scripts.
|
||||
|
||||
## Startup Workflow
|
||||
|
||||
At the start of every task:
|
||||
|
||||
- Read `PLAN.md` and `PROGRESS.md` before deciding what to do.
|
||||
- Read only the other source documents needed for the task.
|
||||
- Use `.codex/agents`, `.codex/commands`, and `.codex/skills` when the user explicitly asks for agent delegation, reusable workflows, or specialized project guidance.
|
||||
- State the relevant current goal, next action, and blocker if one exists.
|
||||
- If `PLAN.md` and `PROGRESS.md` conflict, trust `PROGRESS.md` for what has happened and update `PLAN.md` when making the next change.
|
||||
|
||||
## Progress Tracking
|
||||
|
||||
Use `PLAN.md` and `PROGRESS.md` to coordinate work across agents.
|
||||
|
||||
- Update `PLAN.md` when planned work, ownership, sequencing, open questions, or decisions change.
|
||||
- Update `PROGRESS.md` after meaningful work, verification, blockers, or next actions change.
|
||||
- Keep entries short and factual.
|
||||
- Do not use these files as scratchpads or long research notes.
|
||||
- Do not mark work complete until it has been verified.
|
||||
- When multiple agents work in parallel, each agent must leave enough context in `PROGRESS.md` for the next agent to resume without guessing.
|
||||
|
||||
## Long-Running Harness Workflow
|
||||
|
||||
For substantial implementation work, follow the planner/generator/evaluator pattern from Anthropic's long-running harness design article: https://www.anthropic.com/engineering/harness-design-long-running-apps.
|
||||
|
||||
Use the harness only when task complexity justifies the overhead. For small documentation edits or narrow fixes, a single agent with focused verification is preferred.
|
||||
|
||||
Harness roles:
|
||||
|
||||
- `harness-planner-agent`: expands a brief request into product context, high-level technical direction, non-goals, risks, and a sequence of small contracts.
|
||||
- `feature-generator-agent`: implements one agreed contract at a time after implementation has been explicitly requested.
|
||||
- `evaluation-agent`: independently reviews proposed contracts and completed work. It must be skeptical, specific, and willing to fail work that is incomplete, stubbed, unverified, or below threshold.
|
||||
|
||||
Before each implementation chunk:
|
||||
|
||||
- Write or update a concise sprint contract before code changes start.
|
||||
- Include objective, touched surfaces, expected outputs, non-goals, verification steps, hard failure criteria, and handoff fields.
|
||||
- Let `evaluation-agent` review the contract before `feature-generator-agent` implements it.
|
||||
- Avoid over-specifying low-level implementation before the responsible agent has inspected the code.
|
||||
|
||||
After each implementation chunk:
|
||||
|
||||
- `feature-generator-agent` runs a self-check but does not approve its own work.
|
||||
- `evaluation-agent` performs independent checks against the contract and reports actionable findings.
|
||||
- If the chunk fails, feed the evaluator's findings back into the next generator pass.
|
||||
- Update `PROGRESS.md` with completed work, checks run, residual risks, and the next concrete action.
|
||||
|
||||
When context becomes too large or a task spans sessions, prefer a clean structured handoff over relying only on conversation history. The handoff must include current state, decisions made, files touched, checks run, known failures, and the next action.
|
||||
|
||||
Periodically re-evaluate the harness itself. Remove roles, contracts, or checks that are not load-bearing, and add structure only when it improves correctness, scope control, or verification quality.
|
||||
|
||||
## Fixed Product Decisions
|
||||
|
||||
- Language: Python.
|
||||
- Workflow: `uv`.
|
||||
- Interface: CLI plus Python library.
|
||||
- Default CLI name: `pdf2md`.
|
||||
- Runtime policy: local-only. Do not add cloud OCR, remote LLM, or external document upload paths.
|
||||
- Default output: Obsidian-friendly Markdown.
|
||||
- Inline math: `$...$`.
|
||||
- Display math: `$$...$$`.
|
||||
- Conversion engine: MinerU 3.1.0.
|
||||
- Hardware target: NVIDIA GPU.
|
||||
- Input priority: digital PDFs with text layers.
|
||||
- Quality workflow: fully automatic. Log warnings and continue when possible.
|
||||
- MinerU execution: direct local `mineru` CLI only. MinerU 3.1.0 may launch a temporary local `mineru-api` internally when CLI runs without `--api-url`.
|
||||
- Quality report: write both metadata JSON and `<stem>.report.md`.
|
||||
- v1 use case: personal/research. MinerU and transitive model/package licenses must be documented before redistribution.
|
||||
|
||||
## Architecture Guidance
|
||||
|
||||
Follow `ARCHITECTURE.md` for implementation structure. Do not duplicate architecture decisions in code comments or docs unless the new text points back to that file.
|
||||
|
||||
Key implementation constraints:
|
||||
|
||||
- Keep MinerU-specific objects behind the MinerU adapter.
|
||||
- Keep public CLI/library contracts stable and project-owned.
|
||||
- Keep Obsidian Markdown normalization separate from MinerU execution.
|
||||
- Keep metadata and warning generation structured.
|
||||
- Keep quality report generation derived from metadata and local checks.
|
||||
- Do not add runtime engine selection in v1.
|
||||
|
||||
## Local-Only Requirements
|
||||
|
||||
Never add runtime dependencies that upload PDFs, page images, or extracted text to remote services. Follow the strict-local enforcement rules in `ARCHITECTURE.md`.
|
||||
|
||||
Allowed in v1: direct `mineru` CLI execution and the CLI-internal temporary local `mineru-api` process.
|
||||
|
||||
Do not pass `--api-url`, use remote APIs, router mode, HTTP client backends, or remote OpenAI-compatible inference endpoints in v1.
|
||||
|
||||
## CLI Behavior
|
||||
|
||||
Follow the CLI requirements in `PRD.md`. Do not add commands, flags, config files, or runtime engine selection unless the user explicitly asks for them.
|
||||
|
||||
Do not overwrite user files unless the requested behavior and `--overwrite` semantics allow it.
|
||||
|
||||
## Testing Guidance
|
||||
|
||||
Add tests in proportion to behavior risk.
|
||||
|
||||
Required early tests:
|
||||
|
||||
- Math delimiter normalization.
|
||||
- Display math spacing.
|
||||
- Asset path normalization.
|
||||
- Metadata schema creation.
|
||||
- Warning aggregation.
|
||||
- CLI path planning and overwrite behavior.
|
||||
- MinerU adapter contract with mocked outputs.
|
||||
|
||||
Fixture PDFs should cover:
|
||||
|
||||
- Simple digital PDF.
|
||||
- Math-heavy academic PDF.
|
||||
- Multi-column paper.
|
||||
- Table with formulas.
|
||||
- Figure with caption.
|
||||
|
||||
Any test that depends on large local models should be optional or marked separately so normal CI/dev checks can run quickly.
|
||||
|
||||
## Git Workflow
|
||||
|
||||
After changing files:
|
||||
|
||||
- Run the smallest useful verification for the change.
|
||||
- Check `git status --short`.
|
||||
- Commit the completed change unless the user explicitly asks not to.
|
||||
- Do not include unrelated user edits in the commit.
|
||||
|
||||
## Documentation Guidance
|
||||
|
||||
Keep documentation explicit about:
|
||||
|
||||
- Local-only privacy behavior.
|
||||
- NVIDIA GPU expectations.
|
||||
- MinerU installation and model downloads.
|
||||
- Known limitations of automatic formula reconstruction.
|
||||
- Dependency licenses.
|
||||
- Obsidian output assumptions.
|
||||
|
||||
Do not imply perfect LaTeX conversion. The correct guarantee is best-effort automatic conversion with warnings and provenance.
|
||||
+243
@@ -0,0 +1,243 @@
|
||||
# Architecture: Local PDF-to-Markdown Converter
|
||||
|
||||
Last updated: 2026-05-07
|
||||
|
||||
## 1. Overview
|
||||
|
||||
The system converts math-heavy digital PDFs into Obsidian-friendly Markdown using MinerU 3.1.0 as the fixed local conversion engine. Product requirements live in `PRD.md`; agent workflow rules live in `AGENTS.md`; research notes live in `docs/KNOWLEDGEBASE.md`.
|
||||
|
||||
The architecture separates MinerU execution from project-owned normalization and metadata. This boundary exists only to isolate MinerU I/O; it is not a pluggable engine system.
|
||||
|
||||
## 2. System Layers
|
||||
|
||||
1. CLI/API layer
|
||||
- Parse command arguments and Python API parameters.
|
||||
- Discover input PDFs.
|
||||
- Plan output paths.
|
||||
- Enforce overwrite behavior.
|
||||
- Print conversion summaries.
|
||||
|
||||
2. MinerU adapter layer
|
||||
- Validate MinerU 3.1.0 installation and version.
|
||||
- Run MinerU through direct local CLI execution.
|
||||
- Capture raw Markdown, structured output, assets, logs, and exit status.
|
||||
- Enforce strict-local execution.
|
||||
|
||||
3. Intermediate representation layer
|
||||
- Convert MinerU-specific output into project-owned document/page/block objects.
|
||||
- Preserve page index, bbox, confidence, source engine, and asset references returned by MinerU.
|
||||
- Prevent raw MinerU objects from becoming public API return types.
|
||||
|
||||
4. Normalization layer
|
||||
- Convert project-owned objects and MinerU Markdown into Obsidian-friendly Markdown.
|
||||
- Normalize math delimiters, display math spacing, headings, tables, and asset links.
|
||||
|
||||
5. Quality and metadata layer
|
||||
- Run link checks and math renderability checks with local tooling.
|
||||
- Aggregate structured warnings.
|
||||
- Write metadata JSON, quality report Markdown, and optional raw MinerU diagnostics.
|
||||
|
||||
## 3. Conversion Pipeline
|
||||
|
||||
1. Input discovery
|
||||
- Accept a single PDF or a directory.
|
||||
- Require `--recursive` for subdirectory traversal.
|
||||
- Validate that each selected input is a local PDF.
|
||||
- Compute source SHA-256.
|
||||
|
||||
2. MinerU conversion
|
||||
- Create an isolated work directory per input PDF.
|
||||
- Run the MinerU 3.1.0 adapter through the direct `mineru` CLI.
|
||||
- Capture raw Markdown, raw JSON/structured output when available, extracted assets, warnings, and logs.
|
||||
|
||||
3. Intermediate representation
|
||||
- Build document/page/block records from MinerU output.
|
||||
- Preserve provenance data instead of relying only on final Markdown text.
|
||||
|
||||
4. Obsidian normalization
|
||||
- Normalize inline math to `$...$`.
|
||||
- Normalize display math to `$$...$$` blocks on separate lines.
|
||||
- Normalize image links to stable relative asset paths.
|
||||
- Normalize tables without destroying complex table structure.
|
||||
|
||||
5. Quality checks
|
||||
- Verify generated asset links.
|
||||
- Check math renderability when local tooling is available.
|
||||
- Emit warnings without stopping conversion unless no usable output can be produced.
|
||||
|
||||
6. Output writing
|
||||
- Write final Markdown.
|
||||
- Write extracted assets.
|
||||
- Write metadata JSON.
|
||||
- Write `<stem>.report.md`.
|
||||
- Keep raw MinerU output when requested.
|
||||
|
||||
## 4. MinerU Adapter Contract
|
||||
|
||||
The MinerU adapter exposes:
|
||||
|
||||
- `name`
|
||||
- `is_available()`
|
||||
- `version()`
|
||||
- `doctor()`
|
||||
- `convert(input_pdf, work_dir, options)`
|
||||
|
||||
Adapter conversion output contains:
|
||||
|
||||
- `raw_markdown`
|
||||
- `raw_structured`
|
||||
- `assets`
|
||||
- `pages`
|
||||
- `warnings`
|
||||
- `engine`
|
||||
- `engine_version`
|
||||
- `engine_options`
|
||||
- `exit_code`
|
||||
- `stderr`
|
||||
|
||||
The adapter must fail fast if it cannot run in strict-local mode. Runtime engine selection is not part of v1.
|
||||
|
||||
The default conversion device is `cuda:0`. Because MinerU 3.1.0 selects its local device through environment/config rather than a dedicated CLI GPU flag, the adapter must set the MinerU subprocess environment to request CUDA by default while keeping the command shape direct and local.
|
||||
|
||||
Allowed MinerU execution in v1:
|
||||
|
||||
- Direct local `mineru` CLI execution.
|
||||
- The temporary local `mineru-api` process that MinerU 3.1.0 starts internally when the CLI runs without `--api-url`.
|
||||
|
||||
Prohibited MinerU execution in v1:
|
||||
|
||||
- Passing `--api-url`.
|
||||
- Remote APIs.
|
||||
- Router mode.
|
||||
- HTTP client backends.
|
||||
- Remote OpenAI-compatible backends or inference endpoints.
|
||||
|
||||
## 5. Intermediate Representation
|
||||
|
||||
The project uses a small internal representation for normalization and metadata.
|
||||
|
||||
Required concepts:
|
||||
|
||||
- Document
|
||||
- Page
|
||||
- Block
|
||||
- Asset
|
||||
- Warning
|
||||
|
||||
Required block types:
|
||||
|
||||
- `heading`
|
||||
- `paragraph`
|
||||
- `inline_formula`
|
||||
- `display_formula`
|
||||
- `table`
|
||||
- `figure`
|
||||
- `caption`
|
||||
- `footnote`
|
||||
- `reference`
|
||||
- `unknown`
|
||||
|
||||
Record these page/block fields when MinerU returns them. Do not invent missing values.
|
||||
|
||||
- Page index.
|
||||
- Page dimensions.
|
||||
- Bounding boxes.
|
||||
- Confidence.
|
||||
- Source engine, fixed to MinerU 3.1.0 in v1.
|
||||
- Markdown character span.
|
||||
|
||||
## 6. Markdown Normalization
|
||||
|
||||
Final Markdown must prioritize Obsidian.
|
||||
|
||||
- Use `$...$` for inline math.
|
||||
- Use display math blocks with `$$` on their own lines.
|
||||
- Keep blank lines around display math.
|
||||
- Do not escape underscores or carets inside math unnecessarily.
|
||||
- Prefer Markdown tables for simple tables.
|
||||
- Use HTML tables for complex tables when Markdown would lose structure.
|
||||
- Store figures/images in a stable relative assets directory.
|
||||
- Do not add visible page separators in v1.
|
||||
- Preserve captions and references when MinerU provides them.
|
||||
|
||||
## 7. Metadata Schema
|
||||
|
||||
When metadata is enabled, write `<stem>.metadata.json`.
|
||||
|
||||
Required top-level fields:
|
||||
|
||||
- `source_pdf`
|
||||
- `source_sha256`
|
||||
- `created_at`
|
||||
- `engine`
|
||||
- `engine_version`
|
||||
- `engine_options`
|
||||
- `pages`
|
||||
- `assets`
|
||||
- `warnings`
|
||||
- `summary`
|
||||
|
||||
Required summary fields:
|
||||
|
||||
- `pages_processed`
|
||||
- `warning_count`
|
||||
- `asset_count`
|
||||
- `display_formula_count`
|
||||
- `inline_formula_count`
|
||||
- `math_render_error_count`
|
||||
|
||||
Warning records include:
|
||||
|
||||
- `code`
|
||||
- `severity`
|
||||
- `page_index`
|
||||
- `bbox`
|
||||
- `message`
|
||||
|
||||
Stable warning code examples:
|
||||
|
||||
- `ENGINE_MISSING`
|
||||
- `GPU_UNAVAILABLE`
|
||||
- `LOW_CONFIDENCE_FORMULA`
|
||||
- `MATH_RENDER_FAILED`
|
||||
- `ASSET_LINK_MISSING`
|
||||
- `READING_ORDER_UNCERTAIN`
|
||||
- `STRICT_LOCAL_VIOLATION`
|
||||
- `MINERU_CLI_FAILED`
|
||||
|
||||
## 8. Quality Report
|
||||
|
||||
Every conversion writes `<stem>.report.md`.
|
||||
|
||||
The report is derived from metadata and local quality checks. It contains:
|
||||
|
||||
- Source and output paths.
|
||||
- MinerU version and execution mode.
|
||||
- Pages processed.
|
||||
- Warning count.
|
||||
- Asset count and missing asset link count.
|
||||
- Inline and display formula counts.
|
||||
- Math render error count.
|
||||
- Pages with warnings.
|
||||
- Final status: `success`, `partial`, or `failed`.
|
||||
|
||||
## 9. Local-Only Enforcement
|
||||
|
||||
The implementation must not upload PDFs, page images, or extracted text to remote services.
|
||||
|
||||
Strict-local mode is on by default. The MinerU adapter must not call cloud OCR APIs, hosted document parsing APIs, hosted LLM/VLM APIs, or remote model inference endpoints.
|
||||
|
||||
Local-only execution means direct `mineru` CLI execution in v1. MinerU 3.1.0's CLI-internal temporary local `mineru-api` process is allowed because it is local orchestration owned by the CLI invocation. User-specified API URLs, router mode, HTTP client backends, remote APIs, and remote OpenAI-compatible backends are not allowed.
|
||||
|
||||
Allowed network activity is limited to documentation, package/model installation initiated by setup commands, or tests explicitly marked as network tests and disabled by default.
|
||||
|
||||
## 10. Failure Policy
|
||||
|
||||
Conversion should continue automatically when possible.
|
||||
|
||||
- Low-confidence formulas are included as best effort and recorded as warnings.
|
||||
- Low-confidence pages are included as best effort and recorded as warnings.
|
||||
- Run MinerU with its default local CLI behavior first.
|
||||
- If MinerU cannot run or returns a failure, report the failure clearly and do not silently switch backend.
|
||||
- Conversion fails only when the input cannot be opened, MinerU cannot run, no usable output can be produced, output cannot be written, or strict-local policy is violated.
|
||||
- CLI summaries must report warning counts clearly.
|
||||
@@ -0,0 +1,89 @@
|
||||
# PLAN.md
|
||||
|
||||
This file is the shared work plan for agents. Read it before starting work, then update it when the plan changes.
|
||||
|
||||
## Current Goal
|
||||
|
||||
CUDA-enabled PyTorch and MinerU 3.1.0 runtime setup is complete in the project `.venv`. Sprint 10 pre-conversion PDF chunking is implemented; next work is optional real local sample validation only if requested.
|
||||
|
||||
## Active Constraints
|
||||
|
||||
- Do not implement additional program code beyond the active user-approved sprint.
|
||||
- Keep MinerU 3.1.0 as the only conversion engine.
|
||||
- Keep processing local-only.
|
||||
- Target Python 3.12.
|
||||
- Target GPU: GTX 1070 Ti 8GB.
|
||||
- Default conversion device: `cuda:0`.
|
||||
- Run MinerU through direct local CLI execution only.
|
||||
- On MinerU failure, report a clear error/warning and do not silently fallback.
|
||||
- Write both metadata JSON and a human-readable `.report.md` quality report for conversions.
|
||||
- Use `samples/` only as local fixture context; do not commit sample files unless explicitly requested.
|
||||
|
||||
## Planned Work
|
||||
|
||||
1. Use `research-agent` for MinerU 3.1.0 source tracking and official-doc verification.
|
||||
2. Use `requirements-guard-agent` for cross-document consistency reviews.
|
||||
3. Use `mineru-integration-agent` for direct local MinerU CLI adapter planning.
|
||||
4. Use `obsidian-markdown-agent` for math-heavy Obsidian Markdown output planning.
|
||||
5. Use `metadata-agent` for provenance, warning, JSON metadata, and `.report.md` planning.
|
||||
6. Use `evaluation-agent` for local fixture coverage and regression criteria.
|
||||
7. Use `local-setup-agent` for Python 3.12, uv, CUDA, GTX 1070 Ti 8GB, and doctor-check planning.
|
||||
8. Use `license-privacy-agent` for license and strict-local privacy review.
|
||||
9. Use `harness-planner-agent` to turn substantial implementation requests into scoped contracts before code work starts.
|
||||
10. Use `feature-generator-agent` to implement one approved contract at a time after the user explicitly requests implementation.
|
||||
11. Use `evaluation-agent` as the independent contract reviewer and QA evaluator before and after each implementation chunk.
|
||||
12. Follow `docs/V1IMPLEMENTATIONPLAN.md` for the v1 implementation sprint sequence.
|
||||
13. Use `docs/Sprints/SPRINT10CONTRACT.md` for the implemented long-PDF pre-conversion chunking sprint.
|
||||
|
||||
## Open Questions
|
||||
|
||||
- None.
|
||||
|
||||
## Decisions
|
||||
|
||||
- Use `PLAN.md` for intended work and ownership.
|
||||
- Use `PROGRESS.md` for completed work, current status, blockers, and next actions.
|
||||
- MinerU default local CLI execution is the only v1 execution mode.
|
||||
- MinerU 3.1.0 may launch a temporary local `mineru-api` internally when `mineru` CLI runs without `--api-url`.
|
||||
- Strict-local mode forbids `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends.
|
||||
- No silent fallback after MinerU failure.
|
||||
- Conversion output includes both metadata JSON and `<stem>.report.md`.
|
||||
- Local MathJax render checking is optional and nonfatal; missing Node.js or MathJax must produce a clear warning instead of blocking conversion.
|
||||
- Project-scoped custom agents live in `.codex/agents/*.toml`.
|
||||
- Project prompt commands live in `.codex/commands/*.md`.
|
||||
- Project-specific skills live in `.codex/skills/*/SKILL.md`.
|
||||
- Project hooks live in `.codex/hooks.json` and `.codex/hooks/*.py`.
|
||||
- Agent, command, skill, and hook assets are written in English for Codex compatibility.
|
||||
- Long-running implementation should use a planner/generator/evaluator harness only when the task complexity justifies the overhead.
|
||||
- Each substantial implementation chunk should have a sprint contract with objective, scope, verification, failure thresholds, and handoff fields.
|
||||
- Generator agents may self-check, but independent evaluation is required before marking a chunk complete.
|
||||
- V1 implementation sequencing and sprint contracts live in `docs/V1IMPLEMENTATIONPLAN.md`.
|
||||
- Concrete sprint contract documents live under `docs/Sprints/`.
|
||||
- Sprint 2 path planning contract lives at `docs/Sprints/SPRINT2CONTRACT.md`.
|
||||
- Sprint 3 domain records and metadata contract lives at `docs/Sprints/SPRINT3CONTRACT.md`.
|
||||
- Sprint 4 MinerU adapter contract lives at `docs/Sprints/SPRINT4CONTRACT.md`.
|
||||
- Sprint 4 fixes the v1 adapter executable to the direct `mineru` CLI; user-specified alternate executables, including `mineru-api`, are prohibited.
|
||||
- Sprint 5 Obsidian Markdown normalization and asset link contract lives at `docs/Sprints/SPRINT5CONTRACT.md`.
|
||||
- Sprint 5 owns Markdown normalization only; it does not write final Markdown files, copy assets, run MinerU, or connect to conversion orchestration.
|
||||
- Sprint 6 quality checks and report generation contract lives at `docs/Sprints/SPRINT6CONTRACT.md`.
|
||||
- Sprint 6 owns quality/report boundaries only; it does not write final files, run MinerU, or connect to conversion orchestration.
|
||||
- Sprint 7 conversion orchestration, CLI, and Python API contract lives at `docs/Sprints/SPRINT7CONTRACT.md`.
|
||||
- Sprint 7 will be the first implementation sprint allowed to write final Markdown, metadata JSON, report Markdown, and local copied assets as product behavior.
|
||||
- Sprint 7 implemented conversion orchestration, `convert_pdf`, batch conversion, `pdf2md convert`, output writing, metadata/report writing, and fake-adapter CLI/API tests.
|
||||
- Sprint 8 should cover `pdf2md doctor` and setup documentation; Sprint 7 intentionally did not add doctor behavior.
|
||||
- Sprint 8 doctor and setup documentation contract lives at `docs/Sprints/SPRINT8CONTRACT.md`.
|
||||
- Sprint 8 owns doctor diagnostics and setup docs only; it must not run real MinerU, download models, run sample PDFs, or add runtime remote/API paths in default tests.
|
||||
- Sprint 8 implements `pdf2md doctor`, local setup diagnostics, and setup documentation without running real MinerU, downloading models, or touching `samples/` in default tests.
|
||||
- Sprint 9 local fixture evaluation and v1 release gate contract lives at `docs/Sprints/SPRINT9CONTRACT.md`.
|
||||
- Sprint 9 must keep default tests independent of real MinerU, GPU, models, network, Obsidian, LaTeX tooling, and `samples/`; real MinerU fixture checks must be explicit opt-in only.
|
||||
- Sprint 9 implements fast mocked integration tests, explicit opt-in local MinerU fixture evaluation, and `docs/V1RELEASECHECKLIST.md`.
|
||||
- `pdf2md convert` defaults to `--gpu cuda:0`.
|
||||
- The MinerU adapter maps CUDA device requests to local subprocess environment variables instead of adding speculative MinerU CLI flags.
|
||||
- GTX 1070 Ti local runtime uses PyTorch `2.6.0+cu126` and `torchvision 0.21.0+cu126` installed after `uv sync`, followed by `mineru[core]==3.1.0`.
|
||||
- MinerU models are downloaded with `mineru-models-download -s huggingface -m all`, and runtime model loading uses `MINERU_MODEL_SOURCE=local`.
|
||||
- Sprint 10 should use `pypdf` for local 20-page PDF chunk creation if implementation is approved.
|
||||
- Sprint 10 uses `pypdf` for local PDF page chunk planning and temporary chunk PDF writing.
|
||||
- Sprint 10 converts chunk PDFs independently and does not merge generated Markdown outputs.
|
||||
- Chunking is opt-in through `--chunk-pages`; if the option is present without a value, the CLI uses 20 pages per chunk.
|
||||
- `convert_pdf()` keeps returning `ConversionResult` without chunking and returns `BatchConversionResult` when `chunk_pages` is set.
|
||||
- Chunk PDFs are temporary local files and are deleted after conversion completes, including when raw MinerU output is retained.
|
||||
@@ -0,0 +1,307 @@
|
||||
# PRD: Local PDF-to-Markdown Converter
|
||||
|
||||
Last updated: 2026-05-07
|
||||
|
||||
## 1. Summary
|
||||
|
||||
Build a local-only CLI and Python library that converts math-heavy digital PDFs into Obsidian-friendly Markdown. The product prioritizes accurate LaTeX reconstruction for equations, preservation of document structure, stable asset links, and traceable page-level metadata.
|
||||
|
||||
The first version is for personal/research use, targets NVIDIA GPU machines, and uses MinerU 3.1.0 as the fixed conversion engine. It should process digital PDFs with existing text layers first. Scanned books, cloud OCR APIs, web UI, and manual review workflows are out of scope for v1.
|
||||
|
||||
## 2. Goals
|
||||
|
||||
- Convert a single PDF into one Markdown file plus assets, metadata JSON, and a human-readable quality report.
|
||||
- Convert a folder of PDFs in batch mode.
|
||||
- Preserve inline math as `$...$` and display math as `$$...$$`.
|
||||
- Produce Markdown that opens cleanly in Obsidian.
|
||||
- Use MinerU 3.1.0 locally.
|
||||
- Keep enough metadata to diagnose formula, layout, and reading-order errors.
|
||||
- Continue conversion automatically when a page or formula is low-confidence, while logging warnings.
|
||||
|
||||
## 3. Non-Goals
|
||||
|
||||
- No cloud OCR, cloud LLM, or third-party document upload in v1.
|
||||
- No web app or GUI in v1.
|
||||
- No manual review queue in v1.
|
||||
- No optimization for low-quality scanned books in v1.
|
||||
- No guaranteed perfect LaTeX reconstruction.
|
||||
- No multi-user server or hosted API in v1.
|
||||
- No commercial redistribution assumptions until dependency licenses are reviewed.
|
||||
|
||||
## 4. Target Users
|
||||
|
||||
Primary user:
|
||||
|
||||
- A researcher, student, or developer converting math-heavy papers/books into Obsidian notes.
|
||||
|
||||
## 5. Input Scope
|
||||
|
||||
Supported in v1:
|
||||
|
||||
- Local `.pdf` files.
|
||||
- Directories containing `.pdf` files.
|
||||
- Digital PDFs with embedded text layers.
|
||||
- Academic papers with sections, references, figures, captions, tables, inline math, and display equations.
|
||||
|
||||
Best effort in v1:
|
||||
|
||||
- Multi-column academic layouts.
|
||||
- Tables containing math.
|
||||
- Figures and captions.
|
||||
- Page numbers, headers, and footers.
|
||||
|
||||
Out of scope for v1 optimization:
|
||||
|
||||
- Poor-quality scans.
|
||||
- Handwritten math.
|
||||
- Camera photos.
|
||||
- Password-protected PDFs.
|
||||
- Damaged PDFs that cannot be opened by local tooling.
|
||||
|
||||
## 6. Output Scope
|
||||
|
||||
For each input PDF, the converter writes:
|
||||
|
||||
- A normalized Markdown file.
|
||||
- An assets directory when MinerU extracts images or other media.
|
||||
- A metadata JSON file.
|
||||
- A human-readable quality report named `<stem>.report.md`.
|
||||
- Optional raw MinerU outputs for debugging.
|
||||
|
||||
Markdown rules:
|
||||
|
||||
- Inline equations use `$...$`.
|
||||
- Display equations use `$$...$$` on separate lines.
|
||||
- Simple tables use Markdown pipe tables.
|
||||
- Complex tables may use HTML when Markdown would lose structure.
|
||||
- Images use relative links to the generated assets directory.
|
||||
- Visible page markers should be avoided by default; page provenance belongs in metadata.
|
||||
- Obsidian compatibility is the output standard.
|
||||
|
||||
Detailed Markdown normalization rules are defined in `ARCHITECTURE.md`.
|
||||
|
||||
## 7. CLI Requirements
|
||||
|
||||
The CLI binary should be named `pdf2md`.
|
||||
|
||||
Required commands:
|
||||
|
||||
```bash
|
||||
pdf2md convert INPUT --out OUTPUT_DIR
|
||||
pdf2md doctor
|
||||
```
|
||||
|
||||
`convert` behavior:
|
||||
|
||||
- If `INPUT` is a PDF, convert that file.
|
||||
- If `INPUT` is a directory, convert PDFs in that directory.
|
||||
- Directory conversion requires `--recursive` to descend into subdirectories.
|
||||
- Output filenames default to the source PDF stem plus `.md`.
|
||||
- Asset directories default to `<stem>.assets`.
|
||||
- Existing outputs are not overwritten unless `--overwrite` is passed.
|
||||
|
||||
Required `convert` options:
|
||||
|
||||
- `--out PATH`: output directory.
|
||||
- `--metadata`: write metadata JSON. Enabled by default in v1.
|
||||
- `--keep-raw`: keep raw MinerU output for debugging.
|
||||
- `--recursive`: recursively process directory inputs.
|
||||
- `--overwrite`: replace existing outputs.
|
||||
- `--gpu DEVICE`: select CUDA device. Default: `cuda:0`.
|
||||
- `--strict-local`: forbid remote network/cloud execution during conversion. Default: true.
|
||||
|
||||
`doctor` behavior:
|
||||
|
||||
- Report Python version.
|
||||
- Report `uv` availability.
|
||||
- Report CUDA/PyTorch GPU availability when detectable.
|
||||
- Report MinerU availability.
|
||||
- Report local model/cache paths when detectable.
|
||||
- Warn if no NVIDIA GPU is available.
|
||||
- Fail if required v1 runtime dependencies are missing.
|
||||
|
||||
## 8. Python Library Requirements
|
||||
|
||||
The library should expose a stable API suitable for scripts and tests.
|
||||
|
||||
Required high-level API:
|
||||
|
||||
```python
|
||||
from pdf2md import convert_pdf
|
||||
|
||||
result = convert_pdf(
|
||||
input_path="paper.pdf",
|
||||
output_dir="out",
|
||||
metadata=True,
|
||||
)
|
||||
```
|
||||
|
||||
Required return fields:
|
||||
|
||||
- `markdown_path`
|
||||
- `metadata_path`
|
||||
- `assets_dir`
|
||||
- `warnings`
|
||||
- `engine`
|
||||
- `pages_processed`
|
||||
|
||||
The public API should not expose raw MinerU objects as required return types. MinerU-specific data may be stored under optional metadata fields.
|
||||
|
||||
## 9. Metadata Requirements
|
||||
|
||||
When `--metadata` is enabled, write `<stem>.metadata.json`.
|
||||
|
||||
Required top-level fields:
|
||||
|
||||
- `source_pdf`
|
||||
- `source_sha256`
|
||||
- `created_at`
|
||||
- `engine`
|
||||
- `engine_version`
|
||||
- `engine_options`
|
||||
- `pages`
|
||||
- `assets`
|
||||
- `warnings`
|
||||
- `summary`
|
||||
|
||||
Required summary fields:
|
||||
|
||||
- `pages_processed`
|
||||
- `warning_count`
|
||||
- `asset_count`
|
||||
- `display_formula_count`
|
||||
- `inline_formula_count`
|
||||
- `math_render_error_count`
|
||||
|
||||
Warnings must be non-fatal unless the source file cannot be read or no output can be produced.
|
||||
|
||||
Detailed metadata fields, block types, and warning codes are defined in `ARCHITECTURE.md`.
|
||||
|
||||
## 10. Quality Report Requirements
|
||||
|
||||
For every conversion, write `<stem>.report.md`.
|
||||
|
||||
The report must be readable without opening the JSON metadata and include:
|
||||
|
||||
- Source PDF path.
|
||||
- Output Markdown path.
|
||||
- MinerU version.
|
||||
- Page count.
|
||||
- Warning count.
|
||||
- Asset count.
|
||||
- Inline formula count.
|
||||
- Display formula count.
|
||||
- Math render error count.
|
||||
- Missing asset link count.
|
||||
- A short list of pages with warnings.
|
||||
|
||||
## 11. Quality Policy
|
||||
|
||||
The product is fully automatic in v1.
|
||||
|
||||
- Low-confidence formulas are included in the output as best effort.
|
||||
- Low-confidence pages are included in the output as best effort.
|
||||
- The converter logs warnings and metadata records.
|
||||
- Conversion uses MinerU's default local CLI execution. If MinerU cannot run or fails, the converter must emit a clear error/warning instead of silently falling back to another backend.
|
||||
- Conversion fails only when the input cannot be opened, MinerU cannot run, output cannot be written, no usable output can be produced, or local-only policy is violated.
|
||||
|
||||
The CLI summary must report warning counts clearly.
|
||||
|
||||
## 12. Local-Only Policy
|
||||
|
||||
The implementation must not upload PDFs or page images to cloud APIs.
|
||||
|
||||
Prohibited in v1 runtime:
|
||||
|
||||
- Any cloud OCR API.
|
||||
- Any hosted document parsing API.
|
||||
- Any remote LLM or VLM call.
|
||||
- Remote model inference endpoints.
|
||||
|
||||
Allowed:
|
||||
|
||||
- Local model files.
|
||||
- Local Python packages.
|
||||
- Local CLI tools.
|
||||
- Documentation links.
|
||||
- Explicit installation downloads initiated by the user during setup.
|
||||
|
||||
`--strict-local` is on by default. The MinerU adapter must not use remote endpoints in strict-local mode.
|
||||
|
||||
Allowed in v1 runtime:
|
||||
|
||||
- Direct `mineru` CLI execution.
|
||||
- The temporary local `mineru-api` process that MinerU 3.1.0 starts internally when the CLI runs without `--api-url`.
|
||||
|
||||
Prohibited in v1 runtime:
|
||||
|
||||
- `--api-url`.
|
||||
- Remote APIs.
|
||||
- Router mode.
|
||||
- HTTP client backends.
|
||||
- Remote OpenAI-compatible backends or inference endpoints.
|
||||
|
||||
Detailed strict-local enforcement rules are defined in `ARCHITECTURE.md`.
|
||||
|
||||
## 13. Installation Requirements
|
||||
|
||||
Use `uv` as the primary project workflow.
|
||||
|
||||
Expected setup commands:
|
||||
|
||||
```bash
|
||||
uv sync
|
||||
uv run pdf2md doctor
|
||||
```
|
||||
|
||||
MinerU/model setup may require additional scripts, for example:
|
||||
|
||||
```bash
|
||||
uv run scripts/install-mineru.ps1
|
||||
uv run scripts/install-models.py
|
||||
```
|
||||
|
||||
The project should document NVIDIA GPU/CUDA expectations and provide clear errors when GPU acceleration is unavailable.
|
||||
|
||||
## 14. Test Requirements
|
||||
|
||||
Required test categories:
|
||||
|
||||
- Unit tests for Markdown math delimiter normalization.
|
||||
- Unit tests for asset path normalization.
|
||||
- Unit tests for metadata schema creation.
|
||||
- Unit tests for warning aggregation.
|
||||
- MinerU adapter contract tests with mocked outputs.
|
||||
- CLI tests for single PDF, directory input, overwrite behavior, and metadata output.
|
||||
|
||||
Fixture categories:
|
||||
|
||||
- Small digital PDF with simple text and math.
|
||||
- Math-heavy academic paper page.
|
||||
- Multi-column paper page.
|
||||
- Table with formulas.
|
||||
- Figure with caption.
|
||||
|
||||
Acceptance checks:
|
||||
|
||||
- Markdown exists after conversion.
|
||||
- Metadata exists when requested.
|
||||
- Quality report exists after conversion.
|
||||
- Asset links resolve.
|
||||
- Inline/display math delimiters match Obsidian expectations.
|
||||
- Math render checks report failures instead of silently passing.
|
||||
- No cloud calls are made.
|
||||
- Warnings do not stop conversion unless MinerU cannot produce output.
|
||||
- MinerU failure produces a clear error/warning and does not silently switch backend.
|
||||
|
||||
## 15. Release Criteria for v1
|
||||
|
||||
v1 is acceptable when:
|
||||
|
||||
- `pdf2md convert paper.pdf --out out --metadata` works on a representative digital academic PDF.
|
||||
- `pdf2md convert pdfs --out out --recursive --metadata` works on a small folder.
|
||||
- `pdf2md doctor` reports MinerU/GPU status clearly.
|
||||
- The default output opens in Obsidian with math blocks rendered.
|
||||
- Metadata links pages, blocks, warnings, and assets to the source PDF.
|
||||
- `<stem>.report.md` summarizes warnings, formulas, assets, and render/link check results.
|
||||
- The README or setup docs explain local-only behavior and GPU expectations.
|
||||
+437
@@ -0,0 +1,437 @@
|
||||
# PROGRESS.md
|
||||
|
||||
This file records actual progress for agents. Read it before starting work, then update it after meaningful changes.
|
||||
|
||||
## Current Status
|
||||
|
||||
- Project direction is documented.
|
||||
- MinerU 3.1.0 is fixed as the only conversion engine.
|
||||
- `PRD.md`, `ARCHITECTURE.md`, `AGENTS.md`, and `docs/KNOWLEDGEBASE.md` exist.
|
||||
- `samples/` exists locally and is untracked by git.
|
||||
- Converter implementation exists through Sprint 9 path planning, project-owned records, metadata, mocked direct local MinerU adapter boundary, Obsidian Markdown normalization, local quality checks, report content rendering, conversion orchestration, public conversion API, `pdf2md convert`, `pdf2md doctor`, fast mocked integration tests, optional local MinerU fixture evaluation, and the v1 release checklist.
|
||||
- Default conversion now requests `cuda:0`; the MinerU adapter sets local GPU-related environment for the MinerU subprocess.
|
||||
- Project-local Codex workflow assets now live under `.codex/`.
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md` now defines the v1 implementation sequence.
|
||||
- `docs/Sprints/SPRINT0CONTRACT.md` now defines the Sprint 0 contract.
|
||||
- `docs/Sprints/SPRINT1CONTRACT.md` now defines the Sprint 1 scaffold contract.
|
||||
- `docs/Sprints/SPRINT2CONTRACT.md` now defines the Sprint 2 path planning contract.
|
||||
- `docs/Sprints/SPRINT3CONTRACT.md` now defines the Sprint 3 domain records and metadata contract.
|
||||
- `docs/Sprints/SPRINT4CONTRACT.md` now defines the Sprint 4 mocked MinerU adapter contract.
|
||||
- `docs/Sprints/SPRINT5CONTRACT.md` now defines the Sprint 5 Obsidian Markdown normalization and asset link contract.
|
||||
- `docs/Sprints/SPRINT6CONTRACT.md` now defines the Sprint 6 quality checks and report generation contract.
|
||||
- `docs/Sprints/SPRINT7CONTRACT.md` now defines the Sprint 7 conversion orchestration, CLI, and Python API contract.
|
||||
- `docs/Sprints/SPRINT8CONTRACT.md` now defines the Sprint 8 doctor and setup documentation contract.
|
||||
- `docs/Sprints/SPRINT9CONTRACT.md` now defines the Sprint 9 local fixture evaluation and v1 release gate contract.
|
||||
- Relevant `.codex/agents/*.toml` files now reference the v1 plan and sprint contract paths directly.
|
||||
- Sprint 10 is implemented with opt-in pre-conversion PDF chunking, temporary chunk PDF cleanup, chunk metadata/report context, and mocked tests.
|
||||
- Sprint 0 source, environment, license, privacy, and contract verification is complete with a `go-with-risks` recommendation.
|
||||
- Sprint 1 is complete with a minimal Python package, CLI placeholder, and fast pytest loop.
|
||||
- Sprint 4 is implemented with a mock-tested direct local MinerU CLI adapter.
|
||||
- Sprint 5 is implemented with a pure Markdown normalizer and local-only unit tests.
|
||||
- Sprint 6 is implemented with local quality checks and report string rendering.
|
||||
- Sprint 7 is implemented with `convert_pdf`, `convert_input`, output writing, metadata/report writing, local asset copying, batch conversion, and `pdf2md convert`.
|
||||
- Sprint 7 is implemented with fake-adapter CLI/API tests.
|
||||
- Sprint 8 is implemented and committed.
|
||||
- Sprint 9 is implemented, independently evaluated, and committed.
|
||||
- The project `.venv` has been rebuilt with CUDA-enabled PyTorch and MinerU 3.1.0.
|
||||
- Latest `samples/MITC공부.pdf` conversion completed on GPU and wrote Markdown, metadata JSON, report Markdown, and assets under ignored `outputs/MITC공부/`.
|
||||
- `docs/MATHJAXCHECKERPLAN.md` now documents the local MathJax render checker plan and implementation status.
|
||||
- Local MathJax render checker code now exists with optional local Node.js/`mathjax` setup, default conversion integration, and `doctor` diagnostics.
|
||||
- `docs/Sprints/SPRINT10CONTRACT.md` now documents the implemented long-PDF pre-conversion chunking sprint.
|
||||
|
||||
## Environment Notes
|
||||
|
||||
- OS/workspace: Windows PowerShell in `D:\Work\Repos\AICoding\ConvertPDFToMD`.
|
||||
- Python target: 3.12.
|
||||
- Local Python observed during Sprint 0: 3.12.7.
|
||||
- `uv` observed during Sprint 0: not available on PATH.
|
||||
- `uv` installed during Sprint 1: 0.11.11 at `C:\Users\user\.local\bin`.
|
||||
- If a new shell cannot find `uv`, restart the shell or add `C:\Users\user\.local\bin` to PATH.
|
||||
- GPU target: GTX 1070 Ti 8GB.
|
||||
- Local GPU observed during Sprint 0: NVIDIA GeForce GTX 1070 Ti, driver 577.00, 8192 MiB VRAM, WDDM.
|
||||
- Sample PDFs are in `samples/` and include Korean filenames.
|
||||
- MinerU execution mode: direct local CLI only.
|
||||
- MinerU 3.1.0 CLI-internal temporary local `mineru-api` is allowed when CLI runs without `--api-url`.
|
||||
- Strict-local prohibits `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends.
|
||||
- MinerU planning pin: `mineru[core]==3.1.0` unless Sprint 1 or Sprint 8 proves another 3.1.0 extra is required.
|
||||
- MinerU 3.1.0 was installed in the local `.venv` with `uv pip install "mineru[core]==3.1.0"` for real CLI probing.
|
||||
- Current `pdf2md doctor` status is WARN: MinerU CLI is present, GTX 1070 Ti is visible with Pascal/pre-Turing risk, PyTorch is `2.6.0+cu126` with CUDA available, local MinerU model config is detected, local MathJax checker passes after `npm install`, and strict-local policy passes.
|
||||
- User-level environment variable `MINERU_MODEL_SOURCE=local` is set so MinerU uses the downloaded local model paths in `C:\Users\user\mineru.json`.
|
||||
|
||||
## Completed Work
|
||||
|
||||
- Created initial project documents.
|
||||
- Originally selected MinerU 2.5, then changed the fixed engine target to MinerU 3.1.0 after user approval.
|
||||
- Split architecture details into `ARCHITECTURE.md`.
|
||||
- Aligned documents with `Project Guidelines`.
|
||||
- Added this shared planning/progress workflow.
|
||||
- Decided MinerU failures must produce clear warnings/errors without silent fallback.
|
||||
- Decided every conversion should produce metadata JSON and a human-readable `.report.md`.
|
||||
- Created custom agent specs for research, requirements, MinerU integration, Obsidian Markdown, metadata, evaluation, local setup, and license/privacy work.
|
||||
- Created project prompt commands for startup, MinerU research, document review, integration planning, and quality evaluation planning.
|
||||
- Created project skills for MinerU research, math Markdown review, and fixture evaluation.
|
||||
- Created project hooks for startup context, pre-tool policy checks, and stop-time completion reminders.
|
||||
- Read Anthropic's long-running harness design article and adapted its planner/generator/evaluator pattern for this repository.
|
||||
- Added `harness-planner-agent` and `feature-generator-agent`.
|
||||
- Strengthened `evaluation-agent` as an independent contract reviewer and skeptical QA evaluator.
|
||||
- Added long-running harness workflow guidance to `AGENTS.md`.
|
||||
- Created `docs/V1IMPLEMENTATIONPLAN.md` with v1 sprint sequencing, contracts, verification gates, and agent ownership.
|
||||
- Created `docs/Sprints/SPRINT0CONTRACT.md` for source and environment verification before implementation.
|
||||
- Added direct `docs/V1IMPLEMENTATIONPLAN.md` and `docs/Sprints/SPRINT0CONTRACT.md` references to the agents that need them.
|
||||
- Completed Sprint 0 contract evaluation; result was PASS.
|
||||
- Completed the original Sprint 0 MinerU 2.5.4 package, CLI shape, output layout, model/cache, and strict-local risk verification from primary sources.
|
||||
- Verified local Python, `uv`, and GPU facts using the allowed Sprint 0 commands.
|
||||
- Verified MinerU/model license and privacy posture for personal/research local use versus redistribution.
|
||||
- Updated `docs/KNOWLEDGEBASE.md`, `docs/V1IMPLEMENTATIONPLAN.md`, and `docs/Sprints/SPRINT0CONTRACT.md` with Sprint 0 findings.
|
||||
- Completed post-output Sprint 0 evaluation. The only missing acceptance item at review time was the final commit.
|
||||
- Redefined strict-local policy for MinerU 3.1.0: allow direct `mineru` CLI and CLI-internal temporary local `mineru-api`; prohibit `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends.
|
||||
- Updated core project documents and `.codex` workflow assets to reflect MinerU 3.1.0 and the redefined strict-local policy.
|
||||
- Checked MinerU 3.1.0 sources: PyPI 3.1.0 metadata, MinerU release notes, quick usage docs, CLI tools docs, output file docs, and model source docs.
|
||||
- Created `docs/Sprints/SPRINT1CONTRACT.md` for project scaffold and fast test loop planning.
|
||||
- Added direct `docs/Sprints/SPRINT1CONTRACT.md` references to the agents that need Sprint 1 scaffold context.
|
||||
- Started Sprint 1 implementation and amended the contract to include `uv.lock`, which is generated by `uv sync`.
|
||||
- Installed `uv` per-user using the official Astral installer.
|
||||
- Created Sprint 1 scaffold files: `pyproject.toml`, `uv.lock`, `.gitignore`, `README.md`, `src/pdf2md/__init__.py`, `src/pdf2md/cli.py`, `tests/test_package.py`, and `tests/test_cli.py`.
|
||||
- Verified `uv sync` with a temporary project environment outside the repo.
|
||||
- Verified `uv run pytest` passes with 4 tests.
|
||||
- Verified `uv run pdf2md --version` prints `pdf2md 0.1.0`.
|
||||
- Verified `git diff --check` passes.
|
||||
- Checked the scaffold for disallowed MinerU, remote API, router, HTTP, or OpenAI backend references.
|
||||
- Completed independent Sprint 1 evaluation once; the only scope failure was that `PLAN.md` was updated for shared workflow coordination before the Sprint 1 contract listed it as an allowed touched surface.
|
||||
- Amended the Sprint 1 contract to allow minimal `PLAN.md` current-goal coordination updates.
|
||||
- Completed the final independent Sprint 1 evaluation; result was PASS.
|
||||
- Created `docs/Sprints/SPRINT2CONTRACT.md` for paths, input discovery, and overwrite planning.
|
||||
- Added direct `docs/Sprints/SPRINT2CONTRACT.md` references to the agents that need Sprint 2 path planning context.
|
||||
- Updated `docs/V1IMPLEMENTATIONPLAN.md` to point Sprint 2 at the new contract and current scaffold state.
|
||||
- Verified the Sprint 2 contract documentation change with `git diff --check` and `uv run pytest` passing 4 tests.
|
||||
- Started Sprint 2 implementation after user approval and pre-implementation contract review PASS.
|
||||
- Added `src/pdf2md/paths.py` for input discovery, output path planning, overwrite conflict detection, duplicate output detection, and output-root escape prevention.
|
||||
- Added `tests/test_paths.py` with temporary-file coverage for single PDF discovery, directory discovery, recursive discovery, deterministic ordering, Korean filenames, output path planning, overwrite behavior, duplicate planned outputs, and output-root escape prevention.
|
||||
- Completed independent Sprint 2 evaluation once; the only hard failure was a Windows rooted/drive-relative `relative_parent` escape case.
|
||||
- Fixed output-root escape prevention by rejecting absolute, rooted, drive-qualified, and `..` relative parents and validating resolved planned outputs stay under the output root.
|
||||
- Verified `uv run pytest tests/test_paths.py` passes 17 tests.
|
||||
- Verified `uv run pytest` passes 21 tests.
|
||||
- Verified `git diff --check` passes for the Sprint 2 implementation.
|
||||
- Checked the implementation for disallowed MinerU, remote API, router, HTTP, OpenAI backend, or network client references.
|
||||
- Completed the final independent Sprint 2 evaluation; result was PASS.
|
||||
- Created `docs/Sprints/SPRINT3CONTRACT.md` for domain records, metadata construction, and warning aggregation planning.
|
||||
- Added direct `docs/Sprints/SPRINT3CONTRACT.md` references to the agents that need Sprint 3 metadata context.
|
||||
- Updated `docs/V1IMPLEMENTATIONPLAN.md` to point Sprint 3 at the new contract and current path-planning state.
|
||||
- Verified the Sprint 3 contract documentation change with `git diff --check` and `uv run pytest` passing 21 tests.
|
||||
- Started Sprint 3 implementation after user approval and pre-implementation contract review PASS.
|
||||
- Added `src/pdf2md/ir.py` for project-owned document, page, block, asset, and warning records with stable block types, warning codes, and severities.
|
||||
- Added `src/pdf2md/metadata.py` for JSON-serializable metadata construction and summary counts from project-owned records.
|
||||
- Added `tests/test_ir.py` and `tests/test_metadata.py` covering record serialization, optional field preservation/omission, invalid enum/severity validation, metadata top-level fields, summary counts, warning order, JSON serializability, and required input validation.
|
||||
- Verified `uv run pytest tests/test_ir.py tests/test_metadata.py` passes 25 tests.
|
||||
- Verified `uv run pytest` passes 46 tests.
|
||||
- Verified `git diff --check` passes for the Sprint 3 implementation.
|
||||
- Checked the implementation for disallowed remote API, router, HTTP, OpenAI backend, network client, MinerU adapter, and doctor references.
|
||||
- Completed the final independent Sprint 3 evaluation; result was PASS.
|
||||
- Created `docs/Sprints/SPRINT4CONTRACT.md` for direct local MinerU CLI adapter boundary planning with mocked subprocess/output tests.
|
||||
- Added direct `docs/Sprints/SPRINT4CONTRACT.md` references to the agents that need Sprint 4 MinerU adapter context.
|
||||
- Updated `docs/V1IMPLEMENTATIONPLAN.md` to point Sprint 4 at the new contract and current metadata-model state.
|
||||
- Verified the Sprint 4 contract documentation change with `git diff --check` and `uv run pytest` passing 46 tests.
|
||||
- Started Sprint 4 implementation after user approval and pre-implementation contract review PASS.
|
||||
- Added `src/pdf2md/mineru_adapter.py` for the direct local MinerU CLI adapter boundary, mockable availability/version checks, deterministic command construction, subprocess result capture, strict-local option validation, optional mocked-output parsing, and adapter warning mapping.
|
||||
- Added `tests/test_mineru_adapter.py` with fake-runner coverage for availability, missing MinerU, version success/failure/empty output, fixed command shape, custom executable rejection, strict-local rejection, mocked success, non-zero exit, missing output, and invalid JSON.
|
||||
- Fixed an independent evaluation finding that a caller-controlled executable could bypass strict-local policy; v1 now accepts only the direct `mineru` executable name, and user-exposed `mineru-api` execution is rejected.
|
||||
- Verified `uv sync` passes.
|
||||
- Verified `uv run pytest tests/test_mineru_adapter.py` passes 26 tests.
|
||||
- Verified `uv run pytest` passes 72 tests.
|
||||
- Verified `git diff --check` passes for the Sprint 4 implementation.
|
||||
- Checked the implementation for network client imports; none were found.
|
||||
- Checked strict-local prohibited tokens in `src/pdf2md`; matches are limited to deliberate validation literals in `mineru_adapter.py`.
|
||||
- Completed the final independent Sprint 4 evaluation; result was PASS.
|
||||
- Created `docs/Sprints/SPRINT5CONTRACT.md` for Obsidian Markdown normalization, math delimiter handling, asset link normalization, and conservative table fallback planning.
|
||||
- Added direct `docs/Sprints/SPRINT5CONTRACT.md` references to the agents that need Sprint 5 Markdown, warning, implementation, planning, or evaluation context.
|
||||
- Updated `docs/V1IMPLEMENTATIONPLAN.md` to point Sprint 5 at the new contract and current Sprint 4 implementation state.
|
||||
- Verified the Sprint 5 contract documentation change with agent TOML parsing, `git diff --check`, and `uv run pytest` passing 72 tests.
|
||||
- Started Sprint 5 implementation after user approval and pre-implementation contract review PASS.
|
||||
- Added `src/pdf2md/markdown.py` for project-owned Obsidian Markdown normalization, inline/display math delimiter handling, code fence and inline code protection, relative asset link normalization, local asset warning behavior, and conservative table fallback warnings.
|
||||
- Added `tests/test_markdown.py` covering inline math, display math spacing, idempotency, math body preservation, code protection, asset path normalization, invalid/missing/remote asset warnings, simple table preservation, and complex HTML table fallback warnings.
|
||||
- Added narrow warning codes `ASSET_LINK_INVALID` and `TABLE_FALLBACK` to `src/pdf2md/ir.py`.
|
||||
- Verified `uv sync` passes.
|
||||
- Verified `uv run pytest tests/test_markdown.py tests/test_ir.py` passes 30 tests.
|
||||
- Verified `uv run pytest` passes 89 tests.
|
||||
- Verified `git diff --check` passes for the Sprint 5 implementation.
|
||||
- Checked the implementation for network client imports; none were found.
|
||||
- Checked the implementation for conversion orchestration, metadata writing, report generation, and CLI convert behavior; no Sprint 5 code introduced those paths.
|
||||
- Completed the final independent Sprint 5 evaluation; result was PASS.
|
||||
- Created `docs/Sprints/SPRINT6CONTRACT.md` for local quality checks, math renderability boundary, metadata summary extensions, report content rendering, and final status planning.
|
||||
- Added direct `docs/Sprints/SPRINT6CONTRACT.md` references to the agents that need Sprint 6 quality, reporting, metadata, math renderability, implementation, planning, or evaluation context.
|
||||
- Updated `docs/V1IMPLEMENTATIONPLAN.md` to point Sprint 6 at the new contract and current Sprint 5 implementation state.
|
||||
- Verified the Sprint 6 contract documentation change with agent TOML parsing, `git diff --check`, and `uv run pytest` passing 89 tests.
|
||||
- Started Sprint 6 implementation after user approval and pre-implementation contract review PASS.
|
||||
- Added `src/pdf2md/quality.py` for local asset-link checks, math renderability checker boundaries, nonfatal checker-unavailable behavior, and quality result aggregation.
|
||||
- Added `src/pdf2md/report.py` for human-readable quality report content rendering from metadata and quality results, pages-with-warnings derivation, and final status calculation.
|
||||
- Added `tests/test_quality.py` covering missing/invalid asset links, code-block exclusions, fake math checker failures, checker-unavailable behavior, and quality result merging.
|
||||
- Added `tests/test_report.py` covering required report content, optional path handling, pages-with-warnings, final status policy, metadata/quality count use, and no report-file creation.
|
||||
- Verified `uv sync` passes.
|
||||
- Verified `uv run pytest tests/test_quality.py tests/test_report.py tests/test_metadata.py` passes 26 tests.
|
||||
- Verified `uv run pytest` passes 103 tests.
|
||||
- Verified `git diff --check` passes for the Sprint 6 implementation.
|
||||
- Checked the implementation for network client imports; none were found.
|
||||
- Checked the implementation for conversion orchestration, final output writing, metadata JSON writing, `.report.md` file writing, real MinerU invocation, setup scripts, and CLI convert behavior; no Sprint 6 code introduced those paths.
|
||||
- Completed the final independent Sprint 6 evaluation; result was PASS.
|
||||
- Completed the final independent Sprint 7 evaluation after fixing math renderability metadata counts; result was PASS.
|
||||
- Started Sprint 8 implementation after user approval.
|
||||
- Added `src/pdf2md/doctor.py` for mockable setup diagnostics covering Python 3.12, `uv`, MinerU availability/version, NVIDIA GPU visibility, PyTorch CUDA visibility, local model/cache/config detection, and strict-local policy reporting.
|
||||
- Added `pdf2md doctor` CLI integration without changing `pdf2md convert` or `pdf2md --version` behavior.
|
||||
- Updated `README.md` with Windows PowerShell setup, `uv`, MinerU 3.1.0 direct CLI expectations, model/cache environment notes, GTX 1070 Ti risk, and strict-local runtime policy.
|
||||
- Added mocked doctor and CLI tests for success, warning-only success, hard dependency failure, missing `uv`, missing MinerU, MinerU version warnings, missing GPU/PyTorch warnings, GTX 1070 Ti/Pascal risk, and missing model/cache warnings.
|
||||
- Verified `uv run pytest tests/test_doctor.py tests/test_cli.py` passes 22 tests.
|
||||
- Verified `uv sync` passes.
|
||||
- Verified `uv run pytest` passes 133 tests.
|
||||
- Verified `uv run pdf2md --version` prints `pdf2md 0.1.0`.
|
||||
- Verified local `uv run pdf2md doctor` returns exit code 1 because MinerU is not installed; it reports Python and `uv` pass, GTX 1070 Ti/Pascal risk warning, PyTorch missing warning, model/cache missing warning, and strict-local pass.
|
||||
- Completed independent Sprint 8 evaluation; result was PASS.
|
||||
- Committed Sprint 8 implementation as `7d965e3 feat: implement sprint 8 doctor diagnostics`.
|
||||
- Created `docs/Sprints/SPRINT9CONTRACT.md` for local fixture evaluation and the v1 release gate.
|
||||
- Added direct `docs/Sprints/SPRINT9CONTRACT.md` references to the agents that need Sprint 9 fixture evaluation, release gate, strict-local, or implementation context.
|
||||
- Updated `docs/V1IMPLEMENTATIONPLAN.md` to point Sprint 9 at the new contract and current Sprint 8 completion state.
|
||||
- Started Sprint 9 implementation after user approval and pre-implementation contract review PASS.
|
||||
- Added fast mocked v1 release-gate integration tests in `tests/integration/test_v1_fast_release_gate.py`.
|
||||
- Added explicit opt-in local MinerU fixture evaluation in `tests/integration/test_optional_mineru_fixtures.py`, gated by `PDF2MD_RUN_MINERU_FIXTURES=1`.
|
||||
- Added `docs/V1RELEASECHECKLIST.md` with default fast gates, strict-local release gates, doctor hard-failure handling, optional sample gates, fixture coverage notes, and no-sample-commit checks.
|
||||
- Updated `README.md` to point at the v1 release checklist and optional fixture evaluation gate.
|
||||
- Verified `uv run pytest tests/integration tests/test_conversion.py tests/test_cli.py` passes 24 tests with 1 optional skip.
|
||||
- Verified `uv run pytest tests/integration` passes 3 fast tests with 1 optional skip.
|
||||
- Verified opt-in `PDF2MD_RUN_MINERU_FIXTURES=1 uv run pytest -rs tests/integration/test_optional_mineru_fixtures.py` is skipped with a clear doctor blocker because MinerU is not installed.
|
||||
- Verified `uv run pytest` passes 136 tests with 1 optional skip.
|
||||
- Completed independent Sprint 9 evaluation; result was PASS.
|
||||
- Committed Sprint 9 implementation as `466abcf feat: implement sprint 9 release gate`.
|
||||
- Attempted `samples/MITC공부.pdf` conversion after installing MinerU; the run did not produce a successful conversion and was stopped after the user observed CPU-bound execution.
|
||||
- Added `outputs/` to `.gitignore` and removed the leftover generated output directory from the stopped sample run.
|
||||
- Updated default conversion behavior so `convert_pdf`, `convert_input`, and `pdf2md convert` default to `cuda:0`.
|
||||
- Updated the MinerU adapter to map CUDA requests to the MinerU subprocess environment with `MINERU_DEVICE_MODE` and `CUDA_VISIBLE_DEVICES`, preserving strict-local direct CLI execution.
|
||||
- Updated README, PRD, and architecture docs to document GPU default behavior and the remaining CUDA/PyTorch requirement.
|
||||
- Verified the GPU-default change with targeted tests, full tests, `git diff --check`, CLI help, and `pdf2md doctor`.
|
||||
- Re-ran `uv run pdf2md convert samples\MITC공부.pdf --out outputs\MITC공부 --overwrite`; the CLI reported `converted: 0`, `failed: 1`, `warnings: 1`, and wrote no Markdown, metadata JSON, or `.report.md`.
|
||||
- Confirmed the failure with a direct adapter probe using an ASCII work directory: MinerU 3.1.0 started its allowed temporary local `mineru-api`, used `hybrid-auto-engine`, attempted to load the VLM model on CUDA, and failed with `AssertionError: Torch not compiled with CUDA enabled`.
|
||||
- Left the product conversion stdout/stderr logs under ignored `outputs/MITC공부.logs`; removed temporary diagnostic probe directories.
|
||||
- Removed and recreated the project `.venv` with `uv sync`.
|
||||
- Installed CUDA-enabled PyTorch runtime: `torch==2.6.0+cu126` and `torchvision==0.21.0+cu126`.
|
||||
- Verified CUDA with an actual tensor operation on `NVIDIA GeForce GTX 1070 Ti`, compute capability `6.1`.
|
||||
- Installed `mineru[core]==3.1.0`; verified `mineru, version 3.1.0`.
|
||||
- Downloaded MinerU pipeline and VLM models with `uv run mineru-models-download -s huggingface -m all`; MinerU wrote model paths to `C:\Users\user\mineru.json`.
|
||||
- Set `MINERU_MODEL_SOURCE=local` at user scope and current process scope.
|
||||
- Verified `uv run pdf2md doctor` reports PyTorch CUDA available and model config detected; remaining WARN status is the intentional Pascal/pre-Turing GPU risk warning.
|
||||
- Verified `uv run pytest` passes 138 tests with 1 optional skip in the rebuilt environment.
|
||||
- Fixed real MinerU nested `images/...` asset-link rewriting so copied assets under `<stem>.assets/.../images/` resolve from the final Markdown.
|
||||
- Fixed page-count extraction for MinerU-style structured lists with `page_idx` values.
|
||||
- Verified `uv run pytest tests/test_conversion.py` passes 12 tests.
|
||||
- Verified `uv run pytest` passes 139 tests with 1 optional skip after the asset/page-count fix.
|
||||
- Re-ran `samples/MITC공부.pdf` conversion with `MINERU_MODEL_SOURCE=local`; MinerU used GPU via the direct local CLI and CLI-internal temporary local `mineru-api`.
|
||||
- The final sample outputs are `outputs/MITC공부/MITC공부.md`, `outputs/MITC공부/MITC공부.metadata.json`, `outputs/MITC공부/MITC공부.report.md`, and `outputs/MITC공부/MITC공부.assets/`.
|
||||
- The final sample report status is `partial` only because the local math render checker is unavailable; asset link checks pass with 0 missing and 0 invalid links.
|
||||
- Sample summary: 13 pages processed, 107 assets, 23 inline formulas, 103 display formulas, 1 info warning.
|
||||
- Added `docs/MATHJAXCHECKERPLAN.md` with the local MathJax checker objective, touched surfaces, Node helper contract, Python wrapper behavior, tests, acceptance criteria, and open implementation decisions.
|
||||
- Implemented the local MathJax render checker with `MathExpression` extraction, a local Node.js helper, a Python wrapper, default conversion integration, `doctor` diagnostics, setup documentation, and mocked tests.
|
||||
- Verified `npm run mathjax-checker:health` returns `{"ok":true}` after local `npm install`.
|
||||
- Verified direct helper JSON stdin reports one valid expression as ok and a malformed display formula as `Missing close brace`.
|
||||
- Verified `create_default_math_checker()` finds the local checker and records one render failure for malformed display math.
|
||||
- Verified targeted tests pass: `uv run pytest tests/test_quality.py tests/test_math_render.py tests/test_conversion.py tests/test_doctor.py tests/test_cli.py`.
|
||||
- Verified full tests pass: `uv run pytest` passed 150 tests with 1 optional skip.
|
||||
- Verified `git diff --check` passes.
|
||||
- Researched local PDF chunking packages and MinerU page-range behavior for Sprint 10.
|
||||
- Created `docs/Sprints/SPRINT10CONTRACT.md` recommending `pypdf>=6.10.2,<7` for 20-page local chunk PDFs, with chunk outputs converted independently and no Markdown merge.
|
||||
- Implemented Sprint 10 with `pypdf>=6.10.2,<7`, `src/pdf2md/pdf_splitter.py`, `--chunk-pages [PAGES]`, chunk-aware conversion orchestration, and chunk report context.
|
||||
- `--chunk-pages` is opt-in; when present without a value it uses 20 pages.
|
||||
- `convert_pdf()` returns `BatchConversionResult` when `chunk_pages` is set and keeps returning `ConversionResult` when chunking is unset.
|
||||
- Temporary chunk PDFs are deleted after conversion completes, including when raw MinerU output is retained.
|
||||
- Verified targeted Sprint 10 tests: `uv run pytest tests/test_pdf_splitter.py tests/test_conversion.py tests/test_cli.py tests/test_report.py` passed 42 tests.
|
||||
- Verified full default test suite: `uv run pytest` passed 163 tests with 1 optional skip.
|
||||
- Verified `git diff --check` passed with line-ending warnings only.
|
||||
|
||||
## In Progress
|
||||
|
||||
- No active implementation chunk.
|
||||
|
||||
## Blockers
|
||||
|
||||
- No active blocker for the completed `samples/MITC공부.pdf` conversion.
|
||||
- GTX 1070 Ti remains an 8GB Pascal GPU; larger PDFs may still hit VRAM or model compatibility limits even though this sample completed.
|
||||
|
||||
## Next Actions
|
||||
|
||||
1. Review the generated `outputs/MITC공부/MITC공부.md` in Obsidian if visual quality needs manual assessment.
|
||||
2. Run optional real local chunked conversion on a long sample only if requested.
|
||||
3. Run `npm install` and `npm run mathjax-checker:health` when real local MathJax checker validation is desired.
|
||||
4. Preserve the strict-local rule: setup downloads may be explicit, but runtime conversion must use local model paths, direct CLI execution, and no user-specified API or remote backend.
|
||||
|
||||
## Sprint 9 Handoff
|
||||
|
||||
- Files changed: `tests/integration/test_v1_fast_release_gate.py`, `tests/integration/test_optional_mineru_fixtures.py`, `docs/V1RELEASECHECKLIST.md`, `README.md`, `PLAN.md`, `PROGRESS.md`, `docs/V1IMPLEMENTATIONPLAN.md`, and `docs/Sprints/SPRINT9CONTRACT.md`.
|
||||
- Commands run: `uv run pytest tests/integration tests/test_conversion.py tests/test_cli.py`, `uv run pytest tests/integration`, `PDF2MD_RUN_MINERU_FIXTURES=1 uv run pytest -rs tests/integration/test_optional_mineru_fixtures.py`, `uv run pytest`, `git diff --check`, and `git status --short --untracked-files=all`.
|
||||
- Tests passed: targeted integration/CLI/conversion run passed 24 tests with 1 optional skip; integration-only run passed 3 fast tests with 1 optional skip; full `uv run pytest` passed 136 tests with 1 optional skip.
|
||||
- Tests blocked: optional real MinerU fixture conversion is blocked by `pdf2md doctor` because the `mineru` CLI is not installed on PATH.
|
||||
- Optional local MinerU status: explicitly gated by `PDF2MD_RUN_MINERU_FIXTURES=1`; current opt-in run skips with doctor blocker instead of pretending real validation passed.
|
||||
- Fixture coverage: release checklist maps local samples to math-heavy, table/formula, figures/assets, reading-order, and Korean filename/path risk categories; simple one-page, table-dominant, and figure-heavy known-baseline gaps remain.
|
||||
- Generated output locations: none persisted; optional output path uses pytest `tmp_path`.
|
||||
- Known failures: local doctor fails on missing MinerU CLI.
|
||||
- Independent evaluation: PASS.
|
||||
- Residual risks: no real MinerU output has been validated yet; GTX 1070 Ti/PyTorch acceleration and model/cache setup remain unproven; optional fixture quality still requires local MinerU setup.
|
||||
- User decisions needed: decide whether to install/configure MinerU 3.1.0 and run optional fixture validation.
|
||||
- V1 release recommendation: default fast gates are healthy, but full real-MinerU v1 validation is blocked until doctor passes or the user records a waiver.
|
||||
- Go/no-go recommendation for next sprint: go only for real setup/fixture validation if the user wants to proceed with local MinerU installation.
|
||||
- Next action: commit Sprint 9 implementation.
|
||||
|
||||
## Sprint 9 Contract Handoff
|
||||
|
||||
- Files changed: `docs/Sprints/SPRINT9CONTRACT.md`, `docs/V1IMPLEMENTATIONPLAN.md`, relevant `.codex/agents/*.toml`, `PLAN.md`, and `PROGRESS.md`.
|
||||
- Commands run: `uv --version`, `uv sync`, agent TOML parse check, `uv run pytest`, `git diff --check`, `git status --short --untracked-files=all`, and local sample filename listing.
|
||||
- Tests passed: `uv run pytest` passed 133 tests.
|
||||
- Tests blocked: None expected for the contract-only change.
|
||||
- Known failures: local `pdf2md doctor` still fails until MinerU is installed on PATH.
|
||||
- Residual risks: Sprint 9 is contract-only; fast mocked integration tests, optional local MinerU fixture harness, fixture coverage manifest, and release checklist are not implemented yet.
|
||||
- User decisions needed: None before Sprint 9 pre-implementation review.
|
||||
- Go/no-go recommendation for Sprint 9 implementation: review the contract first, then go if the user explicitly requests implementation.
|
||||
- Next action: verify and commit the Sprint 9 contract update.
|
||||
|
||||
## Sprint 8 Handoff
|
||||
|
||||
- Files changed: `src/pdf2md/doctor.py`, `src/pdf2md/cli.py`, `tests/test_doctor.py`, `tests/test_cli.py`, `README.md`, `PLAN.md`, `PROGRESS.md`, `docs/V1IMPLEMENTATIONPLAN.md`, and `docs/Sprints/SPRINT8CONTRACT.md`.
|
||||
- Commands run: `uv --version`, `uv sync`, `uv run pytest tests/test_doctor.py tests/test_cli.py`, `uv run pytest tests/test_doctor.py`, `uv run pytest`, `uv run pdf2md --version`, `uv run pdf2md doctor`, `git diff --check`, `git status --short --untracked-files=all`, and PowerShell strict-local/network pattern checks.
|
||||
- Tests passed: `uv run pytest tests/test_doctor.py tests/test_cli.py` passed 22 tests; `uv run pytest tests/test_doctor.py` passed 11 tests; `uv run pytest` passed 133 tests.
|
||||
- Tests blocked: None.
|
||||
- Known failures: local `uv run pdf2md doctor` correctly fails because the `mineru` CLI is not installed on PATH.
|
||||
- Known warnings: `uv` ignored Miniforge's invalid `SSL_CERT_DIR` path during sync/test commands, but the commands completed successfully; local doctor warns for GTX 1070 Ti/Pascal risk, missing PyTorch, and missing MinerU model/cache/config path.
|
||||
- Independent evaluation: PASS.
|
||||
- Residual risks: Sprint 8 does not install MinerU, download models, validate real MinerU output, run sample PDFs, or prove GTX 1070 Ti PyTorch acceleration. Those remain Sprint 9/local setup work.
|
||||
- User decisions needed: None for Sprint 8.
|
||||
- Go/no-go recommendation for Sprint 9: go after a Sprint 9 contract is written and reviewed.
|
||||
- Next action at completion: prepare the Sprint 9 contract when requested.
|
||||
|
||||
## Sprint 8 Contract Handoff
|
||||
|
||||
Historical note: this contract-only handoff was superseded by the implemented Sprint 8 handoff above.
|
||||
|
||||
- Files changed: `docs/Sprints/SPRINT8CONTRACT.md`, `docs/V1IMPLEMENTATIONPLAN.md`, relevant `.codex/agents/*.toml`, `PLAN.md`, and `PROGRESS.md`.
|
||||
- Commands run: `uv --version`, agent TOML parse check, `uv sync`, `uv run pytest`, `git diff --check`, `git status --short --untracked-files=all`, and PowerShell reference checks.
|
||||
- Tests passed: `uv run pytest` passed 119 tests.
|
||||
- Tests blocked: None.
|
||||
- Known failures: none.
|
||||
- Known warnings: `uv` ignored Miniforge's invalid `SSL_CERT_DIR` path during sync/test commands, but the commands completed successfully.
|
||||
- Residual risks: Sprint 8 is contract-only; `pdf2md doctor`, doctor diagnostics, setup docs, and setup helper scripts are not implemented yet.
|
||||
- User decisions needed: None before Sprint 8 pre-implementation review.
|
||||
- Go/no-go recommendation for Sprint 8 implementation: review the contract first, then go if the user explicitly requests implementation.
|
||||
- Next action: commit the contract update, then wait for an explicit Sprint 8 implementation request.
|
||||
|
||||
## Sprint 7 Handoff
|
||||
|
||||
- Files changed: `src/pdf2md/conversion.py`, `src/pdf2md/cli.py`, `src/pdf2md/__init__.py`, `tests/test_conversion.py`, `tests/test_cli.py`, `tests/test_package.py`, `PLAN.md`, `PROGRESS.md`, `docs/V1IMPLEMENTATIONPLAN.md`, and `docs/Sprints/SPRINT7CONTRACT.md`.
|
||||
- Commands run: `uv --version`, `uv sync`, `uv run pytest tests/test_conversion.py tests/test_cli.py`, `uv run pytest tests/test_conversion.py tests/test_cli.py tests/test_package.py`, `uv run pytest tests/test_conversion.py tests/test_metadata.py tests/test_report.py`, `uv run pytest`, `git diff --check`, `git status --short --untracked-files=all`, and PowerShell strict-local/network pattern checks.
|
||||
- Tests passed: `uv run pytest tests/test_conversion.py tests/test_cli.py` passed 18 tests; `uv run pytest tests/test_conversion.py tests/test_cli.py tests/test_package.py` passed 16 tests before the math renderability fix; `uv run pytest tests/test_conversion.py tests/test_metadata.py tests/test_report.py` passed 29 tests; `uv run pytest` passed 119 tests after the metadata math count fix.
|
||||
- Tests blocked: None.
|
||||
- Known failures: none.
|
||||
- Known warnings: `uv` ignored Miniforge's invalid `SSL_CERT_DIR` path during sync/test commands, but the commands completed successfully.
|
||||
- Independent evaluation: PASS after fixing math renderability metadata counts.
|
||||
- Residual risks: Sprint 7 uses fake adapters in default tests; it does not run real MinerU, probe real MinerU output, implement `pdf2md doctor`, validate CUDA/GPU, install models, or run sample PDFs.
|
||||
- User decisions needed: None for Sprint 7.
|
||||
- Go/no-go recommendation for Sprint 8: go.
|
||||
- Next action: prepare Sprint 8 contract when requested.
|
||||
|
||||
## Sprint 7 Contract Handoff (Historical)
|
||||
|
||||
- Files changed: `docs/Sprints/SPRINT7CONTRACT.md`, `docs/V1IMPLEMENTATIONPLAN.md`, `.codex/agents/feature-generator-agent.toml`, `.codex/agents/evaluation-agent.toml`, `.codex/agents/requirements-guard-agent.toml`, `.codex/agents/harness-planner-agent.toml`, `.codex/agents/mineru-integration-agent.toml`, `.codex/agents/metadata-agent.toml`, `.codex/agents/obsidian-markdown-agent.toml`, `PLAN.md`, and `PROGRESS.md`.
|
||||
- Commands run: `uv --version`, agent TOML parse check, `uv sync`, `uv run pytest`, `git diff --check`, and `git status --short --untracked-files=all`.
|
||||
- Tests passed: `uv run pytest` passed 103 tests.
|
||||
- Tests blocked: None.
|
||||
- Known failures: none.
|
||||
- Known warnings: `uv` ignored Miniforge's invalid `SSL_CERT_DIR` path during sync/test commands, but the commands completed successfully.
|
||||
- Residual risks at that time: Sprint 7 was contract-only before implementation. Superseded by the Sprint 7 Handoff above.
|
||||
- User decisions needed at that time: None before Sprint 7 pre-implementation review.
|
||||
- Go/no-go recommendation at that time: review the contract first, then go if the user explicitly requests implementation.
|
||||
- Next action at that time: commit the contract update, then wait for an explicit Sprint 7 implementation request.
|
||||
|
||||
## Sprint 6 Handoff
|
||||
|
||||
- Files changed: `src/pdf2md/quality.py`, `src/pdf2md/report.py`, `tests/test_quality.py`, `tests/test_report.py`, `PLAN.md`, `PROGRESS.md`, `docs/V1IMPLEMENTATIONPLAN.md`, and `docs/Sprints/SPRINT6CONTRACT.md`.
|
||||
- Commands run: `uv --version`, `uv sync`, `uv run pytest tests/test_quality.py tests/test_report.py tests/test_metadata.py`, `uv run pytest`, `git diff --check`, `git status --short --untracked-files=all`, and PowerShell file/pattern checks.
|
||||
- Tests passed: `uv run pytest tests/test_quality.py tests/test_report.py tests/test_metadata.py` passed 26 tests; `uv run pytest` passed 103 tests.
|
||||
- Tests blocked: None.
|
||||
- Known failures: none in Sprint 6 implementation.
|
||||
- Known warnings: `uv` ignored Miniforge's invalid `SSL_CERT_DIR` path during sync/test commands, but the commands completed successfully.
|
||||
- Residual risks: Sprint 6 intentionally does not run real MinerU, run a real math renderer, parse PDFs, write final Markdown files, copy assets, write metadata JSON files, write `.report.md` files, expose working `convert`, or implement `doctor`.
|
||||
- User decisions needed: None for Sprint 6.
|
||||
- Go/no-go recommendation for Sprint 7: go.
|
||||
- Next action: prepare Sprint 7 contract when requested.
|
||||
|
||||
## Sprint 5 Handoff
|
||||
|
||||
- Files changed: `src/pdf2md/markdown.py`, `src/pdf2md/ir.py`, `tests/test_markdown.py`, `PLAN.md`, `PROGRESS.md`, `docs/V1IMPLEMENTATIONPLAN.md`, and `docs/Sprints/SPRINT5CONTRACT.md`.
|
||||
- Commands run: `uv --version`, `uv sync`, `uv run pytest tests/test_markdown.py tests/test_ir.py`, `uv run pytest`, `git diff --check`, `git status --short --untracked-files=all`, and PowerShell file/pattern checks.
|
||||
- Tests passed: `uv run pytest tests/test_markdown.py tests/test_ir.py` passed 30 tests; `uv run pytest` passed 89 tests.
|
||||
- Tests blocked: None.
|
||||
- Known failures: none in Sprint 5 implementation.
|
||||
- Known warnings: `uv` ignored Miniforge's invalid `SSL_CERT_DIR` path during sync/test commands, but the commands completed successfully.
|
||||
- Residual risks: Sprint 5 intentionally does not run real MinerU, probe real MinerU Markdown, parse PDFs, write final Markdown files, copy assets, write metadata JSON, generate `.report.md`, expose working `convert`, or implement `doctor`.
|
||||
- User decisions needed: None for Sprint 5.
|
||||
- Go/no-go recommendation for Sprint 6: go.
|
||||
- Next action: prepare Sprint 6 contract when requested.
|
||||
|
||||
## Sprint 4 Handoff
|
||||
|
||||
- Files changed: `src/pdf2md/mineru_adapter.py`, `tests/test_mineru_adapter.py`, `PLAN.md`, and `PROGRESS.md`.
|
||||
- Commands run: `uv --version`, `uv sync`, `uv run pytest tests/test_mineru_adapter.py`, `uv run pytest`, `git diff --check`, `git status --short --untracked-files=all`, and PowerShell file/pattern checks.
|
||||
- Tests passed: `uv run pytest tests/test_mineru_adapter.py` passed 26 tests; `uv run pytest` passed 72 tests.
|
||||
- Tests blocked: None.
|
||||
- Known failures: none in Sprint 4 implementation after fixing the independent evaluation finding.
|
||||
- Known warnings: `uv` ignored Miniforge's invalid `SSL_CERT_DIR` path during sync/test commands, but the commands completed successfully.
|
||||
- Residual risks: Sprint 4 intentionally does not run real MinerU, install models, probe real MinerU output layout, parse PDFs, normalize Markdown, write metadata JSON, generate `.report.md`, expose working `convert`, or implement `doctor`.
|
||||
- User decisions needed: None for Sprint 4.
|
||||
- Go/no-go recommendation for Sprint 5: go.
|
||||
- Next action: prepare Sprint 5 contract when requested.
|
||||
|
||||
## Sprint 3 Handoff
|
||||
|
||||
- Files changed: `src/pdf2md/ir.py`, `src/pdf2md/metadata.py`, `tests/test_ir.py`, `tests/test_metadata.py`, `PLAN.md`, `PROGRESS.md`, and `docs/Sprints/SPRINT3CONTRACT.md`.
|
||||
- Commands run: `uv --version`, `uv sync`, `uv run pytest tests/test_ir.py tests/test_metadata.py`, `uv run pytest`, `git diff --check`, `git status --short`, and PowerShell file/pattern checks.
|
||||
- Tests passed: `uv run pytest tests/test_ir.py tests/test_metadata.py` passed 25 tests; `uv run pytest` passed 46 tests.
|
||||
- Tests blocked: None.
|
||||
- Known failures: none in Sprint 3 implementation.
|
||||
- Known warnings: `uv` ignored Miniforge's invalid `SSL_CERT_DIR` path during sync/test commands, but the commands completed successfully.
|
||||
- Residual risks: Sprint 3 intentionally does not parse PDFs, compute SHA-256, invoke MinerU, write conversion outputs, normalize Markdown, create full report content, run quality checks, or expose working `convert` or `doctor` commands.
|
||||
- User decisions needed: None for Sprint 3.
|
||||
- Go/no-go recommendation for Sprint 4: go.
|
||||
- Next action: prepare Sprint 4 contract when requested.
|
||||
|
||||
## Sprint 2 Handoff
|
||||
|
||||
- Files changed: `src/pdf2md/paths.py`, `tests/test_paths.py`, `PLAN.md`, and `PROGRESS.md`.
|
||||
- Commands run: `uv --version`, `uv sync`, `uv run pytest tests/test_paths.py`, `uv run pytest`, `git diff --check`, `git status --short`, and PowerShell file/pattern checks.
|
||||
- Tests passed: `uv run pytest tests/test_paths.py` passed 17 tests; `uv run pytest` passed 21 tests.
|
||||
- Tests blocked: None.
|
||||
- Known failures: none in Sprint 2 implementation.
|
||||
- Known warnings: `uv` ignored Miniforge's invalid `SSL_CERT_DIR` path during sync/test commands, but the commands completed successfully.
|
||||
- Residual risks: Sprint 2 intentionally does not parse PDFs, compute SHA-256, invoke MinerU, write conversion outputs, normalize Markdown, create metadata/report content, or expose a working `convert` command.
|
||||
- User decisions needed: None for Sprint 2.
|
||||
- Go/no-go recommendation for Sprint 3: go.
|
||||
- Next action: prepare Sprint 3 contract when requested.
|
||||
|
||||
## Sprint 1 Handoff
|
||||
|
||||
- Files changed: `pyproject.toml`, `uv.lock`, `.gitignore`, `README.md`, `src/pdf2md/__init__.py`, `src/pdf2md/cli.py`, `tests/test_package.py`, `tests/test_cli.py`, `PROGRESS.md`, `PLAN.md`, and `docs/Sprints/SPRINT1CONTRACT.md`.
|
||||
- Commands run: `uv --version`, `uv sync`, `uv run pytest`, `uv run pdf2md --version`, `git diff --check`, `git status --short`, and PowerShell file/pattern checks after `rg.exe` returned access denied.
|
||||
- Tests passed: `uv run pytest` passed 4 tests.
|
||||
- Tests blocked: None.
|
||||
- Known failures: `uv` may not be visible to a newly opened shell until PATH is refreshed; `rg.exe` returned access denied in this environment, so PowerShell checks were used instead.
|
||||
- Known warnings: `uv` ignored Miniforge's invalid `SSL_CERT_DIR` path during sync/test commands, but the commands completed successfully.
|
||||
- Residual risks: the scaffold intentionally does not validate MinerU, CUDA, model paths, conversion output, metadata, or quality reports.
|
||||
- User decisions needed: None for Sprint 1.
|
||||
- Go/no-go recommendation for Sprint 2: go.
|
||||
- Next action: prepare Sprint 2 contract when requested.
|
||||
|
||||
## Sprint 0 Handoff
|
||||
|
||||
Superseded note: the following Sprint 0 facts describe the completed 2.5.4 verification pass. The current engine decision is MinerU 3.1.0.
|
||||
|
||||
- Files changed: `docs/KNOWLEDGEBASE.md`, `docs/Sprints/SPRINT0CONTRACT.md`, `docs/V1IMPLEMENTATIONPLAN.md`, `PROGRESS.md`.
|
||||
- Sources checked: MinerU 2.5.4 PyPI, MinerU 2.5.4 tag files, MinerU output/model docs, Python/uv/PyTorch/NVIDIA docs, MinerU/model license sources.
|
||||
- Local commands run: `python --version`, `uv --version`, `nvidia-smi`.
|
||||
- Facts confirmed at that time: Python 3.12.7 is present; `uv` is missing; GTX 1070 Ti 8GB is visible; MinerU 2.5.4 direct CLI path is source-verified; MinerU/model 2.5-era licenses should be treated as AGPL-3.0.
|
||||
- Inferences made at that time: v1 should pin MinerU to `mineru[core]==2.5.4`; strict-local runtime should require local model source configuration; current MinerU 3.x docs should not drive the older 2.5 adapter behavior.
|
||||
- Known failures: `uv --version` failed because `uv` is not on PATH.
|
||||
- Residual risks: GTX 1070 Ti/PyTorch CUDA compatibility, real MinerU output layout until local probe, AGPL redistribution obligations, setup-download versus runtime-local separation.
|
||||
- Go/no-go recommendation: `go-with-risks`.
|
||||
- Next action: resolve `uv` availability or include bootstrap handling in Sprint 1, then create the Sprint 1 contract.
|
||||
@@ -0,0 +1,111 @@
|
||||
# ConvertPDFToMD
|
||||
|
||||
Local-only PDF-to-Markdown converter for math-heavy digital documents.
|
||||
|
||||
## Status
|
||||
|
||||
The project currently provides a Python package, `pdf2md convert`, metadata/report output, mocked MinerU adapter tests, `pdf2md doctor` setup diagnostics, and Sprint 9 release-gate documentation. Real local MinerU sample validation remains optional and may be blocked until MinerU 3.1.0 and local model/cache setup are available.
|
||||
|
||||
## Setup
|
||||
|
||||
Use Windows PowerShell with Python 3.12. If `uv` is installed but a new shell cannot find it, add the per-user install directory to PATH for the current session:
|
||||
|
||||
```powershell
|
||||
$env:Path = "C:\Users\user\.local\bin;$env:Path"
|
||||
```
|
||||
|
||||
Sync the project and run the fast local test loop:
|
||||
|
||||
```powershell
|
||||
uv sync
|
||||
uv run pytest
|
||||
uv run pdf2md --version
|
||||
```
|
||||
|
||||
For the local GTX 1070 Ti runtime, install CUDA-enabled PyTorch before installing MinerU so MinerU does not resolve to a CPU-only torch wheel:
|
||||
|
||||
```powershell
|
||||
uv sync
|
||||
uv pip install --index-url https://download.pytorch.org/whl/cu126 torch==2.6.0 torchvision==0.21.0
|
||||
uv pip install "mineru[core]==3.1.0"
|
||||
uv run mineru-models-download -s huggingface -m all
|
||||
[Environment]::SetEnvironmentVariable("MINERU_MODEL_SOURCE", "local", "User")
|
||||
$env:MINERU_MODEL_SOURCE = "local"
|
||||
uv run pdf2md doctor
|
||||
```
|
||||
|
||||
Run `uv sync` before the runtime install commands. If you run `uv sync` again later, repeat the runtime install commands because MinerU and CUDA PyTorch are intentionally not part of the default fast test dependency set.
|
||||
|
||||
Install the optional local MathJax checker when you want formula renderability counts to reflect real MathJax parsing instead of the nonfatal "checker unavailable" warning:
|
||||
|
||||
```powershell
|
||||
npm install
|
||||
npm run mathjax-checker:health
|
||||
uv run pdf2md doctor
|
||||
```
|
||||
|
||||
The checker runs through local Node.js and the local `mathjax` package only. It never uses a CDN or hosted renderer, and conversion still completes if Node.js or MathJax is missing.
|
||||
|
||||
For release checks, see [docs/V1RELEASECHECKLIST.md](docs/V1RELEASECHECKLIST.md). It separates the default fast gates from optional local MinerU/GPU/sample fixture evaluation. Optional fixture runs use `PDF2MD_RUN_MINERU_FIXTURES=1`, should use only local PDFs, write generated outputs to a temporary or ignored local directory, and count a sample conversion as successful only when Markdown, metadata JSON, and `.report.md` outputs all exist.
|
||||
|
||||
Install MinerU 3.1.0 as an explicit local setup step so the `mineru` executable is available on PATH. This project calls MinerU only through the direct local CLI shape:
|
||||
|
||||
```powershell
|
||||
mineru -p <input_path> -o <output_path>
|
||||
```
|
||||
|
||||
`pdf2md convert` requests GPU execution by default with `--gpu cuda:0`. The adapter maps that to MinerU's local `MINERU_DEVICE_MODE=cuda` and `CUDA_VISIBLE_DEVICES=0` environment for the MinerU subprocess. Actual GPU execution still requires a CUDA-capable local PyTorch/MinerU stack; `doctor` reports when PyTorch is CPU-only or CUDA is unavailable.
|
||||
|
||||
Run setup diagnostics before conversion:
|
||||
|
||||
```powershell
|
||||
uv run pdf2md doctor
|
||||
```
|
||||
|
||||
`doctor` checks Python 3.12, `uv`, the MinerU CLI and version, NVIDIA GPU visibility through `nvidia-smi`, PyTorch CUDA visibility when PyTorch is installed, local model/cache/config paths, local MathJax checker availability, and the strict-local runtime policy. It does not install packages, download models, run conversions, or inspect `samples/`.
|
||||
|
||||
The model/cache check looks for these environment variables when present:
|
||||
|
||||
- `MINERU_MODEL_SOURCE`
|
||||
- `MINERU_MODEL_DIR`
|
||||
- `MINERU_CACHE_DIR`
|
||||
- `MINERU_TOOLS_CONFIG_JSON`
|
||||
- `HF_HOME`
|
||||
- `HUGGINGFACE_HUB_CACHE`
|
||||
- `MODELSCOPE_CACHE`
|
||||
|
||||
It also checks for `%USERPROFILE%\mineru.json`, which MinerU documents as its default user config location. Missing model/cache paths are warnings because model download and cache population must be explicit setup actions.
|
||||
|
||||
## Runtime Policy
|
||||
|
||||
Runtime conversion is strict-local. Allowed: direct `mineru` CLI execution and the CLI-internal temporary local `mineru-api` that MinerU starts when `--api-url` is omitted. Prohibited: `--api-url`, remote APIs, router mode, HTTP client backends, remote OpenAI-compatible backends, hosted renderers, and cloud fallbacks.
|
||||
|
||||
Setup may require explicit user-initiated package or model downloads. Those setup downloads are separate from runtime conversion; `pdf2md doctor`, `pdf2md convert`, imports, and default tests must not download packages or models.
|
||||
|
||||
The target GPU is NVIDIA GTX 1070 Ti 8GB. `doctor` warns for GTX 1070 Ti/Pascal/pre-Turing GPUs because local CUDA/PyTorch compatibility and VRAM pressure must be validated on the actual machine before relying on acceleration.
|
||||
|
||||
## Long PDFs
|
||||
|
||||
Chunking is opt-in for long PDFs. Use `--chunk-pages` with no value to split into 20-page chunks, or pass an explicit positive page count:
|
||||
|
||||
```powershell
|
||||
uv run pdf2md convert samples/long.pdf --out outputs --chunk-pages
|
||||
uv run pdf2md convert samples/long.pdf --out outputs --chunk-pages 20
|
||||
```
|
||||
|
||||
Chunk PDFs are written to a temporary local directory before each MinerU run and are deleted after conversion completes. The generated Markdown files are not merged; each chunk gets its own Markdown, metadata JSON, report Markdown, and assets directory named with the original page range.
|
||||
|
||||
The Python API keeps non-chunked behavior unchanged. `convert_pdf(..., chunk_pages=20)` returns a `BatchConversionResult` with one `ConversionResult` per chunk.
|
||||
|
||||
## References
|
||||
|
||||
Source checked on 2026-05-08:
|
||||
|
||||
- MinerU Quick Usage: https://opendatalab.github.io/MinerU/usage/quick_usage/
|
||||
- MinerU CLI Tools: https://opendatalab.github.io/MinerU/usage/cli_tools/
|
||||
- MinerU Model Source: https://opendatalab.github.io/MinerU/usage/model_source/
|
||||
- MinerU GitHub README/release notes: https://github.com/opendatalab/MinerU
|
||||
- uv project sync documentation: https://docs.astral.sh/uv/concepts/projects/sync/
|
||||
- PyTorch previous versions: https://docs.pytorch.org/get-started/previous-versions/
|
||||
- PyTorch CUDA architecture support update: https://dev-discuss.pytorch.org/t/cuda-toolkit-version-and-architecture-support-update-maxwell-and-pascal-architecture-support-removed-in-cuda-12-8-and-12-9-builds/3128
|
||||
- PyTorch CUDA availability API: https://docs.pytorch.org/docs/2.11/generated/torch.cuda.is_available.html
|
||||
@@ -0,0 +1,282 @@
|
||||
# Knowledge Base: Local PDF-to-Markdown Converter for Math-Heavy Documents
|
||||
|
||||
Last updated: 2026-05-07
|
||||
|
||||
## 1. Product Direction
|
||||
|
||||
This project will build a local-first PDF-to-Markdown converter for math-heavy academic PDFs and books. The v1 target is intentionally narrow:
|
||||
|
||||
- Processing policy: local-only. Do not send user PDFs to cloud OCR or external AI APIs.
|
||||
- Primary interface: CLI plus Python library.
|
||||
- Primary output: Obsidian-friendly Markdown.
|
||||
- Main conversion engine: MinerU 3.1.0.
|
||||
- Math output: inline math as `$...$`, display math as `$$...$$`.
|
||||
- Hardware target: NVIDIA GPU.
|
||||
- PDF scope: digital PDFs with an existing text layer first. Scanned books and poor-quality scans are out of scope for v1 optimization.
|
||||
- Quality workflow: fully automatic conversion. Low-confidence regions should be logged and represented in metadata, but conversion should not stop.
|
||||
- Install target: Python with `uv`; scripts should document local model download/setup.
|
||||
- License posture: personal use. License terms are not a v1 blocker for this local project, but document MinerU and transitive model/package licenses before any redistribution.
|
||||
|
||||
The rest of this document records the research basis and implementation implications for these decisions.
|
||||
|
||||
This file is background research, not a second requirements source. Use `PRD.md` for product requirements and `ARCHITECTURE.md` for implementation structure.
|
||||
|
||||
## 2. Why Math PDF Conversion Is Hard
|
||||
|
||||
PDF is a visual/page-description format, not a semantic source format. In scientific PDFs, the displayed equation often survives visually while the source-level LaTeX structure is lost. The Nougat paper states the core issue clearly: scientific knowledge is frequently stored in PDFs, and the PDF format loses semantic information, especially for mathematical expressions. Source: [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418).
|
||||
|
||||
For this project, equation conversion must be treated as a document understanding and formula recognition problem, not just text extraction. A robust converter needs to recover:
|
||||
|
||||
- Reading order across multi-column layouts.
|
||||
- Inline and display equation boundaries.
|
||||
- LaTeX syntax that renders in common Markdown math renderers.
|
||||
- Tables, captions, figures, references, footnotes, and image assets.
|
||||
- Page-level provenance for debugging and downstream correction.
|
||||
|
||||
Digital PDFs make the problem easier because embedded text can be extracted directly, but formulas may still appear as glyph streams, vector paths, images, or fragmented positioned characters. A v1 product should therefore use the PDF text layer where reliable and use document parsing/OCR models for layout and math reconstruction.
|
||||
|
||||
## 3. MinerU 3.1.0 Engine Strategy
|
||||
|
||||
MinerU 3.1.0 is the fixed local parser for v1.
|
||||
|
||||
Relevant source facts:
|
||||
|
||||
- MinerU 3.1.0 is published on PyPI, requires Python `>=3.10,<3.14`, and describes support for PDF, images, DOCX, PPTX, and XLSX to Markdown and JSON. Source: [PyPI mineru 3.1.0](https://pypi.org/project/mineru/3.1.0/).
|
||||
- MinerU 3.1.0 release notes describe a license move to the MinerU Open Source License, a VLM main model upgrade to `MinerU2.5-Pro-2604-1.2B`, improved image/chart parsing, truncated paragraph merging, cross-page table merging, table-internal image recognition, and native PPTX/XLSX parsing. Source: [MinerU releases](https://github.com/opendatalab/MinerU/releases).
|
||||
- MinerU's current quick usage documents the CLI shape as `mineru -p <input_path> -o <output_path>` and states that without `--api-url`, the CLI launches a temporary local `mineru-api`. Source: [MinerU quick usage docs](https://opendatalab.github.io/MinerU/usage/quick_usage/).
|
||||
- Starting with MinerU 3.0, the `mineru` command is an orchestration client on top of `mineru-api`. Source: [MinerU usage guide](https://opendatalab.github.io/MinerU/usage/).
|
||||
- MinerU model source configuration uses `MINERU_MODEL_SOURCE`, and local models are enabled with `MINERU_MODEL_SOURCE=local`. Source: [MinerU model source docs](https://opendatalab.github.io/MinerU/usage/model_source/).
|
||||
|
||||
Implementation implications:
|
||||
|
||||
- Wrap MinerU behind a project-owned adapter rather than binding the codebase to its CLI output.
|
||||
- Preserve both Markdown and structured JSON when available.
|
||||
- Treat MinerU output as the first-pass parse, then normalize for Obsidian.
|
||||
- Keep engine name/version, command/options, page ids, and warnings in metadata.
|
||||
- Expect model downloads and GPU runtime setup to be heavy; document this clearly.
|
||||
- Treat MinerU 3.1.0's CLI-internal temporary local `mineru-api` as allowed local orchestration.
|
||||
- Reject `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends in strict-local mode.
|
||||
- Do not implement runtime engine selection in v1.
|
||||
|
||||
Risks:
|
||||
|
||||
- MinerU version behavior may change quickly.
|
||||
- GTX 1070 Ti is below the current documented GPU acceleration recommendation for some MinerU 3.x paths; `pipeline` CPU or limited GPU behavior must be validated locally.
|
||||
- Some licenses are custom or changed over time; check the exact dependency license before redistribution if the project scope changes.
|
||||
- Output Markdown may require post-processing to match Obsidian math and asset path expectations.
|
||||
|
||||
## 4. Output Standard: Obsidian-Friendly Markdown
|
||||
|
||||
The final Markdown should be optimized for Obsidian rendering and long-term note usage.
|
||||
|
||||
Rules:
|
||||
|
||||
- Inline math: `$...$`.
|
||||
- Display math: `$$...$$` on separate lines.
|
||||
- Store extracted images in a sibling assets directory, for example `paper.assets/page-003-figure-01.png`.
|
||||
- Use relative links from the Markdown file to assets.
|
||||
- Preserve page boundaries in metadata, not by noisy visible page markers in the main Markdown.
|
||||
- Prefer normal Markdown tables for simple tables.
|
||||
- Use HTML tables only when Markdown tables would destroy merged cells or mathematical structure.
|
||||
- Do not silently drop formulas, figures, captions, or references. If exact conversion fails, keep a best-effort representation and log a warning.
|
||||
|
||||
Post-processing should normalize:
|
||||
|
||||
- Math delimiters: convert `\(...\)` to `$...$` and `\[...\]` to `$$...$$` where safe.
|
||||
- Display math spacing: ensure blank lines around display equations.
|
||||
- Escaping: avoid over-escaping underscores inside math.
|
||||
- Asset links: convert absolute/generated paths to stable relative paths.
|
||||
- Heading levels: avoid treating running headers, footers, and page numbers as section headings.
|
||||
- Repeated hyphenation and line breaks from PDF extraction.
|
||||
|
||||
## 5. Metadata and Provenance
|
||||
|
||||
Metadata is necessary because fully automatic conversion can produce imperfect formulas and reading order. The converter must keep enough provenance to identify the source page, source region, engine version, warnings, and emitted assets for each result.
|
||||
|
||||
The metadata schema is defined in `ARCHITECTURE.md`.
|
||||
|
||||
## 6. Evaluation Strategy
|
||||
|
||||
Do not rely only on raw text edit distance. The project should evaluate multiple dimensions.
|
||||
|
||||
### Benchmarks to Learn From
|
||||
|
||||
OmniDocBench:
|
||||
|
||||
- Evaluates document parsing across text, formulas, tables, and reading order.
|
||||
- End-to-end scoring combines text edit distance, table TEDS, and formula CDM.
|
||||
- Provides formula recognition evaluation for display and inline formulas. Source: [OmniDocBench](https://github.com/opendatalab/OmniDocBench).
|
||||
|
||||
Unit-test-style document parsing checks:
|
||||
|
||||
- Uses simple, unambiguous, machine-checkable page facts instead of only soft edit-distance comparisons.
|
||||
- Includes arXiv math, old scans math, tables, headers/footers, multi-column layouts, long/tiny text, and base cases.
|
||||
- Explicitly notes that small equation symbol swaps can be critical even when edit distance is small.
|
||||
|
||||
ParseBench:
|
||||
|
||||
- Emphasizes semantic correctness for agentic document parsing: table structure, chart data, formatting, visual grounding, and content faithfulness. Source: [ParseBench](https://arxiv.org/abs/2604.08538).
|
||||
|
||||
### Project Acceptance Metrics
|
||||
|
||||
For v1, use a small project fixture suite rather than trying to run every public benchmark.
|
||||
|
||||
Required fixture categories:
|
||||
|
||||
- A short digital PDF with simple inline and display math.
|
||||
- A math-heavy academic paper with multi-column layout.
|
||||
- A PDF page containing a table with formulas.
|
||||
- A PDF with figures, captions, references, and page numbers.
|
||||
|
||||
Required checks:
|
||||
|
||||
- Markdown file is generated.
|
||||
- Metadata JSON is generated when requested.
|
||||
- All pages have provenance records.
|
||||
- Inline math and display math render in a KaTeX/MathJax-compatible check.
|
||||
- Asset links resolve.
|
||||
- No cloud network calls occur during conversion.
|
||||
- MinerU failures produce warnings and a best-effort output, not a hard crash, unless the input file cannot be opened or no output can be produced.
|
||||
|
||||
Recommended quantitative checks:
|
||||
|
||||
- Count display math blocks detected.
|
||||
- Count math render failures.
|
||||
- Count missing asset links.
|
||||
- Count pages with warnings.
|
||||
- Compare selected known formulas against expected LaTeX or normalized render output.
|
||||
|
||||
## 7. Implementation Source of Truth
|
||||
|
||||
Do not implement directly from this research note.
|
||||
|
||||
- Use `PRD.md` for CLI, API, scope, tests, and release criteria.
|
||||
- Use `ARCHITECTURE.md` for the conversion pipeline, MinerU boundary, intermediate representation, metadata schema, and strict-local enforcement.
|
||||
|
||||
## 8. Implementation Risks
|
||||
|
||||
- Formula correctness cannot be guaranteed purely by text extraction.
|
||||
- MinerU behavior may change across versions; adapter tests should pin expected behavior.
|
||||
- GPU dependencies can be difficult on Windows; `doctor` should detect CUDA, PyTorch, model paths, and engine availability.
|
||||
- Obsidian math rendering may differ from GitHub or Pandoc.
|
||||
- Licensing must be reviewed before packaging or redistributing models/tools.
|
||||
- Fully automatic mode means users may receive imperfect formulas; warnings and metadata are essential.
|
||||
|
||||
## 9. Sprint 0 Verification And Engine Update (2026-05-07)
|
||||
|
||||
Sprint 0 originally verified MinerU 2.5.4 assumptions before implementation. After Sprint 0, the project owner changed the v1 engine target to MinerU 3.1.0 and redefined strict-local execution. The current implementation target is therefore MinerU 3.1.0, not the earlier 2.5.4 pin.
|
||||
|
||||
### 9.1 Recommendation
|
||||
|
||||
Recommendation: `go-with-risks` for personal-use v1.
|
||||
|
||||
The project can proceed to Sprint 1 after the `uv` workflow is available locally or Sprint 1 explicitly handles the bootstrap gap. Use `mineru[core]==3.1.0` or another explicitly reviewed 3.1.0 installation path until a later sprint contract changes it.
|
||||
|
||||
Current strict-local policy:
|
||||
|
||||
- Allowed: direct `mineru` CLI execution.
|
||||
- Allowed: the temporary local `mineru-api` process that MinerU 3.1.0 starts internally when the CLI runs without `--api-url`.
|
||||
- Prohibited: `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends.
|
||||
- Setup may download models only when the user explicitly runs setup commands.
|
||||
- Runtime conversion should use local model paths, for example with `MINERU_MODEL_SOURCE=local`.
|
||||
|
||||
### 9.2 Evidence Policy Applied
|
||||
|
||||
Sprint 0 claims use official or primary sources where available. Each claim below is marked as either:
|
||||
|
||||
- Direct fact: stated by a source or observed from an allowed local command.
|
||||
- Project inference: a project decision derived from source facts.
|
||||
|
||||
Web research was allowed for documentation verification. Runtime converter design remains local-only.
|
||||
|
||||
### 9.3 MinerU 3.1.0 Facts
|
||||
|
||||
| Area | Confirmed fact | Evidence type | Source | Implementation implication |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| Package pin | PyPI lists MinerU `3.1.0`, released 2026-04-17, with Python `>=3.10,<3.14` and extras including `core`, `pipeline`, `vlm`, `vllm`, `lmdeploy`, `mlx`, `gradio`, and `all`. | Direct fact | [PyPI mineru 3.1.0](https://pypi.org/project/mineru/3.1.0/) | Pin v1 setup to MinerU 3.1.0 until changed by a later contract. |
|
||||
| Install path | Current MinerU docs show `uv pip install -U "mineru[all]"`; PyPI also supports extras including `core`. | Direct fact plus project inference | [PyPI mineru 3.1.0](https://pypi.org/project/mineru/3.1.0/) | Start with `mineru[core]==3.1.0` for the wrapper unless Sprint 1 or Sprint 8 proves that `all` is required. |
|
||||
| CLI shape | Current docs show `mineru -p <input_path> -o <output_path>`. | Direct fact | [MinerU quick usage docs](https://opendatalab.github.io/MinerU/usage/quick_usage/) | The adapter still calls the `mineru` CLI directly. |
|
||||
| Local temporary API | MinerU docs state that without `--api-url`, the CLI launches a temporary local `mineru-api`; with `--api-url`, the CLI connects to an existing local or remote FastAPI service. | Direct fact | [MinerU quick usage docs](https://opendatalab.github.io/MinerU/usage/quick_usage/), [MinerU CLI tools docs](https://opendatalab.github.io/MinerU/usage/cli_tools/) | Allow only the CLI-internal temporary local `mineru-api`; reject `--api-url` and user-managed API endpoints. |
|
||||
| Backend risk | Current docs describe `pipeline`, `vlm`, and `hybrid` paths plus HTTP-client variants for OpenAI-compatible servers. | Direct fact | [PyPI mineru 3.1.0](https://pypi.org/project/mineru/3.1.0/), [MinerU CLI tools docs](https://opendatalab.github.io/MinerU/usage/cli_tools/) | Strict-local validation must reject HTTP client backends and remote OpenAI-compatible backends. |
|
||||
| Output layout | MinerU output docs list Markdown plus visual debugging files and structured files such as `model.json`, `middle.json`, and `content_list.json`; the exact set depends on backend and input type. | Direct fact | [MinerU output files docs](https://opendatalab.github.io/MinerU/reference/output_files/) | Adapter parsing must tolerate optional files and backend-specific structured output. Keep raw output optional through `--keep-raw`. |
|
||||
| Model/cache behavior | MinerU uses Hugging Face and ModelScope by default and switches source through `MINERU_MODEL_SOURCE`; local parsing uses `MINERU_MODEL_SOURCE=local` after models are downloaded. | Direct fact | [MinerU model source docs](https://opendatalab.github.io/MinerU/usage/model_source/) | Setup scripts may download models; runtime conversion should require local model paths under strict-local mode. |
|
||||
| 3.1.0 capability update | Release notes say 3.1.0 upgrades the main VLM model to `MinerU2.5-Pro-2604-1.2B` and improves image/chart parsing, truncated paragraph merging, cross-page table merging, and image recognition inside tables. | Direct fact | [MinerU releases](https://github.com/opendatalab/MinerU/releases) | 3.1.0 is a better target for math-heavy and complex-layout documents, pending local hardware validation. |
|
||||
|
||||
Adapter-facing fields that must remain optional until a local MinerU output probe is run:
|
||||
|
||||
- Backend/parse method directory names.
|
||||
- Presence of `content_list`, `middle`, `model`, `layout`, `span`, and `origin` files.
|
||||
- Page/block confidence and bbox fields in structured output.
|
||||
- Asset naming and relative paths.
|
||||
- Exact stdout/stderr warning text.
|
||||
|
||||
### 9.4 Local Environment Facts
|
||||
|
||||
Allowed local commands run during Sprint 0:
|
||||
|
||||
```powershell
|
||||
python --version
|
||||
uv --version
|
||||
nvidia-smi
|
||||
```
|
||||
|
||||
Results:
|
||||
|
||||
| Area | Result | Evidence type | Source or command | Impact |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| Python | Local Python is `Python 3.12.7`. Python 3.12 target is viable for MinerU 3.1.0's declared `>=3.10,<3.14` range. | Direct fact plus project inference | `python --version`; [PyPI mineru 3.1.0](https://pypi.org/project/mineru/3.1.0/) | Continue targeting Python 3.12. Prefer current 3.12 patch in setup docs, but do not require a patch bump for v1. |
|
||||
| `uv` | `uv` is not on PATH. PowerShell reported that `uv` is not recognized. | Direct fact | `uv --version`; [uv installation docs](https://docs.astral.sh/uv/getting-started/installation/) | Sprint 1 cannot honestly claim `uv sync` works until `uv` is installed or the Sprint 1 contract includes bootstrap instructions. |
|
||||
| GPU | `nvidia-smi` detects NVIDIA GeForce GTX 1070 Ti, driver `577.00`, WDDM, 8192 MiB VRAM, about 5034 MiB free at check time. | Direct fact | `nvidia-smi` | GPU is visible, but available VRAM is tight for model workloads and must be reported by `doctor`. |
|
||||
| Compute capability | GTX 1070 Ti is Pascal and listed as CUDA compute capability `6.1`. | Direct fact | [NVIDIA legacy CUDA GPU table](https://developer.nvidia.com/cuda/gpus/legacy), [NVIDIA GTX 10-series specs](https://www.nvidia.com/en-us/geforce/10-series/10-series-specs/) | Warn on pre-Turing GPUs. Do not assume modern CUDA 12.8/12.9 PyTorch wheels work. |
|
||||
| PyTorch/CUDA risk | PyTorch project discussion states CUDA 12.8/12.9 builds remove Maxwell/Pascal support. `nvidia-smi` CUDA version is a driver capability ceiling, not proof that PyTorch CUDA works. | Direct fact plus project inference | [PyTorch CUDA architecture support discussion](https://dev-discuss.pytorch.org/t/cuda-toolkit-version-and-architecture-support-update-maxwell-and-pascal-architecture-support-removed-in-cuda-12-8-and-12-9-builds/3128), [PyTorch install docs](https://pytorch.org/get-started/locally/) | `pdf2md doctor` must test actual PyTorch import, CUDA runtime, device name, compute capability, and free memory. |
|
||||
|
||||
Future `pdf2md doctor` checks:
|
||||
|
||||
- `python --version`, requiring `>=3.12,<3.13`.
|
||||
- `uv --version`, failing clearly when unavailable.
|
||||
- PowerShell version and edition on Windows.
|
||||
- `nvidia-smi` GPU name, driver, WDDM/TCC mode, total/free VRAM.
|
||||
- GPU compute capability warning for `<7.5`, especially Pascal `6.1`.
|
||||
- `torch` import, `torch.__version__`, `torch.version.cuda`, `torch.cuda.is_available()`, device name, compute capability, and free memory.
|
||||
- MinerU CLI availability, installed package version, and model/cache configuration.
|
||||
- Strict-local validation that runtime uses direct CLI, allows only CLI-internal temporary local `mineru-api`, uses local model paths, and rejects `--api-url`, router mode, HTTP client backends, remote APIs, and remote OpenAI-compatible backends.
|
||||
|
||||
### 9.5 License And Privacy Facts
|
||||
|
||||
| Area | Confirmed fact | Evidence type | Source | Implementation implication |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| MinerU 3.1.0 license | PyPI identifies MinerU 3.1.0 as `LicenseRef-MinerU-Open-Source-License`; release notes state MinerU moved from AGPLv3 to a custom license based on Apache 2.0. | Direct fact | [PyPI mineru 3.1.0](https://pypi.org/project/mineru/3.1.0/), [MinerU releases](https://github.com/opendatalab/MinerU/releases) | License is not a blocker for personal v1. Redistribution still needs review. |
|
||||
| Runtime privacy | MinerU docs expose remote-capable paths, including `--api-url`, router usage, HTTP client backends, and OpenAI-compatible backend URLs. | Direct fact | [MinerU usage guide](https://opendatalab.github.io/MinerU/usage/), [MinerU CLI tools docs](https://opendatalab.github.io/MinerU/usage/cli_tools/) | The wrapper must never upload PDFs, page images, extracted text, or intermediates. Setup downloads are separate from runtime conversion. |
|
||||
| Model downloads | `mineru-models-download` must use a remote model source for real downloads, while runtime local parsing uses `MINERU_MODEL_SOURCE=local`. | Direct fact | [MinerU model source docs](https://opendatalab.github.io/MinerU/usage/model_source/) | Download scripts are setup-only. Conversion runtime must not download or call remote inference. |
|
||||
|
||||
### 9.6 Go-With-Risks Register
|
||||
|
||||
| Risk | Later sprint that must absorb it | Required handling |
|
||||
| --- | --- | --- |
|
||||
| `uv` is missing locally. | Sprint 1 and Sprint 8 | Install `uv` before claiming scaffold success, or make bootstrap documentation part of Sprint 1. `doctor` must report missing `uv`. |
|
||||
| GTX 1070 Ti is Pascal compute capability 6.1 and is below current documented GPU acceleration recommendations for some MinerU 3.x paths. | Sprint 8 | `doctor` must verify actual PyTorch/CUDA/MinerU backend availability and warn on pre-Turing GPUs. Setup docs should avoid assuming CUDA 12.8/12.9 PyTorch works on this GPU. |
|
||||
| MinerU 3.1.0 output layout is source-verified but not locally probed. | Sprint 4 and Sprint 9 | Adapter tests should mock optional outputs first. A later local MinerU probe should confirm real output paths before release. |
|
||||
| CLI-internal local `mineru-api` is allowed, but user-specified API paths are prohibited. | Sprint 4 and Sprint 8 | Adapter command validation must allow `mineru -p ... -o ...` without `--api-url`, while rejecting `--api-url`, router mode, HTTP client backends, remote APIs, and remote OpenAI-compatible backends. |
|
||||
| Runtime must be local-only while setup may download packages/models. | Sprint 4 and Sprint 8 | Separate setup/download commands from conversion runtime. Enforce `MINERU_MODEL_SOURCE=local` or equivalent local model configuration in strict-local mode. |
|
||||
|
||||
No hard failure criteria are currently met. Direct local MinerU 3.1.0 CLI execution appears suitable for v1 under the redefined strict-local policy, Python 3.12 is compatible with the package metadata, local-only design remains viable, and license posture does not block personal use.
|
||||
|
||||
## 10. Source Index
|
||||
|
||||
- [Nougat paper](https://arxiv.org/abs/2308.13418)
|
||||
- [MinerU docs](https://opendatalab.github.io/MinerU/zh/)
|
||||
- [MinerU GitHub](https://github.com/opendatalab/mineru)
|
||||
- [PyPI mineru 3.1.0](https://pypi.org/project/mineru/3.1.0/)
|
||||
- [MinerU releases](https://github.com/opendatalab/MinerU/releases)
|
||||
- [MinerU usage guide](https://opendatalab.github.io/MinerU/usage/)
|
||||
- [MinerU quick usage docs](https://opendatalab.github.io/MinerU/usage/quick_usage/)
|
||||
- [MinerU CLI tools docs](https://opendatalab.github.io/MinerU/usage/cli_tools/)
|
||||
- [MinerU output files docs](https://opendatalab.github.io/MinerU/reference/output_files/)
|
||||
- [MinerU model source docs](https://opendatalab.github.io/MinerU/usage/model_source/)
|
||||
- [uv installation docs](https://docs.astral.sh/uv/getting-started/installation/)
|
||||
- [PyTorch install docs](https://pytorch.org/get-started/locally/)
|
||||
- [PyTorch CUDA architecture support discussion](https://dev-discuss.pytorch.org/t/cuda-toolkit-version-and-architecture-support-update-maxwell-and-pascal-architecture-support-removed-in-cuda-12-8-and-12-9-builds/3128)
|
||||
- [NVIDIA legacy CUDA GPU table](https://developer.nvidia.com/cuda/gpus/legacy)
|
||||
- [NVIDIA GTX 10-series specs](https://www.nvidia.com/en-us/geforce/10-series/10-series-specs/)
|
||||
- [OmniDocBench](https://github.com/opendatalab/OmniDocBench)
|
||||
- [ParseBench](https://arxiv.org/abs/2604.08538)
|
||||
@@ -0,0 +1,298 @@
|
||||
# MathJax Local Render Checker Implementation Plan
|
||||
|
||||
## Purpose
|
||||
|
||||
Add a local MathJax-based render checker so the converter can validate whether extracted LaTeX formulas are likely to render in Obsidian. The checker must remain a quality signal only: failed formulas produce structured warnings, metadata counts, and report entries, but they do not stop conversion when Markdown output can still be produced.
|
||||
|
||||
This plan is implementation planning only. It does not add a second PDF conversion engine, cloud service, remote API, or manual review workflow.
|
||||
|
||||
Implementation status: implemented on 2026-05-08 with a local Node.js helper, Python MathJax wrapper, conversion integration, doctor diagnostics, setup documentation, and mocked default tests. Real checker execution uses the official local `mathjax` package and requires `npm install` to populate local `node_modules/`.
|
||||
|
||||
## Product Context
|
||||
|
||||
The project already normalizes inline math to `$...$` and display math to `$$...$$`. `src/pdf2md/quality.py` already has a math renderability boundary through `check_math_renderability()` and `MathCheckerUnavailable`, but current conversions record an info warning when no checker is injected.
|
||||
|
||||
The next implementation should replace that unavailable-checker path with a real local MathJax check when local Node.js and MathJax are available.
|
||||
|
||||
Relevant existing behavior:
|
||||
|
||||
- Conversion remains local-only.
|
||||
- MinerU 3.1.0 remains the only PDF conversion engine.
|
||||
- Quality warnings are non-fatal unless no usable output can be produced.
|
||||
- Metadata and `.report.md` already include `math_render_error_count`.
|
||||
- Default tests must not require real MinerU, GPU, Node.js, MathJax, network, Obsidian, or sample PDFs.
|
||||
|
||||
## References
|
||||
|
||||
- Obsidian documents math expressions as MathJax/LaTeX notation: https://help.obsidian.md/advanced-syntax
|
||||
- MathJax supports Node/server-side use through components: https://docs.mathjax.org/en/v4.1/server/components.html
|
||||
- MathJax can convert TeX strings to SVG, including `tex2svgPromise()`: https://docs.mathjax.org/en/latest/web/convert.html
|
||||
|
||||
## Design Decisions
|
||||
|
||||
1. Use MathJax, not KaTeX, as the primary checker.
|
||||
- Obsidian compatibility is the output standard.
|
||||
- Obsidian uses MathJax for math rendering.
|
||||
- KaTeX can remain a future fast preflight option, but it should not define v1 pass/fail behavior.
|
||||
|
||||
2. Run MathJax locally through Node.js.
|
||||
- Do not use a CDN.
|
||||
- Do not fetch packages at conversion time.
|
||||
- Do not call remote render APIs.
|
||||
|
||||
3. Batch formulas in one Node process per conversion.
|
||||
- Spawning one process per formula would be too slow for math-heavy papers.
|
||||
- `samples/MITC공부.pdf` produced 126 math expressions, so batch checking is the practical baseline.
|
||||
|
||||
4. Treat unavailable tooling differently from invalid math.
|
||||
- Missing Node.js, missing MathJax, bad helper path, timeout, or invalid helper JSON should produce an info-level unavailable-checker warning.
|
||||
- A MathJax parse/render failure for a specific expression should produce a warning-level `MATH_RENDER_FAILED` record and increment `math_render_error_count`.
|
||||
|
||||
5. Preserve conversion continuity.
|
||||
- Math render failures never remove formulas from the Markdown.
|
||||
- Math render failures do not trigger fallback engines.
|
||||
- The generated report remains derived from metadata and local checks.
|
||||
|
||||
## Proposed Touched Surfaces
|
||||
|
||||
- `src/pdf2md/quality.py`
|
||||
- Replace body-only math iteration with expression records carrying body, display mode, index, and Markdown span.
|
||||
- Keep code fence and inline code protection.
|
||||
- Keep unavailable-checker behavior non-fatal.
|
||||
|
||||
- `src/pdf2md/math_render.py`
|
||||
- Add the Python wrapper for local MathJax checking.
|
||||
- Probe Node.js availability.
|
||||
- Execute the Node helper with JSON stdin.
|
||||
- Parse JSON stdout into project-owned check results.
|
||||
- Convert helper failures into `MathCheckerUnavailable`.
|
||||
|
||||
- `tools/mathjax-checker/check.mjs`
|
||||
- Add the Node helper.
|
||||
- Load local MathJax components.
|
||||
- Accept JSON input with expressions.
|
||||
- Return JSON results only.
|
||||
|
||||
- `package.json` and lockfile, or equivalent local setup documentation
|
||||
- Add a local MathJax Node dependency if the project chooses a committed Node package manifest.
|
||||
- The implementation should not install npm dependencies during conversion.
|
||||
|
||||
- `src/pdf2md/conversion.py`
|
||||
- Wire the default local checker when available, while preserving dependency injection for tests.
|
||||
- Keep the public Python API compatible unless a later sprint explicitly changes it.
|
||||
|
||||
- `src/pdf2md/doctor.py`
|
||||
- Add diagnostic checks for Node.js and local MathJax checker availability.
|
||||
- Report missing MathJax as a warning, not a hard failure, unless the project later decides the checker is mandatory.
|
||||
|
||||
- `README.md` or setup documentation
|
||||
- Document local MathJax checker setup.
|
||||
- Explain that missing MathJax does not block conversion but leaves renderability unvalidated.
|
||||
|
||||
- `PROGRESS.md`
|
||||
- Record the implementation and verification outcome after completion.
|
||||
|
||||
## Data Model Plan
|
||||
|
||||
Add a small expression record for quality checking:
|
||||
|
||||
```python
|
||||
@dataclass(frozen=True)
|
||||
class MathExpression:
|
||||
index: int
|
||||
body: str
|
||||
display: bool
|
||||
markdown_span: tuple[int, int]
|
||||
```
|
||||
|
||||
The checker output should be project-owned and independent of MathJax internals:
|
||||
|
||||
```python
|
||||
@dataclass(frozen=True)
|
||||
class MathCheckResult:
|
||||
ok: bool
|
||||
message: str = ""
|
||||
```
|
||||
|
||||
If per-expression metadata is later needed, extend warning messages first. Do not expose raw MathJax objects in metadata or public API return values.
|
||||
|
||||
## Node Helper Contract
|
||||
|
||||
Input over stdin:
|
||||
|
||||
```json
|
||||
{
|
||||
"expressions": [
|
||||
{"index": 0, "body": "x^2", "display": false},
|
||||
{"index": 1, "body": "\\frac{1}{2}", "display": true}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Output over stdout:
|
||||
|
||||
```json
|
||||
{
|
||||
"results": [
|
||||
{"index": 0, "ok": true},
|
||||
{"index": 1, "ok": false, "message": "MathJax error message"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
stderr is reserved for diagnostics only. The Python wrapper should not depend on stderr format.
|
||||
|
||||
## Python Wrapper Behavior
|
||||
|
||||
The wrapper should:
|
||||
|
||||
1. Locate `node`.
|
||||
2. Locate the helper script.
|
||||
3. Send all expressions through JSON stdin.
|
||||
4. Set a deterministic timeout.
|
||||
5. Require valid JSON stdout.
|
||||
6. Map each result by expression index.
|
||||
7. Return `MathCheckResult` values for expression failures.
|
||||
8. Raise `MathCheckerUnavailable` for tool-level failures.
|
||||
|
||||
Recommended default timeout policy:
|
||||
|
||||
- Start with one conversion-level MathJax timeout.
|
||||
- Use a conservative default such as 60 seconds for a document batch.
|
||||
- Make the timeout test-injectable.
|
||||
- Do not add CLI flags for timeout unless the user explicitly asks for configuration.
|
||||
|
||||
## Integration Plan
|
||||
|
||||
1. Refactor math extraction in `quality.py`.
|
||||
- Add expression records.
|
||||
- Preserve existing code-block exclusions.
|
||||
- Preserve inline/display count behavior.
|
||||
- Add tests for display mode and spans.
|
||||
|
||||
2. Add mocked MathJax wrapper tests.
|
||||
- Fake successful Node JSON response.
|
||||
- Fake per-expression failure.
|
||||
- Fake missing `node`.
|
||||
- Fake timeout.
|
||||
- Fake invalid JSON.
|
||||
- Fake mismatched expression indexes.
|
||||
|
||||
3. Add the Node helper.
|
||||
- Keep stdout as JSON only.
|
||||
- Ensure local package resolution.
|
||||
- Avoid remote imports or CDN URLs.
|
||||
|
||||
4. Wire the checker into conversion.
|
||||
- If a test injects `math_checker`, use the injected checker.
|
||||
- Otherwise, build a default local MathJax checker when available.
|
||||
- If unavailable, keep the current info warning behavior.
|
||||
|
||||
5. Extend doctor.
|
||||
- Report Node.js availability.
|
||||
- Report local MathJax package/helper availability.
|
||||
- Keep missing MathJax as WARN.
|
||||
|
||||
6. Update setup docs.
|
||||
- Explain how to install local MathJax dependencies.
|
||||
- Explain expected report behavior with and without MathJax.
|
||||
|
||||
7. Run optional real fixture validation.
|
||||
- Re-run `samples/MITC공부.pdf` only under an explicit local fixture gate or direct user request.
|
||||
- Confirm `MATH_RENDER_FAILED` unavailable-checker warning disappears when MathJax is installed.
|
||||
|
||||
## Test Plan
|
||||
|
||||
Default fast tests, no real Node or MathJax required:
|
||||
|
||||
- `uv run pytest tests/test_quality.py`
|
||||
- `uv run pytest tests/test_conversion.py`
|
||||
- `uv run pytest tests/test_doctor.py tests/test_cli.py`
|
||||
- `uv run pytest`
|
||||
|
||||
New required tests:
|
||||
|
||||
- Extract inline and display expressions with correct `display` values.
|
||||
- Ignore math-like text inside fenced code and inline code.
|
||||
- Count failures from injected checker results.
|
||||
- Preserve conversion success when some formulas fail render checks.
|
||||
- Preserve info warning when the checker is unavailable.
|
||||
- Validate Python wrapper command construction and JSON stdin/stdout handling with a fake runner.
|
||||
- Validate timeout and invalid JSON handling.
|
||||
- Validate doctor warning output when Node.js or MathJax is missing.
|
||||
|
||||
Optional local tests:
|
||||
|
||||
- Run the Node helper against a small expression list.
|
||||
- Run the converter on `samples/MITC공부.pdf`.
|
||||
- Confirm report fields:
|
||||
- `Math render error count` is actual failure count.
|
||||
- Missing checker info warning is absent when MathJax is available.
|
||||
- Asset link counts remain 0 missing and 0 invalid for the sample.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
- Default test suite passes without Node.js or MathJax.
|
||||
- Local-only policy is preserved: no CDN, remote API, or document upload path.
|
||||
- `pdf2md doctor` reports MathJax checker availability clearly.
|
||||
- Conversion still succeeds when MathJax is unavailable, with an info warning.
|
||||
- Conversion still succeeds when individual formulas fail, with warning records.
|
||||
- `.metadata.json` and `.report.md` show actual math render failure counts when MathJax is available.
|
||||
- The generated Markdown is not changed by the checker.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
- The checker blocks conversion when Markdown output exists.
|
||||
- The checker uses a remote service or CDN at runtime.
|
||||
- Default tests require Node.js, MathJax, MinerU, GPU, network, Obsidian, or sample PDFs.
|
||||
- Raw MathJax output objects become public API return types.
|
||||
- The report records renderability as successful when the checker did not actually run.
|
||||
|
||||
## Open Decisions Before Implementation
|
||||
|
||||
1. Dependency packaging:
|
||||
- Use committed `package.json` and lockfile for a local MathJax package, or document a manual local npm setup.
|
||||
- Recommended: commit `package.json` and lockfile so setup is reproducible.
|
||||
|
||||
2. Default checker activation:
|
||||
- Recommended: auto-attempt the local checker when available; otherwise emit the existing unavailable-checker info warning.
|
||||
|
||||
3. Timeout value:
|
||||
- Recommended initial default: 60 seconds per document batch, test-injectable and documented.
|
||||
|
||||
4. Doctor severity:
|
||||
- Recommended: missing MathJax checker is WARN, not FAIL, because conversion can still produce useful output.
|
||||
|
||||
5. Real fixture gate:
|
||||
- Recommended: keep real sample conversion explicit and opt-in for tests, but allow direct user-requested runs.
|
||||
|
||||
## Suggested Implementation Contract
|
||||
|
||||
Objective:
|
||||
|
||||
- Implement a local MathJax render checker that validates normalized Markdown math expressions and records failures in metadata/report output without changing conversion continuity.
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- Python MathJax checker wrapper.
|
||||
- Node MathJax helper.
|
||||
- Updated quality extraction for display/inline expression records.
|
||||
- Doctor warning coverage for missing checker dependencies.
|
||||
- Setup documentation.
|
||||
- Fast mocked tests and optional real local checker validation.
|
||||
|
||||
Non-goals:
|
||||
|
||||
- No cloud rendering.
|
||||
- No Obsidian app automation.
|
||||
- No full LaTeX engine.
|
||||
- No manual review queue.
|
||||
- No runtime engine selection.
|
||||
- No correction or rewriting of failed formulas.
|
||||
|
||||
Verification:
|
||||
|
||||
- `uv run pytest`
|
||||
- `git diff --check`
|
||||
- Optional local Node helper smoke test.
|
||||
- Optional `samples/MITC공부.pdf` conversion after local MathJax setup.
|
||||
@@ -0,0 +1,239 @@
|
||||
# Sprint 0 Contract: Source And Environment Verification
|
||||
|
||||
Status: Completed
|
||||
Last updated: 2026-05-07
|
||||
Result: PASS, go-with-risks
|
||||
|
||||
## Objective
|
||||
|
||||
Verify the external facts and local environment assumptions needed before any converter implementation starts.
|
||||
|
||||
Current amendment: the project target changed after the original Sprint 0 pass from MinerU 2.5 to MinerU 3.1.0. Read MinerU version constraints in this contract as applying to MinerU 3.1.0 for future work.
|
||||
|
||||
Sprint 0 must answer whether the planned v1 implementation can proceed with:
|
||||
|
||||
- MinerU 3.1.0 through direct local CLI execution only.
|
||||
- Python 3.12 and `uv`.
|
||||
- Windows PowerShell on the current workspace.
|
||||
- NVIDIA GTX 1070 Ti 8GB as the target GPU.
|
||||
- Local-only processing with no cloud OCR, remote LLM/VLM, hosted parser, `--api-url`, remote APIs, router mode, HTTP client backends, or remote OpenAI-compatible backends.
|
||||
- The temporary local `mineru-api` process started internally by MinerU 3.1.0 CLI is allowed when CLI runs without `--api-url`.
|
||||
|
||||
## Touched Surfaces
|
||||
|
||||
Allowed:
|
||||
|
||||
- `docs/KNOWLEDGEBASE.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md` only if the sprint sequence or constraints need adjustment
|
||||
- `docs/Sprints/SPRINT0CONTRACT.md`
|
||||
- `PROGRESS.md`
|
||||
|
||||
Not allowed:
|
||||
|
||||
- `pyproject.toml`
|
||||
- `src/`
|
||||
- `tests/`
|
||||
- `scripts/`
|
||||
- Any converter implementation code
|
||||
- Any committed file under `samples/`
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
Sprint 0 should produce a concise source-backed handoff in `docs/KNOWLEDGEBASE.md` under the heading `## 9. Sprint 0 Verification (2026-05-07)`, plus a short status and handoff entry in `PROGRESS.md`.
|
||||
|
||||
Evidence requirements:
|
||||
|
||||
- Prefer official or primary sources.
|
||||
- Use non-official sources only when official documentation is incomplete and the source is directly relevant.
|
||||
- For volatile implementation claims, record source URL, access date, and whether the claim is a direct fact or a project inference.
|
||||
- Record failures from allowed local commands as environment facts. Do not fix them during Sprint 0.
|
||||
- Web research is allowed for documentation verification. Runtime converter design must remain local-only.
|
||||
|
||||
The handoff must cover:
|
||||
|
||||
1. MinerU 3.1.0 local CLI facts
|
||||
- Install command or supported install path.
|
||||
- Version command or reliable version detection path.
|
||||
- Direct local CLI invocation shape for PDF conversion.
|
||||
- Supported output locations for Markdown, JSON/structured data, assets, logs, and raw diagnostics.
|
||||
- Whether local execution can be kept free of router/API/HTTP endpoint modes.
|
||||
|
||||
2. Python and package workflow facts
|
||||
- Python 3.12 compatibility status for the planned dependency stack.
|
||||
- `uv` setup implications.
|
||||
- Any packaging constraints that affect Sprint 1 scaffolding.
|
||||
|
||||
3. GPU and runtime facts
|
||||
- CUDA/PyTorch expectations relevant to GTX 1070 Ti 8GB.
|
||||
- Known GPU memory risks or CPU fallback/error-message implications.
|
||||
- Commands that future `pdf2md doctor` should check.
|
||||
|
||||
4. License and privacy facts
|
||||
- MinerU license status.
|
||||
- Model-weight license status when identifiable.
|
||||
- Transitive package/license risks that must be reviewed before redistribution.
|
||||
- Confirmation that v1 runtime design remains local-only.
|
||||
|
||||
5. Implementation go/no-go recommendation
|
||||
- `go`: proceed to Sprint 1 with listed assumptions.
|
||||
- `go-with-risks`: proceed but carry specific risks into later sprint contracts. Each risk must name the later sprint that must absorb it.
|
||||
- `blocked`: stop and ask the user for a requirement change.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Do not scaffold the Python project.
|
||||
- Do not install MinerU, CUDA, PyTorch, or model weights.
|
||||
- Do not run full PDF conversion.
|
||||
- Do not edit `samples/` or commit sample files.
|
||||
- Do not introduce candidate engine comparisons.
|
||||
- Do not decide a new conversion engine.
|
||||
- Do not add implementation abstractions, config systems, or CLI flags.
|
||||
|
||||
## Work Packages
|
||||
|
||||
### WP0.1: MinerU Source Review
|
||||
|
||||
Owner:
|
||||
|
||||
- `research-agent`
|
||||
- `mineru-research` skill
|
||||
|
||||
Actions:
|
||||
|
||||
- Review official MinerU documentation, MinerU GitHub, release notes/tags, and relevant license files.
|
||||
- Verify current install, CLI, version, output, model/cache, and local execution facts.
|
||||
- Record source URLs and access date for durable claims.
|
||||
|
||||
Output:
|
||||
|
||||
- Source-backed MinerU fact table in `docs/KNOWLEDGEBASE.md` under `## 9. Sprint 0 Verification (2026-05-07)`.
|
||||
- Any adapter-impacting uncertainty listed in `PROGRESS.md`.
|
||||
|
||||
### WP0.2: Local Environment Review
|
||||
|
||||
Owner:
|
||||
|
||||
- `local-setup-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Check non-invasive local facts, such as Python version, `uv` availability, and GPU visibility when available.
|
||||
- Review official Python, `uv`, PyTorch/CUDA, and NVIDIA-relevant documentation for compatibility constraints.
|
||||
- Identify future `pdf2md doctor` checks.
|
||||
|
||||
Output:
|
||||
|
||||
- Environment compatibility notes and doctor-check requirements.
|
||||
- Explicit GTX 1070 Ti 8GB risks.
|
||||
|
||||
Allowed local commands:
|
||||
|
||||
```powershell
|
||||
python --version
|
||||
uv --version
|
||||
nvidia-smi
|
||||
```
|
||||
|
||||
Only run these commands if they are useful for the active research pass. Do not install or modify system packages.
|
||||
|
||||
### WP0.3: Output Layout Probe Plan
|
||||
|
||||
Owner:
|
||||
|
||||
- `mineru-integration-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Define what must be observed from MinerU before implementing the adapter.
|
||||
- If MinerU is already installed locally, a later user-approved probe may inspect command help or run against a disposable file. Sprint 0 does not require conversion execution.
|
||||
|
||||
Output:
|
||||
|
||||
- Adapter-facing output layout assumptions.
|
||||
- List of fields that must remain optional until observed from real MinerU output.
|
||||
|
||||
### WP0.4: License And Privacy Review
|
||||
|
||||
Owner:
|
||||
|
||||
- `license-privacy-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Review MinerU and model/package license sources.
|
||||
- Distinguish personal/research use from redistribution.
|
||||
- Check that no planned runtime path uploads PDFs, page images, extracted text, or model intermediates.
|
||||
|
||||
Output:
|
||||
|
||||
- License/privacy summary with unresolved obligations.
|
||||
- Blockers if redistribution assumptions are unsafe.
|
||||
|
||||
### WP0.5: Contract Evaluation
|
||||
|
||||
Owner:
|
||||
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Review this contract before Sprint 0 research starts.
|
||||
- Require concrete evidence expectations and failure thresholds.
|
||||
- After Sprint 0 research, independently verify that each expected output is present and source-backed.
|
||||
|
||||
Output:
|
||||
|
||||
- Pass/fail evaluation notes.
|
||||
- Specific follow-up findings if the contract or results are incomplete.
|
||||
|
||||
## Verification Checks
|
||||
|
||||
Required:
|
||||
|
||||
- Every volatile implementation fact has an official or primary source URL.
|
||||
- Source-backed claims distinguish direct fact from project inference.
|
||||
- `docs/KNOWLEDGEBASE.md` remains consistent with `PRD.md` and `ARCHITECTURE.md`.
|
||||
- No candidate engine comparison is reintroduced.
|
||||
- No converter implementation code is created.
|
||||
- `samples/` remains untracked.
|
||||
- `git diff --check` passes for documentation changes.
|
||||
|
||||
Recommended:
|
||||
|
||||
- Check all newly added links manually or with a lightweight local link check if available.
|
||||
- Have `evaluation-agent` review the completed Sprint 0 outputs before proceeding to Sprint 1.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
Sprint 0 fails and must stop for a user decision if any of these are true:
|
||||
|
||||
- MinerU 3.1.0 cannot be invoked through a direct local CLI path suitable for v1.
|
||||
- MinerU's required v1 path requires `--api-url`, router mode, HTTP client backends, remote APIs, or remote OpenAI-compatible backend behavior.
|
||||
- Python 3.12 is incompatible with the required implementation stack.
|
||||
- Local-only processing cannot be maintained.
|
||||
- License terms clearly block the intended personal/research use.
|
||||
- Required evidence cannot be verified from official or primary sources.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
Sprint 0 is complete when:
|
||||
|
||||
- `docs/KNOWLEDGEBASE.md` contains updated Sprint 0 facts with sources.
|
||||
- `PROGRESS.md` records checks performed, unresolved risks, and go/no-go recommendation.
|
||||
- Future Sprint 1 can proceed without guessing install, CLI, environment, or licensing assumptions.
|
||||
- The evaluator review is complete.
|
||||
- The completed documentation change is committed.
|
||||
|
||||
## Handoff Fields
|
||||
|
||||
Use these fields when Sprint 0 completes:
|
||||
|
||||
- Files changed:
|
||||
- Sources checked:
|
||||
- Local commands run:
|
||||
- Facts confirmed:
|
||||
- Inferences made:
|
||||
- Known failures:
|
||||
- Residual risks:
|
||||
- Go/no-go recommendation:
|
||||
- Next action:
|
||||
@@ -0,0 +1,355 @@
|
||||
# Sprint 10 Contract: Pre-Conversion PDF Page Chunking
|
||||
|
||||
Status: Implemented
|
||||
Last updated: 2026-05-08
|
||||
|
||||
## Objective
|
||||
|
||||
Add an opt-in pre-conversion workflow for long PDFs:
|
||||
|
||||
1. Split each source PDF into fixed-size chunk PDFs of 20 pages.
|
||||
2. Convert each chunk PDF independently through the existing MinerU conversion pipeline.
|
||||
3. Do not merge the generated Markdown files.
|
||||
|
||||
The feature is intended to reduce long-document memory/runtime pressure and make partial progress usable when one chunk fails. It must preserve strict-local execution, keep MinerU 3.1.0 as the only conversion engine, and keep default tests independent of real MinerU, GPU, CUDA, model files, network access, Obsidian, LaTeX tooling, and `samples/`.
|
||||
|
||||
## Research Summary
|
||||
|
||||
Sources checked on 2026-05-08:
|
||||
|
||||
- [pypdf PyPI](https://pypi.org/project/pypdf/): current release observed as `6.10.2`, uploaded 2026-04-15; metadata lists `BSD-3-Clause`, Python `>=3.9`, Python 3.12 support, and describes pypdf as a pure-Python PDF library capable of splitting, merging, cropping, and transforming PDF pages.
|
||||
- [pypdf merging docs](https://pypdf.readthedocs.io/en/stable/user/merging-pdfs.html): `PdfWriter.append()` can append a complete or partial source PDF; examples use zero-based page ranges such as `(0, 10)`. The docs recommend `append` or `merge` over low-level `add_page` / `insert_page`.
|
||||
- [pypdf streaming docs](https://pypdf.readthedocs.io/en/latest/user/streaming-data.html): `PdfReader` and `PdfWriter` support file-like objects, but the project should write chunk PDFs to local disk because MinerU accepts local file paths.
|
||||
- [pypdf PdfWriter docs](https://pypdf.readthedocs.io/en/stable/modules/PdfWriter.html): writer operations clone/copy PDF objects into the destination. The docs warn that cloning linked objects can copy more than just the visible page object in some cases, so chunk output size must be checked in tests.
|
||||
- [MinerU CLI tools docs](https://opendatalab.github.io/MinerU/usage/cli_tools/): the direct `mineru` CLI accepts `-p/--path`, `-o/--output`, `-s/--start`, and `-e/--end`; without `--api-url`, it launches a temporary local `mineru-api`.
|
||||
- [PyMuPDF PyPI](https://pypi.org/project/PyMuPDF/): PyMuPDF is fast and local, but PyPI lists dual licensing under GNU AGPL v3 or an Artifex commercial license.
|
||||
- [pikepdf page assembly docs](https://pikepdf.readthedocs.io/en/latest/topics/pages.html): pikepdf can split pages and transfer page-associated data; it is a capable fallback candidate but adds a QPDF-backed dependency and is not needed for a first implementation.
|
||||
|
||||
## Recommended Package Decision
|
||||
|
||||
Use `pypdf` for Sprint 10.
|
||||
|
||||
Rationale:
|
||||
|
||||
- It is pure Python and fits the current Python 3.12 + `uv` workflow.
|
||||
- It has permissive `BSD-3-Clause` metadata on PyPI.
|
||||
- It directly supports page-level PDF assembly with `PdfReader` / `PdfWriter`.
|
||||
- It avoids adding PyMuPDF's AGPL/commercial licensing considerations for a simple split-only feature.
|
||||
- It avoids adding pikepdf/QPDF native dependency complexity before there is evidence that pypdf cannot handle the project samples.
|
||||
|
||||
Recommended dependency range for implementation:
|
||||
|
||||
```toml
|
||||
dependencies = [
|
||||
"pypdf>=6.10.2,<7",
|
||||
]
|
||||
```
|
||||
|
||||
The implementation adds this dependency to `pyproject.toml` and `uv.lock`.
|
||||
|
||||
## Current Precondition
|
||||
|
||||
- `pdf2md convert` already converts one PDF or a directory of PDFs.
|
||||
- Existing conversion output per input includes:
|
||||
- Markdown
|
||||
- optional metadata JSON, enabled by default
|
||||
- `<stem>.report.md`
|
||||
- assets directory
|
||||
- optional raw MinerU output
|
||||
- `plan_outputs()` already enforces overwrite and output-root safety.
|
||||
- `convert_input()` already handles directory batches and continues after per-file failures.
|
||||
- The MinerU adapter accepts one PDF path at a time and runs direct local `mineru` CLI.
|
||||
- `samples/` is local and untracked; do not commit sample PDFs or generated outputs.
|
||||
|
||||
## Touched Surfaces
|
||||
|
||||
Allowed during implementation:
|
||||
|
||||
- `pyproject.toml`
|
||||
- `uv.lock`
|
||||
- `src/pdf2md/pdf_splitter.py`
|
||||
- `src/pdf2md/paths.py`
|
||||
- `src/pdf2md/conversion.py`
|
||||
- `src/pdf2md/cli.py`
|
||||
- `src/pdf2md/ir.py` only if new warning codes or chunk provenance records are required
|
||||
- `src/pdf2md/metadata.py` only for chunk provenance fields
|
||||
- `src/pdf2md/report.py` only to expose chunk provenance in reports
|
||||
- `tests/test_pdf_splitter.py`
|
||||
- `tests/test_conversion.py`
|
||||
- `tests/test_cli.py`
|
||||
- `tests/test_paths.py`
|
||||
- `tests/test_metadata.py`
|
||||
- `tests/integration/` for mocked chunk workflow coverage
|
||||
- `README.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md`
|
||||
- `docs/Sprints/SPRINT10CONTRACT.md`
|
||||
- `PLAN.md`
|
||||
- `PROGRESS.md`
|
||||
|
||||
Not allowed:
|
||||
|
||||
- Runtime engine selection or alternate conversion engines.
|
||||
- Use of cloud OCR, remote LLM/VLM, hosted renderers, hosted document parsers, `--api-url`, router mode, HTTP client backends, remote APIs, or remote OpenAI-compatible backends.
|
||||
- Mandatory default tests requiring real MinerU, GPU, CUDA, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- Committed files under `samples/`.
|
||||
- Committed generated conversion outputs.
|
||||
- Automatic model or package downloads triggered by import time, `doctor`, `convert`, or tests.
|
||||
- Markdown merge behavior for chunk outputs.
|
||||
- Claims that chunking improves formula correctness; it is only a processing-control feature.
|
||||
|
||||
## Product Behavior
|
||||
|
||||
Activation:
|
||||
|
||||
- Chunking is opt-in and existing conversion behavior is unchanged when `chunk_pages` is unset.
|
||||
- CLI: `pdf2md convert INPUT --out OUTPUT_DIR --chunk-pages` uses the default chunk size of 20 pages.
|
||||
- CLI: `pdf2md convert INPUT --out OUTPUT_DIR --chunk-pages 20` uses an explicit positive chunk size.
|
||||
- Python API: `convert_pdf(..., chunk_pages=20)` and `convert_input(..., chunk_pages=20)`.
|
||||
- `convert_pdf()` returns `ConversionResult` without chunking and `BatchConversionResult` when chunk mode is active.
|
||||
- `chunk_pages` must be `None` or a positive integer.
|
||||
|
||||
Chunking behavior:
|
||||
|
||||
- If `chunk_pages` is unset, current behavior remains unchanged.
|
||||
- If `chunk_pages=20` and a PDF has 20 or fewer pages, conversion may either:
|
||||
- convert the original PDF directly, or
|
||||
- create one chunk PDF and convert that chunk.
|
||||
- Recommended: convert the original directly when `total_pages <= chunk_pages` to avoid unnecessary intermediate files.
|
||||
- If a PDF has more than 20 pages, split it into chunk PDFs with ranges:
|
||||
- chunk 1: source pages 1-20
|
||||
- chunk 2: source pages 21-40
|
||||
- chunk N: remaining pages
|
||||
- Convert chunk PDFs sequentially, not in parallel. GTX 1070 Ti 8GB memory pressure makes sequential conversion the safer default.
|
||||
- If one chunk conversion fails, continue with later chunks and report the failed chunk clearly.
|
||||
- Do not merge Markdown outputs.
|
||||
|
||||
Recommended chunk output naming:
|
||||
|
||||
```text
|
||||
<stem>.part-001.pages-001-020.md
|
||||
<stem>.part-001.pages-001-020.metadata.json
|
||||
<stem>.part-001.pages-001-020.report.md
|
||||
<stem>.part-001.pages-001-020.assets/
|
||||
|
||||
<stem>.part-002.pages-021-040.md
|
||||
...
|
||||
```
|
||||
|
||||
Recommended chunk PDF staging:
|
||||
|
||||
- Use a temporary working directory.
|
||||
- Delete temporary chunk PDFs after conversion completes, including when `--keep-raw` is enabled.
|
||||
- Do not add a separate `--keep-chunks` flag in Sprint 10.
|
||||
|
||||
## Provenance Requirements
|
||||
|
||||
Each chunk conversion must preserve original-source context.
|
||||
|
||||
Required chunk fields in metadata or engine options:
|
||||
|
||||
- original source PDF path
|
||||
- original source SHA-256
|
||||
- chunk PDF path when retained, or chunk PDF filename when temporary
|
||||
- chunk index, 1-based
|
||||
- total chunk count
|
||||
- source page start, 1-based inclusive
|
||||
- source page end, 1-based inclusive
|
||||
- chunk page count
|
||||
|
||||
Page provenance must distinguish:
|
||||
|
||||
- chunk-local page index, starting at 0 for MinerU output
|
||||
- original source page number, starting at 1 for user-facing reports
|
||||
|
||||
The report should include a short chunk context line, for example:
|
||||
|
||||
```text
|
||||
- Chunk: 2/5, source pages: 21-40
|
||||
```
|
||||
|
||||
## Architecture Plan
|
||||
|
||||
### WP10.1: PDF Splitter Module
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `mineru-integration-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add `src/pdf2md/pdf_splitter.py`.
|
||||
- Define project-owned `PdfChunkPlan`.
|
||||
- Implement page counting with `pypdf.PdfReader`.
|
||||
- Implement chunk planning without writing files.
|
||||
- Implement chunk writing with `pypdf.PdfWriter.append(source, (start, end))` or an equivalent tested `PdfReader`/`PdfWriter` path.
|
||||
- Use zero-based half-open page ranges internally and one-based inclusive ranges for filenames and reports.
|
||||
- Reject invalid chunk sizes with clear `ValueError`.
|
||||
- Fail clearly on encrypted/password-protected PDFs unless a later sprint adds password handling.
|
||||
|
||||
Expected output:
|
||||
|
||||
- Deterministic chunk plans and local chunk PDFs suitable for the existing MinerU adapter.
|
||||
|
||||
### WP10.2: Chunk-Aware Path Planning
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Extend output planning so chunk outputs are deterministic and conflict-checked before conversion starts.
|
||||
- Avoid collisions between original-output stems and chunk-output stems.
|
||||
- Preserve output-root escape prevention.
|
||||
- Respect `--overwrite`.
|
||||
- Keep Korean and non-ASCII source stems working.
|
||||
|
||||
Expected output:
|
||||
|
||||
- A long PDF can produce multiple planned Markdown/metadata/report/assets outputs without overwriting another chunk.
|
||||
|
||||
### WP10.3: Conversion Orchestration
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `mineru-integration-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add chunk mode to `convert_pdf()` and `convert_input()`.
|
||||
- When chunk mode is active, split before calling the MinerU adapter.
|
||||
- Reuse existing per-PDF conversion path for each chunk PDF rather than creating a second conversion pipeline.
|
||||
- Continue conversion after a chunk-level failure and aggregate a batch-like result for the source.
|
||||
- Ensure temporary chunk directories are cleaned up unless raw retention is requested.
|
||||
- Keep strict-local validation unchanged.
|
||||
|
||||
Expected output:
|
||||
|
||||
- Long PDF conversion yields separate Markdown/metadata/report/assets outputs per chunk.
|
||||
|
||||
### WP10.4: Metadata And Report Chunk Provenance
|
||||
|
||||
Owner:
|
||||
|
||||
- `metadata-agent`
|
||||
- `obsidian-markdown-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add chunk provenance fields without exposing raw pypdf objects.
|
||||
- Keep existing required metadata fields valid.
|
||||
- Keep original source provenance visible even though MinerU sees a chunk PDF as input.
|
||||
- Ensure chunk reports are readable without opening JSON metadata.
|
||||
|
||||
Expected output:
|
||||
|
||||
- Users can map each output file back to original source pages.
|
||||
|
||||
### WP10.5: CLI And API Surface
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `requirements-guard-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add `--chunk-pages [INTEGER]` to `pdf2md convert`.
|
||||
- Keep chunking disabled unless the option is present.
|
||||
- Use 20 pages when `--chunk-pages` is present without an explicit value.
|
||||
- Validate positive integer input.
|
||||
- Keep `--out`, `--metadata`, `--keep-raw`, `--recursive`, `--overwrite`, `--gpu`, and strict-local behavior unchanged.
|
||||
- Update README with the long-PDF workflow.
|
||||
|
||||
Expected output:
|
||||
|
||||
```powershell
|
||||
uv run pdf2md convert samples/long.pdf --out outputs --chunk-pages 20
|
||||
```
|
||||
|
||||
### WP10.6: Tests
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
Default tests must not require real MinerU or sample PDFs.
|
||||
|
||||
Required tests:
|
||||
|
||||
- Build small local PDF fixtures using `pypdf` blank pages or minimal test PDFs.
|
||||
- Page count detection for 1, 20, 21, 40, and 41 pages.
|
||||
- Chunk planning produces expected 1-based filenames and page ranges.
|
||||
- Chunk writing produces PDFs with the expected page counts.
|
||||
- Non-positive chunk size is rejected.
|
||||
- Existing conversion without `--chunk-pages` is unchanged.
|
||||
- Chunked conversion calls the fake adapter once per chunk.
|
||||
- Chunked conversion writes separate Markdown, metadata JSON, report Markdown, and assets per chunk.
|
||||
- `--overwrite` and conflict detection work for all planned chunk outputs.
|
||||
- A failed chunk does not silently fallback and does not prevent later chunks from being attempted.
|
||||
- Metadata/report contain original source PDF and source page range.
|
||||
- CLI validates `--chunk-pages` and prints a useful summary.
|
||||
|
||||
Optional local validation:
|
||||
|
||||
- Run chunked conversion on a local `samples/` PDF only by explicit user request or opt-in gate.
|
||||
- Do not commit generated chunk PDFs or outputs.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
- Sprint 10 implementation can split a PDF into 20-page chunk PDFs before MinerU conversion.
|
||||
- Chunk PDFs are converted one by one using the existing direct local MinerU CLI adapter.
|
||||
- Markdown outputs are separate and not merged.
|
||||
- Metadata/report files show chunk index and original page range.
|
||||
- Default test suite passes without real MinerU, GPU, CUDA, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- Strict-local policy remains unchanged.
|
||||
- Existing non-chunked conversion behavior remains backward-compatible.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
- Chunking uses a remote PDF service or uploads document content.
|
||||
- Chunking introduces an alternate Markdown conversion engine.
|
||||
- Default tests require real MinerU, GPU, CUDA, model files, network, or local samples.
|
||||
- Chunk outputs overwrite each other or overwrite non-chunk outputs without `--overwrite`.
|
||||
- Chunk metadata loses original source page provenance.
|
||||
- The implementation merges Markdown despite this contract's non-merge requirement.
|
||||
- The implementation silently skips failed chunks without warnings.
|
||||
|
||||
## Resolved Decisions
|
||||
|
||||
- Activation mode: opt-in with `--chunk-pages`; the option defaults to 20 pages when no value is supplied.
|
||||
- Chunk PDF retention: temporary chunk PDFs only; they are deleted after conversion completes.
|
||||
- API return type: `convert_pdf()` returns a `BatchConversionResult` when chunk mode is active.
|
||||
|
||||
## Verification Commands For Implementation
|
||||
|
||||
```powershell
|
||||
uv sync
|
||||
uv run pytest tests/test_pdf_splitter.py tests/test_conversion.py tests/test_cli.py tests/test_paths.py tests/test_metadata.py
|
||||
uv run pytest
|
||||
git diff --check
|
||||
git status --short --untracked-files=all
|
||||
```
|
||||
|
||||
Optional local command after implementation and explicit user approval:
|
||||
|
||||
```powershell
|
||||
uv run pdf2md convert samples/MITC공부.pdf --out outputs --chunk-pages 20 --overwrite
|
||||
```
|
||||
|
||||
## Handoff Requirements
|
||||
|
||||
After implementation:
|
||||
|
||||
- Update `PROGRESS.md` with files changed, commands run, tests passed, optional sample status, known failures, residual risks, and next action.
|
||||
- Do not mark the sprint implemented until independent evaluation or equivalent focused review verifies the acceptance criteria.
|
||||
- Commit the completed change without including `samples/` or generated outputs.
|
||||
|
||||
## Implementation Handoff
|
||||
|
||||
- Files changed: `pyproject.toml`, `uv.lock`, `src/pdf2md/pdf_splitter.py`, `src/pdf2md/conversion.py`, `src/pdf2md/cli.py`, `src/pdf2md/__init__.py`, `src/pdf2md/report.py`, tests, README, `docs/V1IMPLEMENTATIONPLAN.md`, `PLAN.md`, and `PROGRESS.md`.
|
||||
- Verification status: targeted unit tests passed 42 tests; the full local test suite passed 163 tests with 1 optional skip; `git diff --check` passed with line-ending warnings only.
|
||||
- Optional local sample conversion remains out of scope unless explicitly requested.
|
||||
@@ -0,0 +1,258 @@
|
||||
# Sprint 1 Contract: Project Scaffold And Fast Test Loop
|
||||
|
||||
Status: Completed
|
||||
Last updated: 2026-05-07
|
||||
|
||||
## Objective
|
||||
|
||||
Create the minimal Python project scaffold and fast local test loop for the PDF-to-Markdown converter.
|
||||
|
||||
Sprint 1 must establish:
|
||||
|
||||
- A `uv`-managed Python 3.12 project.
|
||||
- A source package importable as `pdf2md`.
|
||||
- A reserved `pdf2md` CLI entry point that does not implement conversion yet.
|
||||
- A fast test command that runs without MinerU, model downloads, GPU access, sample PDFs, or network access.
|
||||
|
||||
Sprint 1 is scaffolding only. It must not implement PDF conversion, MinerU execution, Markdown normalization, metadata generation, or report generation.
|
||||
|
||||
## Current Precondition
|
||||
|
||||
Sprint 0 found that `uv` was not available on PATH in the current local environment.
|
||||
|
||||
Sprint 1 resolved this by installing `uv` per-user at `C:\Users\user\.local\bin`.
|
||||
|
||||
Before Sprint 1 can be accepted, one of these must happen:
|
||||
|
||||
- `uv` is installed and `uv --version` succeeds.
|
||||
- The user explicitly approves including `uv` bootstrap documentation or setup handling as part of Sprint 1, and the contract result records that `uv sync` could not be run locally.
|
||||
|
||||
Do not silently replace `uv` with another package manager.
|
||||
|
||||
## Touched Surfaces
|
||||
|
||||
Allowed:
|
||||
|
||||
- `pyproject.toml`
|
||||
- `uv.lock`
|
||||
- `.gitignore`
|
||||
- `src/pdf2md/__init__.py`
|
||||
- `src/pdf2md/cli.py` only for a minimal placeholder CLI if needed for entry point verification
|
||||
- `tests/`
|
||||
- `README.md` only for minimal setup/test instructions if needed
|
||||
- `PLAN.md` only for current-goal coordination updates required by the shared agent workflow
|
||||
- `PROGRESS.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md` only if sequencing or constraints need adjustment
|
||||
- `docs/Sprints/SPRINT1CONTRACT.md`
|
||||
|
||||
Not allowed:
|
||||
|
||||
- `src/pdf2md/conversion.py`
|
||||
- `src/pdf2md/mineru_adapter.py`
|
||||
- `src/pdf2md/paths.py`
|
||||
- `src/pdf2md/ir.py`
|
||||
- `src/pdf2md/markdown.py`
|
||||
- `src/pdf2md/metadata.py`
|
||||
- `src/pdf2md/quality.py`
|
||||
- `src/pdf2md/report.py`
|
||||
- `src/pdf2md/doctor.py`
|
||||
- `scripts/`
|
||||
- Any real MinerU invocation
|
||||
- Any model download or install script
|
||||
- Any committed file under `samples/`
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
Sprint 1 should produce:
|
||||
|
||||
1. Project package scaffold
|
||||
- `pyproject.toml` with project metadata.
|
||||
- Python requirement constrained to Python 3.12.
|
||||
- Build configuration suitable for a `src/` layout.
|
||||
- `uv.lock` generated by `uv sync`.
|
||||
- `.gitignore` entries for local virtual environments, pytest cache, and Python bytecode.
|
||||
- Minimal test dependency configuration.
|
||||
- CLI entry point name reserved as `pdf2md`.
|
||||
|
||||
2. Minimal source package
|
||||
- `src/pdf2md/__init__.py`.
|
||||
- A stable package import surface.
|
||||
- Optional minimal `src/pdf2md/cli.py` placeholder that exits clearly and does not imply conversion is implemented.
|
||||
|
||||
3. Fast test loop
|
||||
- A minimal test suite that verifies the package imports.
|
||||
- If a CLI placeholder is added, a smoke test that verifies the CLI entry point is wired without invoking conversion.
|
||||
- Tests must not require MinerU, CUDA, GPU, model files, `samples/`, or network.
|
||||
|
||||
4. Developer workflow
|
||||
- `uv sync` should work when `uv` is installed.
|
||||
- `uv run pytest` should work when `uv` is installed.
|
||||
- If `uv` is still missing locally, record the failure explicitly in `PROGRESS.md` and do not mark Sprint 1 complete.
|
||||
|
||||
5. Handoff
|
||||
- `PROGRESS.md` records changed files, commands run, tests passed or blocked, known failures, residual risks, and next action.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Do not implement PDF discovery.
|
||||
- Do not implement conversion orchestration.
|
||||
- Do not implement the MinerU adapter.
|
||||
- Do not run MinerU.
|
||||
- Do not install MinerU 3.1.0.
|
||||
- Do not download MinerU models.
|
||||
- Do not implement Markdown normalization.
|
||||
- Do not implement metadata JSON or `.report.md` output.
|
||||
- Do not implement `pdf2md doctor`; a CLI placeholder may mention future commands, but it must not create a doctor module.
|
||||
- Do not add runtime engine selection.
|
||||
- Do not add alternate conversion engines.
|
||||
- Do not add cloud, remote API, router, HTTP client backend, or remote OpenAI-compatible backend support.
|
||||
|
||||
## Work Packages
|
||||
|
||||
### WP1.1: Scaffold Metadata
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Create the minimal `pyproject.toml`.
|
||||
- Use Python 3.12 constraints.
|
||||
- Configure a `src/` package layout.
|
||||
- Configure pytest as the fast local test runner.
|
||||
- Reserve the `pdf2md` console script.
|
||||
|
||||
Output:
|
||||
|
||||
- A minimal, maintainable scaffold without speculative dependencies.
|
||||
|
||||
### WP1.2: Package Import Surface
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Create `src/pdf2md/__init__.py`.
|
||||
- Expose only a minimal version/import surface.
|
||||
- Avoid public API promises beyond what Sprint 1 verifies.
|
||||
|
||||
Output:
|
||||
|
||||
- `import pdf2md` succeeds.
|
||||
|
||||
### WP1.3: CLI Placeholder
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- If needed for console script verification, create `src/pdf2md/cli.py`.
|
||||
- The placeholder may expose a help message or a clear "not implemented yet" command.
|
||||
- It must not create conversion flags beyond the reserved command shape unless tests need them.
|
||||
|
||||
Output:
|
||||
|
||||
- `pdf2md` entry point is wired without implying conversion works.
|
||||
|
||||
### WP1.4: Fast Tests
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add minimal tests for package import and optional CLI placeholder behavior.
|
||||
- Ensure tests are local, fast, and independent of MinerU/model/GPU/network state.
|
||||
|
||||
Output:
|
||||
|
||||
- `uv run pytest` passes when `uv` is available.
|
||||
|
||||
### WP1.5: Independent Evaluation
|
||||
|
||||
Owner:
|
||||
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Review the completed scaffold against this contract.
|
||||
- Verify no converter implementation was added.
|
||||
- Verify `samples/` remains untracked and unstaged.
|
||||
- Verify no runtime remote path or alternate engine was introduced.
|
||||
|
||||
Output:
|
||||
|
||||
- PASS/FAIL notes with any missing acceptance criteria.
|
||||
|
||||
## Verification Checks
|
||||
|
||||
Required:
|
||||
|
||||
- `git status --short` before staging confirms `samples/` remains untracked.
|
||||
- `uv --version` is run and result is recorded.
|
||||
- `uv sync` passes if `uv` is available.
|
||||
- `uv run pytest` passes if `uv` is available.
|
||||
- If `uv` is unavailable, Sprint 1 is marked blocked rather than complete.
|
||||
- Import test passes through the configured test command.
|
||||
- No real MinerU dependency is required for default tests.
|
||||
- No model downloads occur.
|
||||
- No network calls are required.
|
||||
- No candidate engine comparison is reintroduced.
|
||||
- No conversion behavior is implemented.
|
||||
- `git diff --check` passes.
|
||||
|
||||
Recommended:
|
||||
|
||||
- Keep `pyproject.toml` dependency list minimal.
|
||||
- Avoid adding README content beyond setup/test instructions needed for the scaffold.
|
||||
- Use `requirements-guard-agent` to check document consistency if the scaffold reveals a sequencing issue.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
Sprint 1 fails and must stop for a user decision if any of these are true:
|
||||
|
||||
- `uv` remains unavailable and the user has not approved bootstrap handling.
|
||||
- The project cannot be installed as a Python 3.12 package.
|
||||
- The package cannot be imported as `pdf2md`.
|
||||
- Default tests require MinerU, model downloads, GPU access, sample PDFs, or network access.
|
||||
- The scaffold introduces conversion logic outside Sprint 1 scope.
|
||||
- The scaffold introduces alternate engines or runtime engine selection.
|
||||
- The scaffold introduces `--api-url`, remote APIs, router mode, HTTP client backends, or remote OpenAI-compatible backends.
|
||||
- `samples/` is staged or committed.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
Sprint 1 is complete when:
|
||||
|
||||
- `pyproject.toml` exists and defines a minimal Python 3.12 `uv` project.
|
||||
- `src/pdf2md/__init__.py` exists and `import pdf2md` works through the project environment.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- The `pdf2md` CLI entry point is reserved and does not imply conversion is implemented.
|
||||
- No converter implementation code beyond the allowed placeholder exists.
|
||||
- No default test depends on MinerU, GPU, model files, network, or `samples/`.
|
||||
- `PROGRESS.md` records checks performed and residual risks.
|
||||
- Independent evaluation is complete.
|
||||
- The completed change is committed.
|
||||
|
||||
## Handoff Fields
|
||||
|
||||
Use these fields when Sprint 1 completes:
|
||||
|
||||
- Files changed:
|
||||
- Commands run:
|
||||
- Tests passed:
|
||||
- Tests blocked:
|
||||
- Known failures:
|
||||
- Residual risks:
|
||||
- User decisions needed:
|
||||
- Go/no-go recommendation for Sprint 2:
|
||||
- Next action:
|
||||
@@ -0,0 +1,274 @@
|
||||
# Sprint 2 Contract: Paths, Input Discovery, And Overwrite Planning
|
||||
|
||||
Status: Completed
|
||||
Last updated: 2026-05-07
|
||||
|
||||
## Objective
|
||||
|
||||
Implement deterministic input discovery and output path planning before any PDF conversion logic exists.
|
||||
|
||||
Sprint 2 must establish:
|
||||
|
||||
- A project-owned path planning module for local PDF inputs.
|
||||
- Deterministic discovery for a single PDF, a directory, and optional recursive directory traversal.
|
||||
- Deterministic planned output paths for Markdown, assets, metadata JSON, quality report, and optional raw MinerU output.
|
||||
- Preflight overwrite conflict detection that prevents accidental replacement unless overwrite is explicitly allowed.
|
||||
- Fast unit tests using generated temporary files, including non-ASCII filenames.
|
||||
|
||||
Sprint 2 is path planning only. It must not run MinerU, parse PDFs, write conversion outputs, normalize Markdown, create metadata content, or add the real `convert` command behavior.
|
||||
|
||||
## Current Precondition
|
||||
|
||||
Sprint 1 is complete:
|
||||
|
||||
- `uv` is installed per-user at `C:\Users\user\.local\bin`.
|
||||
- `pyproject.toml`, `uv.lock`, the `pdf2md` package, CLI placeholder, and fast pytest loop exist.
|
||||
- `uv sync` and `uv run pytest` passed.
|
||||
|
||||
If a new shell cannot find `uv`, prepend `C:\Users\user\.local\bin` to PATH for verification commands and record that in `PROGRESS.md`.
|
||||
|
||||
## Touched Surfaces
|
||||
|
||||
Allowed:
|
||||
|
||||
- `src/pdf2md/paths.py`
|
||||
- `src/pdf2md/conversion.py` only for a minimal type boundary if path planning cannot be tested cleanly without it
|
||||
- `src/pdf2md/cli.py` only if a minimal parser hook is needed for path-planning tests; do not expose working conversion behavior
|
||||
- `tests/test_paths.py` or `tests/unit/test_paths.py`
|
||||
- `tests/test_cli.py` only for path-planning parser coverage if `cli.py` changes
|
||||
- `README.md` only if setup/test instructions need a small update
|
||||
- `PLAN.md` only for current-goal coordination updates required by the shared agent workflow
|
||||
- `PROGRESS.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md` only if sequencing or constraints need adjustment
|
||||
- `docs/Sprints/SPRINT2CONTRACT.md`
|
||||
|
||||
Not allowed:
|
||||
|
||||
- `src/pdf2md/mineru_adapter.py`
|
||||
- `src/pdf2md/ir.py`
|
||||
- `src/pdf2md/markdown.py`
|
||||
- `src/pdf2md/metadata.py`
|
||||
- `src/pdf2md/quality.py`
|
||||
- `src/pdf2md/report.py`
|
||||
- `src/pdf2md/doctor.py`
|
||||
- `scripts/`
|
||||
- Any real MinerU invocation
|
||||
- Any model download or install script
|
||||
- Any file parsing beyond local filesystem path and extension checks
|
||||
- Any conversion output writing beyond temporary files created by tests
|
||||
- Any committed file under `samples/`
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
Sprint 2 should produce:
|
||||
|
||||
1. Input discovery
|
||||
- Accept a local path that is either a PDF file or a directory.
|
||||
- Treat `.pdf` extension matching as case-insensitive.
|
||||
- Reject a non-existent path with a clear project-owned error.
|
||||
- Reject a non-PDF file with a clear project-owned error.
|
||||
- Reject a directory with no discovered PDFs with a clear project-owned error.
|
||||
- Discover only direct child PDFs for directory input unless recursive traversal is requested.
|
||||
- Discover nested PDFs only when recursive traversal is requested.
|
||||
- Return discovered PDFs in a deterministic order.
|
||||
|
||||
2. Output path plan
|
||||
- For each discovered PDF, plan:
|
||||
- Markdown path: `<output-root>/<relative-parent>/<stem>.md`.
|
||||
- Assets directory: `<output-root>/<relative-parent>/<stem>.assets`.
|
||||
- Metadata path when metadata is enabled: `<output-root>/<relative-parent>/<stem>.metadata.json`.
|
||||
- Quality report path: `<output-root>/<relative-parent>/<stem>.report.md`.
|
||||
- Raw MinerU directory when raw output is kept: `<output-root>/<relative-parent>/<stem>.raw`.
|
||||
- For a single PDF input, `relative-parent` is empty unless the implementation has a tested reason to preserve more context.
|
||||
- For recursive directory input, preserve the source-relative subdirectory under the output root to avoid filename collisions.
|
||||
- Keep planned paths local filesystem paths. Do not introduce URI, URL, cloud, or remote storage handling.
|
||||
|
||||
3. Overwrite preflight
|
||||
- Detect existing planned file or directory outputs before conversion writes occur.
|
||||
- Report all detected conflicts in one project-owned error instead of failing on the first conflict.
|
||||
- Allow conflicts only when overwrite is explicitly enabled.
|
||||
- Do not delete or replace files in Sprint 2.
|
||||
|
||||
4. Tests
|
||||
- Unit tests for single PDF discovery.
|
||||
- Unit tests for non-recursive directory discovery.
|
||||
- Unit tests for recursive directory discovery.
|
||||
- Unit tests for deterministic ordering.
|
||||
- Unit tests for non-ASCII filenames, including Korean filenames, using temporary files.
|
||||
- Unit tests for invalid input errors.
|
||||
- Unit tests for planned Markdown, assets, metadata, report, and raw output paths.
|
||||
- Unit tests for overwrite conflict detection.
|
||||
|
||||
5. Handoff
|
||||
- `PROGRESS.md` records changed files, commands run, tests passed or blocked, known failures, residual risks, and next action.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Do not implement PDF conversion.
|
||||
- Do not implement conversion orchestration.
|
||||
- Do not implement the MinerU adapter.
|
||||
- Do not run MinerU.
|
||||
- Do not install MinerU 3.1.0.
|
||||
- Do not download MinerU models.
|
||||
- Do not parse PDF contents.
|
||||
- Do not compute source SHA-256.
|
||||
- Do not implement Markdown normalization.
|
||||
- Do not implement metadata JSON content.
|
||||
- Do not implement `.report.md` content.
|
||||
- Do not implement `pdf2md convert` as a working command.
|
||||
- Do not implement `pdf2md doctor`.
|
||||
- Do not add runtime engine selection.
|
||||
- Do not add alternate conversion engines.
|
||||
- Do not add cloud, remote API, router, HTTP client backend, or remote OpenAI-compatible backend support.
|
||||
|
||||
## Work Packages
|
||||
|
||||
### WP2.1: Path Planning Types And Errors
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add the smallest project-owned types needed to represent discovered inputs, planned outputs, and overwrite conflicts.
|
||||
- Add clear project-owned exceptions or error result types for invalid inputs and conflicts.
|
||||
- Avoid public API promises beyond what Sprint 2 tests verify.
|
||||
|
||||
Output:
|
||||
|
||||
- Path planning can be tested without converter execution.
|
||||
|
||||
### WP2.2: Input Discovery
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Implement single PDF and directory discovery.
|
||||
- Require explicit recursive mode for subdirectory traversal.
|
||||
- Sort results deterministically.
|
||||
- Preserve local `Path` objects rather than converting to strings early.
|
||||
|
||||
Output:
|
||||
|
||||
- Discovery behavior matches PRD directory and recursive requirements.
|
||||
|
||||
### WP2.3: Output Planning
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Plan Markdown, assets, metadata, report, and optional raw output paths.
|
||||
- Preserve relative subdirectories for recursive directory input.
|
||||
- Keep all planned outputs under the requested output root.
|
||||
|
||||
Output:
|
||||
|
||||
- Later conversion code can write outputs without rediscovering naming rules.
|
||||
|
||||
### WP2.4: Overwrite Conflict Detection
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Check whether any planned output already exists.
|
||||
- Return or raise a structured conflict list when overwrite is not enabled.
|
||||
- Permit the plan when overwrite is enabled without deleting anything.
|
||||
|
||||
Output:
|
||||
|
||||
- Existing user files are protected before conversion starts.
|
||||
|
||||
### WP2.5: Independent Evaluation
|
||||
|
||||
Owner:
|
||||
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Review the completed path planning implementation against this contract.
|
||||
- Verify no conversion behavior, MinerU execution, remote runtime path, or alternate engine was added.
|
||||
- Verify `samples/` remains untracked and unstaged.
|
||||
- Verify tests use temporary files, not committed sample PDFs.
|
||||
|
||||
Output:
|
||||
|
||||
- PASS/FAIL notes with any missing acceptance criteria.
|
||||
|
||||
## Verification Checks
|
||||
|
||||
Required:
|
||||
|
||||
- `git status --short` before staging confirms `samples/` remains untracked.
|
||||
- `uv --version` is run and result is recorded.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- Targeted path planning tests pass.
|
||||
- Tests do not require MinerU, CUDA, GPU, model files, `samples/`, or network.
|
||||
- No real MinerU dependency is required for default tests.
|
||||
- No model downloads occur.
|
||||
- No network calls are required.
|
||||
- No candidate engine comparison is reintroduced.
|
||||
- No conversion behavior is implemented.
|
||||
- No output files are written outside temporary test directories.
|
||||
- `git diff --check` passes.
|
||||
|
||||
Recommended:
|
||||
|
||||
- Use temporary directories for all filesystem tests.
|
||||
- Include Windows-relevant path behavior without hard-coding Windows-only separators in assertions.
|
||||
- Use `requirements-guard-agent` if path planning reveals a contradiction in PRD or architecture wording.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
Sprint 2 fails and must stop for a user decision if any of these are true:
|
||||
|
||||
- Directory conversion descends recursively without explicit recursive intent.
|
||||
- Existing planned outputs can be overwritten without explicit overwrite intent.
|
||||
- Planned output paths can escape the requested output root.
|
||||
- Default tests require MinerU, CUDA, GPU, model files, network, or `samples/`.
|
||||
- The implementation parses PDF contents or invokes conversion behavior.
|
||||
- The implementation introduces alternate engines or runtime engine selection.
|
||||
- The implementation introduces `--api-url`, remote APIs, router mode, HTTP client backends, or remote OpenAI-compatible backends.
|
||||
- `samples/` is staged or committed.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
Sprint 2 is complete when:
|
||||
|
||||
- `src/pdf2md/paths.py` exists and owns input discovery plus output path planning.
|
||||
- Single PDF discovery is tested.
|
||||
- Non-recursive and recursive directory discovery are tested.
|
||||
- Non-ASCII PDF filenames are tested with generated temporary files.
|
||||
- Markdown, assets, metadata JSON, report Markdown, and optional raw output paths are tested.
|
||||
- Existing-output conflict detection is tested with and without overwrite enabled.
|
||||
- No conversion, MinerU, Markdown normalization, metadata content, report content, or doctor behavior is implemented.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- `PROGRESS.md` records checks performed and residual risks.
|
||||
- Independent evaluation is complete.
|
||||
- The completed change is committed.
|
||||
|
||||
## Handoff Fields
|
||||
|
||||
Use these fields when Sprint 2 completes:
|
||||
|
||||
- Files changed:
|
||||
- Commands run:
|
||||
- Tests passed:
|
||||
- Tests blocked:
|
||||
- Known failures:
|
||||
- Residual risks:
|
||||
- User decisions needed:
|
||||
- Go/no-go recommendation for Sprint 3:
|
||||
- Next action:
|
||||
@@ -0,0 +1,303 @@
|
||||
# Sprint 3 Contract: Domain Records, Metadata, And Warning Model
|
||||
|
||||
Status: Completed
|
||||
Last updated: 2026-05-07
|
||||
|
||||
## Objective
|
||||
|
||||
Define project-owned domain records, warning records, and metadata JSON construction before binding the system to MinerU output.
|
||||
|
||||
Sprint 3 must establish:
|
||||
|
||||
- Internal records for documents, pages, blocks, assets, warnings, and conversion outputs.
|
||||
- Stable warning code and severity definitions aligned with `ARCHITECTURE.md`.
|
||||
- A metadata builder that produces the required v1 top-level and summary fields.
|
||||
- Warning aggregation behavior that later report generation can consume.
|
||||
- Fast unit tests that do not require MinerU, model files, GPU, sample PDFs, or network.
|
||||
|
||||
Sprint 3 is schema and metadata modeling only. It must not run MinerU, parse PDFs, normalize Markdown, generate final report Markdown content, expose a working `convert` command, or add remote/runtime engine behavior.
|
||||
|
||||
## Current Precondition
|
||||
|
||||
Sprint 2 is complete:
|
||||
|
||||
- `src/pdf2md/paths.py` owns input discovery and output path planning.
|
||||
- `tests/test_paths.py` verifies directory recursion, non-ASCII filenames, overwrite conflict detection, duplicate planned outputs, and output-root escape prevention.
|
||||
- `uv run pytest` passed 21 tests.
|
||||
|
||||
Sprint 3 may use path planning records as context, but it should not depend on actual conversion output.
|
||||
|
||||
## Touched Surfaces
|
||||
|
||||
Allowed:
|
||||
|
||||
- `src/pdf2md/ir.py`
|
||||
- `src/pdf2md/metadata.py`
|
||||
- `src/pdf2md/report.py` only for a minimal type boundary if metadata/report handoff cannot be expressed cleanly without it
|
||||
- `src/pdf2md/__init__.py` only if exporting a minimal stable type is necessary and tested
|
||||
- `tests/test_ir.py` or `tests/unit/test_ir.py`
|
||||
- `tests/test_metadata.py` or `tests/unit/test_metadata.py`
|
||||
- `PLAN.md` only for current-goal coordination updates required by the shared agent workflow
|
||||
- `PROGRESS.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md` only if sequencing or constraints need adjustment
|
||||
- `docs/Sprints/SPRINT3CONTRACT.md`
|
||||
|
||||
Not allowed:
|
||||
|
||||
- `src/pdf2md/mineru_adapter.py`
|
||||
- `src/pdf2md/markdown.py`
|
||||
- `src/pdf2md/quality.py`
|
||||
- `src/pdf2md/doctor.py`
|
||||
- `scripts/`
|
||||
- Any real MinerU invocation
|
||||
- Any model download or install script
|
||||
- Any PDF content parsing
|
||||
- Any Markdown normalization behavior
|
||||
- Any `.report.md` content generation beyond a minimal handoff type if absolutely needed
|
||||
- Any working `pdf2md convert` or `pdf2md doctor` behavior
|
||||
- Any committed file under `samples/`
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
Sprint 3 should produce:
|
||||
|
||||
1. Domain records
|
||||
- `DocumentRecord` or equivalent project-owned record.
|
||||
- `PageRecord` or equivalent with page index and optional page dimensions.
|
||||
- `BlockRecord` or equivalent with block type, optional page index, optional bbox, optional confidence, and optional Markdown character span.
|
||||
- `AssetRecord` or equivalent with stable relative path and optional source page/provenance.
|
||||
- `WarningRecord` or equivalent with code, severity, message, optional page index, and optional bbox.
|
||||
- `ConversionOutputRecord` or equivalent only if useful for connecting metadata to later orchestration; it must not invoke conversion.
|
||||
|
||||
2. Stable enums or constants
|
||||
- Block types aligned with `ARCHITECTURE.md`: `heading`, `paragraph`, `inline_formula`, `display_formula`, `table`, `figure`, `caption`, `footnote`, `reference`, and `unknown`.
|
||||
- Warning codes aligned with `ARCHITECTURE.md`, including at least:
|
||||
- `ENGINE_MISSING`
|
||||
- `GPU_UNAVAILABLE`
|
||||
- `LOW_CONFIDENCE_FORMULA`
|
||||
- `MATH_RENDER_FAILED`
|
||||
- `ASSET_LINK_MISSING`
|
||||
- `READING_ORDER_UNCERTAIN`
|
||||
- `STRICT_LOCAL_VIOLATION`
|
||||
- `MINERU_CLI_FAILED`
|
||||
- Warning severity values sufficient for v1 metadata and report summaries, such as `info`, `warning`, and `error`.
|
||||
|
||||
3. Metadata builder
|
||||
- Build a JSON-serializable metadata object with required top-level fields:
|
||||
- `source_pdf`
|
||||
- `source_sha256`
|
||||
- `created_at`
|
||||
- `engine`
|
||||
- `engine_version`
|
||||
- `engine_options`
|
||||
- `pages`
|
||||
- `assets`
|
||||
- `warnings`
|
||||
- `summary`
|
||||
- Build required summary fields:
|
||||
- `pages_processed`
|
||||
- `warning_count`
|
||||
- `asset_count`
|
||||
- `display_formula_count`
|
||||
- `inline_formula_count`
|
||||
- `math_render_error_count`
|
||||
- Preserve optional fields such as bbox and confidence only when present.
|
||||
- Require `source_sha256` as an input value. Sprint 3 should not compute hashes by reading PDFs unless the contract is explicitly amended.
|
||||
- Produce only plain Python data structures that `json.dumps` can serialize without custom encoders.
|
||||
|
||||
4. Warning aggregation
|
||||
- Count warnings.
|
||||
- Count math render failures from `MATH_RENDER_FAILED`.
|
||||
- Preserve warning order unless there is a tested reason to sort.
|
||||
- Preserve page-level warning data when available.
|
||||
|
||||
5. Tests
|
||||
- Unit tests for domain record serialization.
|
||||
- Unit tests for metadata schema creation with all required top-level fields.
|
||||
- Unit tests for summary counts.
|
||||
- Unit tests for warning aggregation.
|
||||
- Unit tests that optional bbox and confidence fields are preserved only when present.
|
||||
- Unit tests that metadata is JSON serializable.
|
||||
- Unit tests that metadata requires source PDF, source SHA-256, engine, engine version, and page records.
|
||||
|
||||
6. Handoff
|
||||
- `PROGRESS.md` records changed files, commands run, tests passed or blocked, known failures, residual risks, and next action.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Do not implement PDF conversion.
|
||||
- Do not implement conversion orchestration.
|
||||
- Do not implement the MinerU adapter.
|
||||
- Do not run MinerU.
|
||||
- Do not install MinerU 3.1.0.
|
||||
- Do not download MinerU models.
|
||||
- Do not parse PDF contents.
|
||||
- Do not compute source SHA-256 by reading files unless this contract is explicitly amended.
|
||||
- Do not implement Markdown normalization.
|
||||
- Do not implement asset link checking.
|
||||
- Do not implement math renderability checking.
|
||||
- Do not implement full `.report.md` content generation.
|
||||
- Do not implement `pdf2md convert` as a working command.
|
||||
- Do not implement `pdf2md doctor`.
|
||||
- Do not add runtime engine selection.
|
||||
- Do not add alternate conversion engines.
|
||||
- Do not add cloud, remote API, router, HTTP client backend, or remote OpenAI-compatible backend support.
|
||||
|
||||
## Work Packages
|
||||
|
||||
### WP3.1: Domain Record Types
|
||||
|
||||
Owner:
|
||||
|
||||
- `metadata-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Define small project-owned records for document/page/block/asset/warning concepts.
|
||||
- Use simple, typed Python structures that are easy to serialize and test.
|
||||
- Keep MinerU-specific raw objects out of public and required fields.
|
||||
|
||||
Output:
|
||||
|
||||
- `ir.py` contains the minimal domain model needed by metadata construction.
|
||||
|
||||
### WP3.2: Warning Codes And Severities
|
||||
|
||||
Owner:
|
||||
|
||||
- `metadata-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Define stable warning codes from `ARCHITECTURE.md`.
|
||||
- Define severity values and validate warning records against them.
|
||||
- Avoid inventing speculative warning categories beyond the known v1 set unless needed by tests.
|
||||
|
||||
Output:
|
||||
|
||||
- Warnings are structured, countable, and stable across later sprints.
|
||||
|
||||
### WP3.3: Metadata Builder
|
||||
|
||||
Owner:
|
||||
|
||||
- `metadata-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Build required metadata JSON data from project-owned records.
|
||||
- Preserve optional provenance fields only when present.
|
||||
- Require source PDF path, source SHA-256, engine, engine version, pages, assets, warnings, and engine options as explicit inputs.
|
||||
|
||||
Output:
|
||||
|
||||
- `metadata.py` produces the required v1 metadata object without MinerU execution.
|
||||
|
||||
### WP3.4: Metadata And Warning Tests
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add focused unit tests for schema, counts, optional fields, JSON serialization, and validation failures.
|
||||
- Use in-memory records and temporary paths only.
|
||||
|
||||
Output:
|
||||
|
||||
- `uv run pytest` verifies metadata behavior without external dependencies.
|
||||
|
||||
### WP3.5: Independent Evaluation
|
||||
|
||||
Owner:
|
||||
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Review the completed records and metadata builder against this contract.
|
||||
- Verify no conversion behavior, MinerU execution, remote runtime path, alternate engine, Markdown normalization, quality checks, or report content generation was added.
|
||||
- Verify `samples/` remains untracked and unstaged.
|
||||
|
||||
Output:
|
||||
|
||||
- PASS/FAIL notes with any missing acceptance criteria.
|
||||
|
||||
## Verification Checks
|
||||
|
||||
Required:
|
||||
|
||||
- `git status --short` before staging confirms `samples/` remains untracked.
|
||||
- `uv --version` is run and result is recorded.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- Targeted IR/metadata tests pass.
|
||||
- Metadata output is JSON serializable through `json.dumps`.
|
||||
- Tests do not require MinerU, CUDA, GPU, model files, `samples/`, or network.
|
||||
- No real MinerU dependency is required for default tests.
|
||||
- No model downloads occur.
|
||||
- No network calls are required.
|
||||
- No candidate engine comparison is reintroduced.
|
||||
- No conversion behavior is implemented.
|
||||
- No Markdown normalization behavior is implemented.
|
||||
- No full `.report.md` content generation is implemented.
|
||||
- `git diff --check` passes.
|
||||
|
||||
Recommended:
|
||||
|
||||
- Keep dataclass or enum APIs small and explicit.
|
||||
- Prefer one serialization function per record over ad hoc dict mutation in tests.
|
||||
- Include tests that fail if a required metadata top-level field is omitted.
|
||||
- Use `requirements-guard-agent` if metadata requirements conflict between `PRD.md` and `ARCHITECTURE.md`.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
Sprint 3 fails and must stop for a user decision if any of these are true:
|
||||
|
||||
- Metadata omits source PDF, source SHA-256, engine, engine version, pages, warnings, assets, or summary.
|
||||
- Summary omits pages processed, warning count, asset count, display formula count, inline formula count, or math render error count.
|
||||
- Public or required metadata fields require raw MinerU objects.
|
||||
- Optional bbox, confidence, or page provenance is dropped when provided.
|
||||
- Optional bbox, confidence, or page provenance is invented when absent.
|
||||
- Default tests require MinerU, CUDA, GPU, model files, network, or `samples/`.
|
||||
- The implementation parses PDF contents, invokes conversion behavior, normalizes Markdown, or generates full report Markdown content.
|
||||
- The implementation introduces alternate engines or runtime engine selection.
|
||||
- The implementation introduces `--api-url`, remote APIs, router mode, HTTP client backends, or remote OpenAI-compatible backends.
|
||||
- `samples/` is staged or committed.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
Sprint 3 is complete when:
|
||||
|
||||
- `src/pdf2md/ir.py` exists and owns project domain records.
|
||||
- `src/pdf2md/metadata.py` exists and builds required metadata JSON data from project-owned records.
|
||||
- Stable block types and warning codes are defined and tested.
|
||||
- Metadata top-level fields and summary fields are tested.
|
||||
- Warning aggregation is tested.
|
||||
- Optional bbox and confidence preservation is tested.
|
||||
- Metadata JSON serializability is tested.
|
||||
- No conversion, MinerU, Markdown normalization, quality check, full report generation, or doctor behavior is implemented.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- `PROGRESS.md` records checks performed and residual risks.
|
||||
- Independent evaluation is complete.
|
||||
- The completed change is committed.
|
||||
|
||||
## Handoff Fields
|
||||
|
||||
Use these fields when Sprint 3 completes:
|
||||
|
||||
- Files changed:
|
||||
- Commands run:
|
||||
- Tests passed:
|
||||
- Tests blocked:
|
||||
- Known failures:
|
||||
- Residual risks:
|
||||
- User decisions needed:
|
||||
- Go/no-go recommendation for Sprint 4:
|
||||
- Next action:
|
||||
@@ -0,0 +1,316 @@
|
||||
# Sprint 4 Contract: MinerU Adapter With Mocked Contract
|
||||
|
||||
Status: Implemented
|
||||
Last updated: 2026-05-07
|
||||
|
||||
## Objective
|
||||
|
||||
Build the direct local MinerU 3.1.0 adapter boundary using mocked subprocess results and fake output directories first.
|
||||
|
||||
Sprint 4 must establish:
|
||||
|
||||
- A project-owned adapter module that is the only boundary for MinerU CLI interaction.
|
||||
- Deterministic command construction for direct local MinerU CLI execution.
|
||||
- Strict-local validation that rejects prohibited remote/API/router/backend options.
|
||||
- Subprocess execution wrapping that captures stdout, stderr, exit code, command, and generated paths.
|
||||
- Optional-file parsing for mocked MinerU output directories.
|
||||
- Clear adapter result and warning records for missing MinerU, failed CLI execution, missing output, and strict-local violations.
|
||||
- Fast unit tests that do not require real MinerU, model files, GPU, sample PDFs, or network.
|
||||
|
||||
Sprint 4 is an adapter contract sprint. It must not connect the adapter to real conversion orchestration, Markdown normalization, metadata writing, report generation, or a working `pdf2md convert` command.
|
||||
|
||||
## Current Precondition
|
||||
|
||||
Sprint 3 is complete:
|
||||
|
||||
- `src/pdf2md/paths.py` owns input discovery and output path planning.
|
||||
- `src/pdf2md/ir.py` owns project records, block types, warning codes, and warning severities.
|
||||
- `src/pdf2md/metadata.py` builds JSON-serializable metadata and summary counts from project-owned records.
|
||||
- `uv run pytest` passed 46 tests.
|
||||
|
||||
Sprint 4 may import project-owned warning records from `ir.py`, but it must not require raw MinerU objects as public or required return fields.
|
||||
|
||||
## Touched Surfaces
|
||||
|
||||
Allowed:
|
||||
|
||||
- `src/pdf2md/mineru_adapter.py`
|
||||
- `src/pdf2md/doctor.py` only for minimal adapter availability/version check types if needed; do not implement full `pdf2md doctor`
|
||||
- `src/pdf2md/ir.py` only for narrowly required warning/result fields discovered while implementing the adapter contract
|
||||
- `tests/test_mineru_adapter.py` or `tests/unit/test_mineru_adapter.py`
|
||||
- `tests/test_doctor.py` only if `doctor.py` is touched for adapter availability/version checks
|
||||
- `README.md` only if a small note is needed to clarify mocked adapter tests versus real MinerU setup
|
||||
- `PLAN.md` only for current-goal coordination updates required by the shared agent workflow
|
||||
- `PROGRESS.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md` only if sequencing or constraints need adjustment
|
||||
- `docs/Sprints/SPRINT4CONTRACT.md`
|
||||
|
||||
Not allowed:
|
||||
|
||||
- `src/pdf2md/conversion.py`
|
||||
- `src/pdf2md/markdown.py`
|
||||
- `src/pdf2md/quality.py`
|
||||
- `src/pdf2md/report.py`
|
||||
- Working `pdf2md convert` behavior
|
||||
- Full `pdf2md doctor` behavior
|
||||
- `scripts/`
|
||||
- Any real MinerU invocation in default tests
|
||||
- Any MinerU/model installation or download script
|
||||
- Any PDF content parsing
|
||||
- Any Markdown normalization behavior
|
||||
- Any metadata JSON file writing
|
||||
- Any `.report.md` content generation
|
||||
- Any runtime engine selection or alternate engine support
|
||||
- Any committed file under `samples/`
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
Sprint 4 should produce:
|
||||
|
||||
1. Adapter records and options
|
||||
- A small adapter options record for v1-local MinerU execution.
|
||||
- A result record containing at least:
|
||||
- command arguments
|
||||
- input PDF path
|
||||
- work/output directory path
|
||||
- raw Markdown when found
|
||||
- raw structured data when found
|
||||
- asset paths when found
|
||||
- warnings
|
||||
- engine name fixed to MinerU
|
||||
- engine version when known
|
||||
- engine options
|
||||
- exit code
|
||||
- stdout
|
||||
- stderr
|
||||
- No public or required field should expose raw MinerU-specific Python objects.
|
||||
|
||||
2. Availability and version checks
|
||||
- Check whether `mineru` is available using a mockable local mechanism such as `shutil.which`.
|
||||
- Check MinerU version using a mockable subprocess call.
|
||||
- Missing MinerU should produce a clear `ENGINE_MISSING` warning or project-owned adapter error.
|
||||
- Version command failure should be explicit and testable.
|
||||
|
||||
3. Direct local command construction
|
||||
- Baseline conversion command shape:
|
||||
|
||||
```text
|
||||
mineru -p <input-pdf> -o <work-dir>
|
||||
```
|
||||
|
||||
- The command must not include `--api-url`.
|
||||
- The command must not include router mode, HTTP client backend flags, remote API URLs, remote OpenAI-compatible backend settings, or runtime engine selection.
|
||||
- GPU/device options may be represented only if they are strict-local and do not introduce remote/backend choices. If the exact MinerU 3.1.0 flag is uncertain, store the requested option without passing a speculative CLI flag until a later source-verified sprint.
|
||||
|
||||
4. Strict-local validation
|
||||
- Reject prohibited CLI args and options before subprocess execution.
|
||||
- Reject any value that looks like a remote URL in user-controlled adapter options.
|
||||
- Allow only direct local `mineru` CLI execution.
|
||||
- Allow the CLI-internal temporary local `mineru-api` process that MinerU 3.1.0 may start when the CLI runs without `--api-url`.
|
||||
- Do not implement or call an HTTP client backend.
|
||||
|
||||
5. Subprocess wrapper
|
||||
- Use dependency injection or a small runner boundary so tests can fake subprocess behavior.
|
||||
- Capture stdout, stderr, and exit code.
|
||||
- Convert non-zero exit into an adapter result or project-owned error with a `MINERU_CLI_FAILED` warning.
|
||||
- Do not silently fallback to another engine.
|
||||
|
||||
6. Mocked output parsing
|
||||
- Parse fake output directories using optional-file behavior.
|
||||
- Raw Markdown is optional and should be read only from mocked local files created by tests.
|
||||
- Raw structured output is optional and may be represented as a JSON-serializable object or raw text, depending on the fake file extension used in tests.
|
||||
- Assets are optional and should be collected as local relative or absolute paths according to the adapter result design.
|
||||
- Missing expected output should produce a clear warning or failed result instead of being silently ignored.
|
||||
- The adapter must not assume real MinerU output layout is fully known until a later local probe.
|
||||
|
||||
7. Tests
|
||||
- Unit tests for availability check success and missing MinerU.
|
||||
- Unit tests for version check success and version command failure.
|
||||
- Unit tests for command construction.
|
||||
- Unit tests that prohibited `--api-url`, remote URLs, router mode, HTTP backend, and OpenAI-compatible backend options are rejected.
|
||||
- Unit tests for mocked successful MinerU output.
|
||||
- Unit tests for mocked non-zero exit.
|
||||
- Unit tests for mocked missing output.
|
||||
- Unit tests proving no real MinerU binary, model files, GPU, `samples/`, or network are required by default.
|
||||
|
||||
8. Handoff
|
||||
- `PROGRESS.md` records changed files, commands run, tests passed or blocked, known failures, residual risks, and next action.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Do not implement conversion orchestration.
|
||||
- Do not implement `convert_pdf`.
|
||||
- Do not implement `pdf2md convert`.
|
||||
- Do not implement full `pdf2md doctor`.
|
||||
- Do not install MinerU 3.1.0.
|
||||
- Do not download MinerU models.
|
||||
- Do not run real MinerU in default tests.
|
||||
- Do not parse real PDFs.
|
||||
- Do not normalize Markdown.
|
||||
- Do not write final Markdown, metadata JSON, assets, or report files as product behavior.
|
||||
- Do not compute source SHA-256.
|
||||
- Do not implement asset link checking.
|
||||
- Do not implement math renderability checking.
|
||||
- Do not implement alternate engines or runtime engine selection.
|
||||
- Do not add cloud, remote API, router, HTTP client backend, or remote OpenAI-compatible backend support.
|
||||
|
||||
## Work Packages
|
||||
|
||||
### WP4.1: Adapter Types And Strict-Local Options
|
||||
|
||||
Owner:
|
||||
|
||||
- `mineru-integration-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Define minimal adapter options and result records.
|
||||
- Encode strict-local defaults.
|
||||
- Reject prohibited remote/API/backend options before command execution.
|
||||
|
||||
Output:
|
||||
|
||||
- The adapter boundary can be tested without invoking MinerU.
|
||||
|
||||
### WP4.2: Availability, Version, And Command Builder
|
||||
|
||||
Owner:
|
||||
|
||||
- `mineru-integration-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Implement mockable `is_available` and `version` checks.
|
||||
- Build the direct local command shape `mineru -p <input-pdf> -o <work-dir>`.
|
||||
- Keep command construction deterministic and easy to inspect in tests.
|
||||
|
||||
Output:
|
||||
|
||||
- Later orchestration can call the adapter without knowing MinerU CLI details.
|
||||
|
||||
### WP4.3: Subprocess Runner Boundary
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add a small runner interface or callable boundary for subprocess execution.
|
||||
- Capture command, stdout, stderr, exit code, and return values.
|
||||
- Map non-zero exits to structured adapter warnings/errors.
|
||||
|
||||
Output:
|
||||
|
||||
- Default tests can fake all MinerU behavior.
|
||||
|
||||
### WP4.4: Mocked Output Parser
|
||||
|
||||
Owner:
|
||||
|
||||
- `mineru-integration-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Parse fake Markdown, JSON/structured, asset, and diagnostic outputs from test-created directories.
|
||||
- Treat all raw MinerU output files as optional until real local output is probed.
|
||||
- Emit clear warnings for missing usable output.
|
||||
|
||||
Output:
|
||||
|
||||
- Adapter result objects can carry raw output into later IR/normalization sprints without binding to a guessed full MinerU layout.
|
||||
|
||||
### WP4.5: Independent Evaluation
|
||||
|
||||
Owner:
|
||||
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Review the completed adapter against this contract.
|
||||
- Verify no conversion orchestration, real MinerU dependency in default tests, remote runtime path, alternate engine, Markdown normalization, metadata writing, report generation, or working CLI command was added.
|
||||
- Verify `samples/` remains untracked and unstaged.
|
||||
|
||||
Output:
|
||||
|
||||
- PASS/FAIL notes with any missing acceptance criteria.
|
||||
|
||||
## Verification Checks
|
||||
|
||||
Required:
|
||||
|
||||
- `git status --short` before staging confirms `samples/` remains untracked.
|
||||
- `uv --version` is run and result is recorded.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- Targeted MinerU adapter tests pass.
|
||||
- Tests do not require real MinerU, CUDA, GPU, model files, `samples/`, or network.
|
||||
- No model downloads occur.
|
||||
- No network calls are required.
|
||||
- No candidate engine comparison is reintroduced.
|
||||
- No conversion orchestration is implemented.
|
||||
- No Markdown normalization behavior is implemented.
|
||||
- No metadata JSON writing or full report generation is implemented.
|
||||
- No working `pdf2md convert` or full `pdf2md doctor` behavior is implemented.
|
||||
- Strict-local rejection tests cover `--api-url`, remote URL values, router mode, HTTP backend, and remote OpenAI-compatible backend options.
|
||||
- `git diff --check` passes.
|
||||
|
||||
Recommended:
|
||||
|
||||
- Use fake runner classes or functions rather than monkeypatching global subprocess calls everywhere.
|
||||
- Keep adapter result data JSON-friendly where practical, but do not force metadata schema generation in Sprint 4.
|
||||
- Include a test that no prohibited remote/API flag appears in the successful command args.
|
||||
- Use `requirements-guard-agent` if command flags or strict-local wording conflict across documents.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
Sprint 4 fails and must stop for a user decision if any of these are true:
|
||||
|
||||
- The adapter passes `--api-url`.
|
||||
- The adapter uses router mode.
|
||||
- The adapter uses an HTTP client backend.
|
||||
- The adapter accepts a remote API URL or remote OpenAI-compatible backend for runtime conversion.
|
||||
- The adapter falls back to another engine after MinerU failure.
|
||||
- Default tests require real MinerU, CUDA, GPU, model files, network, or `samples/`.
|
||||
- The implementation installs MinerU or downloads models.
|
||||
- The implementation connects the adapter to a working conversion CLI/API.
|
||||
- The implementation adds Markdown normalization, metadata file writing, full report generation, or quality checks.
|
||||
- The implementation assumes real MinerU output layout is fully known without a later local probe.
|
||||
- `samples/` is staged or committed.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
Sprint 4 is complete when:
|
||||
|
||||
- `src/pdf2md/mineru_adapter.py` exists and owns direct local MinerU CLI adapter behavior.
|
||||
- Availability and version checks are mock-tested.
|
||||
- Conversion command construction is mock-tested and uses `mineru -p <input-pdf> -o <work-dir>`.
|
||||
- Strict-local validation rejects prohibited remote/API/router/backend options.
|
||||
- Mocked successful MinerU output produces an adapter result with raw Markdown, raw structured data when available, assets when available, engine info, command args, stdout, stderr, and exit code.
|
||||
- Mocked non-zero exit produces a clear failure result or project-owned error with a `MINERU_CLI_FAILED` warning.
|
||||
- Mocked missing MinerU produces a clear `ENGINE_MISSING` warning or project-owned adapter error.
|
||||
- Default tests do not require MinerU, GPU, model files, network, or `samples/`.
|
||||
- No conversion orchestration, Markdown normalization, metadata file writing, full report generation, or working CLI behavior is implemented.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- `PROGRESS.md` records checks performed and residual risks.
|
||||
- Independent evaluation is complete.
|
||||
- The completed change is committed.
|
||||
|
||||
## Handoff Fields
|
||||
|
||||
Use these fields when Sprint 4 completes:
|
||||
|
||||
- Files changed:
|
||||
- Commands run:
|
||||
- Tests passed:
|
||||
- Tests blocked:
|
||||
- Known failures:
|
||||
- Residual risks:
|
||||
- User decisions needed:
|
||||
- Go/no-go recommendation for Sprint 5:
|
||||
- Next action:
|
||||
@@ -0,0 +1,311 @@
|
||||
# Sprint 5 Contract: Obsidian Markdown Normalization And Asset Links
|
||||
|
||||
Status: Implemented
|
||||
Last updated: 2026-05-07
|
||||
|
||||
## Objective
|
||||
|
||||
Build the project-owned Markdown normalization boundary for Obsidian output, using deterministic unit tests before it is connected to conversion orchestration.
|
||||
|
||||
Sprint 5 must establish:
|
||||
|
||||
- A small Markdown normalization module that accepts local raw Markdown-like text and returns normalized Markdown plus project-owned warnings.
|
||||
- Obsidian math delimiter normalization for inline and display math.
|
||||
- Stable relative asset link normalization without copying files or writing final outputs.
|
||||
- Limited local asset link validation where useful for warnings.
|
||||
- Table preservation and clear warning behavior when a table cannot be safely simplified.
|
||||
- Fast unit tests that do not require real MinerU, model files, GPU, sample PDFs, network, or Obsidian itself.
|
||||
|
||||
Sprint 5 is a normalization contract sprint. It must not connect normalization to the CLI, conversion orchestration, metadata writing, report generation, real MinerU execution, or end-to-end output writing.
|
||||
|
||||
## Current Precondition
|
||||
|
||||
Sprint 4 is complete:
|
||||
|
||||
- `src/pdf2md/paths.py` owns input discovery and output path planning.
|
||||
- `src/pdf2md/ir.py` owns project records, block types, warning codes, and warning severities.
|
||||
- `src/pdf2md/metadata.py` builds JSON-serializable metadata and summary counts from project-owned records.
|
||||
- `src/pdf2md/mineru_adapter.py` owns the mocked direct local MinerU CLI adapter boundary.
|
||||
- `uv run pytest` passed 72 tests.
|
||||
|
||||
Sprint 5 may use `WarningRecord`, `WarningCode`, and `WarningSeverity` from `ir.py`, but it must not require raw MinerU-specific Python objects as public or required inputs.
|
||||
|
||||
## Touched Surfaces
|
||||
|
||||
Allowed:
|
||||
|
||||
- `src/pdf2md/markdown.py`
|
||||
- `src/pdf2md/quality.py` only for minimal local asset link check helpers if that boundary is cleaner than placing them in `markdown.py`
|
||||
- `src/pdf2md/ir.py` only for narrowly required warning codes discovered while implementing table or asset fallback warnings
|
||||
- `tests/test_markdown.py` or `tests/unit/test_markdown.py`
|
||||
- `tests/test_quality.py` only if `quality.py` is touched for asset link checks
|
||||
- `README.md` only if a small note is needed to clarify that normalization tests are mocked/local and not full conversion behavior
|
||||
- `PLAN.md` only for current-goal coordination updates required by the shared agent workflow
|
||||
- `PROGRESS.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md` only if sequencing or constraints need adjustment
|
||||
- `docs/Sprints/SPRINT5CONTRACT.md`
|
||||
|
||||
Not allowed:
|
||||
|
||||
- `src/pdf2md/conversion.py`
|
||||
- `src/pdf2md/cli.py`
|
||||
- `src/pdf2md/mineru_adapter.py`
|
||||
- `src/pdf2md/report.py`
|
||||
- Working `pdf2md convert` behavior
|
||||
- Full `pdf2md doctor` behavior
|
||||
- `scripts/`
|
||||
- Any real MinerU invocation in default tests
|
||||
- Any MinerU/model installation or download script
|
||||
- Any PDF content parsing
|
||||
- Any metadata JSON file writing
|
||||
- Any `.report.md` content generation
|
||||
- Any runtime engine selection or alternate engine support
|
||||
- Any remote asset fetch, HTTP client, or cloud/API integration
|
||||
- Any committed file under `samples/`
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
Sprint 5 should produce:
|
||||
|
||||
1. Normalization records and API
|
||||
- A small result record or equivalent project-owned return type containing at least:
|
||||
- normalized Markdown
|
||||
- warnings
|
||||
- asset links discovered or normalized when available
|
||||
- A normalization function with a narrow input surface, such as raw Markdown text plus optional output/assets context.
|
||||
- No public or required field should expose raw MinerU-specific Python objects.
|
||||
- The API should be usable by later orchestration without knowing how MinerU represented the original Markdown.
|
||||
|
||||
2. Inline math delimiter normalization
|
||||
- Normalize safe inline math forms to `$...$`.
|
||||
- Preserve already valid `$...$` inline math.
|
||||
- Preserve the exact LaTeX body inside inline math except delimiter changes.
|
||||
- Do not escape or rewrite underscores, carets, braces, or backslashes inside math.
|
||||
- Do not normalize math delimiters inside fenced code blocks or inline code spans.
|
||||
- Avoid converting ambiguous dollar signs that look like currency or prose punctuation.
|
||||
|
||||
3. Display math delimiter normalization
|
||||
- Normalize safe display math forms to `$$...$$`.
|
||||
- Ensure display math delimiters sit on their own lines.
|
||||
- Keep a blank line around display math blocks.
|
||||
- Preserve the exact LaTeX body inside display math except delimiter and surrounding whitespace normalization.
|
||||
- Preserve LaTeX environments such as `equation`, `align`, or `gather` rather than rewriting their semantics.
|
||||
- Make normalization idempotent: running the normalizer twice should produce the same Markdown.
|
||||
|
||||
4. Asset link normalization
|
||||
- Normalize local image/asset links to stable relative POSIX-style Markdown paths.
|
||||
- Keep relative links relative; do not turn them into absolute paths.
|
||||
- Reject or warn on absolute asset links that cannot be represented relative to the planned output/assets context.
|
||||
- Reject or warn on links that escape the output/assets directory with `..`.
|
||||
- Do not fetch remote URLs, copy assets, or write files.
|
||||
- Preserve alt text when rewriting Markdown image links.
|
||||
|
||||
5. Table preservation and fallback warnings
|
||||
- Preserve simple Markdown pipe tables without destructive formatting changes.
|
||||
- Preserve HTML tables when Markdown would lose row spans, column spans, nested content, or other complex structure.
|
||||
- Emit a project-owned warning when complex table fallback behavior is detected or when table simplification is intentionally skipped.
|
||||
- Do not attempt broad table reflow or OCR-style table reconstruction in Sprint 5.
|
||||
|
||||
6. Tests
|
||||
- Unit tests for inline math delimiter normalization.
|
||||
- Unit tests for display math delimiter normalization and blank-line spacing.
|
||||
- Unit tests proving underscores and carets inside math are preserved.
|
||||
- Unit tests proving fenced code blocks and inline code are not normalized.
|
||||
- Unit tests for idempotency.
|
||||
- Unit tests for relative asset link normalization.
|
||||
- Unit tests for missing or escaping asset link warnings when asset checking is implemented.
|
||||
- Unit tests for simple table preservation.
|
||||
- Unit tests for complex table fallback warning behavior.
|
||||
- Unit tests proving no real MinerU binary, model files, GPU, `samples/`, Obsidian installation, or network are required by default.
|
||||
|
||||
7. Handoff
|
||||
- `PROGRESS.md` records changed files, commands run, tests passed or blocked, known failures, residual risks, and next action.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Do not implement conversion orchestration.
|
||||
- Do not implement `convert_pdf`.
|
||||
- Do not implement `pdf2md convert`.
|
||||
- Do not implement full `pdf2md doctor`.
|
||||
- Do not invoke MinerU.
|
||||
- Do not install MinerU 3.1.0.
|
||||
- Do not download MinerU models.
|
||||
- Do not parse real PDFs.
|
||||
- Do not write final Markdown files as product behavior.
|
||||
- Do not copy or move assets as product behavior.
|
||||
- Do not write metadata JSON.
|
||||
- Do not generate `.report.md`.
|
||||
- Do not compute source SHA-256.
|
||||
- Do not implement math renderability checks beyond a future-facing warning interface if needed.
|
||||
- Do not implement full quality report checks.
|
||||
- Do not implement alternate engines or runtime engine selection.
|
||||
- Do not add cloud, remote API, router, HTTP client backend, remote OpenAI-compatible backend, or remote asset-fetching support.
|
||||
|
||||
## Work Packages
|
||||
|
||||
### WP5.1: Normalization Types And Safe Boundaries
|
||||
|
||||
Owner:
|
||||
|
||||
- `obsidian-markdown-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Define a small Markdown normalization result type.
|
||||
- Define a focused normalization function.
|
||||
- Keep warnings project-owned through `WarningRecord`.
|
||||
- Keep the API independent of raw MinerU objects.
|
||||
|
||||
Output:
|
||||
|
||||
- Later orchestration can normalize adapter Markdown without knowing MinerU internals.
|
||||
|
||||
### WP5.2: Math Delimiter Normalization
|
||||
|
||||
Owner:
|
||||
|
||||
- `obsidian-markdown-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Normalize safe inline math delimiters to `$...$`.
|
||||
- Normalize safe display math delimiters to `$$...$$` with stable surrounding blank lines.
|
||||
- Preserve LaTeX bodies exactly.
|
||||
- Protect code fences and inline code spans.
|
||||
- Add idempotency tests.
|
||||
|
||||
Output:
|
||||
|
||||
- Obsidian-friendly math delimiter behavior is deterministic and covered by unit tests.
|
||||
|
||||
### WP5.3: Asset Link Normalization
|
||||
|
||||
Owner:
|
||||
|
||||
- `obsidian-markdown-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Normalize local image/asset links to stable relative POSIX-style paths.
|
||||
- Preserve alt text.
|
||||
- Warn on missing, absolute, escaping, or non-local asset links when the helper has enough local context to judge them.
|
||||
- Do not fetch or copy assets.
|
||||
|
||||
Output:
|
||||
|
||||
- Later conversion can produce Markdown links that are stable relative to planned output paths.
|
||||
|
||||
### WP5.4: Table Preservation And Fallback Warning
|
||||
|
||||
Owner:
|
||||
|
||||
- `obsidian-markdown-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Preserve simple Markdown pipe tables.
|
||||
- Preserve complex HTML tables without simplifying them destructively.
|
||||
- Emit a project-owned warning for complex table fallback behavior.
|
||||
|
||||
Output:
|
||||
|
||||
- Table handling is conservative and traceable instead of silently lossy.
|
||||
|
||||
### WP5.5: Independent Evaluation
|
||||
|
||||
Owner:
|
||||
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Review the completed normalizer against this contract.
|
||||
- Verify no conversion orchestration, real MinerU dependency in default tests, remote runtime path, alternate engine, metadata writing, report generation, file-copying behavior, or working CLI command was added.
|
||||
- Verify `samples/` remains untracked and unstaged.
|
||||
|
||||
Output:
|
||||
|
||||
- PASS/FAIL notes with any missing acceptance criteria.
|
||||
|
||||
## Verification Checks
|
||||
|
||||
Required:
|
||||
|
||||
- `git status --short` before staging confirms `samples/` remains untracked.
|
||||
- `uv --version` is run and result is recorded.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- Targeted Markdown normalization tests pass.
|
||||
- Tests do not require real MinerU, CUDA, GPU, model files, Obsidian, `samples/`, or network.
|
||||
- No model downloads occur.
|
||||
- No network calls are required.
|
||||
- No candidate engine comparison is reintroduced.
|
||||
- No conversion orchestration is implemented.
|
||||
- No metadata JSON writing or full report generation is implemented.
|
||||
- No working `pdf2md convert` or full `pdf2md doctor` behavior is implemented.
|
||||
- No final output files are written as product behavior.
|
||||
- No remote asset fetching is implemented.
|
||||
- Math delimiter normalization is idempotent.
|
||||
- Asset paths in normalized Markdown are relative when they are rewritten.
|
||||
- `git diff --check` passes.
|
||||
|
||||
Recommended:
|
||||
|
||||
- Prefer a small tokenizer or state-machine approach over broad regular-expression rewrites for math/code boundary handling.
|
||||
- Keep normalization helpers pure and deterministic.
|
||||
- Treat complex tables conservatively: preserve content and warn rather than flattening structure.
|
||||
- Use `requirements-guard-agent` if warning codes or output behavior conflict across documents.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
Sprint 5 fails and must stop for a user decision if any of these are true:
|
||||
|
||||
- The normalizer rewrites LaTeX math bodies beyond delimiter and whitespace normalization without deterministic tests.
|
||||
- The normalizer changes underscores, carets, braces, or backslashes inside math content.
|
||||
- The normalizer rewrites code fences or inline code spans as math.
|
||||
- The normalizer produces absolute asset links where relative links are required.
|
||||
- The normalizer accepts asset links that escape the output/assets context without warning.
|
||||
- The implementation fetches remote assets or adds any HTTP/network client path.
|
||||
- The implementation connects normalization to a working conversion CLI/API.
|
||||
- The implementation adds metadata file writing, full report generation, real MinerU execution, model downloads, or setup scripts.
|
||||
- Default tests require real MinerU, CUDA, GPU, model files, network, Obsidian, or `samples/`.
|
||||
- `samples/` is staged or committed.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
Sprint 5 is complete when:
|
||||
|
||||
- `src/pdf2md/markdown.py` exists and owns Obsidian Markdown normalization behavior.
|
||||
- Inline math delimiter normalization is unit-tested.
|
||||
- Display math delimiter normalization and blank-line spacing are unit-tested.
|
||||
- Tests prove underscores and carets inside math are preserved.
|
||||
- Tests prove fenced code blocks and inline code are not normalized.
|
||||
- Normalization idempotency is unit-tested.
|
||||
- Relative asset link normalization is unit-tested.
|
||||
- Asset warning behavior is unit-tested when missing, absolute, escaping, or non-local links are in scope.
|
||||
- Simple table preservation and complex table fallback warning behavior are unit-tested.
|
||||
- Default tests do not require MinerU, GPU, model files, network, Obsidian, or `samples/`.
|
||||
- No conversion orchestration, metadata file writing, full report generation, file-copying behavior, or working CLI behavior is implemented.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- `PROGRESS.md` records checks performed and residual risks.
|
||||
- Independent evaluation is complete.
|
||||
- The completed change is committed.
|
||||
|
||||
## Handoff Fields
|
||||
|
||||
Use these fields when Sprint 5 completes:
|
||||
|
||||
- Files changed:
|
||||
- Commands run:
|
||||
- Tests passed:
|
||||
- Tests blocked:
|
||||
- Known failures:
|
||||
- Residual risks:
|
||||
- User decisions needed:
|
||||
- Go/no-go recommendation for Sprint 6:
|
||||
- Next action:
|
||||
@@ -0,0 +1,334 @@
|
||||
# Sprint 6 Contract: Quality Checks And Report Generation
|
||||
|
||||
Status: Implemented
|
||||
Last updated: 2026-05-08
|
||||
|
||||
## Objective
|
||||
|
||||
Build local quality-check and human-readable report generation boundaries from project-owned metadata and normalized Markdown, before they are connected to conversion orchestration.
|
||||
|
||||
Sprint 6 must establish:
|
||||
|
||||
- A project-owned quality module for local asset-link and math-renderability signals.
|
||||
- A report module that renders `<stem>.report.md` content from metadata and quality results.
|
||||
- Deterministic final status calculation: `success`, `partial`, or `failed`.
|
||||
- Summary fields needed by reports, including missing asset links and math render failures.
|
||||
- Fast unit tests that do not require real MinerU, model files, GPU, sample PDFs, Obsidian, LaTeX tooling, network, or a working conversion CLI.
|
||||
|
||||
Sprint 6 is a quality/report contract sprint. It may generate report Markdown content as a string, but it must not connect to the CLI, conversion orchestration, real MinerU execution, file output writing, setup scripts, or end-to-end conversion.
|
||||
|
||||
## Current Precondition
|
||||
|
||||
Sprint 5 is complete:
|
||||
|
||||
- `src/pdf2md/paths.py` owns input discovery and output path planning.
|
||||
- `src/pdf2md/ir.py` owns project records, block types, warning codes, and warning severities.
|
||||
- `src/pdf2md/metadata.py` builds JSON-serializable metadata and summary counts from project-owned records.
|
||||
- `src/pdf2md/mineru_adapter.py` owns the mocked direct local MinerU CLI adapter boundary.
|
||||
- `src/pdf2md/markdown.py` owns Obsidian Markdown normalization, asset link warnings, and table fallback warnings.
|
||||
- `uv run pytest` passed 89 tests.
|
||||
|
||||
Sprint 6 may use metadata dictionaries produced by `build_metadata`, project-owned `WarningRecord` values, and normalized Markdown text. It must not require raw MinerU-specific Python objects as public or required inputs.
|
||||
|
||||
## Touched Surfaces
|
||||
|
||||
Allowed:
|
||||
|
||||
- `src/pdf2md/quality.py`
|
||||
- `src/pdf2md/report.py`
|
||||
- `src/pdf2md/metadata.py` only for narrowly required summary fields or helper functions that keep metadata/report consistency
|
||||
- `src/pdf2md/ir.py` only for narrowly required warning codes discovered while implementing quality checks
|
||||
- `tests/test_quality.py`
|
||||
- `tests/test_report.py`
|
||||
- `tests/test_metadata.py` only if `metadata.py` changes
|
||||
- `README.md` only if a small note is needed to clarify mocked/local quality and report behavior
|
||||
- `PLAN.md` only for current-goal coordination updates required by the shared agent workflow
|
||||
- `PROGRESS.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md` only if sequencing or constraints need adjustment
|
||||
- `docs/Sprints/SPRINT6CONTRACT.md`
|
||||
|
||||
Not allowed:
|
||||
|
||||
- `src/pdf2md/conversion.py`
|
||||
- `src/pdf2md/cli.py`
|
||||
- `src/pdf2md/mineru_adapter.py`
|
||||
- Working `pdf2md convert` behavior
|
||||
- Full `pdf2md doctor` behavior
|
||||
- `scripts/`
|
||||
- Any real MinerU invocation in default tests
|
||||
- Any MinerU/model installation or download script
|
||||
- Any PDF content parsing
|
||||
- Any final Markdown file writing
|
||||
- Any metadata JSON file writing
|
||||
- Any `.report.md` file writing as product behavior
|
||||
- Any asset copying or moving
|
||||
- Any runtime engine selection or alternate engine support
|
||||
- Any remote asset fetch, HTTP client, cloud/API integration, hosted renderer, or remote math-render service
|
||||
- Any committed file under `samples/`
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
Sprint 6 should produce:
|
||||
|
||||
1. Quality result records and API
|
||||
- A small project-owned quality result type containing at least:
|
||||
- missing asset link count
|
||||
- invalid asset link count when available
|
||||
- math render error count
|
||||
- warnings produced by quality checks
|
||||
- A local asset-link check function that accepts normalized Markdown and local asset context without writing files.
|
||||
- A math renderability check interface that accepts a local checker callable or reports tool-unavailable behavior gracefully.
|
||||
- No public or required field should expose raw MinerU-specific Python objects.
|
||||
|
||||
2. Asset-link quality checks
|
||||
- Count missing local asset links in Markdown.
|
||||
- Count invalid links that are absolute, parent-escaping, remote, or otherwise non-local according to project policy.
|
||||
- Produce project-owned warnings for missing or invalid asset links.
|
||||
- Keep all checks local and deterministic.
|
||||
- Do not fetch remote URLs, copy assets, move assets, or write files.
|
||||
|
||||
3. Math renderability checks
|
||||
- Provide a boundary for local math renderability checking.
|
||||
- Default tests must use fake/local checker callables.
|
||||
- Tool-unavailable behavior must be explicit and non-fatal.
|
||||
- Render failures must produce `MATH_RENDER_FAILED` warnings and count toward the report.
|
||||
- The checker must not call network services or require a LaTeX/Obsidian install in default tests.
|
||||
|
||||
4. Metadata summary consistency
|
||||
- Preserve existing required metadata summary fields.
|
||||
- Add or derive report-needed counts without breaking existing metadata tests:
|
||||
- missing asset link count
|
||||
- invalid asset link count
|
||||
- math render error count
|
||||
- Warning order and warning counts must remain deterministic.
|
||||
- Reports must be derived from metadata and quality results, not independently duplicated state.
|
||||
|
||||
5. Report Markdown generation
|
||||
- Render a human-readable `<stem>.report.md` content string from metadata and quality results.
|
||||
- Include at least:
|
||||
- source PDF path
|
||||
- output Markdown path when provided
|
||||
- metadata path when provided
|
||||
- report path when provided
|
||||
- MinerU engine/version and execution mode/options
|
||||
- pages processed
|
||||
- warning count
|
||||
- asset count
|
||||
- missing asset link count
|
||||
- inline formula count
|
||||
- display formula count
|
||||
- math render error count
|
||||
- pages with warnings
|
||||
- final status: `success`, `partial`, or `failed`
|
||||
- The report must not invent facts that are absent from metadata; absent optional paths should be omitted or clearly shown as unavailable.
|
||||
- The report generator must not write files in Sprint 6.
|
||||
|
||||
6. Final status policy
|
||||
- `failed`: metadata or quality warnings contain at least one `error` severity warning.
|
||||
- `partial`: no error severity warnings, but warnings or quality failures exist.
|
||||
- `success`: no warnings and no quality failures.
|
||||
- The status function must be unit-tested and reusable by later orchestration.
|
||||
|
||||
7. Tests
|
||||
- Unit tests for missing asset link counting.
|
||||
- Unit tests for invalid/remote/escaping asset link warnings.
|
||||
- Unit tests for math render failure aggregation with a fake checker.
|
||||
- Unit tests for math checker unavailable behavior.
|
||||
- Unit tests for report content and required sections.
|
||||
- Unit tests proving report content is derived from metadata and quality results.
|
||||
- Unit tests for pages-with-warnings summary.
|
||||
- Unit tests for final status calculation.
|
||||
- Unit tests proving no real MinerU binary, model files, GPU, `samples/`, Obsidian, LaTeX install, or network are required by default.
|
||||
|
||||
8. Handoff
|
||||
- `PROGRESS.md` records changed files, commands run, tests passed or blocked, known failures, residual risks, and next action.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Do not implement conversion orchestration.
|
||||
- Do not implement `convert_pdf`.
|
||||
- Do not implement `pdf2md convert`.
|
||||
- Do not implement full `pdf2md doctor`.
|
||||
- Do not invoke MinerU.
|
||||
- Do not install MinerU 3.1.0.
|
||||
- Do not download MinerU models.
|
||||
- Do not parse real PDFs.
|
||||
- Do not write final Markdown files.
|
||||
- Do not copy or move assets.
|
||||
- Do not write metadata JSON files.
|
||||
- Do not write `.report.md` files as product behavior.
|
||||
- Do not compute source SHA-256.
|
||||
- Do not implement real LaTeX, KaTeX, MathJax, or Obsidian rendering in default tests.
|
||||
- Do not add setup scripts.
|
||||
- Do not implement full local environment diagnostics.
|
||||
- Do not implement alternate engines or runtime engine selection.
|
||||
- Do not add cloud, remote API, router, HTTP client backend, remote OpenAI-compatible backend, hosted renderer, or remote asset-fetching support.
|
||||
|
||||
## Work Packages
|
||||
|
||||
### WP6.1: Quality Result Types And Asset Checks
|
||||
|
||||
Owner:
|
||||
|
||||
- `metadata-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Define a small project-owned quality result type.
|
||||
- Add deterministic local asset link checks over normalized Markdown.
|
||||
- Count missing, invalid, escaping, absolute, and remote asset references.
|
||||
- Return project-owned warnings without writing files.
|
||||
|
||||
Output:
|
||||
|
||||
- Later orchestration can add local quality results to metadata/report flow without duplicating asset-link logic.
|
||||
|
||||
### WP6.2: Math Renderability Boundary
|
||||
|
||||
Owner:
|
||||
|
||||
- `obsidian-markdown-agent`
|
||||
- `metadata-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Define a local math render checker interface.
|
||||
- Support fake checkers in tests.
|
||||
- Treat checker-unavailable as explicit non-fatal warning/info according to the implementation design.
|
||||
- Treat render failures as `MATH_RENDER_FAILED` warnings and count them.
|
||||
|
||||
Output:
|
||||
|
||||
- Math renderability is represented as a local, testable boundary without external dependencies.
|
||||
|
||||
### WP6.3: Metadata Summary Extensions
|
||||
|
||||
Owner:
|
||||
|
||||
- `metadata-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Preserve existing required metadata summary fields.
|
||||
- Add or derive counts needed by reports in a backward-compatible way.
|
||||
- Keep metadata JSON serializable and deterministic.
|
||||
|
||||
Output:
|
||||
|
||||
- Metadata remains the source of truth for report counts and warning summaries.
|
||||
|
||||
### WP6.4: Report Markdown Rendering
|
||||
|
||||
Owner:
|
||||
|
||||
- `metadata-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Implement report content rendering from metadata plus quality results.
|
||||
- Include required report sections and final status.
|
||||
- Generate content only; do not write files.
|
||||
|
||||
Output:
|
||||
|
||||
- Later orchestration can write `<stem>.report.md` by using the tested report renderer.
|
||||
|
||||
### WP6.5: Independent Evaluation
|
||||
|
||||
Owner:
|
||||
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Review completed quality/report behavior against this contract.
|
||||
- Verify no conversion orchestration, real MinerU dependency in default tests, remote runtime path, alternate engine, final output writing, CLI behavior, or sample dependency was added.
|
||||
- Verify `samples/` remains untracked and unstaged.
|
||||
|
||||
Output:
|
||||
|
||||
- PASS/FAIL notes with any missing acceptance criteria.
|
||||
|
||||
## Verification Checks
|
||||
|
||||
Required:
|
||||
|
||||
- `git status --short` before staging confirms `samples/` remains untracked.
|
||||
- `uv --version` is run and result is recorded.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- Targeted quality/report tests pass.
|
||||
- Tests do not require real MinerU, CUDA, GPU, model files, Obsidian, LaTeX tooling, `samples/`, or network.
|
||||
- No model downloads occur.
|
||||
- No network calls are required.
|
||||
- No candidate engine comparison is reintroduced.
|
||||
- No conversion orchestration is implemented.
|
||||
- No working `pdf2md convert` or full `pdf2md doctor` behavior is implemented.
|
||||
- No final Markdown, metadata JSON, or `.report.md` files are written as product behavior.
|
||||
- No remote asset fetching is implemented.
|
||||
- No real math renderer dependency is required by default tests.
|
||||
- Report counts match metadata and quality results.
|
||||
- Report generation does not re-run MinerU.
|
||||
- `git diff --check` passes.
|
||||
|
||||
Recommended:
|
||||
|
||||
- Keep quality helpers pure and deterministic.
|
||||
- Use fake checkers for math renderability tests.
|
||||
- Keep report rendering stable enough for snapshot-like unit assertions.
|
||||
- Use `requirements-guard-agent` if warning codes, summary fields, or report wording conflict across documents.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
Sprint 6 fails and must stop for a user decision if any of these are true:
|
||||
|
||||
- Report content diverges from metadata or quality result counts.
|
||||
- Math render failures are silently ignored.
|
||||
- Quality checks require network access.
|
||||
- The implementation fetches remote assets or adds any HTTP/network client path.
|
||||
- The implementation requires a real LaTeX/Obsidian/MathJax/KaTeX install in default tests.
|
||||
- The implementation connects quality/report behavior to a working conversion CLI/API.
|
||||
- The implementation writes final Markdown, metadata JSON, `.report.md`, or copied assets as product behavior.
|
||||
- The implementation invokes MinerU, downloads models, adds setup scripts, or parses real PDFs.
|
||||
- Default tests require real MinerU, CUDA, GPU, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- `samples/` is staged or committed.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
Sprint 6 is complete when:
|
||||
|
||||
- `src/pdf2md/quality.py` exists and owns local quality-check behavior.
|
||||
- `src/pdf2md/report.py` exists and owns human-readable report content rendering.
|
||||
- Missing asset link counting is unit-tested.
|
||||
- Invalid, escaping, absolute, or remote asset link warning behavior is unit-tested.
|
||||
- Math render failure aggregation is unit-tested with fake checkers.
|
||||
- Math checker unavailable behavior is unit-tested and non-fatal.
|
||||
- Report content includes the required sections and counts.
|
||||
- Pages-with-warnings summary is unit-tested.
|
||||
- Final status calculation is unit-tested.
|
||||
- Report generation is proven not to write files or re-run MinerU.
|
||||
- Default tests do not require MinerU, GPU, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- No conversion orchestration, final output file writing, working CLI behavior, real MinerU execution, or setup script is implemented.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- `PROGRESS.md` records checks performed and residual risks.
|
||||
- Independent evaluation is complete.
|
||||
- The completed change is committed.
|
||||
|
||||
## Handoff Fields
|
||||
|
||||
Use these fields when Sprint 6 completes:
|
||||
|
||||
- Files changed:
|
||||
- Commands run:
|
||||
- Tests passed:
|
||||
- Tests blocked:
|
||||
- Known failures:
|
||||
- Residual risks:
|
||||
- User decisions needed:
|
||||
- Go/no-go recommendation for Sprint 7:
|
||||
- Next action:
|
||||
@@ -0,0 +1,360 @@
|
||||
# Sprint 7 Contract: Conversion Orchestrator, CLI, And Python API
|
||||
|
||||
Status: Implemented
|
||||
Last updated: 2026-05-08
|
||||
|
||||
## Objective
|
||||
|
||||
Connect the existing project-owned boundaries into a working conversion orchestration layer, public Python API, and `pdf2md convert` CLI path.
|
||||
|
||||
Sprint 7 must establish:
|
||||
|
||||
- A public `convert_pdf` API for one local PDF.
|
||||
- A batch conversion API or helper for directory inputs.
|
||||
- A `pdf2md convert INPUT --out OUTPUT_DIR` command.
|
||||
- Product behavior that writes Markdown, optional metadata JSON, and `<stem>.report.md`.
|
||||
- Local asset materialization for adapter-provided asset files.
|
||||
- CLI summaries that surface success, failure, and warning counts.
|
||||
- Fast tests that use fake adapter outputs and do not require real MinerU, model files, GPU, sample PDFs, network, Obsidian, or LaTeX tooling.
|
||||
|
||||
Sprint 7 is an orchestration sprint. It may call the real `MinerUAdapter` in the normal production path, but the default test suite must use injected fake adapters and must not execute MinerU.
|
||||
|
||||
## Current Precondition
|
||||
|
||||
Sprint 6 is complete:
|
||||
|
||||
- `src/pdf2md/paths.py` owns input discovery and output path planning.
|
||||
- `src/pdf2md/ir.py` owns project records, block types, warning codes, and warning severities.
|
||||
- `src/pdf2md/metadata.py` builds JSON-serializable metadata and summary counts from project-owned records.
|
||||
- `src/pdf2md/mineru_adapter.py` owns the mocked direct local MinerU CLI adapter boundary.
|
||||
- `src/pdf2md/markdown.py` owns Obsidian Markdown normalization, asset link warnings, and table fallback warnings.
|
||||
- `src/pdf2md/quality.py` owns local quality checks over normalized Markdown and asset context.
|
||||
- `src/pdf2md/report.py` owns report content rendering and final status calculation.
|
||||
- `uv run pytest` passed 103 tests.
|
||||
|
||||
Sprint 7 may compute source SHA-256, create conversion output directories, copy local adapter-provided asset files, write final Markdown, write metadata JSON when requested, and write `<stem>.report.md`. It must keep public return types project-owned and must not require raw MinerU-specific Python objects from callers.
|
||||
|
||||
## Touched Surfaces
|
||||
|
||||
Allowed:
|
||||
|
||||
- `src/pdf2md/conversion.py`
|
||||
- `src/pdf2md/cli.py`
|
||||
- `src/pdf2md/__init__.py`
|
||||
- `src/pdf2md/paths.py` only for narrowly required path/output helper compatibility
|
||||
- `src/pdf2md/mineru_adapter.py` only for narrowly required adapter protocol or result compatibility
|
||||
- `src/pdf2md/metadata.py` only for narrowly required output-location or summary compatibility
|
||||
- `src/pdf2md/markdown.py` only for narrowly required orchestration compatibility
|
||||
- `src/pdf2md/quality.py` only for narrowly required orchestration compatibility
|
||||
- `src/pdf2md/report.py` only for narrowly required orchestration compatibility
|
||||
- `tests/test_conversion.py`
|
||||
- `tests/test_cli.py`
|
||||
- Existing focused unit tests only if a touched module requires compatibility updates
|
||||
- `README.md` only if a short usage note is needed for `pdf2md convert`
|
||||
- `PLAN.md`
|
||||
- `PROGRESS.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md`
|
||||
- `docs/Sprints/SPRINT7CONTRACT.md`
|
||||
|
||||
Not allowed:
|
||||
|
||||
- `src/pdf2md/doctor.py`
|
||||
- Working `pdf2md doctor` behavior
|
||||
- `scripts/`
|
||||
- MinerU/model installation or download scripts
|
||||
- Real MinerU invocation in default tests
|
||||
- Real GPU/CUDA checks
|
||||
- Real PDF content parsing outside adapter output handling
|
||||
- Runtime engine selection or alternate engine support
|
||||
- Cloud OCR, remote LLM/VLM, hosted renderer, remote document parser, remote asset fetching, HTTP client backend, router mode, `--api-url`, remote APIs, or remote OpenAI-compatible backend support
|
||||
- A CLI flag that disables strict-local policy
|
||||
- Committed files under `samples/`
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
Sprint 7 should produce:
|
||||
|
||||
1. Public conversion records and API
|
||||
- A project-owned conversion result type containing at least:
|
||||
- source PDF path
|
||||
- Markdown output path
|
||||
- metadata JSON path when written
|
||||
- report path
|
||||
- assets directory
|
||||
- raw output directory when kept
|
||||
- engine name and version
|
||||
- final status
|
||||
- warning count
|
||||
- warnings
|
||||
- `convert_pdf(input_path, output_dir, metadata=True, keep_raw=False, overwrite=False, gpu=None, strict_local=True, adapter=None, clock=None)` or an equivalently small API.
|
||||
- The default API path uses the direct local MinerU adapter.
|
||||
- Tests can inject a fake adapter and deterministic clock.
|
||||
- The public return type must not expose raw MinerU-specific Python objects as required fields.
|
||||
|
||||
2. Single-PDF orchestration
|
||||
- Discover and plan the single PDF using existing path helpers.
|
||||
- Create required output directories only after preflight path checks pass.
|
||||
- Run the adapter into a planned temporary or raw work directory.
|
||||
- Stop the individual conversion on adapter hard failure and return explicit warnings/status.
|
||||
- Normalize adapter Markdown into Obsidian-friendly Markdown.
|
||||
- Copy local adapter-provided asset files into the planned assets directory when needed.
|
||||
- Compute source SHA-256 with local file reads.
|
||||
- Build metadata from project-owned records.
|
||||
- Run local quality checks over normalized Markdown and asset context.
|
||||
- Render report Markdown from metadata and quality results.
|
||||
- Write final Markdown, optional metadata JSON, and report Markdown.
|
||||
|
||||
3. Output writing and overwrite behavior
|
||||
- Never write outside the planned output root.
|
||||
- Respect existing-output conflicts unless `overwrite=True`.
|
||||
- Keep writes deterministic and UTF-8 encoded for text outputs.
|
||||
- Preserve `--metadata` behavior: metadata JSON is written when enabled and omitted when disabled.
|
||||
- Always write `<stem>.report.md`.
|
||||
- `--keep-raw` preserves raw MinerU output in the planned raw directory.
|
||||
- Without `--keep-raw`, temporary raw work must be cleaned up when the conversion completes, while preserving enough failure context in metadata/report/warnings.
|
||||
|
||||
4. Asset materialization
|
||||
- Copy only local files returned by the adapter.
|
||||
- Do not fetch remote assets.
|
||||
- Do not follow asset paths that escape the adapter work directory or the planned output root.
|
||||
- Handle missing or invalid adapter asset paths with project-owned warnings.
|
||||
- Normalize final Markdown asset links to stable relative paths.
|
||||
|
||||
5. CLI convert command
|
||||
- `pdf2md convert INPUT --out OUTPUT_DIR`.
|
||||
- Options:
|
||||
- `--metadata`
|
||||
- `--keep-raw`
|
||||
- `--recursive`
|
||||
- `--overwrite`
|
||||
- `--gpu GPU_DEVICE`
|
||||
- `--strict-local`
|
||||
- Strict-local must remain enabled in v1; the CLI must not add a supported way to disable it.
|
||||
- Single PDF conversion returns exit code `0` on success or partial success, and non-zero when a hard error prevents conversion.
|
||||
- Directory conversion handles multiple PDFs deterministically and prints a summary.
|
||||
- Batch conversion should continue to the next file when one PDF fails after planning, then return a non-zero exit code if any PDF failed.
|
||||
- CLI output must include converted count, failed count, and warning count.
|
||||
|
||||
6. Batch conversion
|
||||
- Directory inputs use existing non-recursive discovery by default.
|
||||
- Recursive discovery occurs only with `--recursive`.
|
||||
- Output paths preserve relative subdirectories from the input root.
|
||||
- Duplicate planned outputs and overwrite conflicts fail before conversion starts.
|
||||
- Results are deterministic and ordered by discovery/path planning order.
|
||||
|
||||
7. Failure and warning behavior
|
||||
- MinerU failure must be clear and must not trigger fallback to any other engine.
|
||||
- Strict-local violations must be hard failures.
|
||||
- Per-file failures must include project-owned warnings.
|
||||
- CLI summaries must not suppress warning counts.
|
||||
- Metadata/report content must reflect warnings emitted during adapter, normalization, asset, and quality steps.
|
||||
|
||||
8. Tests
|
||||
- API test for one successful conversion with a fake adapter.
|
||||
- API test for adapter failure with no fallback.
|
||||
- API test for output conflict and overwrite behavior.
|
||||
- API test for metadata disabled behavior.
|
||||
- API test for local asset copying and relative Markdown links.
|
||||
- API test for `keep_raw` behavior.
|
||||
- CLI test for single PDF conversion with a fake adapter.
|
||||
- CLI test for directory conversion with deterministic summary output.
|
||||
- CLI test for recursive behavior.
|
||||
- CLI test for failure summary and non-zero exit code.
|
||||
- Tests proving default checks do not require real MinerU, GPU, models, network, `samples/`, Obsidian, or LaTeX tooling.
|
||||
|
||||
9. Handoff
|
||||
- `PROGRESS.md` records changed files, commands run, tests passed or blocked, known failures, residual risks, and next action.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Do not implement `pdf2md doctor`.
|
||||
- Do not implement environment diagnostics.
|
||||
- Do not install MinerU 3.1.0.
|
||||
- Do not download MinerU models.
|
||||
- Do not probe real MinerU output with local sample PDFs.
|
||||
- Do not add setup scripts.
|
||||
- Do not implement runtime engine selection.
|
||||
- Do not add alternate engines.
|
||||
- Do not add cloud, remote API, router, HTTP client backend, remote OpenAI-compatible backend, hosted renderer, or remote asset-fetching support.
|
||||
- Do not add a CLI flag or API option that disables strict-local policy.
|
||||
- Do not require real MinerU, CUDA, GPU, model files, network, Obsidian, LaTeX tooling, or `samples/` in default tests.
|
||||
- Do not implement real math rendering; use the Sprint 6 local checker boundary.
|
||||
- Do not commit generated conversion outputs or sample PDFs.
|
||||
|
||||
## Work Packages
|
||||
|
||||
### WP7.1: Public API And Result Records
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `requirements-guard-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add `conversion.py`.
|
||||
- Define project-owned conversion result records.
|
||||
- Expose `convert_pdf` from the library surface.
|
||||
- Support fake adapter and deterministic clock injection for tests.
|
||||
|
||||
Output:
|
||||
|
||||
- Callers can run one conversion without depending on raw MinerU objects.
|
||||
|
||||
### WP7.2: Single-PDF Orchestration And Output Writing
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `mineru-integration-agent`
|
||||
- `metadata-agent`
|
||||
- `obsidian-markdown-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Connect path planning, adapter execution, Markdown normalization, metadata building, quality checks, and report rendering.
|
||||
- Write final Markdown, optional metadata JSON, and report Markdown.
|
||||
- Compute source SHA-256.
|
||||
- Preserve strict-local behavior and no-fallback behavior.
|
||||
|
||||
Output:
|
||||
|
||||
- One PDF can be converted through mocked adapter outputs in tests and through the real adapter in normal use.
|
||||
|
||||
### WP7.3: Asset And Raw Output Handling
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `obsidian-markdown-agent`
|
||||
- `metadata-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Copy local adapter-provided assets into the planned assets directory.
|
||||
- Normalize Markdown links relative to the final Markdown file.
|
||||
- Preserve raw output only when requested.
|
||||
- Clean temporary work when raw output is not requested.
|
||||
|
||||
Output:
|
||||
|
||||
- Markdown, assets, metadata, and report paths are stable and local-only.
|
||||
|
||||
### WP7.4: CLI Convert And Batch Summary
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `requirements-guard-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Replace the placeholder CLI with `convert` while keeping `--version`.
|
||||
- Add only the agreed v1 options.
|
||||
- Print deterministic summaries with converted, failed, and warning counts.
|
||||
- Return non-zero exit code when hard failures occur.
|
||||
|
||||
Output:
|
||||
|
||||
- Users can run `pdf2md convert` for one PDF or a directory.
|
||||
|
||||
### WP7.5: Independent Evaluation
|
||||
|
||||
Owner:
|
||||
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Review completed orchestration behavior against this contract.
|
||||
- Verify no default test executes real MinerU, uses GPU, downloads models, uses network, or requires `samples/`.
|
||||
- Verify no runtime remote/API path or alternate engine is introduced.
|
||||
- Verify `samples/` remains untracked and unstaged.
|
||||
|
||||
Output:
|
||||
|
||||
- PASS/FAIL notes with any missing acceptance criteria.
|
||||
|
||||
## Verification Checks
|
||||
|
||||
Required:
|
||||
|
||||
- `git status --short --untracked-files=all` before staging confirms `samples/` remains untracked and unstaged.
|
||||
- `uv --version` is run and result is recorded.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest tests/test_conversion.py tests/test_cli.py` passes.
|
||||
- `uv run pytest` passes.
|
||||
- `git diff --check` passes.
|
||||
- Default tests do not require real MinerU, CUDA, GPU, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- No model downloads occur.
|
||||
- No network calls are required.
|
||||
- No candidate engine comparison is reintroduced.
|
||||
- No alternate engine or runtime engine selection is added.
|
||||
- No CLI/API option disables strict-local policy.
|
||||
- No `--api-url`, router mode, HTTP client backend, remote API, or remote OpenAI-compatible backend support is added.
|
||||
- Adapter failures produce explicit failed results and no fallback conversion.
|
||||
- Output files are written only after path preflight succeeds.
|
||||
- Existing outputs are protected unless overwrite is enabled.
|
||||
- CLI summaries include warning counts.
|
||||
- Metadata/report paths and counts match the files written.
|
||||
|
||||
Recommended:
|
||||
|
||||
- Keep conversion orchestration small and dependency-injected.
|
||||
- Prefer local temporary directories from the standard library for raw work when `keep_raw` is disabled.
|
||||
- Keep batch conversion a thin loop over single-file conversion.
|
||||
- Keep CLI formatting simple and stable enough for tests.
|
||||
- Use fake adapter records in tests rather than monkeypatching subprocess behavior at the CLI layer.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
Sprint 7 fails and must stop for a user decision if any of these are true:
|
||||
|
||||
- Public API requires or exposes raw MinerU-specific Python objects as required return fields.
|
||||
- The implementation silently falls back to another engine after MinerU failure.
|
||||
- A CLI/API option disables strict-local policy.
|
||||
- The implementation adds or permits `--api-url`, remote APIs, router mode, HTTP client backends, or remote OpenAI-compatible backends.
|
||||
- Default tests execute real MinerU, require GPU/CUDA, download models, use network, require Obsidian/LaTeX tooling, or require `samples/`.
|
||||
- Output writing can escape the planned output root.
|
||||
- Existing files are overwritten without explicit overwrite intent.
|
||||
- CLI writes final outputs after a preflight hard failure.
|
||||
- CLI summaries suppress warning counts or failed counts.
|
||||
- Metadata/report content omits warnings emitted during adapter, normalization, asset, or quality steps.
|
||||
- `samples/` is staged or committed.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
Sprint 7 is complete when:
|
||||
|
||||
- `src/pdf2md/conversion.py` exists and owns conversion orchestration.
|
||||
- `convert_pdf` is available from the public Python package.
|
||||
- `pdf2md convert INPUT --out OUTPUT_DIR` exists.
|
||||
- Single-PDF conversion is tested with a fake adapter.
|
||||
- Directory and recursive conversion behavior is tested with fake adapters.
|
||||
- Output conflict and overwrite behavior is tested.
|
||||
- Adapter failure produces a clear failed result and no fallback.
|
||||
- Final Markdown, metadata JSON when enabled, and report Markdown are written by product behavior.
|
||||
- Local adapter-provided assets are copied or warned about deterministically.
|
||||
- `--keep-raw` behavior is tested.
|
||||
- CLI summaries include converted, failed, and warning counts.
|
||||
- Default tests do not require real MinerU, GPU, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- `uv sync` passes.
|
||||
- Targeted conversion/CLI tests pass.
|
||||
- `uv run pytest` passes.
|
||||
- `PROGRESS.md` records checks performed and residual risks.
|
||||
- Independent evaluation is complete.
|
||||
- The completed change is committed.
|
||||
|
||||
## Handoff Fields
|
||||
|
||||
Use these fields when Sprint 7 completes:
|
||||
|
||||
- Files changed:
|
||||
- Commands run:
|
||||
- Tests passed:
|
||||
- Tests blocked:
|
||||
- Known failures:
|
||||
- Residual risks:
|
||||
- User decisions needed:
|
||||
- Go/no-go recommendation for Sprint 8:
|
||||
- Next action:
|
||||
@@ -0,0 +1,339 @@
|
||||
# Sprint 8 Contract: Doctor And Setup Documentation
|
||||
|
||||
Status: Implemented
|
||||
Last updated: 2026-05-08
|
||||
|
||||
## Objective
|
||||
|
||||
Make local setup failures explicit before users run conversions by adding a mockable `pdf2md doctor` diagnostic path and setup documentation for Windows PowerShell, Python 3.12, `uv`, MinerU 3.1.0, local model/cache expectations, NVIDIA GPU/CUDA visibility, and strict-local runtime behavior.
|
||||
|
||||
Sprint 8 must establish:
|
||||
|
||||
- A project-owned doctor module for local environment diagnostics.
|
||||
- A `pdf2md doctor` CLI command with deterministic exit codes.
|
||||
- Clear reporting for Python, `uv`, MinerU CLI, MinerU version, GPU/CUDA/PyTorch visibility, and model/cache path detection.
|
||||
- Documentation that explains setup steps and local-only runtime constraints without introducing cloud/API fallback paths.
|
||||
- Fast tests that mock environment checks and do not require real MinerU, CUDA, GPU, model files, network, `samples/`, Obsidian, LaTeX tooling, or package/model downloads.
|
||||
|
||||
Sprint 8 is a diagnostics and setup-documentation sprint. It must not change conversion output behavior, run real conversions, probe local sample PDFs, or make runtime conversion depend on network access.
|
||||
|
||||
## Current Precondition
|
||||
|
||||
Sprint 7 is complete:
|
||||
|
||||
- `src/pdf2md/paths.py` owns input discovery and output path planning.
|
||||
- `src/pdf2md/ir.py` owns project records, block types, warning codes, and warning severities.
|
||||
- `src/pdf2md/metadata.py` builds JSON-serializable metadata and summary counts from project-owned records.
|
||||
- `src/pdf2md/mineru_adapter.py` owns the mocked direct local MinerU CLI adapter boundary.
|
||||
- `src/pdf2md/markdown.py` owns Obsidian Markdown normalization.
|
||||
- `src/pdf2md/quality.py` owns local quality checks, including math checker unavailable behavior.
|
||||
- `src/pdf2md/report.py` owns report content rendering and final status calculation.
|
||||
- `src/pdf2md/conversion.py` owns conversion orchestration and output writing.
|
||||
- `src/pdf2md/cli.py` owns `pdf2md convert`.
|
||||
- `uv run pytest` passed 119 tests.
|
||||
|
||||
Sprint 8 may add doctor diagnostics and setup documentation. It must not require a real successful local MinerU/GPU/model setup in the default test loop.
|
||||
|
||||
## Touched Surfaces
|
||||
|
||||
Allowed:
|
||||
|
||||
- `src/pdf2md/doctor.py`
|
||||
- `src/pdf2md/cli.py`
|
||||
- `src/pdf2md/mineru_adapter.py` only for narrowly required availability/version helper compatibility
|
||||
- `README.md`
|
||||
- `scripts/install-mineru.ps1` only if implemented as an explicit user-invoked setup helper
|
||||
- `scripts/install-models.py` only if implemented as an explicit user-invoked setup helper
|
||||
- `tests/test_doctor.py`
|
||||
- `tests/test_cli.py`
|
||||
- `tests/test_mineru_adapter.py` only if adapter helper compatibility changes
|
||||
- `PLAN.md`
|
||||
- `PROGRESS.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md`
|
||||
- `docs/Sprints/SPRINT8CONTRACT.md`
|
||||
|
||||
Not allowed:
|
||||
|
||||
- Changes to `src/pdf2md/conversion.py` unless a doctor/CLI regression forces a narrow compatibility fix
|
||||
- Changes to Markdown normalization, metadata schema, report rendering, or path planning unrelated to doctor behavior
|
||||
- Real PDF conversion in default tests
|
||||
- Real MinerU execution in default tests
|
||||
- Real CUDA/GPU dependency in default tests
|
||||
- Model downloads in default tests
|
||||
- Any setup download triggered by `pdf2md doctor`, `pdf2md convert`, import time, or tests
|
||||
- Runtime engine selection or alternate engine support
|
||||
- Cloud OCR, remote LLM/VLM, hosted renderer, remote document parser, remote asset fetching, HTTP client backend, router mode, `--api-url`, remote APIs, or remote OpenAI-compatible backend support
|
||||
- A CLI/API option that disables strict-local policy
|
||||
- Committed files under `samples/`
|
||||
- Generated conversion outputs committed to git
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
Sprint 8 should produce:
|
||||
|
||||
1. Doctor result records and API
|
||||
- A small project-owned doctor result type containing at least:
|
||||
- check name
|
||||
- status: `pass`, `warn`, or `fail`
|
||||
- human-readable message
|
||||
- optional details
|
||||
- A doctor report type containing ordered checks and an overall status.
|
||||
- Mockable checker dependencies for subprocess calls, executable discovery, Python version, imports, environment variables, and filesystem paths.
|
||||
- No public or required field should expose raw subprocess or third-party objects.
|
||||
|
||||
2. Required checks
|
||||
- Python version check:
|
||||
- Pass on Python 3.12.
|
||||
- Fail outside the supported project range.
|
||||
- `uv` check:
|
||||
- Detect executable availability.
|
||||
- Report version text when available.
|
||||
- Fail clearly when missing.
|
||||
- MinerU check:
|
||||
- Detect direct local `mineru` CLI availability through the existing adapter boundary where possible.
|
||||
- Report MinerU version when available.
|
||||
- Fail clearly when the CLI is missing.
|
||||
- Warn or fail clearly when version detection fails or the detected version is not MinerU 3.1.0.
|
||||
- GPU/CUDA/PyTorch visibility:
|
||||
- Report whether an NVIDIA GPU is visible when detectable.
|
||||
- Report CUDA/PyTorch visibility without requiring PyTorch in default tests.
|
||||
- Warn clearly when GPU/CUDA/PyTorch acceleration is unavailable.
|
||||
- Warn clearly when the detected GPU is GTX 1070 Ti or another Pascal/pre-Turing class GPU with likely CUDA/PyTorch compatibility risk.
|
||||
- Model/cache paths:
|
||||
- Report detectable local model/cache paths from documented environment variables or known local config locations.
|
||||
- Warn when model/cache paths cannot be detected.
|
||||
- Do not download, install, or validate large model files in default tests.
|
||||
- Local-only policy:
|
||||
- Report that runtime conversion allows only direct local `mineru` CLI execution and CLI-internal temporary local `mineru-api`.
|
||||
- Report that `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends are prohibited.
|
||||
|
||||
3. CLI command
|
||||
- `pdf2md doctor` exists.
|
||||
- `pdf2md --version` remains unchanged.
|
||||
- `pdf2md convert` behavior remains covered and unchanged except for parser integration.
|
||||
- Exit code policy:
|
||||
- `0` when all checks pass or only warnings exist.
|
||||
- Non-zero when any required check fails.
|
||||
- CLI output must be concise, deterministic, and testable.
|
||||
- CLI output must distinguish warnings from failures.
|
||||
|
||||
4. Setup documentation
|
||||
- README or setup section explains:
|
||||
- Windows PowerShell workflow.
|
||||
- Python 3.12 requirement.
|
||||
- `uv` usage and PATH note for `C:\Users\user\.local\bin`.
|
||||
- MinerU 3.1.0 local CLI expectation.
|
||||
- Model/cache setup expectations and where doctor looks.
|
||||
- NVIDIA GPU expectations and GTX 1070 Ti 8GB risk.
|
||||
- Strict-local runtime policy.
|
||||
- Difference between explicit setup downloads and runtime conversion, which must stay local-only.
|
||||
- If setup helper scripts are added, they must be explicit user-invoked helpers, not imported by package code, not called by `doctor`, and not used by default tests.
|
||||
|
||||
5. Tests
|
||||
- Unit tests for doctor success with mocked checks.
|
||||
- Unit tests for missing Python version support.
|
||||
- Unit tests for missing `uv`.
|
||||
- Unit tests for missing MinerU.
|
||||
- Unit tests for MinerU version detection failure or non-3.1.0 version.
|
||||
- Unit tests for missing GPU/CUDA/PyTorch warning behavior.
|
||||
- Unit tests for GTX 1070 Ti/Pascal risk warning behavior.
|
||||
- Unit tests for missing model/cache warning behavior.
|
||||
- CLI tests for `pdf2md doctor` success, warning-only success, and hard failure exit code.
|
||||
- Regression tests proving `pdf2md convert` and `--version` still work.
|
||||
- Tests proving default checks do not require real MinerU, GPU, CUDA, PyTorch, model files, network, `samples/`, Obsidian, or LaTeX tooling.
|
||||
|
||||
6. Handoff
|
||||
- `PROGRESS.md` records changed files, commands run, tests passed or blocked, known failures, residual risks, and next action.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Do not run real MinerU in default tests.
|
||||
- Do not install MinerU 3.1.0.
|
||||
- Do not download MinerU models.
|
||||
- Do not run real model setup during doctor or tests.
|
||||
- Do not parse, convert, or inspect sample PDFs.
|
||||
- Do not implement local fixture evaluation.
|
||||
- Do not change conversion orchestration behavior except for unavoidable CLI parser integration.
|
||||
- Do not add runtime engine selection.
|
||||
- Do not add alternate engines.
|
||||
- Do not add cloud, remote API, router, HTTP client backend, remote OpenAI-compatible backend, hosted renderer, or remote asset-fetching support.
|
||||
- Do not add a CLI/API option that disables strict-local policy.
|
||||
- Do not require real CUDA, GPU, PyTorch, MinerU, model files, network, Obsidian, LaTeX tooling, or `samples/` in default tests.
|
||||
- Do not claim that GTX 1070 Ti CUDA/PyTorch acceleration is guaranteed until local validation proves it.
|
||||
- Do not claim perfect setup automation.
|
||||
|
||||
## Work Packages
|
||||
|
||||
### WP8.1: Doctor Records And Mockable Check Boundaries
|
||||
|
||||
Owner:
|
||||
|
||||
- `local-setup-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add `doctor.py`.
|
||||
- Define doctor check/result records.
|
||||
- Keep all external probes injectable or mockable.
|
||||
- Implement deterministic aggregation and exit status policy.
|
||||
|
||||
Output:
|
||||
|
||||
- The CLI can report setup health without depending on real local tools in tests.
|
||||
|
||||
### WP8.2: Environment And MinerU Diagnostics
|
||||
|
||||
Owner:
|
||||
|
||||
- `local-setup-agent`
|
||||
- `mineru-integration-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add Python, `uv`, MinerU availability, MinerU version, GPU/CUDA/PyTorch, and model/cache checks.
|
||||
- Reuse the direct local MinerU adapter boundary where it fits.
|
||||
- Keep MinerU 3.1.0 as the only accepted engine target.
|
||||
|
||||
Output:
|
||||
|
||||
- Users get clear local setup failures before conversion.
|
||||
|
||||
### WP8.3: CLI Doctor Command
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `requirements-guard-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add `pdf2md doctor` without breaking `pdf2md convert` or `--version`.
|
||||
- Print deterministic pass/warn/fail lines and an overall status.
|
||||
- Return non-zero when required checks fail.
|
||||
|
||||
Output:
|
||||
|
||||
- The command-line workflow can diagnose setup state.
|
||||
|
||||
### WP8.4: Setup Documentation And Optional Explicit Helpers
|
||||
|
||||
Owner:
|
||||
|
||||
- `local-setup-agent`
|
||||
- `license-privacy-agent`
|
||||
- `requirements-guard-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Update README setup docs for Windows PowerShell, Python 3.12, `uv`, MinerU 3.1.0, model/cache, GPU, and local-only runtime policy.
|
||||
- Verify volatile install/setup claims against official docs before editing.
|
||||
- If adding scripts, keep them explicit, local setup-only, and never called by doctor, convert, import time, or default tests.
|
||||
|
||||
Output:
|
||||
|
||||
- Setup instructions are clear without weakening strict-local runtime policy.
|
||||
|
||||
### WP8.5: Independent Evaluation
|
||||
|
||||
Owner:
|
||||
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Review completed doctor behavior and docs against this contract.
|
||||
- Verify no default test executes real MinerU, uses GPU/CUDA, downloads models, uses network, or requires `samples/`.
|
||||
- Verify no runtime remote/API path or alternate engine is introduced.
|
||||
- Verify `samples/` remains untracked and unstaged.
|
||||
|
||||
Output:
|
||||
|
||||
- PASS/FAIL notes with any missing acceptance criteria.
|
||||
|
||||
## Verification Checks
|
||||
|
||||
Required:
|
||||
|
||||
- `git status --short --untracked-files=all` before staging confirms `samples/` remains untracked and unstaged.
|
||||
- `uv --version` is run and result is recorded.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest tests/test_doctor.py tests/test_cli.py` passes.
|
||||
- `uv run pytest` passes.
|
||||
- `git diff --check` passes.
|
||||
- Default tests do not require real MinerU, CUDA, GPU, PyTorch, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- No model downloads occur.
|
||||
- No setup downloads occur from doctor, convert, imports, or tests.
|
||||
- No network calls are required in default tests.
|
||||
- No candidate engine comparison is reintroduced.
|
||||
- No alternate engine or runtime engine selection is added.
|
||||
- No CLI/API option disables strict-local policy.
|
||||
- No `--api-url`, router mode, HTTP client backend, remote API, or remote OpenAI-compatible backend support is added.
|
||||
- Doctor fails clearly when required dependencies are missing.
|
||||
- Doctor does not report the environment as healthy when MinerU is missing.
|
||||
- Doctor warnings are clear for GPU/CUDA/PyTorch/model-cache risk.
|
||||
- `pdf2md convert` tests still pass.
|
||||
- `pdf2md --version` still works.
|
||||
|
||||
Recommended:
|
||||
|
||||
- Keep doctor checks small, ordered, and deterministic.
|
||||
- Keep human-readable output stable enough for unit tests.
|
||||
- Use dependency injection rather than monkeypatching global process state where possible.
|
||||
- Treat real local GPU/MinerU/model probes as optional manual verification outside the default test suite.
|
||||
- Use `requirements-guard-agent` if setup wording risks weakening strict-local policy.
|
||||
- Use `research-agent` or `local-setup-agent` with live web verification before changing volatile installation commands or model-cache documentation.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
Sprint 8 fails and must stop for a user decision if any of these are true:
|
||||
|
||||
- Doctor reports a healthy environment when MinerU is missing.
|
||||
- Doctor says cloud/API fallback is supported.
|
||||
- Doctor, import time, or default tests install packages, download models, call network services, or run model setup.
|
||||
- Default tests require real MinerU, CUDA, GPU, PyTorch, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- The implementation adds or permits `--api-url`, remote APIs, router mode, HTTP client backends, or remote OpenAI-compatible backends.
|
||||
- The implementation adds runtime engine selection or alternate engines.
|
||||
- `pdf2md doctor` breaks `pdf2md convert` or `pdf2md --version`.
|
||||
- The README or scripts imply runtime conversion can upload PDFs, page images, extracted text, or intermediates to remote services.
|
||||
- Setup helper scripts are invoked automatically by doctor, convert, import time, or tests.
|
||||
- `samples/` is staged or committed.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
Sprint 8 is complete when:
|
||||
|
||||
- `src/pdf2md/doctor.py` exists and owns local setup diagnostics.
|
||||
- `pdf2md doctor` exists.
|
||||
- Doctor returns a project-owned report with ordered checks and overall status.
|
||||
- Python 3.12, `uv`, MinerU availability/version, GPU/CUDA/PyTorch visibility, and model/cache path checks are implemented with mocked tests.
|
||||
- Missing `uv` is tested.
|
||||
- Missing MinerU is tested and produces a failure.
|
||||
- Missing GPU/CUDA/PyTorch is tested and produces clear warning behavior.
|
||||
- GTX 1070 Ti/Pascal risk warning behavior is tested.
|
||||
- Missing model/cache path warning behavior is tested.
|
||||
- `pdf2md doctor` exit code behavior is tested for success, warning-only success, and failure.
|
||||
- `pdf2md convert` and `pdf2md --version` regression tests still pass.
|
||||
- Setup docs explain local-only runtime behavior and do not imply cloud/API fallback.
|
||||
- Default tests do not require real MinerU, GPU, CUDA, PyTorch, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- `uv sync` passes.
|
||||
- Targeted doctor/CLI tests pass.
|
||||
- `uv run pytest` passes.
|
||||
- `PROGRESS.md` records checks performed and residual risks.
|
||||
- Independent evaluation is complete.
|
||||
- The completed change is committed.
|
||||
|
||||
## Handoff Fields
|
||||
|
||||
Use these fields when Sprint 8 completes:
|
||||
|
||||
- Files changed:
|
||||
- Commands run:
|
||||
- Tests passed:
|
||||
- Tests blocked:
|
||||
- Known failures:
|
||||
- Residual risks:
|
||||
- User decisions needed:
|
||||
- Go/no-go recommendation for Sprint 9:
|
||||
- Next action:
|
||||
@@ -0,0 +1,320 @@
|
||||
# Sprint 9 Contract: Local Fixture Evaluation And V1 Release Gate
|
||||
|
||||
Status: Implemented
|
||||
Last updated: 2026-05-08
|
||||
|
||||
## Objective
|
||||
|
||||
Validate the v1 converter against local fixture workflows without committing sample PDFs or making the default test loop depend on MinerU models, GPU, CUDA, network access, Obsidian, or LaTeX tooling.
|
||||
|
||||
Sprint 9 must establish:
|
||||
|
||||
- A fast mocked integration suite that exercises the public conversion path end to end.
|
||||
- An optional, explicitly enabled local MinerU fixture evaluation path for `samples/`.
|
||||
- A fixture coverage manifest or checklist that records which local PDFs cover math, tables, figures/assets, reading order, Korean filenames, and metadata/report risks.
|
||||
- Release-gate documentation that distinguishes default automated checks from optional local MinerU/GPU checks.
|
||||
- Clear `PROGRESS.md` notes for local fixture coverage, skipped/blocked optional checks, known quality risks, and the v1 go/no-go recommendation.
|
||||
|
||||
Sprint 9 is an evaluation and release-gate sprint. It may add tests, local-only evaluation helpers, fixture manifests, and narrow compatibility fixes only when needed to evaluate the current v1 behavior. It must not add alternate engines, cloud/API paths, runtime engine selection, or automatic model downloads.
|
||||
|
||||
## Current Precondition
|
||||
|
||||
Sprint 8 is complete:
|
||||
|
||||
- `pdf2md doctor` exists and reports Python, `uv`, MinerU CLI/version, GPU, PyTorch, model/cache, and strict-local policy status.
|
||||
- Local `pdf2md doctor` currently fails because the `mineru` CLI is not installed on PATH.
|
||||
- `pdf2md convert` exists and writes Markdown, metadata JSON, and `<stem>.report.md` with fake-adapter test coverage.
|
||||
- Default tests pass without real MinerU, CUDA, GPU, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- `samples/` exists locally and is untracked. Observed local fixture files include:
|
||||
- `samples/FourNodeQuadrilateralShellElementMITC4.pdf`
|
||||
- `samples/MITC공부.pdf`
|
||||
- `samples/2007쉘구조물의유한요소해석에대하여.pdf`
|
||||
- `samples/유한요소해석법을이용한쉘구조물의동적좌굴해석.pdf`
|
||||
- `samples/metadata.json`
|
||||
|
||||
Sprint 9 must preserve the untracked status of `samples/` unless the user explicitly requests otherwise.
|
||||
|
||||
## Touched Surfaces
|
||||
|
||||
Allowed:
|
||||
|
||||
- `tests/integration/`
|
||||
- `tests/test_conversion.py`
|
||||
- `tests/test_cli.py`
|
||||
- `tests/test_report.py`
|
||||
- `tests/test_metadata.py`
|
||||
- `tests/test_quality.py`
|
||||
- `tests/conftest.py` only for markers or opt-in fixture controls
|
||||
- `src/pdf2md/mineru_adapter.py` only for narrow compatibility fixes backed by mocked or optional local MinerU output evidence
|
||||
- `src/pdf2md/conversion.py` only for narrow release-gate defects found by integration tests
|
||||
- `src/pdf2md/quality.py` only for local quality metric defects found by integration tests
|
||||
- `src/pdf2md/report.py` only for report defects found by integration tests
|
||||
- `README.md`
|
||||
- `docs/V1RELEASECHECKLIST.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md`
|
||||
- `docs/Sprints/SPRINT9CONTRACT.md`
|
||||
- `PLAN.md`
|
||||
- `PROGRESS.md`
|
||||
|
||||
Not allowed:
|
||||
|
||||
- Committed files under `samples/`
|
||||
- Committed generated conversion outputs from local sample PDFs
|
||||
- Mandatory tests that require real MinerU, GPU, CUDA, PyTorch, model files, network, Obsidian, LaTeX tooling, or `samples/`
|
||||
- Automatic package installs or model downloads from tests, import time, doctor, convert, or helpers
|
||||
- Runtime engine selection or alternate conversion engines
|
||||
- Cloud OCR, remote LLM/VLM, hosted renderer, remote document parser, remote asset fetching, `--api-url`, router mode, HTTP client backends, remote APIs, or remote OpenAI-compatible backends
|
||||
- CLI/API options that disable strict-local policy
|
||||
- Claims that v1 perfectly reconstructs LaTeX, tables, or reading order
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
1. Fast mocked integration suite
|
||||
- Exercises `convert_pdf` and/or `pdf2md convert` with a fake MinerU adapter through the real orchestration path.
|
||||
- Verifies Markdown, metadata JSON, and `<stem>.report.md` are all written.
|
||||
- Verifies output paths, asset links, warning counts, and report status stay consistent.
|
||||
- Verifies failures produce metadata/report warnings when possible and do not silently fallback.
|
||||
- Runs as part of `uv run pytest` without real MinerU, models, GPU, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
|
||||
2. Optional local MinerU fixture evaluation
|
||||
- Provides an explicit opt-in command or pytest marker/environment gate for real local MinerU sample evaluation.
|
||||
- Skips or reports a clear local blocker when `pdf2md doctor` fails because MinerU, model/cache paths, or GPU/PyTorch acceleration are unavailable.
|
||||
- Reads sample PDFs only from `samples/` or a user-provided local sample directory.
|
||||
- Writes generated outputs to a temporary or ignored output directory, never to tracked fixture paths.
|
||||
- Produces or records, for each attempted sample:
|
||||
- source filename
|
||||
- command run
|
||||
- exit code
|
||||
- generated Markdown path
|
||||
- generated metadata JSON path
|
||||
- generated `.report.md` path
|
||||
- warning count
|
||||
- math renderability or checker-unavailable count
|
||||
- table fallback/degradation count when available
|
||||
- missing or broken asset link count
|
||||
- page coverage when available
|
||||
- Does not mark optional evaluation as passed when MinerU is missing; it records the blocker.
|
||||
|
||||
3. Fixture coverage manifest or checklist
|
||||
- Maps local sample files to risk categories:
|
||||
- simple digital PDF
|
||||
- math-heavy PDF
|
||||
- multi-column or complex reading order
|
||||
- table with formulas
|
||||
- figure/caption/assets
|
||||
- Korean filename/path handling
|
||||
- May store only relative sample names, categories, and notes; it must not embed sample PDFs or generated outputs.
|
||||
- Records coverage gaps that need additional user-provided samples.
|
||||
|
||||
4. V1 release checklist
|
||||
- Defines default release gates:
|
||||
- `uv sync`
|
||||
- `uv run pytest`
|
||||
- `uv run pdf2md --version`
|
||||
- `uv run pdf2md doctor`
|
||||
- `git diff --check`
|
||||
- `git status --short --untracked-files=all`
|
||||
- Defines optional local MinerU release gates separately from default gates.
|
||||
- Requires Markdown, metadata JSON, and `.report.md` to exist before any sample conversion is considered successful.
|
||||
- Requires warnings and residual risks to be recorded in `PROGRESS.md`.
|
||||
- Makes local-only and no-sample-commit checks explicit.
|
||||
|
||||
5. Documentation
|
||||
- README or release checklist explains how to run default checks and optional local fixture checks.
|
||||
- Documentation states that optional fixture checks may be skipped or blocked until MinerU 3.1.0 and model/cache setup are available.
|
||||
- Documentation does not instruct users to use `--api-url`, router mode, HTTP client backends, remote APIs, or remote OpenAI-compatible backends.
|
||||
|
||||
6. Handoff
|
||||
- `PROGRESS.md` records changed files, commands run, tests passed or blocked, local fixture status, generated output location if any, known failures, residual risks, and next action.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Do not install MinerU.
|
||||
- Do not download MinerU models.
|
||||
- Do not run model setup automatically.
|
||||
- Do not require the local GTX 1070 Ti to pass CUDA/PyTorch checks in the default test loop.
|
||||
- Do not improve OCR/model accuracy.
|
||||
- Do not introduce a manual review UI or web UI.
|
||||
- Do not add alternate conversion engines or fallback engines.
|
||||
- Do not benchmark against cloud OCR/API services.
|
||||
- Do not commit sample PDFs, sample-derived outputs, or large binary fixtures.
|
||||
- Do not make text edit distance the only quality criterion.
|
||||
- Do not claim v1 is release-ready if metadata JSON or `.report.md` generation is missing.
|
||||
|
||||
## Work Packages
|
||||
|
||||
### WP9.1: Fast Mocked Integration Checks
|
||||
|
||||
Owner:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add integration-level tests that use fake adapter output but run the public conversion orchestration and CLI paths.
|
||||
- Assert generated Markdown, metadata JSON, `.report.md`, assets, warnings, and summaries are mutually consistent.
|
||||
- Keep tests deterministic and independent of real samples.
|
||||
|
||||
Output:
|
||||
|
||||
- `uv run pytest` covers v1 file-output behavior without model or GPU dependencies.
|
||||
|
||||
### WP9.2: Optional MinerU Sample Evaluation Harness
|
||||
|
||||
Owner:
|
||||
|
||||
- `mineru-integration-agent`
|
||||
- `local-setup-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add an explicit opt-in local fixture command/test path.
|
||||
- Gate real MinerU execution behind an environment variable, marker, or explicit command documented in README/checklist.
|
||||
- Run `pdf2md doctor` or equivalent preflight before optional local MinerU evaluation.
|
||||
- Use temporary or ignored output directories.
|
||||
- Record blocked status clearly when MinerU/model/cache setup is missing.
|
||||
|
||||
Output:
|
||||
|
||||
- Local users can run real sample evaluation when setup is ready, while default tests stay fast and local.
|
||||
|
||||
### WP9.3: Fixture Coverage And Metrics
|
||||
|
||||
Owner:
|
||||
|
||||
- `evaluation-agent`
|
||||
- `obsidian-markdown-agent`
|
||||
- `metadata-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Define fixture categories and expected risk coverage.
|
||||
- Track math delimiter/renderability, tables, reading order, assets, page coverage, metadata fields, warning counts, and report usefulness.
|
||||
- Avoid scoring quality only by plain-text edit distance.
|
||||
|
||||
Output:
|
||||
|
||||
- Fixture coverage is explicit and gaps are visible.
|
||||
|
||||
### WP9.4: V1 Release Gate Documentation
|
||||
|
||||
Owner:
|
||||
|
||||
- `requirements-guard-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Add or update release checklist documentation.
|
||||
- Separate default release gates from optional local MinerU/GPU gates.
|
||||
- Keep strict-local wording consistent with `ARCHITECTURE.md`, `PRD.md`, and `README.md`.
|
||||
- Update `PLAN.md` and `PROGRESS.md` with the next action and release readiness state.
|
||||
|
||||
Output:
|
||||
|
||||
- A future agent can determine whether v1 is blocked, partial, or ready without relying on conversation history.
|
||||
|
||||
### WP9.5: Independent Evaluation
|
||||
|
||||
Owner:
|
||||
|
||||
- `evaluation-agent`
|
||||
|
||||
Actions:
|
||||
|
||||
- Review completed Sprint 9 work against this contract.
|
||||
- Verify default tests do not require real MinerU, GPU, CUDA, PyTorch, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- Verify optional local MinerU evaluation is clearly gated.
|
||||
- Verify generated sample outputs and sample PDFs are not staged.
|
||||
- Verify release checklist cannot pass without Markdown, metadata JSON, and `.report.md`.
|
||||
|
||||
Output:
|
||||
|
||||
- PASS/FAIL notes with actionable findings and residual risk.
|
||||
|
||||
## Verification Checks
|
||||
|
||||
Required:
|
||||
|
||||
- `git status --short --untracked-files=all` before staging confirms `samples/` remains untracked and unstaged.
|
||||
- `uv --version` is run and result is recorded.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- Targeted integration tests pass.
|
||||
- `uv run pdf2md --version` passes.
|
||||
- `uv run pdf2md doctor` is run and its result is recorded as pass, warn, or blocked/fail.
|
||||
- `git diff --check` passes.
|
||||
- Default tests do not require real MinerU, CUDA, GPU, PyTorch, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- No model downloads occur.
|
||||
- No setup downloads occur from tests, import time, doctor, convert, or helper scripts.
|
||||
- No network calls are required in default tests.
|
||||
- No candidate engine comparison is reintroduced.
|
||||
- No alternate engine or runtime engine selection is added.
|
||||
- No CLI/API option disables strict-local policy.
|
||||
- No `--api-url`, router mode, HTTP client backend, remote API, or remote OpenAI-compatible backend support is added.
|
||||
- Optional local MinerU checks are skipped or blocked clearly when setup is unavailable.
|
||||
- Sample PDFs and generated sample outputs are not staged or committed.
|
||||
- `PROGRESS.md` records local fixture coverage status and release readiness.
|
||||
|
||||
Recommended:
|
||||
|
||||
- Add a pytest marker or environment variable for optional local MinerU tests.
|
||||
- Keep optional output under a temporary directory or an ignored local output root.
|
||||
- Include at least one Korean filename/path check in fast mocked tests.
|
||||
- Include one fake output with math, one with a table warning, and one with an asset link.
|
||||
- Record source-to-output paths in release checklist examples.
|
||||
- Treat local doctor failure as a release blocker for real MinerU validation but not for the default fast test loop.
|
||||
|
||||
## Hard Failure Criteria
|
||||
|
||||
Sprint 9 fails and must stop for a user decision if any of these are true:
|
||||
|
||||
- Default tests require real MinerU, GPU, CUDA, PyTorch, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- Sample PDFs or generated sample outputs are staged or committed.
|
||||
- Optional real MinerU evaluation runs without an explicit opt-in gate.
|
||||
- Optional real MinerU evaluation writes generated output into tracked fixture paths.
|
||||
- V1 release checklist can pass without generated Markdown, metadata JSON, and `.report.md`.
|
||||
- Release status is marked ready when `pdf2md doctor` has a hard failure and no explicit user waiver is recorded.
|
||||
- The implementation adds runtime engine selection or alternate engines.
|
||||
- The implementation adds or permits `--api-url`, remote APIs, router mode, HTTP client backends, or remote OpenAI-compatible backends.
|
||||
- The implementation uses cloud/API fallback for any fixture evaluation.
|
||||
- The implementation hides MinerU failure or silently falls back to another engine.
|
||||
- Quality criteria ignore math, tables, reading order, assets, metadata, or report quality.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
Sprint 9 is complete when:
|
||||
|
||||
- `docs/Sprints/SPRINT9CONTRACT.md` exists and is referenced by relevant agents.
|
||||
- Fast mocked integration tests exist and pass under `uv run pytest`.
|
||||
- Optional local MinerU fixture evaluation is documented and explicitly gated.
|
||||
- Local fixture coverage categories and gaps are recorded.
|
||||
- Release checklist documentation exists or is updated.
|
||||
- `PROGRESS.md` records optional local MinerU status, including skipped/blocked reasons when applicable.
|
||||
- Default tests do not require real MinerU, GPU, CUDA, PyTorch, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
- No sample PDF or generated sample output is staged or committed.
|
||||
- `uv sync` passes.
|
||||
- `uv run pytest` passes.
|
||||
- `git diff --check` passes.
|
||||
- Independent evaluation is complete.
|
||||
- The completed change is committed.
|
||||
|
||||
## Handoff Fields
|
||||
|
||||
Use these fields when Sprint 9 completes:
|
||||
|
||||
- Files changed:
|
||||
- Commands run:
|
||||
- Tests passed:
|
||||
- Tests blocked:
|
||||
- Optional local MinerU status:
|
||||
- Fixture coverage:
|
||||
- Generated output locations:
|
||||
- Known failures:
|
||||
- Residual risks:
|
||||
- User decisions needed:
|
||||
- V1 release recommendation:
|
||||
- Go/no-go recommendation for next sprint:
|
||||
- Next action:
|
||||
@@ -0,0 +1,661 @@
|
||||
# V1 Implementation Plan: Local PDF-to-Markdown Converter
|
||||
|
||||
Last updated: 2026-05-08
|
||||
|
||||
This document is the implementation plan for v1. It does not replace `PRD.md` or `ARCHITECTURE.md`; use those files as the source of product requirements and system design. This plan explains the order of work, sprint contracts, verification gates, and agent ownership for implementing the converter.
|
||||
|
||||
Sprint 1 created the Python package scaffold and CLI placeholder. Sprint 2 created path planning. Sprint 3 created project-owned records and metadata construction. Sprint 4 created the mocked direct local MinerU adapter boundary. Sprint 5 created the Obsidian Markdown normalization boundary. Sprint 6 created local quality-check and report-rendering boundaries. Sprint 7 implemented conversion orchestration, the public conversion API, and the `pdf2md convert` CLI path with fake-adapter tests. Sprint 8 implemented mockable doctor diagnostics, the `pdf2md doctor` CLI path, and setup documentation. Sprint 9 implemented fast mocked integration tests, explicit opt-in local MinerU fixture evaluation, and the v1 release checklist. Sprint 10 implemented opt-in pre-conversion PDF chunking for long documents.
|
||||
|
||||
## 1. V1 Outcome
|
||||
|
||||
v1 is complete when a local user can run:
|
||||
|
||||
```bash
|
||||
uv run pdf2md doctor
|
||||
uv run pdf2md convert paper.pdf --out out --metadata
|
||||
uv run pdf2md convert pdfs --out out --recursive --metadata
|
||||
```
|
||||
|
||||
and receive, for each PDF:
|
||||
|
||||
- Obsidian-friendly Markdown.
|
||||
- A stable sibling assets directory when assets exist.
|
||||
- `<stem>.metadata.json`.
|
||||
- `<stem>.report.md`.
|
||||
- Clear warnings when math, tables, assets, reading order, GPU availability, or MinerU execution are uncertain.
|
||||
|
||||
Long PDFs can be chunked explicitly:
|
||||
|
||||
```bash
|
||||
uv run pdf2md convert paper.pdf --out out --chunk-pages
|
||||
uv run pdf2md convert paper.pdf --out out --chunk-pages 20
|
||||
```
|
||||
|
||||
Chunked conversion writes separate outputs per chunk and does not merge Markdown files.
|
||||
|
||||
The converter must use MinerU 3.1.0 through direct local CLI execution only. It must not silently fallback to another engine.
|
||||
|
||||
## 2. Non-Negotiable Constraints
|
||||
|
||||
- Python 3.12 and `uv`.
|
||||
- MinerU 3.1.0 is the only conversion engine.
|
||||
- Direct local MinerU CLI execution only.
|
||||
- MinerU 3.1.0 may launch a temporary local `mineru-api` internally when CLI runs without `--api-url`.
|
||||
- No cloud OCR, hosted LLM/VLM, remote document parser, `--api-url`, remote APIs, router mode, HTTP client backends, or remote OpenAI-compatible backends.
|
||||
- Target hardware: NVIDIA GTX 1070 Ti 8GB.
|
||||
- Digital PDFs with text layers are the v1 priority.
|
||||
- `samples/` is local fixture context and must not be committed unless explicitly requested.
|
||||
- Every substantial implementation chunk needs a sprint contract and independent evaluation.
|
||||
|
||||
## 3. Harness Operating Model
|
||||
|
||||
Use the project long-running harness only for substantial implementation work.
|
||||
|
||||
1. `harness-planner-agent` turns the next user request into a sprint contract.
|
||||
2. `evaluation-agent` reviews the contract before code changes start.
|
||||
3. `feature-generator-agent` implements one approved contract at a time.
|
||||
4. `feature-generator-agent` runs self-checks and records residual risks.
|
||||
5. `evaluation-agent` independently verifies the result against the contract.
|
||||
6. The parent agent updates `PROGRESS.md`, commits the completed change, and leaves a handoff.
|
||||
|
||||
Each sprint contract must include:
|
||||
|
||||
- Objective.
|
||||
- Touched surfaces.
|
||||
- Expected outputs.
|
||||
- Non-goals.
|
||||
- Verification checks.
|
||||
- Hard failure criteria.
|
||||
- Handoff fields.
|
||||
|
||||
## 4. Proposed Repository Layout
|
||||
|
||||
Create this layout incrementally; do not scaffold unused modules before a sprint needs them.
|
||||
|
||||
```text
|
||||
pyproject.toml
|
||||
README.md
|
||||
src/
|
||||
pdf2md/
|
||||
__init__.py
|
||||
cli.py
|
||||
conversion.py
|
||||
pdf_splitter.py
|
||||
paths.py
|
||||
mineru_adapter.py
|
||||
ir.py
|
||||
markdown.py
|
||||
metadata.py
|
||||
quality.py
|
||||
report.py
|
||||
doctor.py
|
||||
tests/
|
||||
unit/
|
||||
integration/
|
||||
fixtures/
|
||||
scripts/
|
||||
install-mineru.ps1
|
||||
install-models.py
|
||||
```
|
||||
|
||||
Planned module responsibilities:
|
||||
|
||||
- `cli.py`: command parsing, CLI summaries, exit codes.
|
||||
- `conversion.py`: orchestration for one PDF and batch input.
|
||||
- `paths.py`: input discovery, output path planning, overwrite checks.
|
||||
- `mineru_adapter.py`: direct local MinerU CLI boundary.
|
||||
- `ir.py`: project-owned document/page/block/asset/warning records.
|
||||
- `markdown.py`: Obsidian Markdown normalization.
|
||||
- `metadata.py`: metadata schema creation and warning aggregation.
|
||||
- `quality.py`: local checks for assets, math renderability, and output sanity.
|
||||
- `report.py`: `<stem>.report.md` generation from metadata.
|
||||
- `doctor.py`: environment, dependency, CUDA/GPU, MinerU, and cache diagnostics.
|
||||
|
||||
## 5. Sprint Sequence
|
||||
|
||||
### Sprint 0: Source And Environment Verification
|
||||
|
||||
Active contract:
|
||||
|
||||
- `docs/Sprints/SPRINT0CONTRACT.md`
|
||||
|
||||
Objective:
|
||||
|
||||
- Verify the facts needed before implementation starts.
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
- `docs/KNOWLEDGEBASE.md`
|
||||
- `docs/V1IMPLEMENTATIONPLAN.md` if sequencing changes
|
||||
- `PROGRESS.md`
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- Confirmed MinerU 3.1.0 install command, CLI invocation shape, version command, output paths, and local execution behavior.
|
||||
- Confirmed Python 3.12, `uv`, CUDA/PyTorch, and GTX 1070 Ti 8GB risks.
|
||||
- Confirmed license notes needed before redistribution.
|
||||
|
||||
Verification checks:
|
||||
|
||||
- All volatile facts cite official MinerU, Python, uv, PyTorch/CUDA, or license sources.
|
||||
- No candidate engine comparison is reintroduced.
|
||||
- No implementation code is created.
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
- MinerU 3.1.0 cannot be reasonably invoked through a direct local CLI on the target environment.
|
||||
- Python 3.12 compatibility is not viable without changing project requirements.
|
||||
|
||||
Primary agents:
|
||||
|
||||
- `research-agent`
|
||||
- `local-setup-agent`
|
||||
- `license-privacy-agent`
|
||||
|
||||
### Sprint 1: Project Scaffold And Fast Test Loop
|
||||
|
||||
Active contract:
|
||||
|
||||
- `docs/Sprints/SPRINT1CONTRACT.md`
|
||||
|
||||
Objective:
|
||||
|
||||
- Create the minimal Python project structure and a fast local test loop.
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
- `pyproject.toml`
|
||||
- `src/pdf2md/__init__.py`
|
||||
- `tests/`
|
||||
- Development documentation if needed
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- `uv sync` works.
|
||||
- `uv run pytest` works.
|
||||
- Project package imports as `pdf2md`.
|
||||
- CLI entry point name `pdf2md` is reserved but may initially expose only `doctor` or a clear placeholder until later sprints.
|
||||
- If `uv` is still unavailable locally, Sprint 1 records that blocker and is not marked complete.
|
||||
|
||||
Verification checks:
|
||||
|
||||
- Import test passes.
|
||||
- Empty test suite or initial scaffold tests pass.
|
||||
- No runtime network dependency is introduced.
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
- Project cannot be installed with `uv`.
|
||||
- Scaffolding adds speculative config systems, extra engines, or unused abstractions.
|
||||
|
||||
Primary agents:
|
||||
|
||||
- `harness-planner-agent`
|
||||
- `feature-generator-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
### Sprint 2: Paths, Input Discovery, And Overwrite Planning
|
||||
|
||||
Active contract:
|
||||
|
||||
- `docs/Sprints/SPRINT2CONTRACT.md`
|
||||
|
||||
Objective:
|
||||
|
||||
- Implement deterministic input and output planning before conversion logic exists.
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
- `paths.py`
|
||||
- `conversion.py` skeleton if needed
|
||||
- CLI path handling tests
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- Single PDF discovery.
|
||||
- Directory PDF discovery.
|
||||
- Recursive traversal only when requested.
|
||||
- Deterministic output paths for Markdown, assets, metadata JSON, report, and optional raw MinerU output.
|
||||
- Existing-output protection unless `--overwrite` is passed.
|
||||
|
||||
Verification checks:
|
||||
|
||||
- Unit tests for single PDF path planning.
|
||||
- Unit tests for directory and recursive discovery.
|
||||
- Unit tests for overwrite behavior.
|
||||
- Tests include Korean or non-ASCII filename handling using generated temporary files, not committed sample PDFs.
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
- Output planning can overwrite user files without explicit overwrite intent.
|
||||
- Directory conversion descends recursively without `--recursive`.
|
||||
|
||||
Primary agents:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
### Sprint 3: Domain Records, Metadata, And Warning Model
|
||||
|
||||
Active contract:
|
||||
|
||||
- `docs/Sprints/SPRINT3CONTRACT.md`
|
||||
|
||||
Objective:
|
||||
|
||||
- Define project-owned records before binding to MinerU output.
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
- `ir.py`
|
||||
- `metadata.py`
|
||||
- `report.py` skeleton if needed
|
||||
- Unit tests
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- Document, page, block, asset, and warning records.
|
||||
- Stable warning codes from `ARCHITECTURE.md`.
|
||||
- Metadata JSON builder with required top-level and summary fields.
|
||||
- Warning aggregation logic.
|
||||
|
||||
Verification checks:
|
||||
|
||||
- Unit tests for metadata schema creation.
|
||||
- Unit tests for warning aggregation.
|
||||
- Unit tests for optional fields such as bbox and confidence being preserved only when present.
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
- Public API requires raw MinerU objects.
|
||||
- Metadata omits source PDF, SHA-256, engine, pages, warnings, assets, or summary.
|
||||
|
||||
Primary agents:
|
||||
|
||||
- `metadata-agent`
|
||||
- `feature-generator-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
### Sprint 4: MinerU Adapter With Mocked Contract
|
||||
|
||||
Active contract:
|
||||
|
||||
- `docs/Sprints/SPRINT4CONTRACT.md`
|
||||
|
||||
Objective:
|
||||
|
||||
- Build the direct local MinerU adapter boundary with mocked outputs first.
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
- `mineru_adapter.py`
|
||||
- `doctor.py` partial checks
|
||||
- Adapter tests with fake subprocess results and fake output directories
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- Adapter availability check.
|
||||
- Version check.
|
||||
- Direct CLI command construction.
|
||||
- Strict-local command validation.
|
||||
- Subprocess execution wrapper capturing stdout, stderr, exit code, and paths.
|
||||
- Parsed adapter result object with raw Markdown, raw structured data when available, assets, warnings, engine, engine version, options, exit code, and stderr.
|
||||
- Baseline command shape based on MinerU 3.1.0 direct local CLI: `mineru -p <input> -o <output>`.
|
||||
- Strict-local validation allows CLI-internal temporary local `mineru-api` orchestration, while rejecting `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends.
|
||||
|
||||
Verification checks:
|
||||
|
||||
- Mocked successful MinerU output test.
|
||||
- Mocked missing MinerU test.
|
||||
- Mocked non-zero exit test.
|
||||
- Test that prohibited remote/API flags cannot be introduced.
|
||||
- No real MinerU/model dependency in default tests.
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
- Adapter passes `--api-url`, uses router mode, uses an HTTP client backend, or connects to a remote API or remote OpenAI-compatible backend.
|
||||
- Adapter falls back to another engine after MinerU failure.
|
||||
- Tests require model downloads by default.
|
||||
|
||||
Primary agents:
|
||||
|
||||
- `mineru-integration-agent`
|
||||
- `feature-generator-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
### Sprint 5: Obsidian Markdown Normalization And Assets
|
||||
|
||||
Active contract:
|
||||
|
||||
- `docs/Sprints/SPRINT5CONTRACT.md`
|
||||
|
||||
Objective:
|
||||
|
||||
- Normalize MinerU/project IR output into Obsidian-friendly Markdown.
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
- `markdown.py`
|
||||
- `quality.py` partial asset link checks
|
||||
- Unit tests
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- Inline math delimiter normalization to `$...$`.
|
||||
- Display math delimiter normalization to `$$...$$`.
|
||||
- Blank-line normalization around display math.
|
||||
- Relative asset link normalization.
|
||||
- Simple table preservation and complex table fallback warnings.
|
||||
- No visible page markers by default.
|
||||
|
||||
Verification checks:
|
||||
|
||||
- Unit tests for inline math.
|
||||
- Unit tests for display math spacing.
|
||||
- Unit tests for underscores/carets inside math.
|
||||
- Unit tests for relative asset links.
|
||||
- Unit tests for table fallback warning behavior.
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
- Normalization rewrites LaTeX semantics without deterministic tests.
|
||||
- Generated links are absolute when relative links are required.
|
||||
- Page provenance is only visible in Markdown and missing from metadata.
|
||||
|
||||
Primary agents:
|
||||
|
||||
- `obsidian-markdown-agent`
|
||||
- `feature-generator-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
### Sprint 6: Quality Checks And Report Generation
|
||||
|
||||
Active contract:
|
||||
|
||||
- `docs/Sprints/SPRINT6CONTRACT.md`
|
||||
|
||||
Objective:
|
||||
|
||||
- Produce local quality signals and human-readable reports from metadata.
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
- `quality.py`
|
||||
- `report.py`
|
||||
- `metadata.py`
|
||||
- Unit tests
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- Missing asset link count.
|
||||
- Math renderability check interface with graceful unavailable-tool handling.
|
||||
- Pages-with-warnings summary.
|
||||
- `<stem>.report.md` generated from metadata.
|
||||
- Final status: `success`, `partial`, or `failed`.
|
||||
|
||||
Verification checks:
|
||||
|
||||
- Unit tests for report content.
|
||||
- Unit tests for missing asset link count.
|
||||
- Unit tests for math render failure aggregation.
|
||||
- Report generation does not re-run MinerU.
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
- Report diverges from JSON metadata.
|
||||
- Math render failures are silently ignored.
|
||||
- Quality checks require network access.
|
||||
|
||||
Primary agents:
|
||||
|
||||
- `metadata-agent`
|
||||
- `evaluation-agent`
|
||||
- `feature-generator-agent`
|
||||
|
||||
### Sprint 7: Conversion Orchestrator, CLI, And Python API
|
||||
|
||||
Active contract:
|
||||
|
||||
- `docs/Sprints/SPRINT7CONTRACT.md`
|
||||
|
||||
Objective:
|
||||
|
||||
- Connect path planning, MinerU adapter, normalization, metadata, report, and summaries.
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
- `conversion.py`
|
||||
- `cli.py`
|
||||
- `__init__.py`
|
||||
- CLI and API tests
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- `convert_pdf(input_path, output_dir, metadata=True)` public API.
|
||||
- `pdf2md convert INPUT --out OUTPUT_DIR`.
|
||||
- `--metadata`, `--keep-raw`, `--recursive`, `--overwrite`, `--gpu`, and `--strict-local` behavior.
|
||||
- Batch conversion for directories.
|
||||
- CLI summary with warning counts.
|
||||
|
||||
Verification checks:
|
||||
|
||||
- API test with mocked MinerU adapter.
|
||||
- CLI single PDF test with mocked MinerU adapter.
|
||||
- CLI directory test with mocked MinerU adapter.
|
||||
- Existing output test.
|
||||
- Failure summary test.
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
- Public API exposes raw MinerU objects as required return fields.
|
||||
- CLI writes outputs after a hard failure that should stop conversion.
|
||||
- CLI suppresses warning counts.
|
||||
|
||||
Primary agents:
|
||||
|
||||
- `feature-generator-agent`
|
||||
- `requirements-guard-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
### Sprint 8: Doctor And Setup Documentation
|
||||
|
||||
Active contract:
|
||||
|
||||
- `docs/Sprints/SPRINT8CONTRACT.md`
|
||||
|
||||
Status:
|
||||
|
||||
- Implemented.
|
||||
|
||||
Objective:
|
||||
|
||||
- Make local setup failures explicit before users run conversions.
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
- `doctor.py`
|
||||
- `cli.py`
|
||||
- `README.md`
|
||||
- `scripts/install-mineru.ps1`
|
||||
- `scripts/install-models.py`
|
||||
- Tests for mocked environment checks
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- `pdf2md doctor` reports Python version, `uv`, CUDA/PyTorch GPU visibility, MinerU availability, MinerU version, and detectable model/cache paths.
|
||||
- GPU unavailable warning is clear.
|
||||
- Missing `uv` is reported clearly.
|
||||
- Pre-Turing/Pascal GPU risk is reported clearly for GTX 1070 Ti compute capability 6.1.
|
||||
- Missing required dependency causes doctor failure.
|
||||
- Setup docs explain Windows PowerShell, Python 3.12, `uv`, MinerU, models, GPU expectations, and local-only behavior.
|
||||
|
||||
Verification checks:
|
||||
|
||||
- Mocked doctor tests for success, missing MinerU, missing GPU, and missing dependency.
|
||||
- Documentation review for no cloud/API runtime path.
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
- Doctor says the environment is healthy when MinerU is missing.
|
||||
- Doctor implies cloud/API fallback is supported.
|
||||
|
||||
Primary agents:
|
||||
|
||||
- `local-setup-agent`
|
||||
- `license-privacy-agent`
|
||||
- `evaluation-agent`
|
||||
|
||||
### Sprint 9: Local Fixture Evaluation And V1 Release Gate
|
||||
|
||||
Active contract:
|
||||
|
||||
- `docs/Sprints/SPRINT9CONTRACT.md`
|
||||
|
||||
Status:
|
||||
|
||||
- Implemented.
|
||||
|
||||
Objective:
|
||||
|
||||
- Validate the end-to-end v1 behavior against local samples without committing samples.
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
- `tests/integration/`
|
||||
- Optional local-only fixture manifest that does not include sample PDFs
|
||||
- `README.md`
|
||||
- `PROGRESS.md`
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- Fast mocked integration suite.
|
||||
- Optional MinerU-dependent local test command.
|
||||
- Local sample coverage notes in `PROGRESS.md`.
|
||||
- V1 release checklist status.
|
||||
|
||||
Verification checks:
|
||||
|
||||
- `uv run pytest` passes without model downloads.
|
||||
- Optional MinerU test is clearly marked and skipped unless explicitly enabled.
|
||||
- Representative sample produces Markdown, metadata JSON, report Markdown, and asset paths.
|
||||
- Obsidian math delimiter expectations are met.
|
||||
- No sample PDFs are staged.
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
- Default tests require GPU, MinerU models, or network access.
|
||||
- Sample files are added to git unintentionally.
|
||||
- V1 release checklist passes without metadata/report generation.
|
||||
|
||||
Primary agents and skills:
|
||||
|
||||
- `evaluation-agent`
|
||||
- `requirements-guard-agent`
|
||||
- `fixture-evaluation` skill
|
||||
|
||||
### Sprint 10: Pre-Conversion PDF Page Chunking
|
||||
|
||||
Active contract:
|
||||
|
||||
- `docs/Sprints/SPRINT10CONTRACT.md`
|
||||
|
||||
Status:
|
||||
|
||||
- Implemented.
|
||||
|
||||
Objective:
|
||||
|
||||
- Split long PDFs into temporary fixed-size page chunks before MinerU conversion.
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
- `pdf_splitter.py`
|
||||
- `conversion.py`
|
||||
- `cli.py`
|
||||
- `report.py`
|
||||
- README and Sprint 10 documentation
|
||||
- Unit tests for splitter, conversion, CLI, and report behavior
|
||||
|
||||
Expected outputs:
|
||||
|
||||
- `pdf2md convert INPUT --out OUTPUT --chunk-pages` enables 20-page chunks.
|
||||
- `pdf2md convert INPUT --out OUTPUT --chunk-pages N` enables custom positive chunk size.
|
||||
- `convert_pdf(..., chunk_pages=N)` returns a `BatchConversionResult` in chunk mode.
|
||||
- Temporary chunk PDFs are deleted after conversion completes.
|
||||
- Chunk Markdown files are separate and named with original page ranges.
|
||||
- Metadata and report content expose original source path and chunk page ranges.
|
||||
|
||||
Verification checks:
|
||||
|
||||
- pypdf-based local blank PDF tests cover page counts, chunk ranges, and written chunk page counts.
|
||||
- Mocked conversion tests verify one adapter call per chunk, failed-chunk continuation, chunk metadata/report context, and temporary chunk cleanup.
|
||||
- CLI tests verify `--chunk-pages` without a value uses 20 pages.
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
- Chunking uploads document content or uses another conversion engine.
|
||||
- Chunk outputs are merged.
|
||||
- Default tests require real MinerU, GPU, model files, network, Obsidian, LaTeX tooling, or `samples/`.
|
||||
|
||||
## 6. Cross-Cutting Acceptance Criteria
|
||||
|
||||
Every implementation sprint must preserve these acceptance criteria:
|
||||
|
||||
- No runtime remote document processing path exists.
|
||||
- MinerU is the only conversion engine.
|
||||
- Failures are explicit and traceable.
|
||||
- Warnings are structured and countable.
|
||||
- Markdown and metadata can be traced back to source pages where available.
|
||||
- Reports are generated from metadata.
|
||||
- Default tests are fast and local.
|
||||
- `samples/` remains untracked unless explicitly requested.
|
||||
|
||||
## 7. First Implementation Request Contract Template
|
||||
|
||||
Use this template when implementation begins.
|
||||
|
||||
```markdown
|
||||
## Sprint Contract
|
||||
|
||||
Objective:
|
||||
|
||||
Touched surfaces:
|
||||
|
||||
Expected outputs:
|
||||
|
||||
Non-goals:
|
||||
|
||||
Verification checks:
|
||||
|
||||
Hard failure criteria:
|
||||
|
||||
Handoff fields:
|
||||
- Files changed:
|
||||
- Commands run:
|
||||
- Tests passed:
|
||||
- Known failures:
|
||||
- Residual risks:
|
||||
- Next action:
|
||||
```
|
||||
|
||||
## 8. Open Risks
|
||||
|
||||
- MinerU 3.1.0 install and CLI behavior are source-verified, but real local output still needs a later local probe before release.
|
||||
- GTX 1070 Ti 8GB is visible locally, but it is Pascal compute capability 6.1; `doctor` and setup docs must make CUDA/PyTorch limits clear.
|
||||
- `uv` is installed per-user at `C:\Users\user\.local\bin`, but a new shell may need PATH refresh before `uv` is visible.
|
||||
- Formula renderability checks need a local tool choice; the implementation should start with an interface and graceful unavailable-tool warning if needed.
|
||||
- Some PDFs will have tables or formulas that cannot be faithfully represented in Markdown; metadata and `.report.md` must surface this instead of hiding it.
|
||||
- Redistribution license obligations must be reviewed before packaging, redistribution, or bundling model weights.
|
||||
|
||||
## 9. Recommended Next Step
|
||||
|
||||
Run optional real local MinerU validation on a long sample only when requested. Default verification should continue to use mocked adapters and generated temporary PDFs so it remains independent of MinerU, GPU, model files, network access, and `samples/`.
|
||||
|
||||
Facts carried forward from Sprint 0:
|
||||
|
||||
- MinerU is fixed to version 3.1.0.
|
||||
- Direct local CLI command shape is `mineru -p <input> -o <output>`.
|
||||
- MinerU output layout should be treated as optional-file based until locally probed.
|
||||
- Python 3.12 is compatible with the pinned MinerU package range.
|
||||
- GTX 1070 Ti CUDA/PyTorch support needs explicit doctor validation.
|
||||
- MinerU/model license posture is acceptable for personal local use. Redistribution remains out of scope until reviewed.
|
||||
@@ -0,0 +1,146 @@
|
||||
# V1 Release Checklist
|
||||
|
||||
Use this checklist from the repository root when deciding whether v1 is ready for local use. It separates the default fast gates from optional MinerU/GPU/sample validation so the normal loop stays independent of real models, GPU, network access, Obsidian, LaTeX tooling, and `samples/`.
|
||||
|
||||
## Release Status Rules
|
||||
|
||||
- Default fast gates passing means the repository is healthy under mocked/local checks.
|
||||
- Optional local MinerU fixture gates passing means real local sample conversion has also been validated.
|
||||
- If `pdf2md doctor` reports a hard failure, v1 release status is blocked unless the user records an explicit waiver with the exact reason and scope.
|
||||
- Do not mark v1 fully validated when optional MinerU/sample checks are skipped or blocked. Record the blocked reason and release recommendation in `PROGRESS.md`.
|
||||
- Do not claim perfect LaTeX, table, or reading-order reconstruction. The v1 guarantee is best-effort local conversion with warnings, metadata, and a human-readable report.
|
||||
|
||||
## Default Fast Gates
|
||||
|
||||
These gates should be runnable without real MinerU execution, sample PDFs, model files, GPU acceleration, network access, Obsidian, or LaTeX tooling. They must not install packages or download models during imports, tests, `doctor`, or `convert`.
|
||||
|
||||
| Gate | Pass condition | Release handling |
|
||||
| --- | --- | --- |
|
||||
| `uv sync` | Exits 0. | Blocks if project dependencies cannot sync. |
|
||||
| `uv run pytest` | Exits 0 using mocked/local tests only. | Blocks if default tests require real MinerU, GPU, CUDA, PyTorch, model files, network, Obsidian, LaTeX tooling, or `samples/`. |
|
||||
| `uv run pdf2md --version` | Exits 0 and prints the installed CLI version. | Blocks if the CLI entry point is unavailable. |
|
||||
| `uv run pdf2md doctor` | Runs to completion and reports pass, warning-only, or hard-fail status. | A hard failure blocks release readiness unless explicitly waived by the user and recorded. Warning-only status can continue, but warnings must be recorded. |
|
||||
| `git diff --check` | Exits 0. | Blocks on whitespace or patch formatting errors. |
|
||||
| `git status --short --untracked-files=all` | Shows no staged sample PDFs or generated sample outputs. | Blocks if any `samples/` file or sample-derived output is staged or committed unintentionally. |
|
||||
|
||||
## Strict-Local Gate
|
||||
|
||||
Before calling v1 ready, verify the release candidate still follows the strict-local policy:
|
||||
|
||||
- Allowed runtime path: direct local `mineru` CLI execution.
|
||||
- Allowed MinerU internal behavior: MinerU 3.1.0 may start a temporary local `mineru-api` when the CLI runs without `--api-url`.
|
||||
- Prohibited runtime paths: `--api-url`, remote APIs, router mode, HTTP client backends, remote OpenAI-compatible backends, hosted renderers, cloud OCR, remote LLM/VLM calls, remote document parsers, alternate engines, runtime engine selection, and silent fallback after MinerU failure.
|
||||
- Setup downloads must be explicit user actions and remain separate from runtime conversion. Default tests, imports, `doctor`, `convert`, and helper checks must not download packages or models.
|
||||
|
||||
## Doctor Hard-Failure Handling
|
||||
|
||||
Treat a non-zero `pdf2md doctor` result as a release blocker for real v1 readiness. Common hard failures include missing required local dependencies, missing `mineru` on PATH, or a strict-local policy failure. Warning-only doctor results can continue, but the warnings must be recorded.
|
||||
|
||||
When `doctor` hard-fails:
|
||||
|
||||
- Do not run optional sample conversion as a passing release gate.
|
||||
- Do not mark optional MinerU/GPU/sample validation as skipped-pass. Mark it blocked.
|
||||
- Do not use a cloud/API fallback or alternate converter to bypass the failure.
|
||||
- Record the failing check, exit code, and next action in `PROGRESS.md`.
|
||||
- If the user chooses to proceed anyway, record the waiver and report the release as waived or risk-accepted, not fully validated.
|
||||
|
||||
## Optional Local MinerU, GPU, And Sample Gates
|
||||
|
||||
Run these only by explicit opt-in after the default gates. They are intended for a local workstation with MinerU 3.1.0, model/cache setup, and GPU/PyTorch expectations already checked by `doctor`.
|
||||
|
||||
Preconditions:
|
||||
|
||||
- `uv run pdf2md doctor` has no hard failures, or the user has recorded an explicit waiver.
|
||||
- Source PDFs come from local `samples/` or another user-provided local directory.
|
||||
- Generated outputs go to a temporary directory or another ignored local output root, never to tracked fixture paths.
|
||||
- Sample PDFs, `samples/metadata.json`, and generated sample outputs remain unstaged and uncommitted.
|
||||
|
||||
Manual single-sample shape:
|
||||
|
||||
```powershell
|
||||
$sample = "samples\FourNodeQuadrilateralShellElementMITC4.pdf"
|
||||
$out = Join-Path $env:TEMP ("pdf2md-fixture-" + [guid]::NewGuid().ToString())
|
||||
uv run pdf2md convert $sample --out $out --metadata --overwrite
|
||||
```
|
||||
|
||||
Optional pytest fixture evaluation:
|
||||
|
||||
```powershell
|
||||
$env:PDF2MD_RUN_MINERU_FIXTURES = "1"
|
||||
uv run pytest tests/integration/test_optional_mineru_fixtures.py
|
||||
Remove-Item Env:PDF2MD_RUN_MINERU_FIXTURES
|
||||
```
|
||||
|
||||
This optional pytest path runs `pdf2md doctor` first. If doctor has a hard failure, the fixture test is skipped with a blocker message instead of being counted as a passing real-MinerU validation.
|
||||
|
||||
A sample conversion is successful only when all of these are true:
|
||||
|
||||
- The command exits 0.
|
||||
- The planned Markdown file exists: `<output>\<stem>.md`.
|
||||
- The planned metadata JSON exists: `<output>\<stem>.metadata.json`.
|
||||
- The planned quality report exists: `<output>\<stem>.report.md`.
|
||||
- Metadata and report warning counts are consistent enough to explain math, table, reading-order, asset, MinerU, and checker-unavailable risks.
|
||||
- Any Markdown image links resolve relative to the Markdown file, or missing/broken links are reported as warnings.
|
||||
|
||||
Missing Markdown, metadata JSON, or `.report.md` means the sample failed or is blocked. Do not count it as a partial success for release gating.
|
||||
|
||||
For each attempted sample, record at least:
|
||||
|
||||
- Source filename.
|
||||
- Command run.
|
||||
- Exit code.
|
||||
- Generated Markdown path.
|
||||
- Generated metadata JSON path.
|
||||
- Generated `.report.md` path.
|
||||
- Warning count and final status.
|
||||
- Math renderability failures or checker-unavailable count.
|
||||
- Table fallback or degradation count when available.
|
||||
- Missing or broken asset link count.
|
||||
- Page coverage when available.
|
||||
- Doctor status and any GPU/PyTorch/model/cache warnings.
|
||||
|
||||
## Fixture Coverage Notes
|
||||
|
||||
Local fixture coverage should include these risk categories where samples are available:
|
||||
|
||||
- Simple digital PDF with a text layer.
|
||||
- Math-heavy PDF.
|
||||
- Multi-column or complex reading order.
|
||||
- Table with formulas.
|
||||
- Figure, caption, or extracted asset links.
|
||||
- Korean or non-ASCII filename/path handling.
|
||||
|
||||
Observed local fixture map as of 2026-05-08:
|
||||
|
||||
| Local sample | Fixture risks covered | Notes |
|
||||
| --- | --- | --- |
|
||||
| `samples/FourNodeQuadrilateralShellElementMITC4.pdf` | simple digital PDF, math-heavy engineering content, table/formula risk | Small sample suitable for first optional MinerU smoke validation. |
|
||||
| `samples/MITC공부.pdf` | Korean filename/path handling, math-heavy notes, table/formula risk | Useful for non-ASCII path and shell-element notation checks. |
|
||||
| `samples/2007쉘구조물의유한요소해석에대하여.pdf` | Korean filename/path handling, longer academic layout, math-heavy content, reading-order risk | Larger sample; use after the small smoke sample passes. |
|
||||
| `samples/유한요소해석법을이용한쉘구조물의동적좌굴해석.pdf` | Korean filename/path handling, math-heavy content, figures/assets, reading-order risk | Larger sample for report and warning quality review. |
|
||||
| `samples/metadata.json` | fixture notes only | Local untracked context; do not treat as generated v1 metadata unless manually confirmed. |
|
||||
|
||||
Coverage gaps to keep visible:
|
||||
|
||||
- A deliberately simple one-page digital PDF fixture is still useful for release smoke checks.
|
||||
- A table-dominant sample with known formula cells would make table degradation easier to judge.
|
||||
- A figure-heavy sample with expected extracted assets would make asset link validation easier to judge.
|
||||
|
||||
Do not score fixture quality only by plain-text edit distance. Include math delimiter/renderability behavior, tables, reading order, assets, metadata fields, warning usefulness, and `.report.md` usefulness.
|
||||
|
||||
## No-Sample-Commit Check
|
||||
|
||||
Before staging or committing any release-gate work:
|
||||
|
||||
```powershell
|
||||
git status --short --untracked-files=all
|
||||
```
|
||||
|
||||
Confirm:
|
||||
|
||||
- `samples/` files are not staged.
|
||||
- Generated sample outputs are not staged.
|
||||
- No sample PDF or sample-derived binary was copied into tracked tests or docs.
|
||||
- Any temporary fixture output inside the repository was removed unless the user explicitly approved keeping it and it is ignored.
|
||||
|
||||
`samples/` may appear as untracked local context. That is expected; do not add it unless the user explicitly requests it.
|
||||
Generated
+30
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"name": "convert-pdf-to-md",
|
||||
"version": "0.1.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "convert-pdf-to-md",
|
||||
"version": "0.1.0",
|
||||
"dependencies": {
|
||||
"mathjax": "4.1.2"
|
||||
}
|
||||
},
|
||||
"node_modules/@mathjax/mathjax-newcm-font": {
|
||||
"version": "4.1.2",
|
||||
"resolved": "https://registry.npmjs.org/@mathjax/mathjax-newcm-font/-/mathjax-newcm-font-4.1.2.tgz",
|
||||
"integrity": "sha512-lZHMjNP2XbABHA3kVn40rbse5ERUeMEmrGH03qLkCwxq4/5Z/eNLr0akw1MmQcqTwCbvkx1BFcmJ7RCfbRlw3Q==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/mathjax": {
|
||||
"version": "4.1.2",
|
||||
"resolved": "https://registry.npmjs.org/mathjax/-/mathjax-4.1.2.tgz",
|
||||
"integrity": "sha512-EQDS8xBpVg179BXoLeZ9JlwUFftOC5qylw20UlAMDhrTuooENigOocY79aNkkFSyvj/AST/89ZAo12+r5bPI4w==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@mathjax/mathjax-newcm-font": "^4.1.2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"name": "convert-pdf-to-md",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"mathjax-checker:health": "node tools/mathjax-checker/check.mjs --health"
|
||||
},
|
||||
"dependencies": {
|
||||
"mathjax": "4.1.2"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
[build-system]
|
||||
requires = ["hatchling>=1.27"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "convert-pdf-to-md"
|
||||
version = "0.1.0"
|
||||
description = "Local PDF-to-Markdown converter scaffold for math-heavy documents."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12,<3.13"
|
||||
dependencies = [
|
||||
"pypdf>=6.10.2,<7",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
pdf2md = "pdf2md.cli:main"
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"pytest>=8.3",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/pdf2md"]
|
||||
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,91 @@
|
||||
{
|
||||
"samples": [
|
||||
{
|
||||
"page_count": 13,
|
||||
"path": "samples/2007쉘구조물의유한요소해석에대하여.pdf",
|
||||
"traits": {
|
||||
"figure_density": "moderate",
|
||||
"formula_density": "moderate",
|
||||
"has_korean_path": true,
|
||||
"layout_risk": "moderate",
|
||||
"mixed_scanned_text_pages": false,
|
||||
"scanned_pages": [],
|
||||
"table_density": "low",
|
||||
"target_regression_focus": [
|
||||
"korean_path",
|
||||
"text_layer",
|
||||
"formulas",
|
||||
"figures"
|
||||
],
|
||||
"text_layer_quality": "good"
|
||||
}
|
||||
},
|
||||
{
|
||||
"page_count": 7,
|
||||
"path": "samples/FourNodeQuadrilateralShellElementMITC4.pdf",
|
||||
"traits": {
|
||||
"figure_density": "low",
|
||||
"formula_density": "high",
|
||||
"has_korean_path": false,
|
||||
"layout_risk": "moderate",
|
||||
"mixed_scanned_text_pages": false,
|
||||
"scanned_pages": [],
|
||||
"table_density": "low",
|
||||
"target_regression_focus": [
|
||||
"english_path",
|
||||
"text_layer",
|
||||
"formulas",
|
||||
"layout"
|
||||
],
|
||||
"text_layer_quality": "good"
|
||||
}
|
||||
},
|
||||
{
|
||||
"page_count": 13,
|
||||
"path": "samples/MITC공부.pdf",
|
||||
"traits": {
|
||||
"figure_density": "high",
|
||||
"formula_density": "moderate",
|
||||
"has_korean_path": true,
|
||||
"layout_risk": "high",
|
||||
"mixed_scanned_text_pages": false,
|
||||
"scanned_pages": [],
|
||||
"table_density": "moderate",
|
||||
"target_regression_focus": [
|
||||
"korean_path",
|
||||
"figures",
|
||||
"tables",
|
||||
"layout"
|
||||
],
|
||||
"text_layer_quality": "good"
|
||||
}
|
||||
},
|
||||
{
|
||||
"page_count": 76,
|
||||
"path": "samples/유한요소해석법을이용한쉘구조물의동적좌굴해석.pdf",
|
||||
"traits": {
|
||||
"figure_density": "high",
|
||||
"formula_density": "moderate",
|
||||
"has_korean_path": true,
|
||||
"layout_risk": "high",
|
||||
"mixed_scanned_text_pages": true,
|
||||
"scanned_pages": [
|
||||
4,
|
||||
12,
|
||||
64,
|
||||
70
|
||||
],
|
||||
"table_density": "moderate",
|
||||
"target_regression_focus": [
|
||||
"korean_path",
|
||||
"mixed_text_quality",
|
||||
"long_document",
|
||||
"figures",
|
||||
"tables"
|
||||
],
|
||||
"text_layer_quality": "mixed"
|
||||
}
|
||||
}
|
||||
],
|
||||
"schema_version": 1
|
||||
}
|
||||
Binary file not shown.
@@ -0,0 +1,22 @@
|
||||
"""Project package for the local PDF-to-Markdown converter."""
|
||||
|
||||
from pdf2md.conversion import (
|
||||
DEFAULT_CHUNK_PAGES,
|
||||
DEFAULT_GPU_DEVICE,
|
||||
BatchConversionResult,
|
||||
ConversionResult,
|
||||
convert_input,
|
||||
convert_pdf,
|
||||
)
|
||||
|
||||
__version__ = "0.1.0"
|
||||
|
||||
__all__ = [
|
||||
"BatchConversionResult",
|
||||
"ConversionResult",
|
||||
"DEFAULT_CHUNK_PAGES",
|
||||
"DEFAULT_GPU_DEVICE",
|
||||
"__version__",
|
||||
"convert_input",
|
||||
"convert_pdf",
|
||||
]
|
||||
@@ -0,0 +1,119 @@
|
||||
"""Command line interface for local PDF-to-Markdown conversion."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from collections.abc import Sequence
|
||||
|
||||
from pdf2md import __version__
|
||||
from pdf2md.conversion import DEFAULT_CHUNK_PAGES, DEFAULT_GPU_DEVICE, ConversionAdapter, convert_input
|
||||
from pdf2md.doctor import DoctorReport, format_doctor_report, run_doctor
|
||||
from pdf2md.mineru_adapter import StrictLocalViolationError
|
||||
from pdf2md.paths import PathPlanningError
|
||||
|
||||
|
||||
def main(
|
||||
argv: Sequence[str] | None = None,
|
||||
*,
|
||||
adapter: ConversionAdapter | None = None,
|
||||
clock=None,
|
||||
doctor_runner=None,
|
||||
) -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="pdf2md",
|
||||
description="Local PDF-to-Markdown converter.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="store_true",
|
||||
help="Show the installed pdf2md version.",
|
||||
)
|
||||
subparsers = parser.add_subparsers(dest="command")
|
||||
subparsers.add_parser("doctor", help="Check local setup requirements.")
|
||||
convert_parser = subparsers.add_parser("convert", help="Convert a PDF or directory of PDFs.")
|
||||
convert_parser.add_argument("input", help="Input PDF file or directory.")
|
||||
convert_parser.add_argument("--out", required=True, help="Output directory.")
|
||||
convert_parser.add_argument("--metadata", action="store_true", default=True, help="Write metadata JSON. Enabled by default.")
|
||||
convert_parser.add_argument("--keep-raw", action="store_true", help="Keep raw MinerU output.")
|
||||
convert_parser.add_argument("--recursive", action="store_true", help="Recursively discover PDFs in directories.")
|
||||
convert_parser.add_argument("--overwrite", action="store_true", help="Overwrite planned outputs.")
|
||||
convert_parser.add_argument(
|
||||
"--chunk-pages",
|
||||
nargs="?",
|
||||
const=DEFAULT_CHUNK_PAGES,
|
||||
default=None,
|
||||
type=_positive_int,
|
||||
metavar="PAGES",
|
||||
help=(
|
||||
"Opt in to pre-conversion PDF chunking. If PAGES is omitted, "
|
||||
f"{DEFAULT_CHUNK_PAGES} pages per chunk is used."
|
||||
),
|
||||
)
|
||||
convert_parser.add_argument(
|
||||
"--gpu",
|
||||
default=DEFAULT_GPU_DEVICE,
|
||||
help=f"CUDA device. Defaults to {DEFAULT_GPU_DEVICE}.",
|
||||
)
|
||||
convert_parser.add_argument(
|
||||
"--strict-local",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Keep strict-local conversion policy enabled. Enabled by default.",
|
||||
)
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.version:
|
||||
print(f"pdf2md {__version__}")
|
||||
return 0
|
||||
|
||||
if args.command == "doctor":
|
||||
report: DoctorReport = doctor_runner() if doctor_runner is not None else run_doctor()
|
||||
print(format_doctor_report(report))
|
||||
return report.exit_code
|
||||
|
||||
if args.command != "convert":
|
||||
parser.print_help()
|
||||
return 0
|
||||
|
||||
try:
|
||||
batch = convert_input(
|
||||
args.input,
|
||||
args.out,
|
||||
metadata=args.metadata,
|
||||
keep_raw=args.keep_raw,
|
||||
recursive=args.recursive,
|
||||
overwrite=args.overwrite,
|
||||
chunk_pages=args.chunk_pages,
|
||||
gpu=args.gpu,
|
||||
strict_local=args.strict_local,
|
||||
adapter=adapter,
|
||||
clock=clock,
|
||||
)
|
||||
except (PathPlanningError, StrictLocalViolationError, ValueError) as error:
|
||||
print(f"error: {error}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
for result in batch.results:
|
||||
label = "converted" if result.succeeded else "failed"
|
||||
print(f"{label}: {result.source_pdf} -> {result.markdown_path} ({result.warning_count} warnings)")
|
||||
print(f"converted: {batch.converted_count}")
|
||||
print(f"failed: {batch.failed_count}")
|
||||
print(f"warnings: {batch.warning_count}")
|
||||
if batch.failed_count:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def _positive_int(value: str) -> int:
|
||||
try:
|
||||
parsed = int(value)
|
||||
except ValueError as error:
|
||||
raise argparse.ArgumentTypeError("must be a positive integer") from error
|
||||
if parsed < 1:
|
||||
raise argparse.ArgumentTypeError("must be a positive integer")
|
||||
return parsed
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,785 @@
|
||||
"""Conversion orchestration for local PDF-to-Markdown output."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass, replace
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path, PurePosixPath
|
||||
from typing import Protocol
|
||||
|
||||
from pdf2md.ir import (
|
||||
AssetRecord,
|
||||
BlockRecord,
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
)
|
||||
from pdf2md.markdown import normalize_markdown
|
||||
from pdf2md.math_render import create_default_math_checker
|
||||
from pdf2md.metadata import build_metadata
|
||||
from pdf2md.mineru_adapter import (
|
||||
ENGINE_NAME,
|
||||
MinerUAdapter,
|
||||
MinerUAdapterResult,
|
||||
MinerUOptions,
|
||||
StrictLocalViolationError,
|
||||
)
|
||||
from pdf2md.paths import DiscoveredPdf, PathLike, PlannedOutput, discover_pdfs, plan_outputs
|
||||
from pdf2md.pdf_splitter import PdfChunkPlan, plan_pdf_chunks, write_pdf_chunk
|
||||
from pdf2md.quality import MathChecker, QualityResult, check_asset_links, check_math_renderability, merge_quality_results
|
||||
from pdf2md.report import FinalStatus, determine_final_status, render_report
|
||||
|
||||
|
||||
Clock = Callable[[], datetime]
|
||||
DEFAULT_GPU_DEVICE = "cuda:0"
|
||||
DEFAULT_CHUNK_PAGES = 20
|
||||
|
||||
|
||||
class ConversionAdapter(Protocol):
|
||||
def convert(self, input_pdf: PathLike, work_dir: PathLike, options: MinerUOptions | None = None) -> MinerUAdapterResult:
|
||||
"""Run the conversion engine into a local work directory."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ConversionResult:
|
||||
source_pdf: Path
|
||||
markdown_path: Path
|
||||
metadata_path: Path | None
|
||||
report_path: Path
|
||||
assets_dir: Path
|
||||
raw_dir: Path | None
|
||||
engine: str
|
||||
engine_version: str
|
||||
final_status: FinalStatus
|
||||
warning_count: int
|
||||
warnings: tuple[WarningRecord, ...]
|
||||
pages_processed: int
|
||||
|
||||
@property
|
||||
def succeeded(self) -> bool:
|
||||
return self.final_status != "failed"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BatchConversionResult:
|
||||
results: tuple[ConversionResult, ...]
|
||||
|
||||
@property
|
||||
def converted_count(self) -> int:
|
||||
return sum(result.succeeded for result in self.results)
|
||||
|
||||
@property
|
||||
def failed_count(self) -> int:
|
||||
return sum(not result.succeeded for result in self.results)
|
||||
|
||||
@property
|
||||
def warning_count(self) -> int:
|
||||
return sum(result.warning_count for result in self.results)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _AssetMaterialization:
|
||||
records: tuple[AssetRecord, ...]
|
||||
warnings: tuple[WarningRecord, ...]
|
||||
link_map: dict[str, str]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _ConversionTask:
|
||||
output_plan: PlannedOutput
|
||||
chunk_plan: PdfChunkPlan | None = None
|
||||
original_source_pdf: Path | None = None
|
||||
original_source_sha256: str | None = None
|
||||
|
||||
|
||||
_IMAGE_LINK_RE = re.compile(r"!\[(?P<alt>[^\]\n]*)\]\((?P<target>[^)\n]+)\)")
|
||||
_DISPLAY_MATH_RE = re.compile(r"(?<!\\)\$\$(?P<body>.*?)(?<!\\)\$\$", re.DOTALL)
|
||||
_INLINE_MATH_RE = re.compile(r"(?<!\\)\$(?P<body>[^\n$]+?)(?<!\\)\$")
|
||||
|
||||
|
||||
def convert_pdf(
|
||||
input_path: PathLike,
|
||||
output_dir: PathLike,
|
||||
*,
|
||||
metadata: bool = True,
|
||||
keep_raw: bool = False,
|
||||
overwrite: bool = False,
|
||||
gpu: str | None = DEFAULT_GPU_DEVICE,
|
||||
strict_local: bool = True,
|
||||
adapter: ConversionAdapter | None = None,
|
||||
math_checker: MathChecker | None = None,
|
||||
chunk_pages: int | None = None,
|
||||
clock: Clock | None = None,
|
||||
) -> ConversionResult | BatchConversionResult:
|
||||
"""Convert one local PDF into Markdown, metadata, and report outputs."""
|
||||
|
||||
_raise_if_strict_local_disabled(strict_local)
|
||||
candidate = Path(input_path).expanduser()
|
||||
if candidate.exists() and not candidate.is_file():
|
||||
raise ValueError("convert_pdf requires a PDF file input")
|
||||
discovered = discover_pdfs(input_path, recursive=False)
|
||||
if len(discovered) != 1:
|
||||
raise ValueError("convert_pdf requires a single PDF input")
|
||||
engine = adapter or MinerUAdapter()
|
||||
now = clock or _utc_now
|
||||
if chunk_pages is None:
|
||||
plan = plan_outputs(discovered, output_dir, metadata=metadata, keep_raw=keep_raw, overwrite=overwrite)[0]
|
||||
return _convert_plan(
|
||||
plan,
|
||||
adapter=engine,
|
||||
clock=now,
|
||||
metadata_enabled=metadata,
|
||||
keep_raw=keep_raw,
|
||||
overwrite=overwrite,
|
||||
gpu=gpu,
|
||||
strict_local=strict_local,
|
||||
math_checker=math_checker,
|
||||
)
|
||||
|
||||
tasks = _plan_conversion_tasks(
|
||||
discovered,
|
||||
output_dir,
|
||||
metadata=metadata,
|
||||
keep_raw=keep_raw,
|
||||
overwrite=overwrite,
|
||||
chunk_pages=chunk_pages,
|
||||
)
|
||||
return BatchConversionResult(
|
||||
_convert_tasks(
|
||||
tasks,
|
||||
adapter=engine,
|
||||
clock=now,
|
||||
metadata_enabled=metadata,
|
||||
keep_raw=keep_raw,
|
||||
overwrite=overwrite,
|
||||
gpu=gpu,
|
||||
strict_local=strict_local,
|
||||
math_checker=math_checker,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def convert_input(
|
||||
input_path: PathLike,
|
||||
output_dir: PathLike,
|
||||
*,
|
||||
metadata: bool = True,
|
||||
keep_raw: bool = False,
|
||||
recursive: bool = False,
|
||||
overwrite: bool = False,
|
||||
gpu: str | None = DEFAULT_GPU_DEVICE,
|
||||
strict_local: bool = True,
|
||||
adapter: ConversionAdapter | None = None,
|
||||
math_checker: MathChecker | None = None,
|
||||
chunk_pages: int | None = None,
|
||||
clock: Clock | None = None,
|
||||
) -> BatchConversionResult:
|
||||
"""Convert a local PDF or directory of PDFs."""
|
||||
|
||||
_raise_if_strict_local_disabled(strict_local)
|
||||
discovered = discover_pdfs(input_path, recursive=recursive)
|
||||
tasks = _plan_conversion_tasks(
|
||||
discovered,
|
||||
output_dir,
|
||||
metadata=metadata,
|
||||
keep_raw=keep_raw,
|
||||
overwrite=overwrite,
|
||||
chunk_pages=chunk_pages,
|
||||
)
|
||||
engine = adapter or MinerUAdapter()
|
||||
now = clock or _utc_now
|
||||
return BatchConversionResult(
|
||||
_convert_tasks(
|
||||
tasks,
|
||||
adapter=engine,
|
||||
clock=now,
|
||||
metadata_enabled=metadata,
|
||||
keep_raw=keep_raw,
|
||||
overwrite=overwrite,
|
||||
gpu=gpu,
|
||||
strict_local=strict_local,
|
||||
math_checker=math_checker,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _plan_conversion_tasks(
|
||||
discovered: tuple[DiscoveredPdf, ...],
|
||||
output_dir: PathLike,
|
||||
*,
|
||||
metadata: bool,
|
||||
keep_raw: bool,
|
||||
overwrite: bool,
|
||||
chunk_pages: int | None,
|
||||
) -> tuple[_ConversionTask, ...]:
|
||||
if chunk_pages is None:
|
||||
plans = plan_outputs(discovered, output_dir, metadata=metadata, keep_raw=keep_raw, overwrite=overwrite)
|
||||
return tuple(_ConversionTask(output_plan=plan) for plan in plans)
|
||||
if not isinstance(chunk_pages, int) or chunk_pages < 1:
|
||||
raise ValueError("chunk_pages must be a positive integer")
|
||||
|
||||
planned_inputs: list[DiscoveredPdf] = []
|
||||
chunk_plans: list[PdfChunkPlan | None] = []
|
||||
original_sources: list[Path | None] = []
|
||||
source_hashes: dict[Path, str] = {}
|
||||
for item in discovered:
|
||||
chunks = plan_pdf_chunks(item.source_path, chunk_pages=chunk_pages)
|
||||
if len(chunks) == 1:
|
||||
planned_inputs.append(item)
|
||||
chunk_plans.append(None)
|
||||
original_sources.append(None)
|
||||
continue
|
||||
|
||||
source_hashes[item.source_path] = _sha256(item.source_path)
|
||||
for chunk in chunks:
|
||||
planned_inputs.append(
|
||||
DiscoveredPdf(
|
||||
source_path=item.source_path.with_name(chunk.output_filename),
|
||||
relative_parent=item.relative_parent,
|
||||
)
|
||||
)
|
||||
chunk_plans.append(chunk)
|
||||
original_sources.append(item.source_path)
|
||||
|
||||
plans = plan_outputs(planned_inputs, output_dir, metadata=metadata, keep_raw=keep_raw, overwrite=overwrite)
|
||||
return tuple(
|
||||
_ConversionTask(
|
||||
output_plan=plan,
|
||||
chunk_plan=chunk,
|
||||
original_source_pdf=original,
|
||||
original_source_sha256=source_hashes[original] if original is not None else None,
|
||||
)
|
||||
for plan, chunk, original in zip(plans, chunk_plans, original_sources, strict=True)
|
||||
)
|
||||
|
||||
|
||||
def _convert_tasks(
|
||||
tasks: tuple[_ConversionTask, ...],
|
||||
*,
|
||||
adapter: ConversionAdapter,
|
||||
clock: Clock,
|
||||
metadata_enabled: bool,
|
||||
keep_raw: bool,
|
||||
overwrite: bool,
|
||||
gpu: str | None,
|
||||
strict_local: bool,
|
||||
math_checker: MathChecker | None,
|
||||
) -> tuple[ConversionResult, ...]:
|
||||
if any(task.chunk_plan is not None for task in tasks):
|
||||
with tempfile.TemporaryDirectory(prefix="pdf2md.chunks.") as chunk_directory:
|
||||
return tuple(
|
||||
_convert_task(
|
||||
task,
|
||||
chunk_directory=Path(chunk_directory),
|
||||
adapter=adapter,
|
||||
clock=clock,
|
||||
metadata_enabled=metadata_enabled,
|
||||
keep_raw=keep_raw,
|
||||
overwrite=overwrite,
|
||||
gpu=gpu,
|
||||
strict_local=strict_local,
|
||||
math_checker=math_checker,
|
||||
)
|
||||
for task in tasks
|
||||
)
|
||||
|
||||
return tuple(
|
||||
_convert_task(
|
||||
task,
|
||||
chunk_directory=None,
|
||||
adapter=adapter,
|
||||
clock=clock,
|
||||
metadata_enabled=metadata_enabled,
|
||||
keep_raw=keep_raw,
|
||||
overwrite=overwrite,
|
||||
gpu=gpu,
|
||||
strict_local=strict_local,
|
||||
math_checker=math_checker,
|
||||
)
|
||||
for task in tasks
|
||||
)
|
||||
|
||||
|
||||
def _convert_task(
|
||||
task: _ConversionTask,
|
||||
*,
|
||||
chunk_directory: Path | None,
|
||||
adapter: ConversionAdapter,
|
||||
clock: Clock,
|
||||
metadata_enabled: bool,
|
||||
keep_raw: bool,
|
||||
overwrite: bool,
|
||||
gpu: str | None,
|
||||
strict_local: bool,
|
||||
math_checker: MathChecker | None,
|
||||
) -> ConversionResult:
|
||||
if task.chunk_plan is None:
|
||||
return _convert_plan(
|
||||
task.output_plan,
|
||||
adapter=adapter,
|
||||
clock=clock,
|
||||
metadata_enabled=metadata_enabled,
|
||||
keep_raw=keep_raw,
|
||||
overwrite=overwrite,
|
||||
gpu=gpu,
|
||||
strict_local=strict_local,
|
||||
math_checker=math_checker,
|
||||
)
|
||||
|
||||
if chunk_directory is None:
|
||||
raise ValueError("chunk directory is required for chunked conversion")
|
||||
chunk_pdf = write_pdf_chunk(task.chunk_plan, chunk_directory / task.chunk_plan.output_filename)
|
||||
chunk_output_plan = replace(task.output_plan, source_pdf=chunk_pdf)
|
||||
return _convert_plan(
|
||||
chunk_output_plan,
|
||||
adapter=adapter,
|
||||
clock=clock,
|
||||
metadata_enabled=metadata_enabled,
|
||||
keep_raw=keep_raw,
|
||||
overwrite=overwrite,
|
||||
gpu=gpu,
|
||||
strict_local=strict_local,
|
||||
math_checker=math_checker,
|
||||
result_source_pdf=task.original_source_pdf,
|
||||
metadata_source_pdf=task.original_source_pdf,
|
||||
metadata_source_sha256=task.original_source_sha256,
|
||||
engine_options_extra={"chunk": task.chunk_plan.metadata()},
|
||||
)
|
||||
|
||||
|
||||
def _convert_plan(
|
||||
plan: PlannedOutput,
|
||||
*,
|
||||
adapter: ConversionAdapter,
|
||||
clock: Clock,
|
||||
metadata_enabled: bool,
|
||||
keep_raw: bool,
|
||||
overwrite: bool,
|
||||
gpu: str | None,
|
||||
strict_local: bool,
|
||||
math_checker: MathChecker | None,
|
||||
result_source_pdf: Path | None = None,
|
||||
metadata_source_pdf: Path | None = None,
|
||||
metadata_source_sha256: str | None = None,
|
||||
engine_options_extra: dict[str, object] | None = None,
|
||||
) -> ConversionResult:
|
||||
if overwrite:
|
||||
_clear_planned_outputs(plan)
|
||||
|
||||
plan.markdown_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
options = MinerUOptions(strict_local=strict_local, gpu_device=gpu)
|
||||
|
||||
if keep_raw:
|
||||
if plan.raw_dir is None:
|
||||
raise ValueError("raw output directory is required when keep_raw is enabled")
|
||||
plan.raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
return _convert_in_work_dir(
|
||||
plan,
|
||||
plan.raw_dir,
|
||||
adapter,
|
||||
options,
|
||||
clock,
|
||||
metadata_enabled,
|
||||
math_checker,
|
||||
result_source_pdf=result_source_pdf,
|
||||
metadata_source_pdf=metadata_source_pdf,
|
||||
metadata_source_sha256=metadata_source_sha256,
|
||||
engine_options_extra=engine_options_extra,
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix=f"{plan.source_pdf.stem}.", dir=plan.markdown_path.parent) as temporary_dir:
|
||||
return _convert_in_work_dir(
|
||||
plan,
|
||||
Path(temporary_dir),
|
||||
adapter,
|
||||
options,
|
||||
clock,
|
||||
metadata_enabled,
|
||||
math_checker,
|
||||
result_source_pdf=result_source_pdf,
|
||||
metadata_source_pdf=metadata_source_pdf,
|
||||
metadata_source_sha256=metadata_source_sha256,
|
||||
engine_options_extra=engine_options_extra,
|
||||
)
|
||||
|
||||
|
||||
def _convert_in_work_dir(
|
||||
plan: PlannedOutput,
|
||||
work_dir: Path,
|
||||
adapter: ConversionAdapter,
|
||||
options: MinerUOptions,
|
||||
clock: Clock,
|
||||
metadata_enabled: bool,
|
||||
math_checker: MathChecker | None,
|
||||
result_source_pdf: Path | None = None,
|
||||
metadata_source_pdf: Path | None = None,
|
||||
metadata_source_sha256: str | None = None,
|
||||
engine_options_extra: dict[str, object] | None = None,
|
||||
) -> ConversionResult:
|
||||
result_source = result_source_pdf or plan.source_pdf
|
||||
metadata_source = metadata_source_pdf or result_source
|
||||
try:
|
||||
adapter_result = adapter.convert(plan.source_pdf, work_dir, options)
|
||||
except StrictLocalViolationError as error:
|
||||
return _failed_result(plan, warnings=(error.warning,), source_pdf=result_source)
|
||||
|
||||
engine = adapter_result.engine or ENGINE_NAME
|
||||
engine_version = adapter_result.engine_version or "unknown"
|
||||
if not adapter_result.succeeded:
|
||||
return _failed_result(
|
||||
plan,
|
||||
warnings=adapter_result.warnings,
|
||||
engine=engine,
|
||||
engine_version=engine_version,
|
||||
source_pdf=result_source,
|
||||
)
|
||||
|
||||
if adapter_result.raw_markdown is None:
|
||||
warning = WarningRecord(
|
||||
WarningCode.MINERU_CLI_FAILED,
|
||||
WarningSeverity.ERROR,
|
||||
"MinerU produced structured output but no Markdown; no fallback engine was used.",
|
||||
)
|
||||
return _failed_result(
|
||||
plan,
|
||||
warnings=adapter_result.warnings + (warning,),
|
||||
engine=engine,
|
||||
engine_version=engine_version,
|
||||
source_pdf=result_source,
|
||||
)
|
||||
|
||||
assets = _materialize_assets(adapter_result.asset_paths, work_dir, plan.assets_dir)
|
||||
markdown_source = _rewrite_asset_links(adapter_result.raw_markdown, assets.link_map)
|
||||
normalized = normalize_markdown(
|
||||
markdown_source,
|
||||
markdown_dir=plan.markdown_path.parent,
|
||||
asset_root=plan.assets_dir,
|
||||
check_assets=False,
|
||||
)
|
||||
quality = _run_quality_checks(
|
||||
normalized.markdown,
|
||||
markdown_dir=plan.markdown_path.parent,
|
||||
asset_root=plan.assets_dir,
|
||||
math_checker=math_checker,
|
||||
)
|
||||
warnings = adapter_result.warnings + assets.warnings + normalized.warnings + quality.warnings
|
||||
document = _build_document(
|
||||
source_pdf=metadata_source,
|
||||
markdown=normalized.markdown,
|
||||
assets=assets.records,
|
||||
warnings=warnings,
|
||||
raw_structured=adapter_result.raw_structured,
|
||||
)
|
||||
engine_options = dict(adapter_result.engine_options)
|
||||
if engine_options_extra:
|
||||
engine_options.update(engine_options_extra)
|
||||
metadata_data = build_metadata(
|
||||
document=document,
|
||||
source_sha256=metadata_source_sha256 or _sha256(metadata_source),
|
||||
created_at=_format_timestamp(clock()),
|
||||
engine=engine,
|
||||
engine_version=engine_version,
|
||||
engine_options=engine_options,
|
||||
)
|
||||
report_quality = QualityResult(
|
||||
missing_asset_link_count=quality.missing_asset_link_count,
|
||||
invalid_asset_link_count=quality.invalid_asset_link_count,
|
||||
)
|
||||
report_text = render_report(
|
||||
metadata_data,
|
||||
quality=report_quality,
|
||||
markdown_path=plan.markdown_path,
|
||||
metadata_path=plan.metadata_path if metadata_enabled else None,
|
||||
report_path=plan.report_path,
|
||||
)
|
||||
final_status = determine_final_status(metadata_data, report_quality)
|
||||
|
||||
_write_text(plan.markdown_path, normalized.markdown)
|
||||
if metadata_enabled and plan.metadata_path is not None:
|
||||
_write_text(plan.metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
|
||||
_write_text(plan.report_path, report_text)
|
||||
|
||||
return ConversionResult(
|
||||
source_pdf=result_source,
|
||||
markdown_path=plan.markdown_path,
|
||||
metadata_path=plan.metadata_path if metadata_enabled else None,
|
||||
report_path=plan.report_path,
|
||||
assets_dir=plan.assets_dir,
|
||||
raw_dir=plan.raw_dir,
|
||||
engine=engine,
|
||||
engine_version=engine_version,
|
||||
final_status=final_status,
|
||||
warning_count=len(warnings),
|
||||
warnings=warnings,
|
||||
pages_processed=int(metadata_data["summary"]["pages_processed"]),
|
||||
)
|
||||
|
||||
|
||||
def _materialize_assets(asset_paths: tuple[Path, ...], work_dir: Path, assets_dir: Path) -> _AssetMaterialization:
|
||||
records: list[AssetRecord] = []
|
||||
warnings: list[WarningRecord] = []
|
||||
link_map: dict[str, str] = {}
|
||||
copied: set[str] = set()
|
||||
work_root = work_dir.resolve()
|
||||
|
||||
for source in asset_paths:
|
||||
source_path = Path(source)
|
||||
if not source_path.exists() or not source_path.is_file():
|
||||
warnings.append(_warning(WarningCode.ASSET_LINK_MISSING, f"Adapter asset file does not exist: {source_path}"))
|
||||
continue
|
||||
|
||||
try:
|
||||
source_relative = source_path.resolve().relative_to(work_root)
|
||||
except ValueError:
|
||||
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Adapter asset path is outside the work directory: {source_path}"))
|
||||
continue
|
||||
|
||||
destination_relative = _destination_asset_relative(source_relative)
|
||||
destination = assets_dir / destination_relative
|
||||
try:
|
||||
destination.resolve(strict=False).relative_to(assets_dir.resolve(strict=False))
|
||||
except ValueError:
|
||||
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Adapter asset destination is outside the assets directory: {source_path}"))
|
||||
continue
|
||||
|
||||
destination_key = destination_relative.as_posix()
|
||||
if destination_key in copied:
|
||||
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Duplicate adapter asset destination was skipped: {destination_key}"))
|
||||
continue
|
||||
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(source_path, destination)
|
||||
copied.add(destination_key)
|
||||
|
||||
final_link = PurePosixPath(assets_dir.name, destination_relative).as_posix()
|
||||
records.append(AssetRecord(final_link))
|
||||
_add_asset_link_keys(link_map, source_path, source_relative, destination_relative, final_link)
|
||||
|
||||
return _AssetMaterialization(records=tuple(records), warnings=tuple(warnings), link_map=link_map)
|
||||
|
||||
|
||||
def _destination_asset_relative(source_relative: Path) -> PurePosixPath:
|
||||
parts = PurePosixPath(source_relative.as_posix()).parts
|
||||
if len(parts) > 1 and parts[0].casefold() in {"asset", "assets", "image", "images"}:
|
||||
parts = parts[1:]
|
||||
return PurePosixPath(*parts)
|
||||
|
||||
|
||||
def _add_asset_link_keys(
|
||||
link_map: dict[str, str],
|
||||
source_path: Path,
|
||||
source_relative: Path,
|
||||
destination_relative: PurePosixPath,
|
||||
final_link: str,
|
||||
) -> None:
|
||||
keys = {
|
||||
source_relative.as_posix(),
|
||||
destination_relative.as_posix(),
|
||||
source_path.name,
|
||||
str(source_path),
|
||||
source_path.as_posix(),
|
||||
}
|
||||
keys.update(_asset_link_suffixes(source_relative))
|
||||
keys.update(_asset_link_suffixes(destination_relative))
|
||||
for key in keys:
|
||||
link_map[key.replace("\\", "/")] = final_link
|
||||
|
||||
|
||||
def _asset_link_suffixes(path: Path | PurePosixPath) -> set[str]:
|
||||
parts = PurePosixPath(path.as_posix()).parts
|
||||
suffixes: set[str] = set()
|
||||
for index, part in enumerate(parts):
|
||||
if part.casefold() in {"asset", "assets", "image", "images"} and index + 1 < len(parts):
|
||||
suffixes.add(PurePosixPath(*parts[index:]).as_posix())
|
||||
return suffixes
|
||||
|
||||
|
||||
def _rewrite_asset_links(markdown: str, link_map: dict[str, str]) -> str:
|
||||
if not link_map:
|
||||
return markdown
|
||||
|
||||
def replace(match: re.Match[str]) -> str:
|
||||
alt = match.group("alt")
|
||||
target = match.group("target").strip()
|
||||
unwrapped = _unwrap_angle_target(target).replace("\\", "/")
|
||||
replacement = link_map.get(unwrapped)
|
||||
if replacement is None:
|
||||
return match.group(0)
|
||||
return f""
|
||||
|
||||
return _IMAGE_LINK_RE.sub(replace, markdown)
|
||||
|
||||
|
||||
def _build_document(
|
||||
*,
|
||||
source_pdf: Path,
|
||||
markdown: str,
|
||||
assets: tuple[AssetRecord, ...],
|
||||
warnings: tuple[WarningRecord, ...],
|
||||
raw_structured: object | None,
|
||||
) -> DocumentRecord:
|
||||
page_count = _page_count(raw_structured)
|
||||
blocks = _formula_blocks(markdown)
|
||||
pages = [
|
||||
PageRecord(page_index=page_index, blocks=blocks if page_index == 0 else ())
|
||||
for page_index in range(page_count)
|
||||
]
|
||||
return DocumentRecord(source_pdf=source_pdf, pages=tuple(pages), assets=assets, warnings=warnings)
|
||||
|
||||
|
||||
def _run_quality_checks(
|
||||
markdown: str,
|
||||
*,
|
||||
markdown_dir: Path,
|
||||
asset_root: Path,
|
||||
math_checker: MathChecker | None,
|
||||
) -> QualityResult:
|
||||
asset_quality = check_asset_links(markdown, markdown_dir=markdown_dir, asset_root=asset_root)
|
||||
if not _has_math(markdown):
|
||||
return asset_quality
|
||||
if math_checker is None:
|
||||
math_checker = create_default_math_checker()
|
||||
math_quality = check_math_renderability(markdown, math_checker)
|
||||
return merge_quality_results(asset_quality, math_quality)
|
||||
|
||||
|
||||
def _has_math(markdown: str) -> bool:
|
||||
return _DISPLAY_MATH_RE.search(markdown) is not None or _INLINE_MATH_RE.search(markdown) is not None
|
||||
|
||||
|
||||
def _formula_blocks(markdown: str) -> tuple[BlockRecord, ...]:
|
||||
blocks: list[BlockRecord] = []
|
||||
display_spans: list[tuple[int, int]] = []
|
||||
for match in _DISPLAY_MATH_RE.finditer(markdown):
|
||||
display_spans.append(match.span())
|
||||
blocks.append(BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0, markdown_span=match.span()))
|
||||
|
||||
inline_parts: list[tuple[int, str]] = []
|
||||
cursor = 0
|
||||
for start, end in display_spans:
|
||||
inline_parts.append((cursor, markdown[cursor:start]))
|
||||
cursor = end
|
||||
inline_parts.append((cursor, markdown[cursor:]))
|
||||
for offset, part in inline_parts:
|
||||
for match in _INLINE_MATH_RE.finditer(part):
|
||||
body = match.group("body").strip()
|
||||
if body and not body[0].isdigit():
|
||||
start = offset + match.start()
|
||||
end = offset + match.end()
|
||||
blocks.append(BlockRecord(BlockType.INLINE_FORMULA, page_index=0, markdown_span=(start, end)))
|
||||
return tuple(blocks) or (BlockRecord(BlockType.PARAGRAPH, page_index=0),)
|
||||
|
||||
|
||||
def _page_count(raw_structured: object | None) -> int:
|
||||
if isinstance(raw_structured, dict):
|
||||
pages = raw_structured.get("pages")
|
||||
if isinstance(pages, list):
|
||||
return max(1, len(pages))
|
||||
if isinstance(pages, int):
|
||||
return max(1, pages)
|
||||
if isinstance(pages, dict):
|
||||
return max(1, len(pages))
|
||||
pdf_info = raw_structured.get("pdf_info")
|
||||
if isinstance(pdf_info, list):
|
||||
return max(1, len(pdf_info))
|
||||
page_info = raw_structured.get("page_info")
|
||||
if isinstance(page_info, list):
|
||||
return max(1, len(page_info))
|
||||
page_indexes = tuple(_page_indexes(raw_structured))
|
||||
if page_indexes:
|
||||
return max(1, max(page_indexes) + 1)
|
||||
return 1
|
||||
|
||||
|
||||
def _page_indexes(value: object) -> tuple[int, ...]:
|
||||
indexes: list[int] = []
|
||||
if isinstance(value, dict):
|
||||
for key in ("page_idx", "page_index"):
|
||||
page_value = value.get(key)
|
||||
if isinstance(page_value, int) and page_value >= 0:
|
||||
indexes.append(page_value)
|
||||
for item in value.values():
|
||||
indexes.extend(_page_indexes(item))
|
||||
elif isinstance(value, list):
|
||||
for item in value:
|
||||
indexes.extend(_page_indexes(item))
|
||||
return tuple(indexes)
|
||||
|
||||
|
||||
def _failed_result(
|
||||
plan: PlannedOutput,
|
||||
*,
|
||||
warnings: tuple[WarningRecord, ...],
|
||||
engine: str = ENGINE_NAME,
|
||||
engine_version: str = "unknown",
|
||||
source_pdf: Path | None = None,
|
||||
) -> ConversionResult:
|
||||
return ConversionResult(
|
||||
source_pdf=source_pdf or plan.source_pdf,
|
||||
markdown_path=plan.markdown_path,
|
||||
metadata_path=plan.metadata_path,
|
||||
report_path=plan.report_path,
|
||||
assets_dir=plan.assets_dir,
|
||||
raw_dir=plan.raw_dir,
|
||||
engine=engine,
|
||||
engine_version=engine_version,
|
||||
final_status="failed",
|
||||
warning_count=len(warnings),
|
||||
warnings=warnings,
|
||||
pages_processed=0,
|
||||
)
|
||||
|
||||
|
||||
def _clear_planned_outputs(plan: PlannedOutput) -> None:
|
||||
for path in plan.planned_paths():
|
||||
if path.is_dir():
|
||||
shutil.rmtree(path)
|
||||
elif path.exists():
|
||||
path.unlink()
|
||||
|
||||
|
||||
def _write_text(path: Path, text: str) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(text, encoding="utf-8")
|
||||
|
||||
|
||||
def _sha256(path: Path) -> str:
|
||||
digest = hashlib.sha256()
|
||||
with path.open("rb") as file:
|
||||
for chunk in iter(lambda: file.read(1024 * 1024), b""):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
|
||||
def _format_timestamp(value: datetime) -> str:
|
||||
if value.tzinfo is None:
|
||||
value = value.replace(tzinfo=timezone.utc)
|
||||
return value.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
|
||||
|
||||
def _utc_now() -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def _unwrap_angle_target(target: str) -> str:
|
||||
if target.startswith("<") and target.endswith(">"):
|
||||
return target[1:-1].strip()
|
||||
return target
|
||||
|
||||
|
||||
def _warning(code: WarningCode, message: str) -> WarningRecord:
|
||||
return WarningRecord(code, WarningSeverity.WARNING, message)
|
||||
|
||||
|
||||
def _raise_if_strict_local_disabled(strict_local: bool) -> None:
|
||||
if not strict_local:
|
||||
raise StrictLocalViolationError("strict-local execution cannot be disabled in v1.")
|
||||
@@ -0,0 +1,469 @@
|
||||
"""Local setup diagnostics for pdf2md."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from collections.abc import Callable, Mapping
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal, Protocol
|
||||
|
||||
from pdf2md.math_render import default_mathjax_helper_path
|
||||
from pdf2md.mineru_adapter import CommandResult, MinerUAdapter, MinerUVersionResult
|
||||
|
||||
|
||||
DoctorStatus = Literal["pass", "warn", "fail"]
|
||||
CommandRunner = Callable[[tuple[str, ...]], "DoctorCommandResult"]
|
||||
Which = Callable[[str], str | None]
|
||||
ImportModule = Callable[[str], Any]
|
||||
PathExists = Callable[[Path], bool]
|
||||
|
||||
TARGET_PYTHON = (3, 12)
|
||||
TARGET_MINERU_VERSION = "3.1.0"
|
||||
MODEL_CACHE_ENV_VARS = (
|
||||
"MINERU_MODEL_SOURCE",
|
||||
"MINERU_MODEL_DIR",
|
||||
"MINERU_CACHE_DIR",
|
||||
"MINERU_TOOLS_CONFIG_JSON",
|
||||
"HF_HOME",
|
||||
"HUGGINGFACE_HUB_CACHE",
|
||||
"MODELSCOPE_CACHE",
|
||||
)
|
||||
|
||||
|
||||
class MinerUProbe(Protocol):
|
||||
def version(self) -> MinerUVersionResult:
|
||||
"""Return the direct local MinerU CLI version result."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DoctorCommandResult:
|
||||
command: tuple[str, ...]
|
||||
exit_code: int
|
||||
stdout: str = ""
|
||||
stderr: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DoctorCheck:
|
||||
name: str
|
||||
status: DoctorStatus
|
||||
message: str
|
||||
details: tuple[str, ...] = ()
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DoctorReport:
|
||||
checks: tuple[DoctorCheck, ...]
|
||||
|
||||
@property
|
||||
def status(self) -> DoctorStatus:
|
||||
if any(check.status == "fail" for check in self.checks):
|
||||
return "fail"
|
||||
if any(check.status == "warn" for check in self.checks):
|
||||
return "warn"
|
||||
return "pass"
|
||||
|
||||
@property
|
||||
def exit_code(self) -> int:
|
||||
return 1 if self.status == "fail" else 0
|
||||
|
||||
|
||||
def run_doctor(
|
||||
*,
|
||||
python_version: tuple[int, int, int] | None = None,
|
||||
which: Which = shutil.which,
|
||||
run_command: CommandRunner | None = None,
|
||||
import_module: ImportModule = importlib.import_module,
|
||||
env: Mapping[str, str] | None = None,
|
||||
path_exists: PathExists | None = None,
|
||||
home: Path | None = None,
|
||||
mineru_probe: MinerUProbe | None = None,
|
||||
) -> DoctorReport:
|
||||
"""Run ordered local setup checks without installing or downloading anything."""
|
||||
|
||||
runner = run_command or _run_command
|
||||
environment = os.environ if env is None else env
|
||||
exists = path_exists or (lambda path: path.exists())
|
||||
version = python_version or sys.version_info[:3]
|
||||
home_path = home if home is not None else Path.home()
|
||||
probe = mineru_probe or _default_mineru_probe(which, runner)
|
||||
|
||||
checks = (
|
||||
_check_python(version),
|
||||
_check_uv(which, runner),
|
||||
_check_mineru(probe),
|
||||
_check_gpu(which, runner),
|
||||
_check_pytorch(import_module),
|
||||
_check_model_cache(environment, exists, home_path),
|
||||
_check_mathjax_checker(which, runner, exists),
|
||||
_check_local_only_policy(),
|
||||
)
|
||||
return DoctorReport(checks=checks)
|
||||
|
||||
|
||||
def format_doctor_report(report: DoctorReport) -> str:
|
||||
lines = [f"Doctor status: {report.status.upper()}"]
|
||||
for check in report.checks:
|
||||
lines.append(f"[{check.status.upper()}] {check.name}: {check.message}")
|
||||
for detail in check.details:
|
||||
lines.append(f" - {detail}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _check_python(version: tuple[int, int, int]) -> DoctorCheck:
|
||||
version_text = ".".join(str(part) for part in version)
|
||||
if version[:2] == TARGET_PYTHON:
|
||||
return DoctorCheck("python", "pass", f"Python {version_text} is supported.")
|
||||
return DoctorCheck(
|
||||
"python",
|
||||
"fail",
|
||||
f"Python {version_text} is unsupported; use Python 3.12.x.",
|
||||
)
|
||||
|
||||
|
||||
def _check_uv(which: Which, run_command: CommandRunner) -> DoctorCheck:
|
||||
uv_path = which("uv")
|
||||
if uv_path is None:
|
||||
return DoctorCheck(
|
||||
"uv",
|
||||
"fail",
|
||||
"uv executable was not found on PATH.",
|
||||
("Windows per-user uv installs commonly use C:\\Users\\user\\.local\\bin.",),
|
||||
)
|
||||
|
||||
result = run_command(("uv", "--version"))
|
||||
version_text = _first_non_empty_line(result.stdout) or _first_non_empty_line(result.stderr)
|
||||
if result.exit_code != 0:
|
||||
return DoctorCheck(
|
||||
"uv",
|
||||
"warn",
|
||||
"uv was found, but `uv --version` failed.",
|
||||
(f"path: {uv_path}", f"exit code: {result.exit_code}", _trim_detail(result.stderr)),
|
||||
)
|
||||
if version_text is None:
|
||||
return DoctorCheck("uv", "warn", "uv was found, but no version text was reported.", (f"path: {uv_path}",))
|
||||
return DoctorCheck("uv", "pass", version_text, (f"path: {uv_path}",))
|
||||
|
||||
|
||||
def _check_mineru(probe: MinerUProbe) -> DoctorCheck:
|
||||
result = probe.version()
|
||||
command_detail = f"command: {' '.join(result.command)}"
|
||||
if not result.available:
|
||||
return DoctorCheck("mineru", "fail", "MinerU CLI executable was not found.", (command_detail,))
|
||||
|
||||
warning_details = tuple(warning.message for warning in result.warnings)
|
||||
if result.version is None:
|
||||
details = (command_detail, f"exit code: {result.exit_code}", *warning_details, _trim_detail(result.stderr))
|
||||
return DoctorCheck("mineru", "warn", "MinerU CLI is available, but version could not be detected.", details)
|
||||
|
||||
if not _has_target_mineru_version(result.version):
|
||||
return DoctorCheck(
|
||||
"mineru",
|
||||
"warn",
|
||||
f"MinerU version is `{result.version}`; project target is {TARGET_MINERU_VERSION}.",
|
||||
(command_detail,),
|
||||
)
|
||||
|
||||
return DoctorCheck("mineru", "pass", f"MinerU {result.version} CLI detected.", (command_detail,))
|
||||
|
||||
|
||||
def _check_gpu(which: Which, run_command: CommandRunner) -> DoctorCheck:
|
||||
nvidia_smi_path = which("nvidia-smi")
|
||||
if nvidia_smi_path is None:
|
||||
return DoctorCheck("gpu", "warn", "nvidia-smi was not found; NVIDIA GPU visibility could not be confirmed.")
|
||||
|
||||
result = run_command(
|
||||
(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=name,memory.total,driver_version",
|
||||
"--format=csv,noheader",
|
||||
)
|
||||
)
|
||||
if result.exit_code != 0:
|
||||
return DoctorCheck(
|
||||
"gpu",
|
||||
"warn",
|
||||
"nvidia-smi was found, but GPU query failed.",
|
||||
(f"path: {nvidia_smi_path}", f"exit code: {result.exit_code}", _trim_detail(result.stderr)),
|
||||
)
|
||||
|
||||
gpu_lines = tuple(line.strip() for line in result.stdout.splitlines() if line.strip())
|
||||
if not gpu_lines:
|
||||
return DoctorCheck("gpu", "warn", "nvidia-smi reported no visible NVIDIA GPU.", (f"path: {nvidia_smi_path}",))
|
||||
|
||||
risky_names = tuple(line for line in gpu_lines if _is_pascal_or_pre_turing(line))
|
||||
if risky_names:
|
||||
return DoctorCheck(
|
||||
"gpu",
|
||||
"warn",
|
||||
"NVIDIA GPU is visible, but Pascal/pre-Turing compatibility risk was detected.",
|
||||
(f"path: {nvidia_smi_path}", *risky_names),
|
||||
)
|
||||
|
||||
return DoctorCheck("gpu", "pass", "NVIDIA GPU is visible.", (f"path: {nvidia_smi_path}", *gpu_lines))
|
||||
|
||||
|
||||
def _check_pytorch(import_module: ImportModule) -> DoctorCheck:
|
||||
try:
|
||||
torch = import_module("torch")
|
||||
except ImportError:
|
||||
return DoctorCheck("pytorch", "warn", "PyTorch is not installed; CUDA visibility through torch cannot be checked.")
|
||||
except Exception as error: # pragma: no cover - defensive for broken local torch installs.
|
||||
return DoctorCheck("pytorch", "warn", f"PyTorch import failed: {error}")
|
||||
|
||||
version = str(getattr(torch, "__version__", "unknown"))
|
||||
cuda = getattr(torch, "cuda", None)
|
||||
if cuda is None or not hasattr(cuda, "is_available"):
|
||||
return DoctorCheck("pytorch", "warn", f"PyTorch {version} has no CUDA availability API.")
|
||||
|
||||
try:
|
||||
available = bool(cuda.is_available())
|
||||
except Exception as error: # pragma: no cover - defensive for broken CUDA runtimes.
|
||||
return DoctorCheck("pytorch", "warn", f"PyTorch CUDA availability check failed: {error}", (f"torch: {version}",))
|
||||
|
||||
if not available:
|
||||
return DoctorCheck("pytorch", "warn", f"PyTorch {version} reports CUDA unavailable.")
|
||||
|
||||
details = [f"torch: {version}"]
|
||||
torch_version = getattr(torch, "version", None)
|
||||
cuda_version = getattr(torch_version, "cuda", None)
|
||||
if cuda_version:
|
||||
details.append(f"torch cuda: {cuda_version}")
|
||||
count = _safe_int_call(getattr(cuda, "device_count", None))
|
||||
risky_devices: list[str] = []
|
||||
if count is not None:
|
||||
details.append(f"cuda devices: {count}")
|
||||
get_device_name = getattr(cuda, "get_device_name", None)
|
||||
get_device_capability = getattr(cuda, "get_device_capability", None)
|
||||
if callable(get_device_name):
|
||||
for index in range(count):
|
||||
try:
|
||||
device_name = str(get_device_name(index))
|
||||
details.append(f"device {index}: {device_name}")
|
||||
if _is_pascal_or_pre_turing(device_name):
|
||||
risky_devices.append(f"device {index}: {device_name}")
|
||||
except Exception:
|
||||
details.append(f"device {index}: name unavailable")
|
||||
if callable(get_device_capability):
|
||||
for index in range(count):
|
||||
try:
|
||||
capability = tuple(int(part) for part in get_device_capability(index))
|
||||
details.append(f"device {index} capability: {capability[0]}.{capability[1]}")
|
||||
if capability < (7, 0):
|
||||
risky_devices.append(f"device {index}: compute capability {capability[0]}.{capability[1]}")
|
||||
except Exception:
|
||||
details.append(f"device {index} capability: unavailable")
|
||||
if risky_devices:
|
||||
return DoctorCheck(
|
||||
"pytorch",
|
||||
"warn",
|
||||
f"PyTorch {version} reports CUDA available, but Pascal/pre-Turing compatibility risk was detected.",
|
||||
tuple(details + risky_devices),
|
||||
)
|
||||
return DoctorCheck("pytorch", "pass", f"PyTorch {version} reports CUDA available.", tuple(details))
|
||||
|
||||
|
||||
def _check_model_cache(env: Mapping[str, str], path_exists: PathExists, home: Path) -> DoctorCheck:
|
||||
configured_values: list[str] = []
|
||||
existing_paths: list[str] = []
|
||||
missing_paths: list[str] = []
|
||||
|
||||
for name in MODEL_CACHE_ENV_VARS:
|
||||
raw_value = env.get(name, "").strip()
|
||||
if not raw_value:
|
||||
continue
|
||||
if name == "MINERU_MODEL_SOURCE":
|
||||
configured_values.append(f"{name}={raw_value}")
|
||||
continue
|
||||
path = _expand_path(raw_value)
|
||||
detail = f"{name}={path}"
|
||||
configured_values.append(detail)
|
||||
if path_exists(path):
|
||||
existing_paths.append(detail)
|
||||
else:
|
||||
missing_paths.append(detail)
|
||||
|
||||
user_config = home / "mineru.json"
|
||||
if path_exists(user_config):
|
||||
existing_paths.append(f"user config={user_config}")
|
||||
|
||||
if existing_paths:
|
||||
details = existing_paths + [detail for detail in configured_values if detail not in existing_paths]
|
||||
return DoctorCheck(
|
||||
"models",
|
||||
"pass",
|
||||
"Local MinerU model/cache/config path was detected.",
|
||||
tuple(details),
|
||||
)
|
||||
if missing_paths:
|
||||
details = missing_paths + [detail for detail in configured_values if detail not in missing_paths]
|
||||
return DoctorCheck(
|
||||
"models",
|
||||
"warn",
|
||||
"MinerU model/cache environment variables are set, but their paths were not found.",
|
||||
tuple(details),
|
||||
)
|
||||
if configured_values:
|
||||
return DoctorCheck(
|
||||
"models",
|
||||
"warn",
|
||||
"MinerU model source/config is set, but no local model/cache path was detected.",
|
||||
tuple(configured_values),
|
||||
)
|
||||
return DoctorCheck(
|
||||
"models",
|
||||
"warn",
|
||||
"No MinerU model/cache/config path was detected; run explicit local MinerU model setup before offline conversion.",
|
||||
(f"checked env: {', '.join(MODEL_CACHE_ENV_VARS)}", f"checked config: {user_config}"),
|
||||
)
|
||||
|
||||
|
||||
def _check_mathjax_checker(which: Which, run_command: CommandRunner, path_exists: PathExists) -> DoctorCheck:
|
||||
node_path = which("node")
|
||||
helper_path = default_mathjax_helper_path()
|
||||
if node_path is None:
|
||||
return DoctorCheck(
|
||||
"mathjax",
|
||||
"warn",
|
||||
"Node.js executable was not found; MathJax render checker is unavailable.",
|
||||
)
|
||||
if not path_exists(helper_path):
|
||||
return DoctorCheck(
|
||||
"mathjax",
|
||||
"warn",
|
||||
"MathJax helper script was not found.",
|
||||
(f"expected: {helper_path}", f"node: {node_path}"),
|
||||
)
|
||||
|
||||
version_result = run_command((node_path, "--version"))
|
||||
if version_result.exit_code != 0:
|
||||
return DoctorCheck(
|
||||
"mathjax",
|
||||
"warn",
|
||||
"Node.js was found, but `node --version` failed.",
|
||||
(f"node: {node_path}", f"exit code: {version_result.exit_code}", _trim_detail(version_result.stderr)),
|
||||
)
|
||||
|
||||
health_result = run_command((node_path, str(helper_path), "--health"))
|
||||
if health_result.exit_code != 0:
|
||||
detail = _trim_detail(health_result.stderr)
|
||||
return DoctorCheck(
|
||||
"mathjax",
|
||||
"warn",
|
||||
"Local MathJax render checker is unavailable.",
|
||||
(
|
||||
f"node: {node_path}",
|
||||
f"helper: {helper_path}",
|
||||
f"exit code: {health_result.exit_code}",
|
||||
detail,
|
||||
),
|
||||
)
|
||||
|
||||
node_version = _first_non_empty_line(version_result.stdout) or _first_non_empty_line(version_result.stderr)
|
||||
details = [f"node: {node_path}", f"helper: {helper_path}"]
|
||||
if node_version is not None:
|
||||
details.append(f"node version: {node_version}")
|
||||
return DoctorCheck("mathjax", "pass", "Local MathJax render checker is available.", tuple(details))
|
||||
|
||||
|
||||
def _check_local_only_policy() -> DoctorCheck:
|
||||
return DoctorCheck(
|
||||
"local-only",
|
||||
"pass",
|
||||
"Runtime conversion is restricted to direct local mineru CLI execution.",
|
||||
(
|
||||
"allowed: mineru CLI without --api-url, including its temporary local mineru-api process",
|
||||
"prohibited: --api-url, remote APIs, router mode, HTTP client backends, remote OpenAI-compatible backends",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _default_mineru_probe(which: Which, run_command: CommandRunner) -> MinerUAdapter:
|
||||
def adapter_runner(command: tuple[str, ...]) -> CommandResult:
|
||||
result = run_command(command)
|
||||
return CommandResult(
|
||||
command=result.command,
|
||||
exit_code=result.exit_code,
|
||||
stdout=result.stdout,
|
||||
stderr=result.stderr,
|
||||
)
|
||||
|
||||
return MinerUAdapter(which=which, runner=adapter_runner)
|
||||
|
||||
|
||||
def _run_command(command: tuple[str, ...]) -> DoctorCommandResult:
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=20,
|
||||
)
|
||||
except FileNotFoundError as error:
|
||||
return DoctorCommandResult(command=command, exit_code=127, stderr=str(error))
|
||||
except subprocess.TimeoutExpired as error:
|
||||
stdout = error.stdout if isinstance(error.stdout, str) else ""
|
||||
stderr = error.stderr if isinstance(error.stderr, str) else ""
|
||||
return DoctorCommandResult(command=command, exit_code=124, stdout=stdout, stderr=stderr or "command timed out")
|
||||
|
||||
return DoctorCommandResult(
|
||||
command=command,
|
||||
exit_code=completed.returncode,
|
||||
stdout=completed.stdout,
|
||||
stderr=completed.stderr,
|
||||
)
|
||||
|
||||
|
||||
def _first_non_empty_line(value: str) -> str | None:
|
||||
for line in value.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped:
|
||||
return stripped
|
||||
return None
|
||||
|
||||
|
||||
def _has_target_mineru_version(value: str) -> bool:
|
||||
return re.search(rf"(?<!\d){re.escape(TARGET_MINERU_VERSION)}(?!\d)", value) is not None
|
||||
|
||||
|
||||
def _trim_detail(value: str) -> str:
|
||||
stripped = " ".join(value.split())
|
||||
if not stripped:
|
||||
return "stderr: <empty>"
|
||||
return f"stderr: {stripped[:240]}"
|
||||
|
||||
|
||||
def _is_pascal_or_pre_turing(value: str) -> bool:
|
||||
normalized = value.casefold()
|
||||
risky_tokens = (
|
||||
"gtx 10",
|
||||
"gtx 9",
|
||||
"gtx 8",
|
||||
"gtx 7",
|
||||
"gtx 6",
|
||||
"gtx 5",
|
||||
"tesla p",
|
||||
"quadro p",
|
||||
"pascal",
|
||||
)
|
||||
return any(token in normalized for token in risky_tokens)
|
||||
|
||||
|
||||
def _safe_int_call(function: object) -> int | None:
|
||||
if not callable(function):
|
||||
return None
|
||||
try:
|
||||
return int(function())
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _expand_path(value: str) -> Path:
|
||||
return Path(os.path.expandvars(value)).expanduser()
|
||||
@@ -0,0 +1,234 @@
|
||||
"""Project-owned domain records for documents, blocks, assets, and warnings."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from enum import StrEnum
|
||||
from pathlib import Path
|
||||
from typing import Iterable, TypeVar
|
||||
|
||||
|
||||
BBox = tuple[float, float, float, float]
|
||||
MarkdownSpan = tuple[int, int]
|
||||
PathLike = str | os.PathLike[str]
|
||||
TStrEnum = TypeVar("TStrEnum", bound=StrEnum)
|
||||
|
||||
|
||||
class BlockType(StrEnum):
|
||||
HEADING = "heading"
|
||||
PARAGRAPH = "paragraph"
|
||||
INLINE_FORMULA = "inline_formula"
|
||||
DISPLAY_FORMULA = "display_formula"
|
||||
TABLE = "table"
|
||||
FIGURE = "figure"
|
||||
CAPTION = "caption"
|
||||
FOOTNOTE = "footnote"
|
||||
REFERENCE = "reference"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
class WarningCode(StrEnum):
|
||||
ENGINE_MISSING = "ENGINE_MISSING"
|
||||
GPU_UNAVAILABLE = "GPU_UNAVAILABLE"
|
||||
LOW_CONFIDENCE_FORMULA = "LOW_CONFIDENCE_FORMULA"
|
||||
MATH_RENDER_FAILED = "MATH_RENDER_FAILED"
|
||||
ASSET_LINK_MISSING = "ASSET_LINK_MISSING"
|
||||
ASSET_LINK_INVALID = "ASSET_LINK_INVALID"
|
||||
READING_ORDER_UNCERTAIN = "READING_ORDER_UNCERTAIN"
|
||||
STRICT_LOCAL_VIOLATION = "STRICT_LOCAL_VIOLATION"
|
||||
MINERU_CLI_FAILED = "MINERU_CLI_FAILED"
|
||||
TABLE_FALLBACK = "TABLE_FALLBACK"
|
||||
|
||||
|
||||
class WarningSeverity(StrEnum):
|
||||
INFO = "info"
|
||||
WARNING = "warning"
|
||||
ERROR = "error"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BlockRecord:
|
||||
block_type: BlockType
|
||||
page_index: int | None = None
|
||||
bbox: BBox | None = None
|
||||
confidence: float | None = None
|
||||
markdown_span: MarkdownSpan | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
object.__setattr__(self, "block_type", _coerce_enum(BlockType, self.block_type, "block_type"))
|
||||
_validate_optional_page_index(self.page_index)
|
||||
_validate_optional_bbox(self.bbox)
|
||||
_validate_optional_confidence(self.confidence)
|
||||
_validate_optional_markdown_span(self.markdown_span)
|
||||
|
||||
def to_dict(self) -> dict[str, object]:
|
||||
data: dict[str, object] = {"type": self.block_type.value}
|
||||
_add_optional(data, "page_index", self.page_index)
|
||||
_add_optional(data, "bbox", list(self.bbox) if self.bbox is not None else None)
|
||||
_add_optional(data, "confidence", self.confidence)
|
||||
_add_optional(data, "markdown_span", list(self.markdown_span) if self.markdown_span is not None else None)
|
||||
return data
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PageRecord:
|
||||
page_index: int
|
||||
width: float | None = None
|
||||
height: float | None = None
|
||||
blocks: tuple[BlockRecord, ...] = field(default_factory=tuple)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
_validate_page_index(self.page_index)
|
||||
_validate_optional_positive_number(self.width, "width")
|
||||
_validate_optional_positive_number(self.height, "height")
|
||||
object.__setattr__(self, "blocks", _tuple_of(BlockRecord, self.blocks, "blocks"))
|
||||
|
||||
def to_dict(self) -> dict[str, object]:
|
||||
data: dict[str, object] = {
|
||||
"page_index": self.page_index,
|
||||
"blocks": [block.to_dict() for block in self.blocks],
|
||||
}
|
||||
_add_optional(data, "width", self.width)
|
||||
_add_optional(data, "height", self.height)
|
||||
return data
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AssetRecord:
|
||||
relative_path: str
|
||||
page_index: int | None = None
|
||||
bbox: BBox | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
_validate_relative_asset_path(self.relative_path)
|
||||
_validate_optional_page_index(self.page_index)
|
||||
_validate_optional_bbox(self.bbox)
|
||||
|
||||
def to_dict(self) -> dict[str, object]:
|
||||
data: dict[str, object] = {"relative_path": self.relative_path}
|
||||
_add_optional(data, "page_index", self.page_index)
|
||||
_add_optional(data, "bbox", list(self.bbox) if self.bbox is not None else None)
|
||||
return data
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class WarningRecord:
|
||||
code: WarningCode
|
||||
severity: WarningSeverity
|
||||
message: str
|
||||
page_index: int | None = None
|
||||
bbox: BBox | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
object.__setattr__(self, "code", _coerce_enum(WarningCode, self.code, "code"))
|
||||
object.__setattr__(self, "severity", _coerce_enum(WarningSeverity, self.severity, "severity"))
|
||||
if not self.message:
|
||||
raise ValueError("message is required")
|
||||
_validate_optional_page_index(self.page_index)
|
||||
_validate_optional_bbox(self.bbox)
|
||||
|
||||
def to_dict(self) -> dict[str, object]:
|
||||
data: dict[str, object] = {
|
||||
"code": self.code.value,
|
||||
"severity": self.severity.value,
|
||||
"message": self.message,
|
||||
}
|
||||
_add_optional(data, "page_index", self.page_index)
|
||||
_add_optional(data, "bbox", list(self.bbox) if self.bbox is not None else None)
|
||||
return data
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DocumentRecord:
|
||||
source_pdf: PathLike
|
||||
pages: tuple[PageRecord, ...]
|
||||
assets: tuple[AssetRecord, ...] = field(default_factory=tuple)
|
||||
warnings: tuple[WarningRecord, ...] = field(default_factory=tuple)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not str(self.source_pdf):
|
||||
raise ValueError("source_pdf is required")
|
||||
object.__setattr__(self, "source_pdf", Path(self.source_pdf))
|
||||
pages = _tuple_of(PageRecord, self.pages, "pages")
|
||||
if not pages:
|
||||
raise ValueError("at least one page record is required")
|
||||
object.__setattr__(self, "pages", pages)
|
||||
object.__setattr__(self, "assets", _tuple_of(AssetRecord, self.assets, "assets"))
|
||||
object.__setattr__(self, "warnings", _tuple_of(WarningRecord, self.warnings, "warnings"))
|
||||
|
||||
def to_dict(self) -> dict[str, object]:
|
||||
return {
|
||||
"source_pdf": str(self.source_pdf),
|
||||
"pages": [page.to_dict() for page in self.pages],
|
||||
"assets": [asset.to_dict() for asset in self.assets],
|
||||
"warnings": [warning.to_dict() for warning in self.warnings],
|
||||
}
|
||||
|
||||
|
||||
def iter_blocks(pages: Iterable[PageRecord]) -> Iterable[BlockRecord]:
|
||||
for page in pages:
|
||||
yield from page.blocks
|
||||
|
||||
|
||||
def _coerce_enum(enum_type: type[TStrEnum], value: object, field_name: str) -> TStrEnum:
|
||||
try:
|
||||
return enum_type(value)
|
||||
except ValueError as error:
|
||||
allowed = ", ".join(member.value for member in enum_type)
|
||||
raise ValueError(f"invalid {field_name}: {value!r}; expected one of: {allowed}") from error
|
||||
|
||||
|
||||
def _add_optional(data: dict[str, object], key: str, value: object | None) -> None:
|
||||
if value is not None:
|
||||
data[key] = value
|
||||
|
||||
|
||||
def _tuple_of(expected_type: type[object], values: Iterable[object], field_name: str) -> tuple:
|
||||
items = tuple(values)
|
||||
if any(not isinstance(item, expected_type) for item in items):
|
||||
raise TypeError(f"{field_name} must contain only {expected_type.__name__} values")
|
||||
return items
|
||||
|
||||
|
||||
def _validate_page_index(page_index: int) -> None:
|
||||
if not isinstance(page_index, int) or page_index < 0:
|
||||
raise ValueError("page_index must be a non-negative integer")
|
||||
|
||||
|
||||
def _validate_optional_page_index(page_index: int | None) -> None:
|
||||
if page_index is not None:
|
||||
_validate_page_index(page_index)
|
||||
|
||||
|
||||
def _validate_optional_bbox(bbox: BBox | None) -> None:
|
||||
if bbox is None:
|
||||
return
|
||||
if len(bbox) != 4 or any(not isinstance(value, int | float) for value in bbox):
|
||||
raise ValueError("bbox must contain four numeric values")
|
||||
|
||||
|
||||
def _validate_optional_confidence(confidence: float | None) -> None:
|
||||
if confidence is not None and not 0.0 <= confidence <= 1.0:
|
||||
raise ValueError("confidence must be between 0.0 and 1.0")
|
||||
|
||||
|
||||
def _validate_optional_markdown_span(markdown_span: MarkdownSpan | None) -> None:
|
||||
if markdown_span is None:
|
||||
return
|
||||
if len(markdown_span) != 2:
|
||||
raise ValueError("markdown_span must contain start and end offsets")
|
||||
start, end = markdown_span
|
||||
if not isinstance(start, int) or not isinstance(end, int) or start < 0 or end < start:
|
||||
raise ValueError("markdown_span must be non-negative and ordered")
|
||||
|
||||
|
||||
def _validate_optional_positive_number(value: float | None, field_name: str) -> None:
|
||||
if value is not None and value <= 0:
|
||||
raise ValueError(f"{field_name} must be positive when provided")
|
||||
|
||||
|
||||
def _validate_relative_asset_path(path: str) -> None:
|
||||
candidate = Path(path)
|
||||
if not path or candidate.is_absolute() or candidate.drive or candidate.root or ".." in candidate.parts:
|
||||
raise ValueError(f"asset path must be relative and stay within the assets directory: {path}")
|
||||
@@ -0,0 +1,422 @@
|
||||
"""Obsidian Markdown normalization helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path, PurePosixPath, PureWindowsPath
|
||||
|
||||
from pdf2md.ir import PathLike, WarningCode, WarningRecord, WarningSeverity
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MarkdownNormalizationResult:
|
||||
markdown: str
|
||||
warnings: tuple[WarningRecord, ...] = ()
|
||||
asset_links: tuple[str, ...] = ()
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _Segment:
|
||||
text: str
|
||||
protected: bool
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _AssetContext:
|
||||
markdown_dir: Path | None
|
||||
asset_root: Path | None
|
||||
check_assets: bool
|
||||
|
||||
|
||||
_FENCE_START_RE = re.compile(r"^(?P<indent> {0,3})(?P<fence>`{3,}|~{3,}).*$")
|
||||
_INLINE_MATH_RE = re.compile(r"\\\((?P<body>[^\n]*?)\\\)")
|
||||
_DISPLAY_BRACKET_RE = re.compile(r"\\\[(?P<body>.*?)\\\]", re.DOTALL)
|
||||
_DISPLAY_DOLLAR_RE = re.compile(r"(?<!\\)\$\$(?P<body>.*?)(?<!\\)\$\$", re.DOTALL)
|
||||
_IMAGE_LINK_RE = re.compile(r"!\[(?P<alt>[^\]\n]*)\]\((?P<target>[^)\n]+)\)")
|
||||
_HTML_TABLE_RE = re.compile(r"<table\b.*?</table>", re.IGNORECASE | re.DOTALL)
|
||||
_SCHEME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9+.-]*:")
|
||||
|
||||
|
||||
def normalize_markdown(
|
||||
raw_markdown: str,
|
||||
*,
|
||||
markdown_dir: PathLike | None = None,
|
||||
asset_root: PathLike | None = None,
|
||||
check_assets: bool = False,
|
||||
) -> MarkdownNormalizationResult:
|
||||
"""Normalize local Markdown-like text for Obsidian without writing files."""
|
||||
|
||||
if not isinstance(raw_markdown, str):
|
||||
raise TypeError("raw_markdown must be a string")
|
||||
|
||||
context = _AssetContext(
|
||||
markdown_dir=Path(markdown_dir) if markdown_dir is not None else None,
|
||||
asset_root=Path(asset_root) if asset_root is not None else None,
|
||||
check_assets=check_assets,
|
||||
)
|
||||
warnings: list[WarningRecord] = []
|
||||
asset_links: list[str] = []
|
||||
normalized_parts: list[str] = []
|
||||
|
||||
for fence_segment in _split_fenced_code(raw_markdown):
|
||||
if fence_segment.protected:
|
||||
normalized_parts.append(fence_segment.text)
|
||||
continue
|
||||
for inline_segment in _split_inline_code(fence_segment.text):
|
||||
if inline_segment.protected:
|
||||
normalized_parts.append(inline_segment.text)
|
||||
continue
|
||||
normalized, segment_warnings, segment_assets = _normalize_plain_segment(inline_segment.text, context)
|
||||
normalized_parts.append(normalized)
|
||||
warnings.extend(segment_warnings)
|
||||
asset_links.extend(segment_assets)
|
||||
|
||||
return MarkdownNormalizationResult(
|
||||
markdown="".join(normalized_parts),
|
||||
warnings=tuple(warnings),
|
||||
asset_links=tuple(asset_links),
|
||||
)
|
||||
|
||||
|
||||
def _normalize_plain_segment(text: str, context: _AssetContext) -> tuple[str, tuple[WarningRecord, ...], tuple[str, ...]]:
|
||||
warnings: list[WarningRecord] = []
|
||||
asset_links: list[str] = []
|
||||
normalized_parts: list[str] = []
|
||||
for segment in _split_tables(text):
|
||||
if segment.protected:
|
||||
normalized_parts.append(segment.text)
|
||||
warnings.extend(_table_warnings(segment.text))
|
||||
continue
|
||||
normalized = _normalize_math(segment.text)
|
||||
normalized, asset_warnings, segment_assets = _normalize_asset_links(normalized, context)
|
||||
normalized_parts.append(normalized)
|
||||
warnings.extend(asset_warnings)
|
||||
asset_links.extend(segment_assets)
|
||||
return "".join(normalized_parts), tuple(warnings), tuple(asset_links)
|
||||
|
||||
|
||||
def _split_fenced_code(text: str) -> tuple[_Segment, ...]:
|
||||
segments: list[_Segment] = []
|
||||
plain_parts: list[str] = []
|
||||
fence_parts: list[str] = []
|
||||
fence_char: str | None = None
|
||||
fence_length = 0
|
||||
|
||||
def flush_plain() -> None:
|
||||
if plain_parts:
|
||||
segments.append(_Segment("".join(plain_parts), protected=False))
|
||||
plain_parts.clear()
|
||||
|
||||
def flush_fence() -> None:
|
||||
if fence_parts:
|
||||
segments.append(_Segment("".join(fence_parts), protected=True))
|
||||
fence_parts.clear()
|
||||
|
||||
for line in text.splitlines(keepends=True):
|
||||
line_body = line.rstrip("\r\n")
|
||||
if fence_char is None:
|
||||
match = _FENCE_START_RE.match(line_body)
|
||||
if match:
|
||||
flush_plain()
|
||||
fence = match.group("fence")
|
||||
fence_char = fence[0]
|
||||
fence_length = len(fence)
|
||||
fence_parts.append(line)
|
||||
else:
|
||||
plain_parts.append(line)
|
||||
continue
|
||||
|
||||
fence_parts.append(line)
|
||||
if _is_closing_fence(line_body, fence_char, fence_length):
|
||||
fence_char = None
|
||||
fence_length = 0
|
||||
flush_fence()
|
||||
|
||||
if fence_parts:
|
||||
flush_fence()
|
||||
flush_plain()
|
||||
return tuple(segments)
|
||||
|
||||
|
||||
def _is_closing_fence(line: str, fence_char: str, fence_length: int) -> bool:
|
||||
stripped = line.lstrip(" ")
|
||||
if len(line) - len(stripped) > 3:
|
||||
return False
|
||||
fence = fence_char * fence_length
|
||||
return stripped.startswith(fence) and stripped.strip(fence_char).strip() == ""
|
||||
|
||||
|
||||
def _split_inline_code(text: str) -> tuple[_Segment, ...]:
|
||||
segments: list[_Segment] = []
|
||||
index = 0
|
||||
plain_start = 0
|
||||
while index < len(text):
|
||||
if text[index] != "`":
|
||||
index += 1
|
||||
continue
|
||||
|
||||
tick_count = _count_run(text, index, "`")
|
||||
closing = text.find("`" * tick_count, index + tick_count)
|
||||
if closing == -1:
|
||||
index += tick_count
|
||||
continue
|
||||
|
||||
if plain_start < index:
|
||||
segments.append(_Segment(text[plain_start:index], protected=False))
|
||||
end = closing + tick_count
|
||||
segments.append(_Segment(text[index:end], protected=True))
|
||||
index = end
|
||||
plain_start = end
|
||||
|
||||
if plain_start < len(text):
|
||||
segments.append(_Segment(text[plain_start:], protected=False))
|
||||
return tuple(segments)
|
||||
|
||||
|
||||
def _count_run(text: str, start: int, char: str) -> int:
|
||||
index = start
|
||||
while index < len(text) and text[index] == char:
|
||||
index += 1
|
||||
return index - start
|
||||
|
||||
|
||||
def _normalize_math(text: str) -> str:
|
||||
normalized = _DISPLAY_BRACKET_RE.sub(lambda match: _display_block(match.group("body")), text)
|
||||
normalized = _DISPLAY_DOLLAR_RE.sub(lambda match: _display_block(match.group("body")), normalized)
|
||||
normalized = _INLINE_MATH_RE.sub(lambda match: _inline_math(match.group("body")), normalized)
|
||||
return _normalize_display_math_spacing(normalized)
|
||||
|
||||
|
||||
def _inline_math(body: str) -> str:
|
||||
if not body:
|
||||
return r"\(\)"
|
||||
return f"${body}$"
|
||||
|
||||
|
||||
def _display_block(body: str) -> str:
|
||||
stripped = body.strip()
|
||||
return f"\n\n$$\n{stripped}\n$$\n\n"
|
||||
|
||||
|
||||
def _normalize_display_math_spacing(text: str) -> str:
|
||||
if "$$" not in text:
|
||||
return text
|
||||
|
||||
lines = text.splitlines()
|
||||
if not lines:
|
||||
return text
|
||||
|
||||
output: list[str] = []
|
||||
in_display_math = False
|
||||
index = 0
|
||||
while index < len(lines):
|
||||
line = lines[index]
|
||||
if line.strip() != "$$":
|
||||
output.append(line)
|
||||
index += 1
|
||||
continue
|
||||
|
||||
if not in_display_math:
|
||||
while output and output[-1].strip() == "":
|
||||
output.pop()
|
||||
if output:
|
||||
output.append("")
|
||||
output.append("$$")
|
||||
in_display_math = True
|
||||
index += 1
|
||||
continue
|
||||
|
||||
output.append("$$")
|
||||
in_display_math = False
|
||||
index += 1
|
||||
while index < len(lines) and lines[index].strip() == "":
|
||||
index += 1
|
||||
if index < len(lines):
|
||||
output.append("")
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
def _split_tables(text: str) -> tuple[_Segment, ...]:
|
||||
segments: list[_Segment] = []
|
||||
last_end = 0
|
||||
for match in _HTML_TABLE_RE.finditer(text):
|
||||
if match.start() > last_end:
|
||||
segments.extend(_split_pipe_tables(text[last_end : match.start()]))
|
||||
segments.append(_Segment(match.group(0), protected=True))
|
||||
last_end = match.end()
|
||||
if last_end < len(text):
|
||||
segments.extend(_split_pipe_tables(text[last_end:]))
|
||||
return tuple(segments)
|
||||
|
||||
|
||||
def _split_pipe_tables(text: str) -> tuple[_Segment, ...]:
|
||||
segments: list[_Segment] = []
|
||||
lines = text.splitlines(keepends=True)
|
||||
plain_parts: list[str] = []
|
||||
index = 0
|
||||
|
||||
def flush_plain() -> None:
|
||||
if plain_parts:
|
||||
segments.append(_Segment("".join(plain_parts), protected=False))
|
||||
plain_parts.clear()
|
||||
|
||||
while index < len(lines):
|
||||
if _starts_pipe_table(lines, index):
|
||||
flush_plain()
|
||||
table_parts: list[str] = []
|
||||
while index < len(lines) and "|" in lines[index]:
|
||||
table_parts.append(lines[index])
|
||||
index += 1
|
||||
segments.append(_Segment("".join(table_parts), protected=True))
|
||||
continue
|
||||
plain_parts.append(lines[index])
|
||||
index += 1
|
||||
|
||||
flush_plain()
|
||||
return tuple(segments)
|
||||
|
||||
|
||||
def _starts_pipe_table(lines: list[str], index: int) -> bool:
|
||||
return index + 1 < len(lines) and "|" in lines[index] and _is_pipe_separator(lines[index + 1])
|
||||
|
||||
|
||||
def _is_pipe_separator(line: str) -> bool:
|
||||
stripped = line.strip()
|
||||
if "|" not in stripped or "-" not in stripped:
|
||||
return False
|
||||
return all(char in "|:- " for char in stripped)
|
||||
|
||||
|
||||
def _normalize_asset_links(text: str, context: _AssetContext) -> tuple[str, tuple[WarningRecord, ...], tuple[str, ...]]:
|
||||
warnings: list[WarningRecord] = []
|
||||
asset_links: list[str] = []
|
||||
|
||||
def replace(match: re.Match[str]) -> str:
|
||||
alt = match.group("alt")
|
||||
target = match.group("target")
|
||||
normalized_target, target_warnings = _normalize_asset_target(target, context)
|
||||
warnings.extend(target_warnings)
|
||||
asset_links.append(normalized_target)
|
||||
return f""
|
||||
|
||||
return _IMAGE_LINK_RE.sub(replace, text), tuple(warnings), tuple(asset_links)
|
||||
|
||||
|
||||
def _normalize_asset_target(target: str, context: _AssetContext) -> tuple[str, tuple[WarningRecord, ...]]:
|
||||
warnings: list[WarningRecord] = []
|
||||
original = target.strip()
|
||||
unwrapped = _unwrap_angle_target(original)
|
||||
|
||||
if _is_remote_target(unwrapped):
|
||||
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Remote asset link was left unchanged: {unwrapped}"))
|
||||
return original, tuple(warnings)
|
||||
|
||||
if _is_absolute_or_rooted_path(unwrapped):
|
||||
relative = _absolute_target_to_relative(unwrapped, context)
|
||||
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Absolute asset link was rewritten as relative: {unwrapped}"))
|
||||
_append_missing_asset_warning(relative, context, warnings)
|
||||
return relative, tuple(warnings)
|
||||
|
||||
normalized = unwrapped.replace("\\", "/")
|
||||
if _escapes_parent(normalized):
|
||||
safe_name = _path_name(normalized)
|
||||
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Escaping asset link was rewritten as relative: {unwrapped}"))
|
||||
_append_missing_asset_warning(safe_name, context, warnings)
|
||||
return safe_name, tuple(warnings)
|
||||
|
||||
_append_missing_asset_warning(normalized, context, warnings)
|
||||
return normalized, tuple(warnings)
|
||||
|
||||
|
||||
def _unwrap_angle_target(target: str) -> str:
|
||||
if target.startswith("<") and target.endswith(">"):
|
||||
return target[1:-1].strip()
|
||||
return target
|
||||
|
||||
|
||||
def _is_remote_target(target: str) -> bool:
|
||||
if PureWindowsPath(target).drive:
|
||||
return False
|
||||
return _SCHEME_RE.match(target) is not None
|
||||
|
||||
|
||||
def _is_absolute_or_rooted_path(target: str) -> bool:
|
||||
windows = PureWindowsPath(target)
|
||||
posix = PurePosixPath(target)
|
||||
return bool(windows.drive or windows.root or posix.root)
|
||||
|
||||
|
||||
def _absolute_target_to_relative(target: str, context: _AssetContext) -> str:
|
||||
if context.markdown_dir is not None:
|
||||
try:
|
||||
relative = os.path.relpath(Path(target), context.markdown_dir)
|
||||
if not _escapes_parent(relative) and not _is_absolute_or_rooted_path(relative):
|
||||
return relative.replace("\\", "/")
|
||||
except ValueError:
|
||||
pass
|
||||
return _path_name(target)
|
||||
|
||||
|
||||
def _path_name(target: str) -> str:
|
||||
windows_name = PureWindowsPath(target).name
|
||||
if windows_name:
|
||||
return windows_name
|
||||
posix_name = PurePosixPath(target).name
|
||||
return posix_name or "asset"
|
||||
|
||||
|
||||
def _escapes_parent(target: str) -> bool:
|
||||
return ".." in PurePosixPath(target.replace("\\", "/")).parts
|
||||
|
||||
|
||||
def _append_missing_asset_warning(target: str, context: _AssetContext, warnings: list[WarningRecord]) -> None:
|
||||
if not context.check_assets or context.markdown_dir is None:
|
||||
return
|
||||
|
||||
candidate = (context.markdown_dir / target).resolve()
|
||||
if context.asset_root is not None and not _is_relative_to(candidate, context.asset_root.resolve()):
|
||||
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Asset link is outside the asset root: {target}"))
|
||||
return
|
||||
if not candidate.exists():
|
||||
warnings.append(_warning(WarningCode.ASSET_LINK_MISSING, f"Asset link target does not exist: {target}"))
|
||||
|
||||
|
||||
def _is_relative_to(path: Path, root: Path) -> bool:
|
||||
try:
|
||||
path.relative_to(root)
|
||||
except ValueError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _table_warnings(text: str) -> tuple[WarningRecord, ...]:
|
||||
warnings: list[WarningRecord] = []
|
||||
for match in _HTML_TABLE_RE.finditer(text):
|
||||
table = match.group(0)
|
||||
if _is_complex_html_table(table):
|
||||
warnings.append(
|
||||
_warning(
|
||||
WarningCode.TABLE_FALLBACK,
|
||||
"Complex HTML table was preserved instead of simplified to Markdown.",
|
||||
)
|
||||
)
|
||||
return tuple(warnings)
|
||||
|
||||
|
||||
def _is_complex_html_table(table: str) -> bool:
|
||||
normalized = table.casefold()
|
||||
return (
|
||||
"rowspan" in normalized
|
||||
or "colspan" in normalized
|
||||
or normalized.count("<table") > 1
|
||||
or "\\(" in table
|
||||
or "\\[" in table
|
||||
or "$" in table
|
||||
)
|
||||
|
||||
|
||||
def _warning(code: WarningCode, message: str) -> WarningRecord:
|
||||
return WarningRecord(code, WarningSeverity.WARNING, message)
|
||||
@@ -0,0 +1,162 @@
|
||||
"""Local MathJax renderability checker."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.quality import MathCheckerUnavailable, MathCheckResult, MathExpression
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MathJaxCommandResult:
|
||||
command: tuple[str, ...]
|
||||
exit_code: int
|
||||
stdout: str = ""
|
||||
stderr: str = ""
|
||||
|
||||
|
||||
MathJaxRunner = Callable[[tuple[str, ...], str, int], MathJaxCommandResult]
|
||||
Which = Callable[[str], str | None]
|
||||
|
||||
DEFAULT_TIMEOUT_SECONDS = 60
|
||||
|
||||
|
||||
class MathJaxRenderChecker:
|
||||
"""Batch-check TeX expressions with a local Node.js MathJax helper."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
node_executable: str = "node",
|
||||
helper_path: Path | None = None,
|
||||
which: Which = shutil.which,
|
||||
runner: MathJaxRunner | None = None,
|
||||
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
||||
) -> None:
|
||||
self.node_executable = node_executable
|
||||
self.helper_path = helper_path or default_mathjax_helper_path()
|
||||
self.which = which
|
||||
self.runner = runner or _run_node_helper
|
||||
self.timeout_seconds = timeout_seconds
|
||||
|
||||
def __call__(self, body: str) -> MathCheckResult:
|
||||
expression = MathExpression(index=0, body=body, display=False, markdown_span=(0, 0))
|
||||
return self.check_expressions((expression,))[0]
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return self._node_path() is not None and self.helper_path.exists()
|
||||
|
||||
def check_expressions(self, expressions: tuple[MathExpression, ...]) -> tuple[MathCheckResult, ...]:
|
||||
if not expressions:
|
||||
return ()
|
||||
|
||||
node_path = self._node_path()
|
||||
if node_path is None:
|
||||
raise MathCheckerUnavailable("Node.js executable was not found")
|
||||
if not self.helper_path.exists():
|
||||
raise MathCheckerUnavailable(f"MathJax helper script was not found: {self.helper_path}")
|
||||
|
||||
payload = json.dumps(
|
||||
{
|
||||
"expressions": [
|
||||
{"index": expression.index, "body": expression.body, "display": expression.display}
|
||||
for expression in expressions
|
||||
]
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
result = self.runner((node_path, str(self.helper_path)), payload, self.timeout_seconds)
|
||||
if result.exit_code != 0:
|
||||
detail = _trim_detail(result.stderr) or _trim_detail(result.stdout) or f"exit code {result.exit_code}"
|
||||
raise MathCheckerUnavailable(f"MathJax helper failed: {detail}")
|
||||
|
||||
return _parse_results(result.stdout, expressions)
|
||||
|
||||
def _node_path(self) -> str | None:
|
||||
return self.which(self.node_executable)
|
||||
|
||||
|
||||
def create_default_math_checker() -> MathJaxRenderChecker | None:
|
||||
"""Return a default local MathJax checker when Node and the helper are present."""
|
||||
|
||||
checker = MathJaxRenderChecker()
|
||||
return checker if checker.is_available() else None
|
||||
|
||||
|
||||
def default_mathjax_helper_path() -> Path:
|
||||
return Path(__file__).resolve().parents[2] / "tools" / "mathjax-checker" / "check.mjs"
|
||||
|
||||
|
||||
def _run_node_helper(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
input=stdin,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
timeout=timeout_seconds,
|
||||
)
|
||||
except FileNotFoundError as error:
|
||||
return MathJaxCommandResult(command=command, exit_code=127, stderr=str(error))
|
||||
except subprocess.TimeoutExpired as error:
|
||||
stdout = error.stdout if isinstance(error.stdout, str) else ""
|
||||
stderr = error.stderr if isinstance(error.stderr, str) else ""
|
||||
return MathJaxCommandResult(
|
||||
command=command,
|
||||
exit_code=124,
|
||||
stdout=stdout,
|
||||
stderr=stderr or "MathJax helper timed out",
|
||||
)
|
||||
|
||||
return MathJaxCommandResult(
|
||||
command=command,
|
||||
exit_code=completed.returncode,
|
||||
stdout=completed.stdout,
|
||||
stderr=completed.stderr,
|
||||
)
|
||||
|
||||
|
||||
def _parse_results(stdout: str, expressions: tuple[MathExpression, ...]) -> tuple[MathCheckResult, ...]:
|
||||
try:
|
||||
payload = json.loads(stdout)
|
||||
except json.JSONDecodeError as error:
|
||||
raise MathCheckerUnavailable(f"MathJax helper returned invalid JSON: {error}") from error
|
||||
|
||||
if not isinstance(payload, dict):
|
||||
raise MathCheckerUnavailable("MathJax helper returned a non-object JSON payload")
|
||||
raw_results = payload.get("results")
|
||||
if not isinstance(raw_results, list):
|
||||
raise MathCheckerUnavailable("MathJax helper JSON did not include a results list")
|
||||
|
||||
by_index: dict[int, MathCheckResult] = {}
|
||||
for item in raw_results:
|
||||
if not isinstance(item, dict):
|
||||
raise MathCheckerUnavailable("MathJax helper returned a non-object result item")
|
||||
index = item.get("index")
|
||||
if not isinstance(index, int):
|
||||
raise MathCheckerUnavailable("MathJax helper result is missing an integer index")
|
||||
ok = item.get("ok")
|
||||
if not isinstance(ok, bool):
|
||||
raise MathCheckerUnavailable("MathJax helper result is missing a boolean ok field")
|
||||
message = item.get("message", "")
|
||||
if not isinstance(message, str):
|
||||
raise MathCheckerUnavailable("MathJax helper result message must be a string")
|
||||
by_index[index] = MathCheckResult(ok=ok, message=message)
|
||||
|
||||
expected = {expression.index for expression in expressions}
|
||||
if set(by_index) != expected:
|
||||
raise MathCheckerUnavailable("MathJax helper result indexes did not match the requested expressions")
|
||||
|
||||
return tuple(by_index[expression.index] for expression in expressions)
|
||||
|
||||
|
||||
def _trim_detail(value: str) -> str:
|
||||
stripped = " ".join(value.split())
|
||||
return stripped[:240]
|
||||
@@ -0,0 +1,84 @@
|
||||
"""Metadata JSON construction from project-owned records."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections.abc import Mapping
|
||||
from typing import Any
|
||||
|
||||
from pdf2md.ir import BlockType, DocumentRecord, WarningCode, WarningRecord, WarningSeverity, iter_blocks
|
||||
|
||||
|
||||
JsonObject = dict[str, Any]
|
||||
|
||||
|
||||
class MetadataInputError(ValueError):
|
||||
"""Raised when metadata cannot be built from the provided records."""
|
||||
|
||||
|
||||
def build_metadata(
|
||||
*,
|
||||
document: DocumentRecord | None,
|
||||
source_sha256: str | None,
|
||||
created_at: str | None,
|
||||
engine: str | None,
|
||||
engine_version: str | None,
|
||||
engine_options: Mapping[str, Any] | None = None,
|
||||
) -> JsonObject:
|
||||
"""Build the v1 metadata JSON object as plain Python data."""
|
||||
|
||||
if document is None:
|
||||
raise MetadataInputError("document is required")
|
||||
_require_text(source_sha256, "source_sha256")
|
||||
_require_text(created_at, "created_at")
|
||||
_require_text(engine, "engine")
|
||||
_require_text(engine_version, "engine_version")
|
||||
|
||||
metadata: JsonObject = {
|
||||
"source_pdf": str(document.source_pdf),
|
||||
"source_sha256": source_sha256,
|
||||
"created_at": created_at,
|
||||
"engine": engine,
|
||||
"engine_version": engine_version,
|
||||
"engine_options": dict(engine_options or {}),
|
||||
"pages": [page.to_dict() for page in document.pages],
|
||||
"assets": [asset.to_dict() for asset in document.assets],
|
||||
"warnings": [warning.to_dict() for warning in document.warnings],
|
||||
"summary": build_summary(document),
|
||||
}
|
||||
_ensure_json_serializable(metadata)
|
||||
return metadata
|
||||
|
||||
|
||||
def build_summary(document: DocumentRecord) -> JsonObject:
|
||||
"""Build required summary counts for metadata and later reports."""
|
||||
|
||||
blocks = tuple(iter_blocks(document.pages))
|
||||
return {
|
||||
"pages_processed": len(document.pages),
|
||||
"warning_count": len(document.warnings),
|
||||
"asset_count": len(document.assets),
|
||||
"display_formula_count": sum(block.block_type == BlockType.DISPLAY_FORMULA for block in blocks),
|
||||
"inline_formula_count": sum(block.block_type == BlockType.INLINE_FORMULA for block in blocks),
|
||||
"math_render_error_count": count_non_info_warnings(document.warnings, WarningCode.MATH_RENDER_FAILED),
|
||||
}
|
||||
|
||||
|
||||
def count_warnings(warnings: tuple[WarningRecord, ...], code: WarningCode) -> int:
|
||||
return sum(warning.code == code for warning in warnings)
|
||||
|
||||
|
||||
def count_non_info_warnings(warnings: tuple[WarningRecord, ...], code: WarningCode) -> int:
|
||||
return sum(warning.code == code and warning.severity != WarningSeverity.INFO for warning in warnings)
|
||||
|
||||
|
||||
def _require_text(value: str | None, field_name: str) -> None:
|
||||
if not value:
|
||||
raise MetadataInputError(f"{field_name} is required")
|
||||
|
||||
|
||||
def _ensure_json_serializable(value: JsonObject) -> None:
|
||||
try:
|
||||
json.dumps(value)
|
||||
except TypeError as error:
|
||||
raise MetadataInputError("metadata must be JSON serializable") from error
|
||||
@@ -0,0 +1,452 @@
|
||||
"""Direct local MinerU CLI adapter boundary."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from collections.abc import Callable, Mapping
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
|
||||
|
||||
PathLike = str | os.PathLike[str]
|
||||
Runner = Callable[[tuple[str, ...]], "CommandResult"]
|
||||
Which = Callable[[str], str | None]
|
||||
|
||||
ENGINE_NAME = "MinerU"
|
||||
DEFAULT_EXECUTABLE = "mineru"
|
||||
|
||||
|
||||
class MinerUAdapterError(Exception):
|
||||
"""Base adapter error."""
|
||||
|
||||
|
||||
class StrictLocalViolationError(MinerUAdapterError):
|
||||
"""Raised when adapter options would violate strict-local execution."""
|
||||
|
||||
def __init__(self, message: str) -> None:
|
||||
self.warning = WarningRecord(
|
||||
WarningCode.STRICT_LOCAL_VIOLATION,
|
||||
WarningSeverity.ERROR,
|
||||
message,
|
||||
)
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CommandResult:
|
||||
command: tuple[str, ...]
|
||||
exit_code: int
|
||||
stdout: str = ""
|
||||
stderr: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MinerUOptions:
|
||||
strict_local: bool = True
|
||||
gpu_device: str | None = None
|
||||
engine_version: str | None = None
|
||||
extra_cli_args: tuple[str, ...] = ()
|
||||
engine_options: Mapping[str, object] = field(default_factory=dict)
|
||||
|
||||
def to_engine_options(self) -> dict[str, object]:
|
||||
data: dict[str, object] = {"strict_local": self.strict_local}
|
||||
if self.gpu_device is not None:
|
||||
data["gpu_device"] = self.gpu_device
|
||||
data.update(dict(self.engine_options))
|
||||
return data
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MinerUVersionResult:
|
||||
available: bool
|
||||
version: str | None
|
||||
command: tuple[str, ...]
|
||||
exit_code: int | None
|
||||
stdout: str
|
||||
stderr: str
|
||||
warnings: tuple[WarningRecord, ...] = ()
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MinerUAdapterResult:
|
||||
succeeded: bool
|
||||
command: tuple[str, ...]
|
||||
input_pdf: Path
|
||||
work_dir: Path
|
||||
raw_markdown: str | None
|
||||
raw_structured: object | None
|
||||
asset_paths: tuple[Path, ...]
|
||||
warnings: tuple[WarningRecord, ...]
|
||||
engine: str
|
||||
engine_version: str | None
|
||||
engine_options: dict[str, object]
|
||||
exit_code: int | None
|
||||
stdout: str
|
||||
stderr: str
|
||||
|
||||
|
||||
class MinerUAdapter:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
executable: str = DEFAULT_EXECUTABLE,
|
||||
which: Which = shutil.which,
|
||||
runner: Runner | None = None,
|
||||
) -> None:
|
||||
if executable != DEFAULT_EXECUTABLE:
|
||||
raise StrictLocalViolationError("v1 strict-local execution only allows the direct mineru CLI executable.")
|
||||
self.executable = executable
|
||||
self._which = which
|
||||
self._runner = runner or _run_command
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return self._which(self.executable) is not None
|
||||
|
||||
def version(self) -> MinerUVersionResult:
|
||||
command = (self.executable, "--version")
|
||||
if not self.is_available():
|
||||
return MinerUVersionResult(
|
||||
available=False,
|
||||
version=None,
|
||||
command=command,
|
||||
exit_code=None,
|
||||
stdout="",
|
||||
stderr="",
|
||||
warnings=(_warning(WarningCode.ENGINE_MISSING, WarningSeverity.ERROR, "MinerU CLI executable was not found."),),
|
||||
)
|
||||
|
||||
result = self._runner(command)
|
||||
version_text = _first_non_empty_line(result.stdout) or _first_non_empty_line(result.stderr)
|
||||
if result.exit_code != 0:
|
||||
return MinerUVersionResult(
|
||||
available=True,
|
||||
version=None,
|
||||
command=result.command,
|
||||
exit_code=result.exit_code,
|
||||
stdout=result.stdout,
|
||||
stderr=result.stderr,
|
||||
warnings=(
|
||||
_warning(
|
||||
WarningCode.MINERU_CLI_FAILED,
|
||||
WarningSeverity.ERROR,
|
||||
"MinerU version command failed.",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
if version_text is None:
|
||||
return MinerUVersionResult(
|
||||
available=True,
|
||||
version=None,
|
||||
command=result.command,
|
||||
exit_code=result.exit_code,
|
||||
stdout=result.stdout,
|
||||
stderr=result.stderr,
|
||||
warnings=(
|
||||
_warning(
|
||||
WarningCode.MINERU_CLI_FAILED,
|
||||
WarningSeverity.WARNING,
|
||||
"MinerU version command produced no version text.",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
return MinerUVersionResult(
|
||||
available=True,
|
||||
version=version_text,
|
||||
command=result.command,
|
||||
exit_code=result.exit_code,
|
||||
stdout=result.stdout,
|
||||
stderr=result.stderr,
|
||||
)
|
||||
|
||||
def build_command(
|
||||
self,
|
||||
input_pdf: PathLike,
|
||||
work_dir: PathLike,
|
||||
options: MinerUOptions | None = None,
|
||||
) -> tuple[str, ...]:
|
||||
options = options or MinerUOptions()
|
||||
validate_strict_local_options(options)
|
||||
return (self.executable, "-p", str(Path(input_pdf)), "-o", str(Path(work_dir)))
|
||||
|
||||
def convert(
|
||||
self,
|
||||
input_pdf: PathLike,
|
||||
work_dir: PathLike,
|
||||
options: MinerUOptions | None = None,
|
||||
) -> MinerUAdapterResult:
|
||||
options = options or MinerUOptions()
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
command = self.build_command(input_path, output_dir, options)
|
||||
|
||||
if not self.is_available():
|
||||
return _result(
|
||||
succeeded=False,
|
||||
command=command,
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
options=options,
|
||||
exit_code=None,
|
||||
warnings=(_warning(WarningCode.ENGINE_MISSING, WarningSeverity.ERROR, "MinerU CLI executable was not found."),),
|
||||
)
|
||||
|
||||
completed = _run_with_environment(self._runner, command, _mineru_environment(options))
|
||||
if completed.exit_code != 0:
|
||||
return _result(
|
||||
succeeded=False,
|
||||
command=completed.command,
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
options=options,
|
||||
exit_code=completed.exit_code,
|
||||
stdout=completed.stdout,
|
||||
stderr=completed.stderr,
|
||||
warnings=(
|
||||
_warning(
|
||||
WarningCode.MINERU_CLI_FAILED,
|
||||
WarningSeverity.ERROR,
|
||||
"MinerU CLI failed and no fallback engine was used.",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
parsed = _parse_output_dir(output_dir)
|
||||
warnings = parsed.warnings
|
||||
succeeded = parsed.raw_markdown is not None or parsed.raw_structured is not None
|
||||
if not succeeded:
|
||||
warnings = warnings + (
|
||||
_warning(
|
||||
WarningCode.MINERU_CLI_FAILED,
|
||||
WarningSeverity.ERROR,
|
||||
"MinerU completed but produced no usable Markdown or structured output.",
|
||||
),
|
||||
)
|
||||
|
||||
return _result(
|
||||
succeeded=succeeded,
|
||||
command=completed.command,
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
options=options,
|
||||
raw_markdown=parsed.raw_markdown,
|
||||
raw_structured=parsed.raw_structured,
|
||||
asset_paths=parsed.asset_paths,
|
||||
exit_code=completed.exit_code,
|
||||
stdout=completed.stdout,
|
||||
stderr=completed.stderr,
|
||||
warnings=warnings,
|
||||
)
|
||||
|
||||
|
||||
def validate_strict_local_options(options: MinerUOptions) -> None:
|
||||
if not options.strict_local:
|
||||
raise StrictLocalViolationError("strict-local execution cannot be disabled in v1.")
|
||||
|
||||
values: list[object] = [options.gpu_device, *options.extra_cli_args, options.engine_options]
|
||||
for value in values:
|
||||
_reject_prohibited_value(value)
|
||||
|
||||
if options.extra_cli_args:
|
||||
raise StrictLocalViolationError("extra MinerU CLI arguments are not supported by the v1 strict-local adapter.")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _ParsedOutput:
|
||||
raw_markdown: str | None
|
||||
raw_structured: object | None
|
||||
asset_paths: tuple[Path, ...]
|
||||
warnings: tuple[WarningRecord, ...]
|
||||
|
||||
|
||||
def _parse_output_dir(work_dir: Path) -> _ParsedOutput:
|
||||
if not work_dir.exists():
|
||||
return _ParsedOutput(raw_markdown=None, raw_structured=None, asset_paths=(), warnings=())
|
||||
|
||||
markdown_path = _first_file(work_dir, "*.md")
|
||||
structured_path = _first_file(work_dir, "*.json")
|
||||
raw_markdown = markdown_path.read_text(encoding="utf-8") if markdown_path is not None else None
|
||||
raw_structured: object | None = None
|
||||
warnings: tuple[WarningRecord, ...] = ()
|
||||
|
||||
if structured_path is not None:
|
||||
structured_text = structured_path.read_text(encoding="utf-8")
|
||||
try:
|
||||
raw_structured = json.loads(structured_text)
|
||||
except json.JSONDecodeError:
|
||||
raw_structured = structured_text
|
||||
warnings = (
|
||||
_warning(
|
||||
WarningCode.MINERU_CLI_FAILED,
|
||||
WarningSeverity.WARNING,
|
||||
f"MinerU structured output was not valid JSON: {structured_path}",
|
||||
),
|
||||
)
|
||||
|
||||
asset_paths = tuple(
|
||||
sorted(
|
||||
(
|
||||
path
|
||||
for path in work_dir.rglob("*")
|
||||
if path.is_file() and _is_asset_file(path)
|
||||
),
|
||||
key=lambda path: path.as_posix().casefold(),
|
||||
)
|
||||
)
|
||||
return _ParsedOutput(
|
||||
raw_markdown=raw_markdown,
|
||||
raw_structured=raw_structured,
|
||||
asset_paths=asset_paths,
|
||||
warnings=warnings,
|
||||
)
|
||||
|
||||
|
||||
def _run_command(command: tuple[str, ...]) -> CommandResult:
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
return CommandResult(
|
||||
command=command,
|
||||
exit_code=completed.returncode,
|
||||
stdout=completed.stdout,
|
||||
stderr=completed.stderr,
|
||||
)
|
||||
|
||||
|
||||
def _run_with_environment(
|
||||
runner: Runner,
|
||||
command: tuple[str, ...],
|
||||
environment: Mapping[str, str],
|
||||
) -> CommandResult:
|
||||
if not environment:
|
||||
return runner(command)
|
||||
|
||||
previous = {name: os.environ.get(name) for name in environment}
|
||||
try:
|
||||
os.environ.update(environment)
|
||||
return runner(command)
|
||||
finally:
|
||||
for name, value in previous.items():
|
||||
if value is None:
|
||||
os.environ.pop(name, None)
|
||||
else:
|
||||
os.environ[name] = value
|
||||
|
||||
|
||||
def _mineru_environment(options: MinerUOptions) -> dict[str, str]:
|
||||
if options.gpu_device is None:
|
||||
return {}
|
||||
|
||||
device = options.gpu_device.strip()
|
||||
if not device:
|
||||
return {}
|
||||
|
||||
if device.isdecimal():
|
||||
device = f"cuda:{device}"
|
||||
|
||||
environment = {"MINERU_DEVICE_MODE": device}
|
||||
if device.startswith("cuda:"):
|
||||
index = device.split(":", 1)[1].strip()
|
||||
if index:
|
||||
environment["MINERU_DEVICE_MODE"] = "cuda"
|
||||
environment["CUDA_VISIBLE_DEVICES"] = index
|
||||
return environment
|
||||
|
||||
|
||||
def _first_file(root: Path, pattern: str) -> Path | None:
|
||||
matches = sorted(root.rglob(pattern), key=lambda path: path.as_posix().casefold())
|
||||
return matches[0] if matches else None
|
||||
|
||||
|
||||
def _first_non_empty_line(value: str) -> str | None:
|
||||
for line in value.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped:
|
||||
return stripped
|
||||
return None
|
||||
|
||||
|
||||
def _is_asset_file(path: Path) -> bool:
|
||||
return path.suffix.casefold() not in {".json", ".log", ".md", ".txt"}
|
||||
|
||||
|
||||
def _result(
|
||||
*,
|
||||
succeeded: bool,
|
||||
command: tuple[str, ...],
|
||||
input_pdf: Path,
|
||||
work_dir: Path,
|
||||
options: MinerUOptions,
|
||||
raw_markdown: str | None = None,
|
||||
raw_structured: object | None = None,
|
||||
asset_paths: tuple[Path, ...] = (),
|
||||
exit_code: int | None,
|
||||
stdout: str = "",
|
||||
stderr: str = "",
|
||||
warnings: tuple[WarningRecord, ...] = (),
|
||||
) -> MinerUAdapterResult:
|
||||
return MinerUAdapterResult(
|
||||
succeeded=succeeded,
|
||||
command=command,
|
||||
input_pdf=input_pdf,
|
||||
work_dir=work_dir,
|
||||
raw_markdown=raw_markdown,
|
||||
raw_structured=raw_structured,
|
||||
asset_paths=asset_paths,
|
||||
warnings=warnings,
|
||||
engine=ENGINE_NAME,
|
||||
engine_version=options.engine_version,
|
||||
engine_options=options.to_engine_options(),
|
||||
exit_code=exit_code,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
)
|
||||
|
||||
|
||||
def _warning(code: WarningCode, severity: WarningSeverity, message: str) -> WarningRecord:
|
||||
return WarningRecord(code, severity, message)
|
||||
|
||||
|
||||
def _reject_prohibited_value(value: object) -> None:
|
||||
if value is None:
|
||||
return
|
||||
if isinstance(value, Mapping):
|
||||
for key, item in value.items():
|
||||
_reject_prohibited_text(str(key))
|
||||
_reject_prohibited_value(item)
|
||||
return
|
||||
if isinstance(value, (list, tuple, set)):
|
||||
for item in value:
|
||||
_reject_prohibited_value(item)
|
||||
return
|
||||
_reject_prohibited_text(str(value))
|
||||
|
||||
|
||||
def _reject_prohibited_text(value: str) -> None:
|
||||
normalized = value.casefold()
|
||||
prohibited_tokens = (
|
||||
"--api-url",
|
||||
"api_url",
|
||||
"api-url",
|
||||
"base_url",
|
||||
"base-url",
|
||||
"router",
|
||||
"http_backend",
|
||||
"http-backend",
|
||||
"openai",
|
||||
"openai-compatible",
|
||||
"endpoint",
|
||||
"backend",
|
||||
"mineru-api",
|
||||
)
|
||||
if "http://" in normalized or "https://" in normalized or any(token in normalized for token in prohibited_tokens):
|
||||
raise StrictLocalViolationError(f"strict-local MinerU adapter rejected prohibited option: {value}")
|
||||
@@ -0,0 +1,223 @@
|
||||
"""Input discovery and output path planning for local PDF conversions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
PathLike = str | os.PathLike[str]
|
||||
|
||||
|
||||
class PathPlanningError(Exception):
|
||||
"""Base error for project-owned path planning failures."""
|
||||
|
||||
|
||||
class InputDiscoveryError(PathPlanningError):
|
||||
"""Raised when no valid local PDF inputs can be discovered."""
|
||||
|
||||
|
||||
class OutputRootError(PathPlanningError):
|
||||
"""Raised when the output root cannot contain planned outputs."""
|
||||
|
||||
|
||||
class OutputPathError(PathPlanningError):
|
||||
"""Raised when a planned output path would escape the output root."""
|
||||
|
||||
|
||||
class DuplicateOutputPathError(PathPlanningError):
|
||||
"""Raised when two inputs would map to the same planned output."""
|
||||
|
||||
def __init__(self, duplicates: Iterable[Path]) -> None:
|
||||
self.duplicates = tuple(duplicates)
|
||||
joined = ", ".join(str(path) for path in self.duplicates)
|
||||
super().__init__(f"planned output paths are duplicated: {joined}")
|
||||
|
||||
|
||||
class OutputConflictError(PathPlanningError):
|
||||
"""Raised when planned outputs already exist and overwrite is disabled."""
|
||||
|
||||
def __init__(self, conflicts: Iterable[Path]) -> None:
|
||||
self.conflicts = tuple(conflicts)
|
||||
joined = ", ".join(str(path) for path in self.conflicts)
|
||||
super().__init__(f"planned outputs already exist: {joined}")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DiscoveredPdf:
|
||||
"""A local PDF selected for later conversion."""
|
||||
|
||||
source_path: Path
|
||||
relative_parent: Path = Path()
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PlannedOutput:
|
||||
"""All filesystem paths reserved for one future conversion."""
|
||||
|
||||
source_pdf: Path
|
||||
markdown_path: Path
|
||||
assets_dir: Path
|
||||
metadata_path: Path | None
|
||||
report_path: Path
|
||||
raw_dir: Path | None
|
||||
|
||||
def planned_paths(self) -> tuple[Path, ...]:
|
||||
paths = [
|
||||
self.markdown_path,
|
||||
self.assets_dir,
|
||||
self.report_path,
|
||||
]
|
||||
if self.metadata_path is not None:
|
||||
paths.append(self.metadata_path)
|
||||
if self.raw_dir is not None:
|
||||
paths.append(self.raw_dir)
|
||||
return tuple(paths)
|
||||
|
||||
|
||||
def discover_pdfs(input_path: PathLike, *, recursive: bool = False) -> tuple[DiscoveredPdf, ...]:
|
||||
"""Discover local PDFs from a file or directory path."""
|
||||
|
||||
root = Path(input_path).expanduser()
|
||||
if not root.exists():
|
||||
raise InputDiscoveryError(f"input path does not exist: {root}")
|
||||
|
||||
if root.is_file():
|
||||
if not _is_pdf(root):
|
||||
raise InputDiscoveryError(f"input file is not a PDF: {root}")
|
||||
return (DiscoveredPdf(source_path=root.resolve()),)
|
||||
|
||||
if not root.is_dir():
|
||||
raise InputDiscoveryError(f"input path is not a file or directory: {root}")
|
||||
|
||||
root = root.resolve()
|
||||
candidates = root.rglob("*") if recursive else root.iterdir()
|
||||
pdfs = [
|
||||
DiscoveredPdf(
|
||||
source_path=path.resolve(),
|
||||
relative_parent=path.resolve().relative_to(root).parent,
|
||||
)
|
||||
for path in candidates
|
||||
if path.is_file() and _is_pdf(path)
|
||||
]
|
||||
pdfs.sort(key=lambda item: _sort_key(item.relative_parent / item.source_path.name))
|
||||
|
||||
if not pdfs:
|
||||
raise InputDiscoveryError(f"no PDF files discovered in directory: {root}")
|
||||
|
||||
return tuple(pdfs)
|
||||
|
||||
|
||||
def plan_outputs(
|
||||
discovered_pdfs: Iterable[DiscoveredPdf],
|
||||
output_root: PathLike,
|
||||
*,
|
||||
metadata: bool = True,
|
||||
keep_raw: bool = False,
|
||||
overwrite: bool = False,
|
||||
) -> tuple[PlannedOutput, ...]:
|
||||
"""Plan future output paths and preflight overwrite conflicts."""
|
||||
|
||||
root = Path(output_root).expanduser()
|
||||
if root.exists() and not root.is_dir():
|
||||
raise OutputRootError(f"output root exists and is not a directory: {root}")
|
||||
root = root.resolve(strict=False)
|
||||
|
||||
plans = tuple(
|
||||
_plan_one(discovered_pdf, root, metadata=metadata, keep_raw=keep_raw)
|
||||
for discovered_pdf in discovered_pdfs
|
||||
)
|
||||
_raise_if_duplicate_outputs(plans)
|
||||
|
||||
if not overwrite:
|
||||
conflicts = tuple(path for plan in plans for path in plan.planned_paths() if path.exists())
|
||||
if conflicts:
|
||||
raise OutputConflictError(conflicts)
|
||||
|
||||
return plans
|
||||
|
||||
|
||||
def plan_pdf_outputs(
|
||||
input_path: PathLike,
|
||||
output_root: PathLike,
|
||||
*,
|
||||
recursive: bool = False,
|
||||
metadata: bool = True,
|
||||
keep_raw: bool = False,
|
||||
overwrite: bool = False,
|
||||
) -> tuple[PlannedOutput, ...]:
|
||||
"""Discover PDFs and plan outputs in one call."""
|
||||
|
||||
discovered = discover_pdfs(input_path, recursive=recursive)
|
||||
return plan_outputs(
|
||||
discovered,
|
||||
output_root,
|
||||
metadata=metadata,
|
||||
keep_raw=keep_raw,
|
||||
overwrite=overwrite,
|
||||
)
|
||||
|
||||
|
||||
def _plan_one(
|
||||
discovered_pdf: DiscoveredPdf,
|
||||
output_root: Path,
|
||||
*,
|
||||
metadata: bool,
|
||||
keep_raw: bool,
|
||||
) -> PlannedOutput:
|
||||
relative_parent = _safe_relative_parent(discovered_pdf.relative_parent)
|
||||
parent = output_root / relative_parent
|
||||
stem = discovered_pdf.source_path.stem
|
||||
plan = PlannedOutput(
|
||||
source_pdf=discovered_pdf.source_path,
|
||||
markdown_path=parent / f"{stem}.md",
|
||||
assets_dir=parent / f"{stem}.assets",
|
||||
metadata_path=parent / f"{stem}.metadata.json" if metadata else None,
|
||||
report_path=parent / f"{stem}.report.md",
|
||||
raw_dir=parent / f"{stem}.raw" if keep_raw else None,
|
||||
)
|
||||
_raise_if_plan_escapes_root(plan, output_root)
|
||||
return plan
|
||||
|
||||
|
||||
def _raise_if_duplicate_outputs(plans: Iterable[PlannedOutput]) -> None:
|
||||
seen: set[str] = set()
|
||||
duplicates: list[Path] = []
|
||||
for plan in plans:
|
||||
for path in plan.planned_paths():
|
||||
key = _path_key(path)
|
||||
if key in seen:
|
||||
duplicates.append(path)
|
||||
else:
|
||||
seen.add(key)
|
||||
if duplicates:
|
||||
raise DuplicateOutputPathError(duplicates)
|
||||
|
||||
|
||||
def _is_pdf(path: Path) -> bool:
|
||||
return path.suffix.casefold() == ".pdf"
|
||||
|
||||
|
||||
def _safe_relative_parent(path: Path) -> Path:
|
||||
if path.is_absolute() or path.drive or path.root or ".." in path.parts:
|
||||
raise OutputPathError(f"relative parent would escape the output root: {path}")
|
||||
return path
|
||||
|
||||
|
||||
def _raise_if_plan_escapes_root(plan: PlannedOutput, output_root: Path) -> None:
|
||||
root = output_root.resolve(strict=False)
|
||||
for path in plan.planned_paths():
|
||||
try:
|
||||
path.resolve(strict=False).relative_to(root)
|
||||
except ValueError as error:
|
||||
raise OutputPathError(f"planned path would escape the output root: {path}") from error
|
||||
|
||||
|
||||
def _sort_key(path: Path) -> str:
|
||||
return path.as_posix().casefold()
|
||||
|
||||
|
||||
def _path_key(path: Path) -> str:
|
||||
return os.path.normcase(os.path.normpath(str(path.resolve(strict=False))))
|
||||
@@ -0,0 +1,128 @@
|
||||
"""Local PDF page chunk planning and writing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
from pdf2md.paths import PathLike
|
||||
|
||||
|
||||
class PdfChunkError(ValueError):
|
||||
"""Raised when a source PDF cannot be chunked."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PdfChunkPlan:
|
||||
source_pdf: Path
|
||||
chunk_index: int
|
||||
total_chunks: int
|
||||
start_page_index: int
|
||||
end_page_index: int
|
||||
page_number_width: int = 3
|
||||
|
||||
@property
|
||||
def source_page_start(self) -> int:
|
||||
return self.start_page_index + 1
|
||||
|
||||
@property
|
||||
def source_page_end(self) -> int:
|
||||
return self.end_page_index
|
||||
|
||||
@property
|
||||
def page_count(self) -> int:
|
||||
return self.end_page_index - self.start_page_index
|
||||
|
||||
@property
|
||||
def output_stem(self) -> str:
|
||||
return (
|
||||
f"{self.source_pdf.stem}.part-{self.chunk_index:03d}."
|
||||
f"pages-{self.source_page_start:0{self.page_number_width}d}-"
|
||||
f"{self.source_page_end:0{self.page_number_width}d}"
|
||||
)
|
||||
|
||||
@property
|
||||
def output_filename(self) -> str:
|
||||
return f"{self.output_stem}.pdf"
|
||||
|
||||
def metadata(self, chunk_pdf: Path | None = None) -> dict[str, object]:
|
||||
data: dict[str, object] = {
|
||||
"original_source_pdf": str(self.source_pdf),
|
||||
"chunk_index": self.chunk_index,
|
||||
"total_chunks": self.total_chunks,
|
||||
"source_page_start": self.source_page_start,
|
||||
"source_page_end": self.source_page_end,
|
||||
"chunk_page_count": self.page_count,
|
||||
}
|
||||
if chunk_pdf is not None:
|
||||
data["chunk_pdf"] = str(chunk_pdf)
|
||||
else:
|
||||
data["chunk_pdf_name"] = self.output_filename
|
||||
return data
|
||||
|
||||
|
||||
def count_pdf_pages(source_pdf: PathLike) -> int:
|
||||
"""Return the number of pages in a local PDF."""
|
||||
|
||||
reader = _reader(source_pdf)
|
||||
return len(reader.pages)
|
||||
|
||||
|
||||
def plan_pdf_chunks(source_pdf: PathLike, *, chunk_pages: int) -> tuple[PdfChunkPlan, ...]:
|
||||
"""Plan fixed-size page chunks using zero-based half-open page ranges."""
|
||||
|
||||
chunk_size = _validate_chunk_pages(chunk_pages)
|
||||
source = Path(source_pdf).expanduser().resolve()
|
||||
total_pages = count_pdf_pages(source)
|
||||
if total_pages < 1:
|
||||
raise PdfChunkError(f"PDF has no pages: {source}")
|
||||
|
||||
total_chunks = (total_pages + chunk_size - 1) // chunk_size
|
||||
width = max(3, len(str(total_pages)))
|
||||
return tuple(
|
||||
PdfChunkPlan(
|
||||
source_pdf=source,
|
||||
chunk_index=index + 1,
|
||||
total_chunks=total_chunks,
|
||||
start_page_index=start,
|
||||
end_page_index=min(start + chunk_size, total_pages),
|
||||
page_number_width=width,
|
||||
)
|
||||
for index, start in enumerate(range(0, total_pages, chunk_size))
|
||||
)
|
||||
|
||||
|
||||
def write_pdf_chunk(plan: PdfChunkPlan, destination: PathLike) -> Path:
|
||||
"""Write one planned chunk PDF to a local destination path."""
|
||||
|
||||
output_path = Path(destination)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
writer = PdfWriter()
|
||||
writer.append(
|
||||
plan.source_pdf,
|
||||
pages=(plan.start_page_index, plan.end_page_index),
|
||||
import_outline=False,
|
||||
)
|
||||
with output_path.open("wb") as file:
|
||||
writer.write(file)
|
||||
return output_path
|
||||
|
||||
|
||||
def _reader(source_pdf: PathLike) -> PdfReader:
|
||||
source = Path(source_pdf).expanduser()
|
||||
try:
|
||||
reader = PdfReader(source)
|
||||
except Exception as error:
|
||||
raise PdfChunkError(f"PDF cannot be opened for chunking: {source}") from error
|
||||
|
||||
if reader.is_encrypted:
|
||||
raise PdfChunkError(f"Encrypted PDFs cannot be chunked without a password: {source}")
|
||||
return reader
|
||||
|
||||
|
||||
def _validate_chunk_pages(chunk_pages: int) -> int:
|
||||
if not isinstance(chunk_pages, int) or chunk_pages < 1:
|
||||
raise PdfChunkError("chunk_pages must be a positive integer")
|
||||
return chunk_pages
|
||||
@@ -0,0 +1,374 @@
|
||||
"""Local quality checks for normalized Markdown output."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path, PurePosixPath, PureWindowsPath
|
||||
|
||||
from pdf2md.ir import PathLike, WarningCode, WarningRecord, WarningSeverity
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MathCheckResult:
|
||||
ok: bool
|
||||
message: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MathExpression:
|
||||
index: int
|
||||
body: str
|
||||
display: bool
|
||||
markdown_span: tuple[int, int]
|
||||
|
||||
|
||||
MathChecker = Callable[[str], bool | MathCheckResult]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class QualityResult:
|
||||
missing_asset_link_count: int = 0
|
||||
invalid_asset_link_count: int = 0
|
||||
math_render_error_count: int = 0
|
||||
warnings: tuple[WarningRecord, ...] = ()
|
||||
|
||||
@property
|
||||
def failure_count(self) -> int:
|
||||
return self.missing_asset_link_count + self.invalid_asset_link_count + self.math_render_error_count
|
||||
|
||||
|
||||
class MathCheckerUnavailable(RuntimeError):
|
||||
"""Raised by a local math checker when renderability cannot be checked."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _Segment:
|
||||
text: str
|
||||
protected: bool
|
||||
start: int = 0
|
||||
|
||||
|
||||
_FENCE_START_RE = re.compile(r"^(?P<indent> {0,3})(?P<fence>`{3,}|~{3,}).*$")
|
||||
_IMAGE_LINK_RE = re.compile(r"!\[(?P<alt>[^\]\n]*)\]\((?P<target>[^)\n]+)\)")
|
||||
_DISPLAY_MATH_RE = re.compile(r"(?<!\\)\$\$(?P<body>.*?)(?<!\\)\$\$", re.DOTALL)
|
||||
_INLINE_MATH_RE = re.compile(r"(?<!\\)\$(?P<body>[^\n$]+?)(?<!\\)\$")
|
||||
_SCHEME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9+.-]*:")
|
||||
|
||||
|
||||
def check_asset_links(
|
||||
markdown: str,
|
||||
*,
|
||||
markdown_dir: PathLike,
|
||||
asset_root: PathLike | None = None,
|
||||
) -> QualityResult:
|
||||
"""Check local Markdown image links without fetching, copying, or writing files."""
|
||||
|
||||
if not isinstance(markdown, str):
|
||||
raise TypeError("markdown must be a string")
|
||||
|
||||
markdown_root = Path(markdown_dir).resolve()
|
||||
allowed_asset_root = Path(asset_root).resolve() if asset_root is not None else None
|
||||
warnings: list[WarningRecord] = []
|
||||
missing_count = 0
|
||||
invalid_count = 0
|
||||
|
||||
for segment in _iter_unprotected_segments(markdown):
|
||||
for match in _IMAGE_LINK_RE.finditer(segment):
|
||||
target = _unwrap_angle_target(match.group("target").strip())
|
||||
state, message = _classify_asset_target(target, markdown_root, allowed_asset_root)
|
||||
if state == "invalid":
|
||||
invalid_count += 1
|
||||
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, message))
|
||||
elif state == "missing":
|
||||
missing_count += 1
|
||||
warnings.append(_warning(WarningCode.ASSET_LINK_MISSING, message))
|
||||
|
||||
return QualityResult(
|
||||
missing_asset_link_count=missing_count,
|
||||
invalid_asset_link_count=invalid_count,
|
||||
warnings=tuple(warnings),
|
||||
)
|
||||
|
||||
|
||||
def check_math_renderability(markdown: str, checker: MathChecker | None = None) -> QualityResult:
|
||||
"""Check math renderability through an injected local checker."""
|
||||
|
||||
if not isinstance(markdown, str):
|
||||
raise TypeError("markdown must be a string")
|
||||
|
||||
expressions = extract_math_expressions(markdown)
|
||||
if not expressions:
|
||||
return QualityResult()
|
||||
|
||||
if checker is None:
|
||||
return QualityResult(
|
||||
warnings=(
|
||||
WarningRecord(
|
||||
WarningCode.MATH_RENDER_FAILED,
|
||||
WarningSeverity.INFO,
|
||||
"Math render checker is unavailable; renderability was not validated.",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
warnings: list[WarningRecord] = []
|
||||
failure_count = 0
|
||||
try:
|
||||
results = _check_expressions(expressions, checker)
|
||||
for expression, result in zip(expressions, results, strict=True):
|
||||
ok = result.ok
|
||||
message = result.message
|
||||
if not ok:
|
||||
failure_count += 1
|
||||
details = f": {message}" if message else ""
|
||||
kind = "display" if expression.display else "inline"
|
||||
warnings.append(
|
||||
_warning(
|
||||
WarningCode.MATH_RENDER_FAILED,
|
||||
f"Math expression {expression.index} ({kind}) failed to render{details}",
|
||||
)
|
||||
)
|
||||
except MathCheckerUnavailable as error:
|
||||
return QualityResult(
|
||||
warnings=(
|
||||
WarningRecord(
|
||||
WarningCode.MATH_RENDER_FAILED,
|
||||
WarningSeverity.INFO,
|
||||
f"Math render checker is unavailable: {error}",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
return QualityResult(math_render_error_count=failure_count, warnings=tuple(warnings))
|
||||
|
||||
|
||||
def merge_quality_results(*results: QualityResult) -> QualityResult:
|
||||
"""Combine independent quality checks into one deterministic result."""
|
||||
|
||||
return QualityResult(
|
||||
missing_asset_link_count=sum(result.missing_asset_link_count for result in results),
|
||||
invalid_asset_link_count=sum(result.invalid_asset_link_count for result in results),
|
||||
math_render_error_count=sum(result.math_render_error_count for result in results),
|
||||
warnings=tuple(warning for result in results for warning in result.warnings),
|
||||
)
|
||||
|
||||
|
||||
def extract_math_expressions(markdown: str) -> tuple[MathExpression, ...]:
|
||||
"""Return math expressions outside code spans with display mode and source span."""
|
||||
|
||||
found: list[tuple[str, bool, tuple[int, int]]] = []
|
||||
for segment in _iter_unprotected_segment_ranges(markdown):
|
||||
display_spans: list[tuple[int, int]] = []
|
||||
for match in _DISPLAY_MATH_RE.finditer(segment.text):
|
||||
display_spans.append(match.span())
|
||||
body = match.group("body").strip()
|
||||
if body:
|
||||
span = (segment.start + match.start(), segment.start + match.end())
|
||||
found.append((body, True, span))
|
||||
|
||||
inline_source_parts: list[tuple[int, str]] = []
|
||||
cursor = 0
|
||||
for start, end in display_spans:
|
||||
inline_source_parts.append((cursor, segment.text[cursor:start]))
|
||||
cursor = end
|
||||
inline_source_parts.append((cursor, segment.text[cursor:]))
|
||||
|
||||
for offset, inline_source in inline_source_parts:
|
||||
for match in _INLINE_MATH_RE.finditer(inline_source):
|
||||
body = match.group("body").strip()
|
||||
if body and not _looks_like_currency(body):
|
||||
span = (
|
||||
segment.start + offset + match.start(),
|
||||
segment.start + offset + match.end(),
|
||||
)
|
||||
found.append((body, False, span))
|
||||
|
||||
found.sort(key=lambda item: item[2][0])
|
||||
return tuple(
|
||||
MathExpression(index=index, body=body, display=display, markdown_span=span)
|
||||
for index, (body, display, span) in enumerate(found)
|
||||
)
|
||||
|
||||
|
||||
def _iter_math_bodies(markdown: str) -> tuple[str, ...]:
|
||||
return tuple(expression.body for expression in extract_math_expressions(markdown))
|
||||
|
||||
|
||||
def _check_expressions(expressions: tuple[MathExpression, ...], checker: MathChecker) -> tuple[MathCheckResult, ...]:
|
||||
batch_checker = getattr(checker, "check_expressions", None)
|
||||
if callable(batch_checker):
|
||||
return _coerce_batch_results(expressions, batch_checker(expressions))
|
||||
return tuple(_coerce_check_result(checker(expression.body)) for expression in expressions)
|
||||
|
||||
|
||||
def _coerce_batch_results(
|
||||
expressions: tuple[MathExpression, ...],
|
||||
results: object,
|
||||
) -> tuple[MathCheckResult, ...]:
|
||||
if not isinstance(results, tuple | list):
|
||||
raise MathCheckerUnavailable("checker returned an invalid batch result")
|
||||
coerced = tuple(_coerce_check_result(result) for result in results)
|
||||
if len(coerced) != len(expressions):
|
||||
raise MathCheckerUnavailable("checker returned a result count that does not match the expression count")
|
||||
return coerced
|
||||
|
||||
|
||||
def _coerce_check_result(result: bool | MathCheckResult) -> MathCheckResult:
|
||||
if isinstance(result, bool):
|
||||
return MathCheckResult(ok=result)
|
||||
if isinstance(result, MathCheckResult):
|
||||
return result
|
||||
raise MathCheckerUnavailable("checker returned an invalid result")
|
||||
|
||||
|
||||
def _looks_like_currency(body: str) -> bool:
|
||||
stripped = body.strip()
|
||||
return bool(stripped) and stripped[0].isdigit()
|
||||
|
||||
|
||||
def _classify_asset_target(target: str, markdown_dir: Path, asset_root: Path | None) -> tuple[str, str]:
|
||||
if _is_remote_target(target):
|
||||
return "invalid", f"Remote asset link is not local: {target}"
|
||||
if _is_absolute_or_rooted_path(target):
|
||||
return "invalid", f"Absolute asset link is not allowed: {target}"
|
||||
|
||||
normalized = target.replace("\\", "/")
|
||||
if _escapes_parent(normalized):
|
||||
return "invalid", f"Escaping asset link is not allowed: {target}"
|
||||
|
||||
candidate = (markdown_dir / normalized).resolve()
|
||||
if asset_root is not None and not _is_relative_to(candidate, asset_root):
|
||||
return "invalid", f"Asset link is outside the asset root: {target}"
|
||||
if not candidate.exists():
|
||||
return "missing", f"Asset link target does not exist: {target}"
|
||||
return "ok", ""
|
||||
|
||||
|
||||
def _iter_unprotected_segments(markdown: str) -> tuple[str, ...]:
|
||||
return tuple(segment.text for segment in _iter_unprotected_segment_ranges(markdown))
|
||||
|
||||
|
||||
def _iter_unprotected_segment_ranges(markdown: str) -> tuple[_Segment, ...]:
|
||||
segments: list[_Segment] = []
|
||||
for fence_segment in _split_fenced_code(markdown):
|
||||
if fence_segment.protected:
|
||||
continue
|
||||
for inline_segment in _split_inline_code(fence_segment.text, start=fence_segment.start):
|
||||
if not inline_segment.protected:
|
||||
segments.append(inline_segment)
|
||||
return tuple(segments)
|
||||
|
||||
|
||||
def _split_fenced_code(text: str) -> tuple[_Segment, ...]:
|
||||
segments: list[_Segment] = []
|
||||
fence_char: str | None = None
|
||||
fence_length = 0
|
||||
fence_start = 0
|
||||
plain_start = 0
|
||||
offset = 0
|
||||
|
||||
for line in text.splitlines(keepends=True):
|
||||
line_start = offset
|
||||
line_end = line_start + len(line)
|
||||
offset = line_end
|
||||
line_body = line.rstrip("\r\n")
|
||||
if fence_char is None:
|
||||
match = _FENCE_START_RE.match(line_body)
|
||||
if match:
|
||||
if plain_start < line_start:
|
||||
segments.append(_Segment(text[plain_start:line_start], protected=False, start=plain_start))
|
||||
fence = match.group("fence")
|
||||
fence_char = fence[0]
|
||||
fence_length = len(fence)
|
||||
fence_start = line_start
|
||||
continue
|
||||
|
||||
if _is_closing_fence(line_body, fence_char, fence_length):
|
||||
segments.append(_Segment(text[fence_start:line_end], protected=True, start=fence_start))
|
||||
fence_char = None
|
||||
fence_length = 0
|
||||
plain_start = line_end
|
||||
|
||||
if fence_char is not None:
|
||||
segments.append(_Segment(text[fence_start:], protected=True, start=fence_start))
|
||||
elif plain_start < len(text):
|
||||
segments.append(_Segment(text[plain_start:], protected=False, start=plain_start))
|
||||
return tuple(segments)
|
||||
|
||||
|
||||
def _is_closing_fence(line: str, fence_char: str, fence_length: int) -> bool:
|
||||
stripped = line.lstrip(" ")
|
||||
if len(line) - len(stripped) > 3:
|
||||
return False
|
||||
fence = fence_char * fence_length
|
||||
return stripped.startswith(fence) and stripped.strip(fence_char).strip() == ""
|
||||
|
||||
|
||||
def _split_inline_code(text: str, *, start: int = 0) -> tuple[_Segment, ...]:
|
||||
segments: list[_Segment] = []
|
||||
index = 0
|
||||
plain_start = 0
|
||||
while index < len(text):
|
||||
if text[index] != "`":
|
||||
index += 1
|
||||
continue
|
||||
|
||||
tick_count = _count_run(text, index, "`")
|
||||
closing = text.find("`" * tick_count, index + tick_count)
|
||||
if closing == -1:
|
||||
index += tick_count
|
||||
continue
|
||||
|
||||
if plain_start < index:
|
||||
segments.append(_Segment(text[plain_start:index], protected=False, start=start + plain_start))
|
||||
end = closing + tick_count
|
||||
segments.append(_Segment(text[index:end], protected=True, start=start + index))
|
||||
index = end
|
||||
plain_start = end
|
||||
|
||||
if plain_start < len(text):
|
||||
segments.append(_Segment(text[plain_start:], protected=False, start=start + plain_start))
|
||||
return tuple(segments)
|
||||
|
||||
|
||||
def _count_run(text: str, start: int, char: str) -> int:
|
||||
index = start
|
||||
while index < len(text) and text[index] == char:
|
||||
index += 1
|
||||
return index - start
|
||||
|
||||
|
||||
def _unwrap_angle_target(target: str) -> str:
|
||||
if target.startswith("<") and target.endswith(">"):
|
||||
return target[1:-1].strip()
|
||||
return target
|
||||
|
||||
|
||||
def _is_remote_target(target: str) -> bool:
|
||||
if PureWindowsPath(target).drive:
|
||||
return False
|
||||
return _SCHEME_RE.match(target) is not None
|
||||
|
||||
|
||||
def _is_absolute_or_rooted_path(target: str) -> bool:
|
||||
windows = PureWindowsPath(target)
|
||||
posix = PurePosixPath(target)
|
||||
return bool(windows.drive or windows.root or posix.root)
|
||||
|
||||
|
||||
def _escapes_parent(target: str) -> bool:
|
||||
return ".." in PurePosixPath(target.replace("\\", "/")).parts
|
||||
|
||||
|
||||
def _is_relative_to(path: Path, root: Path) -> bool:
|
||||
try:
|
||||
path.relative_to(root)
|
||||
except ValueError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _warning(code: WarningCode, message: str) -> WarningRecord:
|
||||
return WarningRecord(code, WarningSeverity.WARNING, message)
|
||||
@@ -0,0 +1,186 @@
|
||||
"""Human-readable quality report rendering."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
from pdf2md.ir import PathLike, WarningRecord, WarningSeverity
|
||||
from pdf2md.quality import QualityResult
|
||||
|
||||
|
||||
FinalStatus = Literal["success", "partial", "failed"]
|
||||
JsonObject = dict[str, Any]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ReportPaths:
|
||||
markdown_path: Path | None = None
|
||||
metadata_path: Path | None = None
|
||||
report_path: Path | None = None
|
||||
|
||||
|
||||
def render_report(
|
||||
metadata: JsonObject,
|
||||
*,
|
||||
quality: QualityResult | None = None,
|
||||
markdown_path: PathLike | None = None,
|
||||
metadata_path: PathLike | None = None,
|
||||
report_path: PathLike | None = None,
|
||||
) -> str:
|
||||
"""Render report Markdown content without writing files."""
|
||||
|
||||
quality = quality or QualityResult()
|
||||
paths = ReportPaths(
|
||||
markdown_path=Path(markdown_path) if markdown_path is not None else None,
|
||||
metadata_path=Path(metadata_path) if metadata_path is not None else None,
|
||||
report_path=Path(report_path) if report_path is not None else None,
|
||||
)
|
||||
summary = _summary(metadata)
|
||||
status = determine_final_status(metadata, quality)
|
||||
pages_with_warning = pages_with_warnings(metadata, quality)
|
||||
total_warning_count = _int(summary.get("warning_count")) + len(quality.warnings)
|
||||
total_math_render_errors = _int(summary.get("math_render_error_count")) + quality.math_render_error_count
|
||||
|
||||
lines = [
|
||||
"# PDF-to-Markdown Quality Report",
|
||||
"",
|
||||
"## Status",
|
||||
"",
|
||||
f"- Final status: `{status}`",
|
||||
"",
|
||||
"## Source And Outputs",
|
||||
"",
|
||||
f"- Source PDF: {_text(metadata.get('source_pdf'))}",
|
||||
]
|
||||
chunk_line = _chunk_line(metadata.get("engine_options", {}))
|
||||
if chunk_line is not None:
|
||||
lines.append(chunk_line)
|
||||
_append_optional_path(lines, "Output Markdown", paths.markdown_path)
|
||||
_append_optional_path(lines, "Metadata JSON", paths.metadata_path)
|
||||
_append_optional_path(lines, "Report Markdown", paths.report_path)
|
||||
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"## Engine",
|
||||
"",
|
||||
f"- Engine: {_text(metadata.get('engine'))}",
|
||||
f"- Engine version: {_text(metadata.get('engine_version'))}",
|
||||
f"- Engine options: `{_json(metadata.get('engine_options', {}))}`",
|
||||
"",
|
||||
"## Summary",
|
||||
"",
|
||||
f"- Pages processed: {_int(summary.get('pages_processed'))}",
|
||||
f"- Warning count: {total_warning_count}",
|
||||
f"- Asset count: {_int(summary.get('asset_count'))}",
|
||||
f"- Missing asset link count: {quality.missing_asset_link_count}",
|
||||
f"- Invalid asset link count: {quality.invalid_asset_link_count}",
|
||||
f"- Inline formula count: {_int(summary.get('inline_formula_count'))}",
|
||||
f"- Display formula count: {_int(summary.get('display_formula_count'))}",
|
||||
f"- Math render error count: {total_math_render_errors}",
|
||||
"",
|
||||
"## Pages With Warnings",
|
||||
"",
|
||||
]
|
||||
)
|
||||
if pages_with_warning:
|
||||
lines.extend(f"- Page {page_index}" for page_index in pages_with_warning)
|
||||
else:
|
||||
lines.append("- None")
|
||||
|
||||
lines.extend(["", "## Warnings", ""])
|
||||
warning_lines = _warning_lines(metadata, quality)
|
||||
lines.extend(warning_lines if warning_lines else ["- None"])
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def determine_final_status(metadata: JsonObject, quality: QualityResult | None = None) -> FinalStatus:
|
||||
quality = quality or QualityResult()
|
||||
metadata_warnings = _metadata_warnings(metadata)
|
||||
all_warnings = metadata_warnings + tuple(warning.to_dict() for warning in quality.warnings)
|
||||
if any(_text(warning.get("severity")) == WarningSeverity.ERROR.value for warning in all_warnings):
|
||||
return "failed"
|
||||
if all_warnings or quality.failure_count:
|
||||
return "partial"
|
||||
return "success"
|
||||
|
||||
|
||||
def pages_with_warnings(metadata: JsonObject, quality: QualityResult | None = None) -> tuple[int, ...]:
|
||||
quality = quality or QualityResult()
|
||||
pages: set[int] = set()
|
||||
for warning in _metadata_warnings(metadata):
|
||||
page_index = warning.get("page_index")
|
||||
if isinstance(page_index, int):
|
||||
pages.add(page_index)
|
||||
for warning in quality.warnings:
|
||||
if warning.page_index is not None:
|
||||
pages.add(warning.page_index)
|
||||
return tuple(sorted(pages))
|
||||
|
||||
|
||||
def _warning_lines(metadata: JsonObject, quality: QualityResult) -> list[str]:
|
||||
lines: list[str] = []
|
||||
for warning in _metadata_warnings(metadata):
|
||||
page = _format_page(warning.get("page_index"))
|
||||
lines.append(f"- `{_text(warning.get('severity'))}` `{_text(warning.get('code'))}`{page}: {_text(warning.get('message'))}")
|
||||
for warning in quality.warnings:
|
||||
page = _format_page(warning.page_index)
|
||||
lines.append(f"- `{warning.severity.value}` `{warning.code.value}`{page}: {warning.message}")
|
||||
return lines
|
||||
|
||||
|
||||
def _metadata_warnings(metadata: JsonObject) -> tuple[JsonObject, ...]:
|
||||
warnings = metadata.get("warnings", ())
|
||||
if not isinstance(warnings, list):
|
||||
return ()
|
||||
return tuple(warning for warning in warnings if isinstance(warning, dict))
|
||||
|
||||
|
||||
def _summary(metadata: JsonObject) -> JsonObject:
|
||||
summary = metadata.get("summary", {})
|
||||
return summary if isinstance(summary, dict) else {}
|
||||
|
||||
|
||||
def _chunk_line(engine_options: object) -> str | None:
|
||||
if not isinstance(engine_options, dict):
|
||||
return None
|
||||
chunk = engine_options.get("chunk")
|
||||
if not isinstance(chunk, dict):
|
||||
return None
|
||||
|
||||
chunk_index = chunk.get("chunk_index")
|
||||
total_chunks = chunk.get("total_chunks")
|
||||
page_start = chunk.get("source_page_start")
|
||||
page_end = chunk.get("source_page_end")
|
||||
if not all(isinstance(value, int) for value in (chunk_index, total_chunks, page_start, page_end)):
|
||||
return None
|
||||
return f"- Chunk: {chunk_index}/{total_chunks}, source pages: {page_start}-{page_end}"
|
||||
|
||||
|
||||
def _append_optional_path(lines: list[str], label: str, path: Path | None) -> None:
|
||||
if path is not None:
|
||||
lines.append(f"- {label}: {path}")
|
||||
|
||||
|
||||
def _format_page(page_index: object) -> str:
|
||||
return f" page {page_index}" if isinstance(page_index, int) else ""
|
||||
|
||||
|
||||
def _text(value: object) -> str:
|
||||
if value is None or value == "":
|
||||
return "unavailable"
|
||||
return str(value)
|
||||
|
||||
|
||||
def _int(value: object) -> int:
|
||||
return value if isinstance(value, int) else 0
|
||||
|
||||
|
||||
def _json(value: object) -> str:
|
||||
try:
|
||||
return json.dumps(value, sort_keys=True)
|
||||
except TypeError:
|
||||
return json.dumps(str(value))
|
||||
@@ -0,0 +1,118 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
if os.environ.get("PDF2MD_RUN_MINERU_FIXTURES") != "1":
|
||||
pytest.skip(
|
||||
"optional local MinerU fixture evaluation is disabled; set PDF2MD_RUN_MINERU_FIXTURES=1 to run",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
SAMPLES_DIR = REPO_ROOT / "samples"
|
||||
|
||||
|
||||
def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -> None:
|
||||
doctor = subprocess.run(
|
||||
[sys.executable, "-m", "pdf2md.cli", "doctor"],
|
||||
cwd=REPO_ROOT,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if doctor.returncode != 0:
|
||||
pytest.skip(f"local MinerU fixture evaluation blocked by doctor:\n{doctor.stdout}\n{doctor.stderr}")
|
||||
|
||||
sample_pdfs = tuple(sorted(SAMPLES_DIR.glob("*.pdf"), key=lambda path: path.name.casefold()))
|
||||
if not sample_pdfs:
|
||||
pytest.skip(f"no local sample PDFs found under {SAMPLES_DIR}")
|
||||
|
||||
output_root = tmp_path / "mineru-fixture-output"
|
||||
attempts: list[dict[str, object]] = []
|
||||
for pdf in sample_pdfs:
|
||||
sample_output = output_root / pdf.stem
|
||||
completed = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"pdf2md.cli",
|
||||
"convert",
|
||||
str(pdf),
|
||||
"--out",
|
||||
str(sample_output),
|
||||
],
|
||||
cwd=REPO_ROOT,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=1800,
|
||||
)
|
||||
attempts.append(
|
||||
{
|
||||
"source": str(pdf.relative_to(REPO_ROOT)),
|
||||
"command": " ".join(
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"pdf2md.cli",
|
||||
"convert",
|
||||
str(pdf),
|
||||
"--out",
|
||||
str(sample_output),
|
||||
]
|
||||
),
|
||||
"exit_code": completed.returncode,
|
||||
"stdout": completed.stdout,
|
||||
"stderr": completed.stderr,
|
||||
}
|
||||
)
|
||||
assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2)
|
||||
|
||||
markdown_path = sample_output / f"{pdf.stem}.md"
|
||||
metadata_path = sample_output / f"{pdf.stem}.metadata.json"
|
||||
report_path = sample_output / f"{pdf.stem}.report.md"
|
||||
assert markdown_path.exists()
|
||||
assert metadata_path.exists()
|
||||
assert report_path.exists()
|
||||
|
||||
metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
summary = metadata["summary"]
|
||||
assert metadata["engine"] == "MinerU"
|
||||
assert summary["pages_processed"] >= 1
|
||||
assert "warning_count" in summary
|
||||
assert "math_render_error_count" in summary
|
||||
assert "asset_count" in summary
|
||||
report = report_path.read_text(encoding="utf-8")
|
||||
assert "Output Markdown:" in report
|
||||
assert "Metadata JSON:" in report
|
||||
assert "Report Markdown:" in report
|
||||
attempts[-1].update(
|
||||
{
|
||||
"markdown_path": str(markdown_path),
|
||||
"metadata_path": str(metadata_path),
|
||||
"report_path": str(report_path),
|
||||
"warning_count": summary["warning_count"],
|
||||
"final_status": _report_final_status(report),
|
||||
"math_render_error_count": summary["math_render_error_count"],
|
||||
"asset_count": summary["asset_count"],
|
||||
"pages_processed": summary["pages_processed"],
|
||||
}
|
||||
)
|
||||
|
||||
record_path = output_root / "fixture-evaluation.json"
|
||||
record_path.write_text(json.dumps({"attempts": attempts}, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
assert record_path.exists()
|
||||
|
||||
|
||||
def _report_final_status(report: str) -> str:
|
||||
match = re.search(r"^- Final status: `(?P<status>[^`]+)`$", report, re.MULTILINE)
|
||||
return match.group("status") if match else "unavailable"
|
||||
@@ -0,0 +1,152 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.cli import main
|
||||
from pdf2md.conversion import convert_pdf
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult
|
||||
|
||||
|
||||
class FixtureAdapter:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
raw_markdown: str,
|
||||
raw_structured: object | None = None,
|
||||
succeeded: bool = True,
|
||||
asset_name: str | None = None,
|
||||
warnings: tuple[WarningRecord, ...] = (),
|
||||
) -> None:
|
||||
self.raw_markdown = raw_markdown
|
||||
self.raw_structured = raw_structured
|
||||
self.succeeded = succeeded
|
||||
self.asset_name = asset_name
|
||||
self.warnings = warnings
|
||||
self.calls: list[tuple[Path, Path]] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.calls.append((input_path, output_dir))
|
||||
asset_paths: tuple[Path, ...] = ()
|
||||
if self.asset_name is not None:
|
||||
asset_path = output_dir / "assets" / self.asset_name
|
||||
asset_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
asset_path.write_bytes(b"fake image")
|
||||
asset_paths = (asset_path,)
|
||||
failure = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
|
||||
return MinerUAdapterResult(
|
||||
succeeded=self.succeeded,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=self.raw_markdown if self.succeeded else None,
|
||||
raw_structured=self.raw_structured,
|
||||
asset_paths=asset_paths,
|
||||
warnings=self.warnings if self.succeeded else (failure,),
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0 if self.succeeded else 2,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
def fixed_clock() -> datetime:
|
||||
return datetime(2026, 5, 8, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def make_pdf(directory: Path, name: str) -> Path:
|
||||
path = directory / name
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(b"%PDF-1.7\nfast integration fixture\n")
|
||||
return path
|
||||
|
||||
|
||||
def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_counts(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "쉘구조_math.pdf")
|
||||
adapter = FixtureAdapter(
|
||||
raw_markdown=(
|
||||
"# Shell Element\n\n"
|
||||
"Inline \\(u_i\\) and display:\n\n"
|
||||
"\\[\nK u = f\n\\]\n\n"
|
||||
'<table><tr><td rowspan="2">\\(N_i\\)</td><td>stress</td></tr></table>\n\n'
|
||||
"\n"
|
||||
),
|
||||
raw_structured={"pages": [{}, {}, {}]},
|
||||
asset_name="mesh.png",
|
||||
)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert result.markdown_path.exists()
|
||||
assert result.metadata_path is not None and result.metadata_path.exists()
|
||||
assert result.report_path.exists()
|
||||
assert (tmp_path / "out" / "쉘구조_math.assets" / "mesh.png").read_bytes() == b"fake image"
|
||||
|
||||
markdown = result.markdown_path.read_text(encoding="utf-8")
|
||||
assert "$u_i$" in markdown
|
||||
assert "$$\nK u = f\n$$" in markdown
|
||||
assert "" in markdown
|
||||
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["engine"] == "MinerU"
|
||||
assert metadata["engine_version"] == "3.1.0"
|
||||
assert metadata["summary"]["pages_processed"] == 3
|
||||
assert metadata["summary"]["asset_count"] == 1
|
||||
assert metadata["summary"]["inline_formula_count"] == 1
|
||||
assert metadata["summary"]["display_formula_count"] == 1
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["summary"]["warning_count"] == 1
|
||||
assert metadata["warnings"][0]["code"] == "TABLE_FALLBACK"
|
||||
assert metadata["assets"] == [{"relative_path": "쉘구조_math.assets/mesh.png"}]
|
||||
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "- Final status: `partial`" in report
|
||||
assert "- Output Markdown:" in report
|
||||
assert "- Metadata JSON:" in report
|
||||
assert "- Report Markdown:" in report
|
||||
assert "- Math render error count: 0" in report
|
||||
assert "`TABLE_FALLBACK`" in report
|
||||
|
||||
|
||||
def test_v1_fast_failure_records_no_fallback_and_writes_no_release_outputs(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "failed.pdf")
|
||||
adapter = FixtureAdapter(raw_markdown="", succeeded=False)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "failed"
|
||||
assert result.warning_count == 1
|
||||
assert result.warnings[0].code == WarningCode.MINERU_CLI_FAILED
|
||||
assert not result.markdown_path.exists()
|
||||
assert not result.report_path.exists()
|
||||
assert result.metadata_path is not None and not result.metadata_path.exists()
|
||||
|
||||
|
||||
def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, capsys) -> None:
|
||||
source = tmp_path / "pdfs"
|
||||
first = make_pdf(source, "a.pdf")
|
||||
second = make_pdf(source, "한글.pdf")
|
||||
adapter = FixtureAdapter(raw_markdown="# Batch\n\nNo formulas.\n", raw_structured={"pages": 1})
|
||||
|
||||
exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert [call[0] for call in adapter.calls] == [first.resolve(), second.resolve()]
|
||||
assert "converted: 2" in captured.out
|
||||
assert "failed: 0" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert (tmp_path / "out" / "a.md").exists()
|
||||
assert (tmp_path / "out" / "a.metadata.json").exists()
|
||||
assert (tmp_path / "out" / "a.report.md").exists()
|
||||
assert (tmp_path / "out" / "한글.md").exists()
|
||||
assert (tmp_path / "out" / "한글.metadata.json").exists()
|
||||
assert (tmp_path / "out" / "한글.report.md").exists()
|
||||
@@ -0,0 +1,232 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from importlib.metadata import entry_points
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pypdf import PdfWriter
|
||||
|
||||
from pdf2md.cli import main
|
||||
from pdf2md.doctor import DoctorCheck, DoctorReport
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult
|
||||
|
||||
|
||||
class FakeAdapter:
|
||||
def __init__(self, *, succeeded: bool = True) -> None:
|
||||
self.succeeded = succeeded
|
||||
self.calls: list[Path] = []
|
||||
self.options: list[object] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.calls.append(input_path)
|
||||
self.options.append(options)
|
||||
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
|
||||
return MinerUAdapterResult(
|
||||
succeeded=self.succeeded,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=f"# {input_path.stem}\n" if self.succeeded else None,
|
||||
raw_structured={"pages": 1},
|
||||
asset_paths=(),
|
||||
warnings=() if self.succeeded else (warning,),
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0 if self.succeeded else 2,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
def fixed_clock() -> datetime:
|
||||
return datetime(2026, 5, 8, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def make_pdf(directory: Path, name: str) -> Path:
|
||||
path = directory / name
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(b"%PDF-1.7\n")
|
||||
return path
|
||||
|
||||
|
||||
def make_pdf_with_pages(directory: Path, name: str, page_count: int) -> Path:
|
||||
path = directory / name
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
writer = PdfWriter()
|
||||
for _ in range(page_count):
|
||||
writer.add_blank_page(width=72, height=72)
|
||||
with path.open("wb") as file:
|
||||
writer.write(file)
|
||||
return path
|
||||
|
||||
|
||||
def test_console_script_entry_point_is_reserved() -> None:
|
||||
scripts = {entry_point.name: entry_point for entry_point in entry_points(group="console_scripts")}
|
||||
|
||||
assert scripts["pdf2md"].value == "pdf2md.cli:main"
|
||||
|
||||
|
||||
def test_cli_no_args_prints_help(capsys) -> None:
|
||||
assert main([]) == 0
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "usage: pdf2md" in captured.out
|
||||
assert "convert" in captured.out
|
||||
assert "--no-strict-local" not in captured.out
|
||||
|
||||
|
||||
def test_cli_version_module_execution() -> None:
|
||||
completed = subprocess.run(
|
||||
[sys.executable, "-m", "pdf2md.cli", "--version"],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert completed.returncode == 0
|
||||
assert completed.stdout.strip() == "pdf2md 0.1.0"
|
||||
|
||||
|
||||
def test_cli_doctor_success_returns_zero(capsys) -> None:
|
||||
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("python", "pass", "ok"),)))
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "Doctor status: PASS" in captured.out
|
||||
assert "[PASS] python: ok" in captured.out
|
||||
|
||||
|
||||
def test_cli_doctor_warning_only_returns_zero(capsys) -> None:
|
||||
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("gpu", "warn", "missing"),)))
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "Doctor status: WARN" in captured.out
|
||||
assert "[WARN] gpu: missing" in captured.out
|
||||
|
||||
|
||||
def test_cli_doctor_failure_returns_nonzero(capsys) -> None:
|
||||
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("mineru", "fail", "missing"),)))
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 1
|
||||
assert "Doctor status: FAIL" in captured.out
|
||||
assert "[FAIL] mineru: missing" in captured.out
|
||||
|
||||
|
||||
def test_cli_convert_single_pdf_writes_outputs_and_summary(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
out = tmp_path / "out"
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "converted: 1" in captured.out
|
||||
assert "failed: 0" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert (out / "paper.md").exists()
|
||||
assert (out / "paper.metadata.json").exists()
|
||||
assert (out / "paper.report.md").exists()
|
||||
assert adapter.calls == [pdf.resolve()]
|
||||
assert adapter.options[0].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
|
||||
|
||||
def test_cli_convert_directory_is_deterministic(tmp_path: Path, capsys) -> None:
|
||||
source = tmp_path / "pdfs"
|
||||
make_pdf(source, "b.pdf")
|
||||
make_pdf(source, "a.pdf")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert [path.name for path in adapter.calls] == ["a.pdf", "b.pdf"]
|
||||
assert "converted: 2" in captured.out
|
||||
assert captured.out.index("a.pdf") < captured.out.index("b.pdf")
|
||||
|
||||
|
||||
def test_cli_convert_recursive_only_when_requested(tmp_path: Path, capsys) -> None:
|
||||
source = tmp_path / "pdfs"
|
||||
make_pdf(source, "top.pdf")
|
||||
make_pdf(source / "nested", "child.pdf")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(
|
||||
["convert", str(source), "--out", str(tmp_path / "out"), "--recursive"],
|
||||
adapter=adapter,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert [path.name for path in adapter.calls] == ["child.pdf", "top.pdf"]
|
||||
assert "converted: 2" in captured.out
|
||||
assert (tmp_path / "out" / "nested" / "child.md").exists()
|
||||
|
||||
|
||||
def test_cli_failure_summary_returns_nonzero(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
adapter = FakeAdapter(succeeded=False)
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 1
|
||||
assert "failed: 1" in captured.out
|
||||
assert "warnings: 1" in captured.out
|
||||
assert not (tmp_path / "out" / "paper.md").exists()
|
||||
|
||||
|
||||
def test_cli_preflight_conflict_fails_before_conversion(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
out = tmp_path / "out"
|
||||
out.mkdir()
|
||||
(out / "paper.md").write_text("old", encoding="utf-8")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 2
|
||||
assert "planned outputs already exist" in captured.err
|
||||
assert adapter.calls == []
|
||||
|
||||
|
||||
def test_cli_convert_chunk_pages_flag_uses_default_twenty_pages(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, "long.pdf", 21)
|
||||
out = tmp_path / "out"
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out), "--chunk-pages"], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "converted: 2" in captured.out
|
||||
assert [path.name for path in adapter.calls] == [
|
||||
"long.part-001.pages-001-020.pdf",
|
||||
"long.part-002.pages-021-021.pdf",
|
||||
]
|
||||
assert (out / "long.part-001.pages-001-020.md").exists()
|
||||
assert (out / "long.part-002.pages-021-021.md").exists()
|
||||
|
||||
|
||||
def test_cli_convert_rejects_non_positive_chunk_pages(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
|
||||
with pytest.raises(SystemExit) as error:
|
||||
main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--chunk-pages", "0"])
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert error.value.code == 2
|
||||
assert "must be a positive integer" in captured.err
|
||||
@@ -0,0 +1,418 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pypdf import PdfWriter
|
||||
|
||||
import pdf2md.conversion as conversion_module
|
||||
from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult, StrictLocalViolationError
|
||||
from pdf2md.paths import OutputConflictError
|
||||
|
||||
|
||||
class FakeAdapter:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
raw_markdown: str = "# Title\n",
|
||||
raw_structured: object | None = None,
|
||||
succeeded: bool = True,
|
||||
warnings: tuple[WarningRecord, ...] = (),
|
||||
asset_name: str | None = None,
|
||||
) -> None:
|
||||
self.raw_markdown = raw_markdown
|
||||
self.raw_structured = raw_structured
|
||||
self.succeeded = succeeded
|
||||
self.warnings = warnings
|
||||
self.asset_name = asset_name
|
||||
self.calls: list[tuple[Path, Path, object]] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
(output_dir / "raw.log").write_text("raw output", encoding="utf-8")
|
||||
self.calls.append((input_path, output_dir, options))
|
||||
asset_paths: tuple[Path, ...] = ()
|
||||
if self.asset_name is not None:
|
||||
asset_path = output_dir / "assets" / self.asset_name
|
||||
asset_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
asset_path.write_bytes(b"asset")
|
||||
asset_paths = (asset_path,)
|
||||
return MinerUAdapterResult(
|
||||
succeeded=self.succeeded,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=self.raw_markdown if self.succeeded else None,
|
||||
raw_structured=self.raw_structured,
|
||||
asset_paths=asset_paths,
|
||||
warnings=self.warnings,
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0 if self.succeeded else 2,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
class SequencedAdapter:
|
||||
def __init__(self, outcomes: tuple[bool, ...]) -> None:
|
||||
self.outcomes = list(outcomes)
|
||||
self.calls: list[Path] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.calls.append(input_path)
|
||||
succeeded = self.outcomes.pop(0)
|
||||
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
|
||||
return MinerUAdapterResult(
|
||||
succeeded=succeeded,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=f"# {input_path.stem}\n" if succeeded else None,
|
||||
raw_structured={"pages": 1},
|
||||
asset_paths=(),
|
||||
warnings=() if succeeded else (warning,),
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0 if succeeded else 2,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
class NestedMinerUAssetAdapter:
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
asset_path = output_dir / "paper" / "hybrid_auto" / "images" / "fig.png"
|
||||
asset_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
asset_path.write_bytes(b"nested asset")
|
||||
return MinerUAdapterResult(
|
||||
succeeded=True,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown="\n\n\\[x^2\\]\n",
|
||||
raw_structured=[{"page_idx": 0}, {"page_idx": 12}],
|
||||
asset_paths=(asset_path,),
|
||||
warnings=(),
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
def fixed_clock() -> datetime:
|
||||
return datetime(2026, 5, 8, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def make_pdf(tmp_path: Path, name: str = "paper.pdf") -> Path:
|
||||
path = tmp_path / name
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(b"%PDF-1.7\nlocal fixture\n")
|
||||
return path
|
||||
|
||||
|
||||
def make_pdf_with_pages(tmp_path: Path, page_count: int, name: str = "paper.pdf") -> Path:
|
||||
path = tmp_path / name
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
writer = PdfWriter()
|
||||
for _ in range(page_count):
|
||||
writer.add_blank_page(width=72, height=72)
|
||||
with path.open("wb") as file:
|
||||
writer.write(file)
|
||||
return path
|
||||
|
||||
|
||||
def test_convert_pdf_writes_markdown_metadata_report_and_assets(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(
|
||||
raw_markdown="# Title\n\nInline \\(x_i\\)\n\n\n",
|
||||
raw_structured={"pages": [{}, {}]},
|
||||
asset_name="fig.png",
|
||||
)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert result.final_status == "success"
|
||||
assert result.pages_processed == 2
|
||||
assert result.warning_count == 0
|
||||
assert result.engine == "MinerU"
|
||||
assert result.engine_version == "3.1.0"
|
||||
assert result.markdown_path.read_text(encoding="utf-8") == "# Title\n\nInline $x_i$\n\n\n"
|
||||
assert (tmp_path / "out" / "paper.assets" / "fig.png").read_bytes() == b"asset"
|
||||
assert result.report_path.exists()
|
||||
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest()
|
||||
assert metadata["created_at"] == "2026-05-08T00:00:00Z"
|
||||
assert metadata["summary"]["pages_processed"] == 2
|
||||
assert metadata["summary"]["inline_formula_count"] == 1
|
||||
assert metadata["summary"]["asset_count"] == 1
|
||||
assert metadata["assets"] == [{"relative_path": "paper.assets/fig.png"}]
|
||||
assert "- Final status: `success`" in result.report_path.read_text(encoding="utf-8")
|
||||
assert not adapter.calls[0][1].exists()
|
||||
|
||||
|
||||
def test_convert_pdf_adapter_failure_returns_failed_result_without_fallback_or_outputs(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
|
||||
adapter = FakeAdapter(succeeded=False, warnings=(warning,))
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert result.succeeded is False
|
||||
assert result.final_status == "failed"
|
||||
assert result.warnings == (warning,)
|
||||
assert len(adapter.calls) == 1
|
||||
assert not result.markdown_path.exists()
|
||||
assert not result.report_path.exists()
|
||||
|
||||
|
||||
def test_convert_pdf_respects_output_conflicts_and_overwrite(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
out = tmp_path / "out"
|
||||
out.mkdir()
|
||||
(out / "paper.md").write_text("old", encoding="utf-8")
|
||||
|
||||
with pytest.raises(OutputConflictError):
|
||||
convert_pdf(pdf, out, adapter=FakeAdapter(), clock=fixed_clock)
|
||||
|
||||
result = convert_pdf(pdf, out, adapter=FakeAdapter(raw_markdown="new\n"), clock=fixed_clock, overwrite=True)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert result.markdown_path.read_text(encoding="utf-8") == "new\n"
|
||||
|
||||
|
||||
def test_convert_pdf_can_skip_metadata_json_but_still_writes_report(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", metadata=False, adapter=FakeAdapter(), clock=fixed_clock)
|
||||
|
||||
assert result.metadata_path is None
|
||||
assert result.markdown_path.exists()
|
||||
assert result.report_path.exists()
|
||||
assert not (tmp_path / "out" / "paper.metadata.json").exists()
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "Metadata JSON:" not in report
|
||||
assert "Report Markdown:" in report
|
||||
|
||||
|
||||
def test_convert_pdf_records_math_checker_failures_in_metadata_and_report(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n")
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: False, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_FAILED]
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["summary"]["math_render_error_count"] == 1
|
||||
assert metadata["warnings"][0]["code"] == "MATH_RENDER_FAILED"
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "- Math render error count: 1" in report
|
||||
assert "`MATH_RENDER_FAILED`" in report
|
||||
|
||||
|
||||
def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: Path, monkeypatch) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")
|
||||
monkeypatch.setattr(conversion_module, "create_default_math_checker", lambda: None)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert result.warnings[0].code == WarningCode.MATH_RENDER_FAILED
|
||||
assert result.warnings[0].severity == WarningSeverity.INFO
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["summary"]["warning_count"] == 1
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "unavailable" in report
|
||||
assert "- Math render error count: 0" in report
|
||||
|
||||
|
||||
def test_convert_pdf_uses_default_math_checker_when_available(tmp_path: Path, monkeypatch) -> None:
|
||||
class DefaultChecker:
|
||||
def __init__(self) -> None:
|
||||
self.bodies: list[str] = []
|
||||
|
||||
def check_expressions(self, expressions):
|
||||
self.bodies = [expression.body for expression in expressions]
|
||||
return (True,)
|
||||
|
||||
checker = DefaultChecker()
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")
|
||||
monkeypatch.setattr(conversion_module, "create_default_math_checker", lambda: checker)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "success"
|
||||
assert result.warning_count == 0
|
||||
assert checker.bodies == ["x"]
|
||||
|
||||
|
||||
def test_convert_pdf_keep_raw_preserves_adapter_work_directory(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", keep_raw=True, adapter=FakeAdapter(), clock=fixed_clock)
|
||||
|
||||
assert result.raw_dir == tmp_path / "out" / "paper.raw"
|
||||
assert (result.raw_dir / "raw.log").read_text(encoding="utf-8") == "raw output"
|
||||
|
||||
|
||||
def test_convert_pdf_rejects_disabling_strict_local(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
|
||||
with pytest.raises(StrictLocalViolationError):
|
||||
convert_pdf(pdf, tmp_path / "out", strict_local=False, adapter=FakeAdapter(), clock=fixed_clock)
|
||||
|
||||
|
||||
def test_convert_pdf_passes_gpu_device_to_strict_local_options(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter()
|
||||
|
||||
convert_pdf(pdf, tmp_path / "out", gpu="cuda:0", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert adapter.calls[0][2].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
|
||||
|
||||
def test_convert_pdf_defaults_to_cuda_zero(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter()
|
||||
|
||||
convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert adapter.calls[0][2].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
|
||||
|
||||
def test_convert_pdf_rewrites_nested_mineru_image_links_and_page_indexes(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
|
||||
result = convert_pdf(
|
||||
pdf,
|
||||
tmp_path / "out",
|
||||
adapter=NestedMinerUAssetAdapter(),
|
||||
math_checker=lambda _: True,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
assert result.final_status == "success"
|
||||
assert result.pages_processed == 13
|
||||
markdown = result.markdown_path.read_text(encoding="utf-8")
|
||||
assert "" in markdown
|
||||
assert "](images/fig.png)" not in markdown
|
||||
copied_asset = tmp_path / "out" / "paper.assets" / "paper" / "hybrid_auto" / "images" / "fig.png"
|
||||
assert copied_asset.read_bytes() == b"nested asset"
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["summary"]["pages_processed"] == 13
|
||||
assert metadata["summary"]["warning_count"] == 0
|
||||
|
||||
|
||||
def test_convert_input_batch_continues_after_per_file_failure(tmp_path: Path) -> None:
|
||||
source = tmp_path / "pdfs"
|
||||
make_pdf(source, "a.pdf")
|
||||
make_pdf(source, "b.pdf")
|
||||
make_pdf(source, "c.pdf")
|
||||
adapter = SequencedAdapter((True, False, True))
|
||||
|
||||
batch = convert_input(source, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert [path.name for path in adapter.calls] == ["a.pdf", "b.pdf", "c.pdf"]
|
||||
assert batch.converted_count == 2
|
||||
assert batch.failed_count == 1
|
||||
assert (tmp_path / "out" / "a.md").exists()
|
||||
assert not (tmp_path / "out" / "b.md").exists()
|
||||
assert (tmp_path / "out" / "c.md").exists()
|
||||
|
||||
|
||||
def test_convert_pdf_chunk_mode_returns_batch_and_deletes_temporary_chunk_pdfs(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 41, "thesis.pdf")
|
||||
adapter = FakeAdapter(raw_structured={"pages": 1})
|
||||
|
||||
batch = convert_pdf(
|
||||
pdf,
|
||||
tmp_path / "out",
|
||||
adapter=adapter,
|
||||
math_checker=lambda _: True,
|
||||
chunk_pages=20,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
assert isinstance(batch, BatchConversionResult)
|
||||
assert batch.converted_count == 3
|
||||
assert [result.markdown_path.name for result in batch.results] == [
|
||||
"thesis.part-001.pages-001-020.md",
|
||||
"thesis.part-002.pages-021-040.md",
|
||||
"thesis.part-003.pages-041-041.md",
|
||||
]
|
||||
assert [path.name for path, _, _ in adapter.calls] == [
|
||||
"thesis.part-001.pages-001-020.pdf",
|
||||
"thesis.part-002.pages-021-040.pdf",
|
||||
"thesis.part-003.pages-041-041.pdf",
|
||||
]
|
||||
assert all(result.source_pdf == pdf.resolve() for result in batch.results)
|
||||
assert all(not path.exists() for path, _, _ in adapter.calls)
|
||||
|
||||
metadata = json.loads((tmp_path / "out" / "thesis.part-002.pages-021-040.metadata.json").read_text(encoding="utf-8"))
|
||||
assert metadata["source_pdf"] == str(pdf.resolve())
|
||||
assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest()
|
||||
assert metadata["engine_options"]["chunk"] == {
|
||||
"chunk_index": 2,
|
||||
"chunk_page_count": 20,
|
||||
"chunk_pdf_name": "thesis.part-002.pages-021-040.pdf",
|
||||
"original_source_pdf": str(pdf.resolve()),
|
||||
"source_page_end": 40,
|
||||
"source_page_start": 21,
|
||||
"total_chunks": 3,
|
||||
}
|
||||
report = (tmp_path / "out" / "thesis.part-002.pages-021-040.report.md").read_text(encoding="utf-8")
|
||||
assert "- Chunk: 2/3, source pages: 21-40" in report
|
||||
|
||||
|
||||
def test_convert_pdf_chunk_mode_keeps_short_pdf_as_single_batch_result(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 3, "short.pdf")
|
||||
adapter = FakeAdapter(raw_structured={"pages": 3})
|
||||
|
||||
batch = convert_pdf(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock)
|
||||
|
||||
assert isinstance(batch, BatchConversionResult)
|
||||
assert batch.converted_count == 1
|
||||
assert batch.results[0].markdown_path.name == "short.md"
|
||||
assert adapter.calls[0][0] == pdf.resolve()
|
||||
assert adapter.calls[0][0].exists()
|
||||
|
||||
|
||||
def test_convert_input_chunk_mode_continues_after_failed_chunk(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 41, "paper.pdf")
|
||||
adapter = SequencedAdapter((True, False, True))
|
||||
|
||||
batch = convert_input(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock)
|
||||
|
||||
assert batch.converted_count == 2
|
||||
assert batch.failed_count == 1
|
||||
assert [path.name for path in adapter.calls] == [
|
||||
"paper.part-001.pages-001-020.pdf",
|
||||
"paper.part-002.pages-021-040.pdf",
|
||||
"paper.part-003.pages-041-041.pdf",
|
||||
]
|
||||
assert (tmp_path / "out" / "paper.part-001.pages-001-020.md").exists()
|
||||
assert not (tmp_path / "out" / "paper.part-002.pages-021-040.md").exists()
|
||||
assert (tmp_path / "out" / "paper.part-003.pages-041-041.md").exists()
|
||||
@@ -0,0 +1,311 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.doctor import DoctorCommandResult, DoctorReport, format_doctor_report, run_doctor
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.math_render import default_mathjax_helper_path
|
||||
from pdf2md.mineru_adapter import MinerUVersionResult
|
||||
|
||||
|
||||
class FakeMinerUProbe:
|
||||
def __init__(self, result: MinerUVersionResult) -> None:
|
||||
self.result = result
|
||||
|
||||
def version(self) -> MinerUVersionResult:
|
||||
return self.result
|
||||
|
||||
|
||||
class FakeCuda:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
available: bool = True,
|
||||
devices: tuple[str, ...] = ("NVIDIA RTX 4060",),
|
||||
capabilities: tuple[tuple[int, int], ...] = ((8, 9),),
|
||||
) -> None:
|
||||
self._available = available
|
||||
self._devices = devices
|
||||
self._capabilities = capabilities
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return self._available
|
||||
|
||||
def device_count(self) -> int:
|
||||
return len(self._devices)
|
||||
|
||||
def get_device_name(self, index: int) -> str:
|
||||
return self._devices[index]
|
||||
|
||||
def get_device_capability(self, index: int) -> tuple[int, int]:
|
||||
return self._capabilities[index]
|
||||
|
||||
|
||||
class FakeTorchVersion:
|
||||
cuda = "12.8"
|
||||
|
||||
|
||||
class FakeTorch:
|
||||
__version__ = "2.8.0+cu128"
|
||||
version = FakeTorchVersion()
|
||||
|
||||
def __init__(self, cuda: FakeCuda) -> None:
|
||||
self.cuda = cuda
|
||||
|
||||
|
||||
def test_doctor_all_checks_pass_with_mocked_tools(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
env={"HF_HOME": str(tmp_path / "hf")},
|
||||
existing_paths={tmp_path / "hf"},
|
||||
)
|
||||
|
||||
assert report.status == "pass"
|
||||
assert report.exit_code == 0
|
||||
assert [check.name for check in report.checks] == [
|
||||
"python",
|
||||
"uv",
|
||||
"mineru",
|
||||
"gpu",
|
||||
"pytorch",
|
||||
"models",
|
||||
"mathjax",
|
||||
"local-only",
|
||||
]
|
||||
|
||||
|
||||
def test_doctor_fails_outside_python_312(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, python_version=(3, 11, 9))
|
||||
|
||||
python_check = find_check(report, "python")
|
||||
assert report.status == "fail"
|
||||
assert python_check.status == "fail"
|
||||
assert "use Python 3.12.x" in python_check.message
|
||||
|
||||
|
||||
def test_doctor_fails_when_uv_is_missing(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, available_tools={"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe"})
|
||||
|
||||
uv_check = find_check(report, "uv")
|
||||
assert report.status == "fail"
|
||||
assert uv_check.status == "fail"
|
||||
assert "uv executable was not found" in uv_check.message
|
||||
|
||||
|
||||
def test_doctor_fails_when_mineru_is_missing(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
mineru_result=MinerUVersionResult(
|
||||
available=False,
|
||||
version=None,
|
||||
command=("mineru", "--version"),
|
||||
exit_code=None,
|
||||
stdout="",
|
||||
stderr="",
|
||||
),
|
||||
)
|
||||
|
||||
mineru_check = find_check(report, "mineru")
|
||||
assert report.status == "fail"
|
||||
assert report.exit_code == 1
|
||||
assert mineru_check.status == "fail"
|
||||
assert "MinerU CLI executable was not found" in mineru_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mineru_version_command_fails(tmp_path: Path) -> None:
|
||||
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU version command failed.")
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
mineru_result=MinerUVersionResult(
|
||||
available=True,
|
||||
version=None,
|
||||
command=("mineru", "--version"),
|
||||
exit_code=2,
|
||||
stdout="",
|
||||
stderr="boom",
|
||||
warnings=(warning,),
|
||||
),
|
||||
)
|
||||
|
||||
mineru_check = find_check(report, "mineru")
|
||||
assert report.status == "warn"
|
||||
assert mineru_check.status == "warn"
|
||||
assert "version could not be detected" in mineru_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mineru_version_is_not_target(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
mineru_result=MinerUVersionResult(
|
||||
available=True,
|
||||
version="mineru, version 3.1.8",
|
||||
command=("mineru", "--version"),
|
||||
exit_code=0,
|
||||
stdout="mineru, version 3.1.8",
|
||||
stderr="",
|
||||
),
|
||||
)
|
||||
|
||||
mineru_check = find_check(report, "mineru")
|
||||
assert report.status == "warn"
|
||||
assert mineru_check.status == "warn"
|
||||
assert "project target is 3.1.0" in mineru_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_gpu_and_pytorch_are_missing(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
available_tools={"uv": "C:/Users/user/.local/bin/uv.exe"},
|
||||
import_module=missing_torch,
|
||||
)
|
||||
|
||||
assert report.status == "warn"
|
||||
assert find_check(report, "gpu").status == "warn"
|
||||
assert find_check(report, "pytorch").status == "warn"
|
||||
|
||||
|
||||
def test_doctor_warns_for_gtx_1070_ti_pascal_risk(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n")
|
||||
|
||||
gpu_check = find_check(report, "gpu")
|
||||
assert report.status == "warn"
|
||||
assert gpu_check.status == "warn"
|
||||
assert "Pascal/pre-Turing compatibility risk" in gpu_check.message
|
||||
assert any("GTX 1070 Ti" in detail for detail in gpu_check.details)
|
||||
|
||||
|
||||
def test_doctor_warns_for_pytorch_pre_turing_capability(tmp_path: Path) -> None:
|
||||
def fake_pascal_torch(name: str) -> FakeTorch:
|
||||
assert name == "torch"
|
||||
return FakeTorch(FakeCuda(devices=("NVIDIA GeForce GTX 1070 Ti",), capabilities=((6, 1),)))
|
||||
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
gpu_stdout="NVIDIA RTX 4060, 8192 MiB, 551.86\n",
|
||||
import_module=fake_pascal_torch,
|
||||
)
|
||||
|
||||
pytorch_check = find_check(report, "pytorch")
|
||||
assert report.status == "warn"
|
||||
assert pytorch_check.status == "warn"
|
||||
assert "Pascal/pre-Turing compatibility risk" in pytorch_check.message
|
||||
assert any("compute capability 6.1" in detail for detail in pytorch_check.details)
|
||||
|
||||
|
||||
def test_doctor_warns_when_model_cache_is_not_detected(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, env={}, existing_paths=set())
|
||||
|
||||
models_check = find_check(report, "models")
|
||||
assert report.status == "warn"
|
||||
assert models_check.status == "warn"
|
||||
assert "No MinerU model/cache/config path" in models_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mathjax_node_is_missing(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
available_tools={
|
||||
"uv": "C:/Users/user/.local/bin/uv.exe",
|
||||
"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe",
|
||||
},
|
||||
)
|
||||
|
||||
mathjax_check = find_check(report, "mathjax")
|
||||
assert report.status == "warn"
|
||||
assert mathjax_check.status == "warn"
|
||||
assert "Node.js executable was not found" in mathjax_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mathjax_health_fails(tmp_path: Path) -> None:
|
||||
def failing_runner(command: tuple[str, ...]) -> DoctorCommandResult:
|
||||
if command[-1] == "--health":
|
||||
return DoctorCommandResult(command, 1, stderr="Cannot find package 'mathjax'")
|
||||
return command_runner("NVIDIA RTX 4060, 8192 MiB, 551.86\n")(command)
|
||||
|
||||
report = make_report(tmp_path, run_command=failing_runner)
|
||||
|
||||
mathjax_check = find_check(report, "mathjax")
|
||||
assert report.status == "warn"
|
||||
assert mathjax_check.status == "warn"
|
||||
assert "unavailable" in mathjax_check.message
|
||||
assert any("mathjax" in detail for detail in mathjax_check.details)
|
||||
|
||||
|
||||
def test_format_doctor_report_is_stable(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n")
|
||||
|
||||
formatted = format_doctor_report(report)
|
||||
|
||||
assert formatted.startswith("Doctor status: WARN\n")
|
||||
assert "[WARN] gpu:" in formatted
|
||||
assert "[PASS] local-only:" in formatted
|
||||
|
||||
|
||||
def make_report(
|
||||
tmp_path: Path,
|
||||
*,
|
||||
python_version: tuple[int, int, int] = (3, 12, 7),
|
||||
available_tools: dict[str, str] | None = None,
|
||||
mineru_result: MinerUVersionResult | None = None,
|
||||
gpu_stdout: str = "NVIDIA RTX 4060, 8192 MiB, 551.86\n",
|
||||
env: dict[str, str] | None = None,
|
||||
existing_paths: set[Path] | None = None,
|
||||
import_module=None,
|
||||
run_command=None,
|
||||
) -> DoctorReport:
|
||||
tools = available_tools or {
|
||||
"uv": "C:/Users/user/.local/bin/uv.exe",
|
||||
"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe",
|
||||
"node": "C:/Program Files/nodejs/node.exe",
|
||||
}
|
||||
result = mineru_result or MinerUVersionResult(
|
||||
available=True,
|
||||
version="mineru, version 3.1.0",
|
||||
command=("mineru", "--version"),
|
||||
exit_code=0,
|
||||
stdout="mineru, version 3.1.0",
|
||||
stderr="",
|
||||
)
|
||||
environment = env if env is not None else {"HF_HOME": str(tmp_path / "hf")}
|
||||
paths = set(existing_paths if existing_paths is not None else {tmp_path / "hf"})
|
||||
paths.add(default_mathjax_helper_path())
|
||||
|
||||
return run_doctor(
|
||||
python_version=python_version,
|
||||
which=lambda executable: tools.get(executable),
|
||||
run_command=run_command or command_runner(gpu_stdout),
|
||||
import_module=import_module or fake_torch,
|
||||
env=environment,
|
||||
path_exists=lambda path: path in paths,
|
||||
home=tmp_path,
|
||||
mineru_probe=FakeMinerUProbe(result),
|
||||
)
|
||||
|
||||
|
||||
def command_runner(gpu_stdout: str):
|
||||
def run(command: tuple[str, ...]) -> DoctorCommandResult:
|
||||
if command == ("uv", "--version"):
|
||||
return DoctorCommandResult(command, 0, stdout="uv 0.8.13\n")
|
||||
if command and command[0] == "nvidia-smi":
|
||||
return DoctorCommandResult(command, 0, stdout=gpu_stdout)
|
||||
if len(command) == 2 and command[1] == "--version" and command[0].endswith("node.exe"):
|
||||
return DoctorCommandResult(command, 0, stdout="v24.13.0\n")
|
||||
if command and command[-1] == "--health":
|
||||
return DoctorCommandResult(command, 0, stdout='{"ok":true}\n')
|
||||
return DoctorCommandResult(command, 127, stderr="not found")
|
||||
|
||||
return run
|
||||
|
||||
|
||||
def fake_torch(name: str) -> FakeTorch:
|
||||
assert name == "torch"
|
||||
return FakeTorch(FakeCuda())
|
||||
|
||||
|
||||
def missing_torch(name: str):
|
||||
assert name == "torch"
|
||||
raise ImportError(name)
|
||||
|
||||
|
||||
def find_check(report: DoctorReport, name: str):
|
||||
return next(check for check in report.checks if check.name == name)
|
||||
@@ -0,0 +1,136 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import (
|
||||
AssetRecord,
|
||||
BlockRecord,
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
)
|
||||
|
||||
|
||||
def test_record_serialization_preserves_present_optional_fields(tmp_path: Path) -> None:
|
||||
block = BlockRecord(
|
||||
BlockType.INLINE_FORMULA,
|
||||
page_index=1,
|
||||
bbox=(1.0, 2.0, 3.0, 4.0),
|
||||
confidence=0.92,
|
||||
markdown_span=(10, 20),
|
||||
)
|
||||
page = PageRecord(page_index=1, width=612, height=792, blocks=(block,))
|
||||
asset = AssetRecord("paper.assets/image.png", page_index=1, bbox=(5.0, 6.0, 7.0, 8.0))
|
||||
warning = WarningRecord(
|
||||
WarningCode.LOW_CONFIDENCE_FORMULA,
|
||||
WarningSeverity.WARNING,
|
||||
"Formula confidence is low.",
|
||||
page_index=1,
|
||||
bbox=(1.0, 2.0, 3.0, 4.0),
|
||||
)
|
||||
document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,), assets=(asset,), warnings=(warning,))
|
||||
|
||||
data = document.to_dict()
|
||||
|
||||
assert data["source_pdf"] == str(tmp_path / "paper.pdf")
|
||||
assert data["pages"][0]["width"] == 612
|
||||
assert data["pages"][0]["height"] == 792
|
||||
assert data["pages"][0]["blocks"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0]
|
||||
assert data["pages"][0]["blocks"][0]["confidence"] == 0.92
|
||||
assert data["pages"][0]["blocks"][0]["markdown_span"] == [10, 20]
|
||||
assert data["assets"][0]["bbox"] == [5.0, 6.0, 7.0, 8.0]
|
||||
assert data["warnings"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0]
|
||||
json.dumps(data)
|
||||
|
||||
|
||||
def test_record_serialization_omits_absent_optional_fields(tmp_path: Path) -> None:
|
||||
block = BlockRecord(BlockType.PARAGRAPH)
|
||||
page = PageRecord(page_index=0, blocks=(block,))
|
||||
document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,))
|
||||
|
||||
block_data = document.to_dict()["pages"][0]["blocks"][0]
|
||||
page_data = document.to_dict()["pages"][0]
|
||||
|
||||
assert "page_index" not in block_data
|
||||
assert "bbox" not in block_data
|
||||
assert "confidence" not in block_data
|
||||
assert "markdown_span" not in block_data
|
||||
assert "width" not in page_data
|
||||
assert "height" not in page_data
|
||||
|
||||
|
||||
def test_block_types_and_warning_codes_match_architecture_set() -> None:
|
||||
assert {item.value for item in BlockType} == {
|
||||
"heading",
|
||||
"paragraph",
|
||||
"inline_formula",
|
||||
"display_formula",
|
||||
"table",
|
||||
"figure",
|
||||
"caption",
|
||||
"footnote",
|
||||
"reference",
|
||||
"unknown",
|
||||
}
|
||||
assert {item.value for item in WarningCode} >= {
|
||||
"ENGINE_MISSING",
|
||||
"GPU_UNAVAILABLE",
|
||||
"LOW_CONFIDENCE_FORMULA",
|
||||
"MATH_RENDER_FAILED",
|
||||
"ASSET_LINK_MISSING",
|
||||
"READING_ORDER_UNCERTAIN",
|
||||
"STRICT_LOCAL_VIOLATION",
|
||||
"MINERU_CLI_FAILED",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_block_type", ["formula", "image"])
|
||||
def test_invalid_block_type_fails_predictably(invalid_block_type: str) -> None:
|
||||
with pytest.raises(ValueError, match="invalid block_type"):
|
||||
BlockRecord(invalid_block_type) # type: ignore[arg-type]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_code", ["REMOTE_API_USED", "UNKNOWN_WARNING"])
|
||||
def test_invalid_warning_code_fails_predictably(invalid_code: str) -> None:
|
||||
with pytest.raises(ValueError, match="invalid code"):
|
||||
WarningRecord(invalid_code, WarningSeverity.WARNING, "message") # type: ignore[arg-type]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_severity", ["fatal", "warn"])
|
||||
def test_invalid_warning_severity_fails_predictably(invalid_severity: str) -> None:
|
||||
with pytest.raises(ValueError, match="invalid severity"):
|
||||
WarningRecord(WarningCode.MATH_RENDER_FAILED, invalid_severity, "message") # type: ignore[arg-type]
|
||||
|
||||
|
||||
def test_empty_pages_are_rejected(tmp_path: Path) -> None:
|
||||
with pytest.raises(ValueError, match="at least one page"):
|
||||
DocumentRecord(tmp_path / "paper.pdf", pages=())
|
||||
|
||||
|
||||
def test_empty_source_pdf_is_rejected() -> None:
|
||||
with pytest.raises(ValueError, match="source_pdf"):
|
||||
DocumentRecord("", pages=(PageRecord(page_index=0),))
|
||||
|
||||
|
||||
def test_invalid_optional_fields_are_rejected() -> None:
|
||||
with pytest.raises(ValueError, match="page_index"):
|
||||
BlockRecord(BlockType.PARAGRAPH, page_index=-1)
|
||||
with pytest.raises(ValueError, match="bbox"):
|
||||
BlockRecord(BlockType.PARAGRAPH, bbox=(1.0, 2.0, 3.0)) # type: ignore[arg-type]
|
||||
with pytest.raises(ValueError, match="confidence"):
|
||||
BlockRecord(BlockType.PARAGRAPH, confidence=1.2)
|
||||
with pytest.raises(ValueError, match="markdown_span"):
|
||||
BlockRecord(BlockType.PARAGRAPH, markdown_span=(5, 3))
|
||||
|
||||
|
||||
def test_asset_paths_must_be_relative() -> None:
|
||||
with pytest.raises(ValueError, match="relative"):
|
||||
AssetRecord("/absolute/image.png")
|
||||
with pytest.raises(ValueError, match="relative"):
|
||||
AssetRecord("../outside.png")
|
||||
@@ -0,0 +1,159 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import WarningCode
|
||||
from pdf2md.markdown import normalize_markdown
|
||||
|
||||
|
||||
def test_inline_parentheses_math_becomes_obsidian_dollars() -> None:
|
||||
result = normalize_markdown(r"Area is \(x_i^2 + y^{2}\).")
|
||||
|
||||
assert result.markdown == r"Area is $x_i^2 + y^{2}$."
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_existing_dollar_math_and_currency_are_not_rewritten() -> None:
|
||||
source = r"Cost is $5 and $10, while math $x_i^2$ stays."
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
|
||||
|
||||
def test_display_bracket_math_gets_own_delimiter_lines_and_blank_lines() -> None:
|
||||
result = normalize_markdown("Before\n\\[\na_i^2 + b^2\n\\]\nAfter")
|
||||
|
||||
assert result.markdown == "Before\n\n$$\na_i^2 + b^2\n$$\n\nAfter"
|
||||
|
||||
|
||||
def test_display_environment_body_is_preserved_inside_delimiters() -> None:
|
||||
source = "\\[\\begin{align}\na_i &= b^2\n\\end{align}\\]"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == "$$\n\\begin{align}\na_i &= b^2\n\\end{align}\n$$"
|
||||
|
||||
|
||||
def test_existing_display_math_spacing_is_idempotent() -> None:
|
||||
source = "Before\n$$\nx_i^2\n$$\nAfter"
|
||||
|
||||
once = normalize_markdown(source).markdown
|
||||
twice = normalize_markdown(once).markdown
|
||||
|
||||
assert once == "Before\n\n$$\nx_i^2\n$$\n\nAfter"
|
||||
assert twice == once
|
||||
|
||||
|
||||
def test_underscores_carets_braces_and_backslashes_inside_math_are_preserved() -> None:
|
||||
source = r"\(\frac{x_i^{2}}{\alpha_beta}\)"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == r"$\frac{x_i^{2}}{\alpha_beta}$"
|
||||
|
||||
|
||||
def test_fenced_code_blocks_are_not_normalized() -> None:
|
||||
source = "Text\n```md\n\\(x_i\\)\n\\[y\\]\n\n```\n\\(z\\)"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == "Text\n```md\n\\(x_i\\)\n\\[y\\]\n\n```\n$z$"
|
||||
|
||||
|
||||
def test_inline_code_spans_are_not_normalized() -> None:
|
||||
source = r"Keep `\(x_i\)` and convert \(y_i\)."
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == r"Keep `\(x_i\)` and convert $y_i$."
|
||||
|
||||
|
||||
def test_normalization_is_idempotent_for_mixed_content(tmp_path: Path) -> None:
|
||||
(tmp_path / "assets").mkdir()
|
||||
(tmp_path / "assets" / "fig 1.png").write_bytes(b"image")
|
||||
source = "Before \\(x_i\\)\n\\[y^2\\]\n"
|
||||
|
||||
once = normalize_markdown(source, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True)
|
||||
twice = normalize_markdown(once.markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True)
|
||||
|
||||
assert twice.markdown == once.markdown
|
||||
assert twice.warnings == once.warnings
|
||||
|
||||
|
||||
def test_relative_asset_links_use_posix_paths_and_preserve_alt_text() -> None:
|
||||
result = normalize_markdown(r"")
|
||||
|
||||
assert result.markdown == ""
|
||||
assert result.asset_links == ("assets/fig 1.png",)
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_missing_asset_link_emits_warning_when_checking_is_enabled(tmp_path: Path) -> None:
|
||||
(tmp_path / "assets").mkdir()
|
||||
|
||||
result = normalize_markdown(
|
||||
"",
|
||||
markdown_dir=tmp_path,
|
||||
asset_root=tmp_path / "assets",
|
||||
check_assets=True,
|
||||
)
|
||||
|
||||
assert result.markdown == ""
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_MISSING]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("source", "expected_link"),
|
||||
[
|
||||
(r"", "fig.png"),
|
||||
("", "outside.png"),
|
||||
],
|
||||
)
|
||||
def test_invalid_local_asset_links_are_rewritten_as_relative_with_warning(source: str, expected_link: str) -> None:
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown.endswith(f"({expected_link})")
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
|
||||
|
||||
|
||||
def test_remote_asset_link_is_warned_and_not_fetched_or_rewritten() -> None:
|
||||
source = ""
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
assert result.asset_links == ("https://example.test/fig.png",)
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
|
||||
|
||||
|
||||
def test_absolute_asset_under_markdown_dir_can_be_rewritten_relative_with_warning(tmp_path: Path) -> None:
|
||||
asset_dir = tmp_path / "assets"
|
||||
asset_dir.mkdir()
|
||||
asset = asset_dir / "fig.png"
|
||||
asset.write_bytes(b"image")
|
||||
|
||||
result = normalize_markdown(f"", markdown_dir=tmp_path, asset_root=asset_dir, check_assets=True)
|
||||
|
||||
assert result.markdown == ""
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
|
||||
|
||||
|
||||
def test_simple_pipe_table_is_preserved() -> None:
|
||||
source = "| A | B |\n|---|---|\n| \\(x\\) | y |"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_complex_html_table_is_preserved_with_fallback_warning() -> None:
|
||||
source = '<table><tr><td rowspan="2">\\(x_i\\)</td><td>y</td></tr></table>'
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.TABLE_FALLBACK]
|
||||
@@ -0,0 +1,118 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.math_render import MathJaxCommandResult, MathJaxRenderChecker
|
||||
from pdf2md.quality import MathCheckerUnavailable, MathExpression
|
||||
|
||||
|
||||
def test_mathjax_checker_batches_expressions_as_json(tmp_path: Path) -> None:
|
||||
helper = make_helper(tmp_path)
|
||||
calls = []
|
||||
|
||||
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
|
||||
calls.append((command, json.loads(stdin), timeout_seconds))
|
||||
return MathJaxCommandResult(
|
||||
command,
|
||||
0,
|
||||
stdout=json.dumps(
|
||||
{
|
||||
"results": [
|
||||
{"index": 0, "ok": True},
|
||||
{"index": 1, "ok": False, "message": "Undefined control sequence"},
|
||||
]
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
checker = MathJaxRenderChecker(
|
||||
helper_path=helper,
|
||||
which=lambda executable: "C:/node/node.exe" if executable == "node" else None,
|
||||
runner=runner,
|
||||
timeout_seconds=7,
|
||||
)
|
||||
expressions = (
|
||||
MathExpression(0, "x_i^2", False, (0, 7)),
|
||||
MathExpression(1, "\\bad", True, (9, 18)),
|
||||
)
|
||||
|
||||
results = checker.check_expressions(expressions)
|
||||
|
||||
assert [result.ok for result in results] == [True, False]
|
||||
assert results[1].message == "Undefined control sequence"
|
||||
assert calls == [
|
||||
(
|
||||
("C:/node/node.exe", str(helper)),
|
||||
{
|
||||
"expressions": [
|
||||
{"index": 0, "body": "x_i^2", "display": False},
|
||||
{"index": 1, "body": "\\bad", "display": True},
|
||||
]
|
||||
},
|
||||
7,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def test_mathjax_checker_reports_missing_node_as_unavailable(tmp_path: Path) -> None:
|
||||
checker = MathJaxRenderChecker(helper_path=make_helper(tmp_path), which=lambda _: None)
|
||||
|
||||
with pytest.raises(MathCheckerUnavailable, match="Node.js"):
|
||||
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
|
||||
|
||||
|
||||
def test_mathjax_checker_reports_helper_failure_as_unavailable(tmp_path: Path) -> None:
|
||||
helper = make_helper(tmp_path)
|
||||
|
||||
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
|
||||
return MathJaxCommandResult(command, 124, stderr="MathJax helper timed out")
|
||||
|
||||
checker = MathJaxRenderChecker(
|
||||
helper_path=helper,
|
||||
which=lambda _: "node",
|
||||
runner=runner,
|
||||
)
|
||||
|
||||
with pytest.raises(MathCheckerUnavailable, match="timed out"):
|
||||
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
|
||||
|
||||
|
||||
def test_mathjax_checker_reports_invalid_json_as_unavailable(tmp_path: Path) -> None:
|
||||
helper = make_helper(tmp_path)
|
||||
|
||||
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
|
||||
return MathJaxCommandResult(command, 0, stdout="not json")
|
||||
|
||||
checker = MathJaxRenderChecker(
|
||||
helper_path=helper,
|
||||
which=lambda _: "node",
|
||||
runner=runner,
|
||||
)
|
||||
|
||||
with pytest.raises(MathCheckerUnavailable, match="invalid JSON"):
|
||||
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
|
||||
|
||||
|
||||
def test_mathjax_checker_rejects_mismatched_result_indexes(tmp_path: Path) -> None:
|
||||
helper = make_helper(tmp_path)
|
||||
|
||||
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
|
||||
return MathJaxCommandResult(command, 0, stdout=json.dumps({"results": [{"index": 99, "ok": True}]}))
|
||||
|
||||
checker = MathJaxRenderChecker(
|
||||
helper_path=helper,
|
||||
which=lambda _: "node",
|
||||
runner=runner,
|
||||
)
|
||||
|
||||
with pytest.raises(MathCheckerUnavailable, match="indexes"):
|
||||
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
|
||||
|
||||
|
||||
def make_helper(tmp_path: Path) -> Path:
|
||||
helper = tmp_path / "check.mjs"
|
||||
helper.write_text("// fake helper", encoding="utf-8")
|
||||
return helper
|
||||
@@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import (
|
||||
AssetRecord,
|
||||
BlockRecord,
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
)
|
||||
from pdf2md.metadata import MetadataInputError, build_metadata, build_summary
|
||||
|
||||
|
||||
def make_document(tmp_path: Path) -> DocumentRecord:
|
||||
page_zero = PageRecord(
|
||||
page_index=0,
|
||||
blocks=(
|
||||
BlockRecord(BlockType.HEADING, page_index=0),
|
||||
BlockRecord(BlockType.INLINE_FORMULA, page_index=0, confidence=0.98),
|
||||
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0, bbox=(1.0, 2.0, 3.0, 4.0)),
|
||||
),
|
||||
)
|
||||
page_one = PageRecord(
|
||||
page_index=1,
|
||||
blocks=(
|
||||
BlockRecord(BlockType.PARAGRAPH, page_index=1),
|
||||
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=1),
|
||||
),
|
||||
)
|
||||
return DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(page_zero, page_one),
|
||||
assets=(AssetRecord("paper.assets/figure.png", page_index=1),),
|
||||
warnings=(
|
||||
WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Check reading order.", page_index=1),
|
||||
WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Math failed to render.", page_index=0),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def build_test_metadata(tmp_path: Path) -> dict[str, object]:
|
||||
return build_metadata(
|
||||
document=make_document(tmp_path),
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-07T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options={"strict_local": True},
|
||||
)
|
||||
|
||||
|
||||
def test_metadata_has_required_top_level_fields(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
|
||||
assert set(metadata) == {
|
||||
"source_pdf",
|
||||
"source_sha256",
|
||||
"created_at",
|
||||
"engine",
|
||||
"engine_version",
|
||||
"engine_options",
|
||||
"pages",
|
||||
"assets",
|
||||
"warnings",
|
||||
"summary",
|
||||
}
|
||||
|
||||
|
||||
def test_metadata_summary_counts_from_records(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
|
||||
assert metadata["summary"] == {
|
||||
"pages_processed": 2,
|
||||
"warning_count": 2,
|
||||
"asset_count": 1,
|
||||
"display_formula_count": 2,
|
||||
"inline_formula_count": 1,
|
||||
"math_render_error_count": 1,
|
||||
}
|
||||
|
||||
|
||||
def test_warning_order_and_page_provenance_are_preserved(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
|
||||
warnings = metadata["warnings"]
|
||||
assert [warning["code"] for warning in warnings] == [
|
||||
"READING_ORDER_UNCERTAIN",
|
||||
"MATH_RENDER_FAILED",
|
||||
]
|
||||
assert warnings[0]["page_index"] == 1
|
||||
assert warnings[1]["page_index"] == 0
|
||||
|
||||
|
||||
def test_optional_bbox_and_confidence_are_preserved_only_when_present(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
blocks = metadata["pages"][0]["blocks"]
|
||||
|
||||
assert "confidence" not in blocks[0]
|
||||
assert blocks[1]["confidence"] == 0.98
|
||||
assert "bbox" not in blocks[1]
|
||||
assert blocks[2]["bbox"] == [1.0, 2.0, 3.0, 4.0]
|
||||
|
||||
|
||||
def test_metadata_is_json_serializable(tmp_path: Path) -> None:
|
||||
json.dumps(build_test_metadata(tmp_path))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("field_name", "kwargs"),
|
||||
[
|
||||
("document", {"document": None}),
|
||||
("source_sha256", {"source_sha256": ""}),
|
||||
("created_at", {"created_at": ""}),
|
||||
("engine", {"engine": ""}),
|
||||
("engine_version", {"engine_version": ""}),
|
||||
],
|
||||
)
|
||||
def test_metadata_requires_core_inputs(tmp_path: Path, field_name: str, kwargs: dict[str, object]) -> None:
|
||||
values: dict[str, object] = {
|
||||
"document": make_document(tmp_path),
|
||||
"source_sha256": "0" * 64,
|
||||
"created_at": "2026-05-07T00:00:00Z",
|
||||
"engine": "MinerU",
|
||||
"engine_version": "3.1.0",
|
||||
}
|
||||
values.update(kwargs)
|
||||
|
||||
with pytest.raises(MetadataInputError, match=field_name):
|
||||
build_metadata(**values)
|
||||
|
||||
|
||||
def test_engine_options_must_be_json_serializable(tmp_path: Path) -> None:
|
||||
with pytest.raises(MetadataInputError, match="JSON serializable"):
|
||||
build_metadata(
|
||||
document=make_document(tmp_path),
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-07T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options={"path": tmp_path},
|
||||
)
|
||||
|
||||
|
||||
def test_formula_counts_come_from_block_types_not_markdown_text(tmp_path: Path) -> None:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH), BlockRecord(BlockType.UNKNOWN))),),
|
||||
)
|
||||
|
||||
summary = build_summary(document)
|
||||
|
||||
assert summary["inline_formula_count"] == 0
|
||||
assert summary["display_formula_count"] == 0
|
||||
|
||||
|
||||
def test_info_math_render_warning_is_not_counted_as_render_error(tmp_path: Path) -> None:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.INLINE_FORMULA),)),),
|
||||
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.INFO, "Checker unavailable."),),
|
||||
)
|
||||
|
||||
summary = build_summary(document)
|
||||
|
||||
assert summary["warning_count"] == 1
|
||||
assert summary["math_render_error_count"] == 0
|
||||
@@ -0,0 +1,264 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import WarningCode
|
||||
from pdf2md.mineru_adapter import (
|
||||
CommandResult,
|
||||
MinerUAdapter,
|
||||
MinerUOptions,
|
||||
StrictLocalViolationError,
|
||||
)
|
||||
|
||||
|
||||
class FakeRunner:
|
||||
def __init__(self, *results: CommandResult) -> None:
|
||||
self.results = list(results)
|
||||
self.commands: list[tuple[str, ...]] = []
|
||||
|
||||
def __call__(self, command: tuple[str, ...]) -> CommandResult:
|
||||
self.commands.append(command)
|
||||
if not self.results:
|
||||
raise AssertionError("fake runner was called without a queued result")
|
||||
result = self.results.pop(0)
|
||||
return CommandResult(
|
||||
command=command,
|
||||
exit_code=result.exit_code,
|
||||
stdout=result.stdout,
|
||||
stderr=result.stderr,
|
||||
)
|
||||
|
||||
|
||||
class EnvironmentRunner:
|
||||
def __init__(self) -> None:
|
||||
self.mineru_device_mode: str | None = None
|
||||
self.cuda_visible_devices: str | None = None
|
||||
|
||||
def __call__(self, command: tuple[str, ...]) -> CommandResult:
|
||||
self.mineru_device_mode = os.environ.get("MINERU_DEVICE_MODE")
|
||||
self.cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
|
||||
work_dir = Path(command[command.index("-o") + 1])
|
||||
work_dir.mkdir(parents=True, exist_ok=True)
|
||||
(work_dir / "paper.md").write_text("# Title\n", encoding="utf-8")
|
||||
return CommandResult(command=command, exit_code=0)
|
||||
|
||||
|
||||
def available(_: str) -> str:
|
||||
return "C:/local/bin/mineru.exe"
|
||||
|
||||
|
||||
def missing(_: str) -> None:
|
||||
return None
|
||||
|
||||
|
||||
def test_availability_check_uses_mockable_which() -> None:
|
||||
assert MinerUAdapter(which=available, runner=FakeRunner()).is_available() is True
|
||||
assert MinerUAdapter(which=missing, runner=FakeRunner()).is_available() is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("executable", ["mineru-api", "python", "C:/tools/mineru.exe"])
|
||||
def test_custom_executable_is_rejected(executable: str) -> None:
|
||||
with pytest.raises(StrictLocalViolationError):
|
||||
MinerUAdapter(executable=executable, which=available, runner=FakeRunner())
|
||||
|
||||
|
||||
def test_missing_mineru_does_not_call_runner(tmp_path: Path) -> None:
|
||||
runner = FakeRunner()
|
||||
adapter = MinerUAdapter(which=missing, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", tmp_path / "work")
|
||||
|
||||
assert result.succeeded is False
|
||||
assert result.exit_code is None
|
||||
assert runner.commands == []
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ENGINE_MISSING]
|
||||
|
||||
|
||||
def test_missing_mineru_version_does_not_call_runner() -> None:
|
||||
runner = FakeRunner()
|
||||
adapter = MinerUAdapter(which=missing, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.available is False
|
||||
assert result.exit_code is None
|
||||
assert runner.commands == []
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ENGINE_MISSING]
|
||||
|
||||
|
||||
def test_version_success_uses_stdout() -> None:
|
||||
runner = FakeRunner(CommandResult((), 0, stdout="MinerU 3.1.0\n"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.available is True
|
||||
assert result.version == "MinerU 3.1.0"
|
||||
assert result.command == ("mineru", "--version")
|
||||
assert runner.commands == [("mineru", "--version")]
|
||||
|
||||
|
||||
def test_version_success_can_use_stderr() -> None:
|
||||
runner = FakeRunner(CommandResult((), 0, stderr="MinerU 3.1.0\n"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.version == "MinerU 3.1.0"
|
||||
|
||||
|
||||
def test_version_failure_is_explicit() -> None:
|
||||
runner = FakeRunner(CommandResult((), 2, stdout="", stderr="bad version"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.version is None
|
||||
assert result.exit_code == 2
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
|
||||
|
||||
def test_version_empty_output_is_explicit() -> None:
|
||||
runner = FakeRunner(CommandResult((), 0, stdout="", stderr=""))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.available is True
|
||||
assert result.version is None
|
||||
assert result.exit_code == 0
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
|
||||
|
||||
def test_build_command_is_list_based_and_deterministic(tmp_path: Path) -> None:
|
||||
adapter = MinerUAdapter(which=available, runner=FakeRunner())
|
||||
input_pdf = tmp_path / "논문 with spaces.pdf"
|
||||
work_dir = tmp_path / "work output"
|
||||
|
||||
command = adapter.build_command(input_pdf, work_dir)
|
||||
|
||||
assert command == ("mineru", "-p", str(input_pdf), "-o", str(work_dir))
|
||||
assert "--api-url" not in command
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
MinerUOptions(extra_cli_args=("--api-url", "http://example.test")),
|
||||
MinerUOptions(engine_options={"api_url": "http://example.test"}),
|
||||
MinerUOptions(engine_options={"base_url": "http://example.test"}),
|
||||
MinerUOptions(engine_options={"mode": "router"}),
|
||||
MinerUOptions(engine_options={"backend": "http"}),
|
||||
MinerUOptions(engine_options={"openai_base_url": "http://example.test/v1"}),
|
||||
MinerUOptions(engine_options={"endpoint": "https://example.test"}),
|
||||
MinerUOptions(engine_options={"nested": {"url": "local http://example.test"}}),
|
||||
MinerUOptions(engine_options={"process": "mineru-api"}),
|
||||
MinerUOptions(gpu_device="https://example.test/gpu"),
|
||||
MinerUOptions(strict_local=False),
|
||||
],
|
||||
)
|
||||
def test_strict_local_rejects_remote_router_and_backend_options(tmp_path: Path, options: MinerUOptions) -> None:
|
||||
adapter = MinerUAdapter(which=available, runner=FakeRunner())
|
||||
|
||||
with pytest.raises(StrictLocalViolationError):
|
||||
adapter.build_command(tmp_path / "paper.pdf", tmp_path / "work", options)
|
||||
|
||||
|
||||
def test_successful_mocked_output_parses_markdown_json_and_assets(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
(work_dir / "nested").mkdir(parents=True)
|
||||
(work_dir / "paper.md").write_text("# Title\n", encoding="utf-8")
|
||||
(work_dir / "structured.json").write_text('{"pages": 1}', encoding="utf-8")
|
||||
(work_dir / "assets" / "z.png").parent.mkdir()
|
||||
(work_dir / "assets" / "z.png").write_bytes(b"z")
|
||||
(work_dir / "assets" / "a.png").write_bytes(b"a")
|
||||
(work_dir / "assets" / "nested").mkdir()
|
||||
(work_dir / "assets" / "nested" / "b.png").write_bytes(b"b")
|
||||
(work_dir / "zz_extra.md").write_text("not an asset", encoding="utf-8")
|
||||
(work_dir / "zz_extra.json").write_text("{}", encoding="utf-8")
|
||||
(work_dir / "run.log").write_text("diagnostic", encoding="utf-8")
|
||||
runner = FakeRunner(CommandResult((), 0, stdout="ok", stderr="warn"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(
|
||||
tmp_path / "paper.pdf",
|
||||
work_dir,
|
||||
MinerUOptions(engine_version="3.1.0", gpu_device="cuda:0"),
|
||||
)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert result.command == ("mineru", "-p", str(tmp_path / "paper.pdf"), "-o", str(work_dir))
|
||||
assert result.raw_markdown == "# Title\n"
|
||||
assert result.raw_structured == {"pages": 1}
|
||||
assert [path.relative_to(work_dir).as_posix() for path in result.asset_paths] == [
|
||||
"assets/a.png",
|
||||
"assets/nested/b.png",
|
||||
"assets/z.png",
|
||||
]
|
||||
assert result.engine == "MinerU"
|
||||
assert result.engine_version == "3.1.0"
|
||||
assert result.engine_options == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
assert result.exit_code == 0
|
||||
assert result.stdout == "ok"
|
||||
assert result.stderr == "warn"
|
||||
|
||||
|
||||
def test_gpu_option_sets_mineru_environment_and_restores_previous_values(tmp_path: Path, monkeypatch) -> None:
|
||||
monkeypatch.setenv("MINERU_DEVICE_MODE", "cpu")
|
||||
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "7")
|
||||
runner = EnvironmentRunner()
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", tmp_path / "work", MinerUOptions(gpu_device="cuda:0"))
|
||||
|
||||
assert result.succeeded is True
|
||||
assert runner.mineru_device_mode == "cuda"
|
||||
assert runner.cuda_visible_devices == "0"
|
||||
assert os.environ["MINERU_DEVICE_MODE"] == "cpu"
|
||||
assert os.environ["CUDA_VISIBLE_DEVICES"] == "7"
|
||||
|
||||
|
||||
def test_nonzero_exit_does_not_parse_existing_outputs_or_fallback(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
work_dir.mkdir()
|
||||
(work_dir / "paper.md").write_text("existing output", encoding="utf-8")
|
||||
runner = FakeRunner(CommandResult((), 3, stdout="out", stderr="failed"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
|
||||
|
||||
assert result.succeeded is False
|
||||
assert result.raw_markdown is None
|
||||
assert result.asset_paths == ()
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
|
||||
|
||||
def test_exit_zero_with_no_usable_output_warns(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
work_dir.mkdir()
|
||||
runner = FakeRunner(CommandResult((), 0))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
|
||||
|
||||
assert result.succeeded is False
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
assert "no usable" in result.warnings[0].message
|
||||
|
||||
|
||||
def test_invalid_json_is_preserved_as_text_with_warning(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
work_dir.mkdir()
|
||||
(work_dir / "paper.md").write_text("markdown", encoding="utf-8")
|
||||
(work_dir / "structured.json").write_text("{not json", encoding="utf-8")
|
||||
runner = FakeRunner(CommandResult((), 0))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert result.raw_structured == "{not json"
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
@@ -0,0 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pdf2md
|
||||
|
||||
|
||||
def test_package_imports() -> None:
|
||||
assert pdf2md.__version__ == "0.1.0"
|
||||
assert callable(pdf2md.convert_pdf)
|
||||
@@ -0,0 +1,188 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.paths import (
|
||||
DiscoveredPdf,
|
||||
DuplicateOutputPathError,
|
||||
InputDiscoveryError,
|
||||
OutputConflictError,
|
||||
OutputPathError,
|
||||
OutputRootError,
|
||||
discover_pdfs,
|
||||
plan_outputs,
|
||||
plan_pdf_outputs,
|
||||
)
|
||||
|
||||
|
||||
def touch(path: Path) -> Path:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(b"")
|
||||
return path
|
||||
|
||||
|
||||
def test_discovers_single_pdf_case_insensitive(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "Paper.PDF")
|
||||
|
||||
discovered = discover_pdfs(pdf)
|
||||
|
||||
assert discovered == (DiscoveredPdf(source_path=pdf.resolve()),)
|
||||
|
||||
|
||||
def test_rejects_nonexistent_and_non_pdf_inputs(tmp_path: Path) -> None:
|
||||
with pytest.raises(InputDiscoveryError, match="does not exist"):
|
||||
discover_pdfs(tmp_path / "missing.pdf")
|
||||
|
||||
text_file = touch(tmp_path / "notes.txt")
|
||||
with pytest.raises(InputDiscoveryError, match="not a PDF"):
|
||||
discover_pdfs(text_file)
|
||||
|
||||
|
||||
def test_discovers_directory_non_recursive_only(tmp_path: Path) -> None:
|
||||
root_pdf = touch(tmp_path / "root.pdf")
|
||||
nested_pdf = touch(tmp_path / "nested" / "child.pdf")
|
||||
|
||||
discovered = discover_pdfs(tmp_path, recursive=False)
|
||||
|
||||
assert [item.source_path for item in discovered] == [root_pdf.resolve()]
|
||||
assert nested_pdf.resolve() not in {item.source_path for item in discovered}
|
||||
|
||||
|
||||
def test_non_recursive_directory_with_only_nested_pdfs_fails(tmp_path: Path) -> None:
|
||||
touch(tmp_path / "nested" / "child.pdf")
|
||||
|
||||
with pytest.raises(InputDiscoveryError, match="no PDF files"):
|
||||
discover_pdfs(tmp_path, recursive=False)
|
||||
|
||||
|
||||
def test_discovers_directory_recursive_with_relative_parents(tmp_path: Path) -> None:
|
||||
root_pdf = touch(tmp_path / "root.pdf")
|
||||
nested_pdf = touch(tmp_path / "nested" / "child.pdf")
|
||||
deeper_pdf = touch(tmp_path / "nested" / "deeper" / "leaf.PdF")
|
||||
|
||||
discovered = discover_pdfs(tmp_path, recursive=True)
|
||||
|
||||
assert [(item.source_path, item.relative_parent) for item in discovered] == [
|
||||
(nested_pdf.resolve(), Path("nested")),
|
||||
(deeper_pdf.resolve(), Path("nested") / "deeper"),
|
||||
(root_pdf.resolve(), Path()),
|
||||
]
|
||||
|
||||
|
||||
def test_discovery_order_is_deterministic_for_non_ascii_names(tmp_path: Path) -> None:
|
||||
touch(tmp_path / "한글.pdf")
|
||||
touch(tmp_path / "Alpha.pdf")
|
||||
touch(tmp_path / "beta.PDF")
|
||||
|
||||
first = discover_pdfs(tmp_path)
|
||||
second = discover_pdfs(tmp_path)
|
||||
|
||||
assert [item.source_path.name for item in first] == ["Alpha.pdf", "beta.PDF", "한글.pdf"]
|
||||
assert first == second
|
||||
|
||||
|
||||
def test_plans_all_default_output_paths_for_single_pdf(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "입력.pdf")
|
||||
output_root = tmp_path / "out"
|
||||
|
||||
[plan] = plan_pdf_outputs(pdf, output_root)
|
||||
|
||||
assert plan.source_pdf == pdf.resolve()
|
||||
assert plan.markdown_path == output_root.resolve() / "입력.md"
|
||||
assert plan.assets_dir == output_root.resolve() / "입력.assets"
|
||||
assert plan.metadata_path == output_root.resolve() / "입력.metadata.json"
|
||||
assert plan.report_path == output_root.resolve() / "입력.report.md"
|
||||
assert plan.raw_dir is None
|
||||
|
||||
|
||||
def test_plans_optional_metadata_and_raw_outputs(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
|
||||
[without_metadata] = plan_pdf_outputs(pdf, tmp_path / "out", metadata=False)
|
||||
[with_raw] = plan_pdf_outputs(pdf, tmp_path / "out", keep_raw=True)
|
||||
|
||||
assert without_metadata.metadata_path is None
|
||||
assert without_metadata.report_path == (tmp_path / "out").resolve() / "paper.report.md"
|
||||
assert with_raw.raw_dir == (tmp_path / "out").resolve() / "paper.raw"
|
||||
|
||||
|
||||
def test_recursive_planning_preserves_relative_subdirectories(tmp_path: Path) -> None:
|
||||
root = tmp_path / "pdfs"
|
||||
touch(root / "same.pdf")
|
||||
touch(root / "nested" / "same.pdf")
|
||||
|
||||
plans = plan_pdf_outputs(root, tmp_path / "out", recursive=True)
|
||||
|
||||
assert [plan.markdown_path.relative_to((tmp_path / "out").resolve()) for plan in plans] == [
|
||||
Path("nested") / "same.md",
|
||||
Path("same.md"),
|
||||
]
|
||||
|
||||
|
||||
def test_non_recursive_duplicate_output_paths_fail(tmp_path: Path) -> None:
|
||||
first = touch(tmp_path / "first" / "same.pdf")
|
||||
second = touch(tmp_path / "second" / "same.pdf")
|
||||
discovered = (
|
||||
DiscoveredPdf(source_path=first.resolve()),
|
||||
DiscoveredPdf(source_path=second.resolve()),
|
||||
)
|
||||
|
||||
with pytest.raises(DuplicateOutputPathError, match="duplicated"):
|
||||
plan_outputs(discovered, tmp_path / "out")
|
||||
|
||||
|
||||
def test_output_conflicts_report_all_existing_paths(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
output_root = tmp_path / "out"
|
||||
(output_root / "paper.assets").mkdir(parents=True)
|
||||
(output_root / "paper.md").mkdir()
|
||||
touch(output_root / "paper.metadata.json")
|
||||
|
||||
with pytest.raises(OutputConflictError) as error:
|
||||
plan_pdf_outputs(pdf, output_root)
|
||||
|
||||
conflict_names = {path.name for path in error.value.conflicts}
|
||||
assert conflict_names == {"paper.assets", "paper.md", "paper.metadata.json"}
|
||||
|
||||
|
||||
def test_overwrite_allows_existing_paths_without_deleting(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
output_root = tmp_path / "out"
|
||||
existing = touch(output_root / "paper.md")
|
||||
|
||||
[plan] = plan_pdf_outputs(pdf, output_root, overwrite=True)
|
||||
|
||||
assert plan.markdown_path == existing.resolve()
|
||||
assert existing.exists()
|
||||
|
||||
|
||||
def test_output_root_cannot_be_existing_file(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
output_root = touch(tmp_path / "out")
|
||||
|
||||
with pytest.raises(OutputRootError, match="not a directory"):
|
||||
plan_pdf_outputs(pdf, output_root)
|
||||
|
||||
|
||||
def test_planned_paths_cannot_escape_output_root(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
discovered = (DiscoveredPdf(source_path=pdf.resolve(), relative_parent=Path("..")),)
|
||||
|
||||
with pytest.raises(OutputPathError, match="escape"):
|
||||
plan_outputs(discovered, tmp_path / "out")
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.name != "nt", reason="Windows rooted path behavior")
|
||||
@pytest.mark.parametrize("relative_parent", [Path("\\outside"), Path("/outside"), Path("C:outside")])
|
||||
def test_windows_rooted_relative_parents_cannot_escape_output_root(
|
||||
tmp_path: Path,
|
||||
relative_parent: Path,
|
||||
) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
discovered = (DiscoveredPdf(source_path=pdf.resolve(), relative_parent=relative_parent),)
|
||||
|
||||
with pytest.raises(OutputPathError, match="escape"):
|
||||
plan_outputs(discovered, tmp_path / "out")
|
||||
@@ -0,0 +1,62 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
from pdf2md.pdf_splitter import PdfChunkError, count_pdf_pages, plan_pdf_chunks, write_pdf_chunk
|
||||
|
||||
|
||||
def make_blank_pdf(path: Path, page_count: int) -> Path:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
writer = PdfWriter()
|
||||
for _ in range(page_count):
|
||||
writer.add_blank_page(width=72, height=72)
|
||||
with path.open("wb") as file:
|
||||
writer.write(file)
|
||||
return path
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("page_count", "expected_ranges"),
|
||||
[
|
||||
(1, [(1, 1)]),
|
||||
(20, [(1, 20)]),
|
||||
(21, [(1, 20), (21, 21)]),
|
||||
(40, [(1, 20), (21, 40)]),
|
||||
(41, [(1, 20), (21, 40), (41, 41)]),
|
||||
],
|
||||
)
|
||||
def test_plan_pdf_chunks_uses_one_based_ranges_and_names(
|
||||
tmp_path: Path,
|
||||
page_count: int,
|
||||
expected_ranges: list[tuple[int, int]],
|
||||
) -> None:
|
||||
pdf = make_blank_pdf(tmp_path / "paper.pdf", page_count)
|
||||
|
||||
chunks = plan_pdf_chunks(pdf, chunk_pages=20)
|
||||
|
||||
assert count_pdf_pages(pdf) == page_count
|
||||
assert [(chunk.source_page_start, chunk.source_page_end) for chunk in chunks] == expected_ranges
|
||||
assert [chunk.output_filename for chunk in chunks] == [
|
||||
f"paper.part-{index:03d}.pages-{start:03d}-{end:03d}.pdf"
|
||||
for index, (start, end) in enumerate(expected_ranges, start=1)
|
||||
]
|
||||
|
||||
|
||||
def test_write_pdf_chunk_writes_expected_page_count(tmp_path: Path) -> None:
|
||||
pdf = make_blank_pdf(tmp_path / "paper.pdf", 41)
|
||||
chunk = plan_pdf_chunks(pdf, chunk_pages=20)[1]
|
||||
|
||||
output = write_pdf_chunk(chunk, tmp_path / "chunks" / chunk.output_filename)
|
||||
|
||||
assert output.exists()
|
||||
assert len(PdfReader(output).pages) == 20
|
||||
|
||||
|
||||
def test_plan_pdf_chunks_rejects_non_positive_chunk_size(tmp_path: Path) -> None:
|
||||
pdf = make_blank_pdf(tmp_path / "paper.pdf", 1)
|
||||
|
||||
with pytest.raises(PdfChunkError, match="positive integer"):
|
||||
plan_pdf_chunks(pdf, chunk_pages=0)
|
||||
@@ -0,0 +1,144 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.ir import WarningCode, WarningSeverity
|
||||
from pdf2md.quality import (
|
||||
MathCheckerUnavailable,
|
||||
MathCheckResult,
|
||||
check_asset_links,
|
||||
check_math_renderability,
|
||||
extract_math_expressions,
|
||||
merge_quality_results,
|
||||
)
|
||||
|
||||
|
||||
def test_missing_asset_link_is_counted(tmp_path: Path) -> None:
|
||||
asset_root = tmp_path / "assets"
|
||||
asset_root.mkdir()
|
||||
|
||||
result = check_asset_links("", markdown_dir=tmp_path, asset_root=asset_root)
|
||||
|
||||
assert result.missing_asset_link_count == 1
|
||||
assert result.invalid_asset_link_count == 0
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_MISSING]
|
||||
|
||||
|
||||
def test_existing_asset_link_passes_without_warning(tmp_path: Path) -> None:
|
||||
asset_root = tmp_path / "assets"
|
||||
asset_root.mkdir()
|
||||
(asset_root / "fig.png").write_bytes(b"image")
|
||||
|
||||
result = check_asset_links("", markdown_dir=tmp_path, asset_root=asset_root)
|
||||
|
||||
assert result.failure_count == 0
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_invalid_asset_links_are_counted_without_fetching(tmp_path: Path) -> None:
|
||||
markdown = "\n".join(
|
||||
[
|
||||
"",
|
||||
"",
|
||||
r"",
|
||||
]
|
||||
)
|
||||
|
||||
result = check_asset_links(markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets")
|
||||
|
||||
assert result.invalid_asset_link_count == 3
|
||||
assert result.missing_asset_link_count == 0
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID] * 3
|
||||
|
||||
|
||||
def test_asset_links_inside_code_are_ignored(tmp_path: Path) -> None:
|
||||
markdown = "```md\n\n```\n``"
|
||||
|
||||
result = check_asset_links(markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets")
|
||||
|
||||
assert result.failure_count == 0
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_math_render_failures_are_aggregated_with_fake_checker() -> None:
|
||||
def checker(body: str) -> MathCheckResult:
|
||||
return MathCheckResult(ok="bad" not in body, message=f"{body} failed")
|
||||
|
||||
result = check_math_renderability("$x_i^2$\n\n$$\nbad_math\n$$", checker)
|
||||
|
||||
assert result.math_render_error_count == 1
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_FAILED]
|
||||
assert "bad_math failed" in result.warnings[0].message
|
||||
|
||||
|
||||
def test_math_extraction_records_display_mode_and_markdown_spans() -> None:
|
||||
markdown = "Inline $x_i^2$ before\n\n$$\n\\frac{1}{2}\n$$\n"
|
||||
|
||||
expressions = extract_math_expressions(markdown)
|
||||
|
||||
assert [(expression.index, expression.body, expression.display) for expression in expressions] == [
|
||||
(0, "x_i^2", False),
|
||||
(1, "\\frac{1}{2}", True),
|
||||
]
|
||||
assert [markdown[start:end] for start, end in (expression.markdown_span for expression in expressions)] == [
|
||||
"$x_i^2$",
|
||||
"$$\n\\frac{1}{2}\n$$",
|
||||
]
|
||||
|
||||
|
||||
def test_math_extraction_ignores_code_and_currency_like_text() -> None:
|
||||
markdown = "```tex\n$x$\n```\n`$y$`\nPrice $12.00$ and real $z$."
|
||||
|
||||
expressions = extract_math_expressions(markdown)
|
||||
|
||||
assert [(expression.body, expression.display) for expression in expressions] == [("z", False)]
|
||||
|
||||
|
||||
def test_batch_math_checker_receives_expression_records() -> None:
|
||||
class BatchChecker:
|
||||
def __init__(self) -> None:
|
||||
self.expressions = ()
|
||||
|
||||
def check_expressions(self, expressions):
|
||||
self.expressions = expressions
|
||||
return tuple(MathCheckResult(ok=expression.display) for expression in expressions)
|
||||
|
||||
checker = BatchChecker()
|
||||
result = check_math_renderability("$inline$\n\n$$\ndisplay\n$$", checker)
|
||||
|
||||
assert [expression.body for expression in checker.expressions] == ["inline", "display"]
|
||||
assert result.math_render_error_count == 1
|
||||
assert "inline" in result.warnings[0].message
|
||||
|
||||
|
||||
def test_math_checker_unavailable_is_nonfatal() -> None:
|
||||
def checker(_: str) -> bool:
|
||||
raise MathCheckerUnavailable("local renderer missing")
|
||||
|
||||
result = check_math_renderability("$x$", checker)
|
||||
|
||||
assert result.math_render_error_count == 0
|
||||
assert result.warnings[0].code == WarningCode.MATH_RENDER_FAILED
|
||||
assert result.warnings[0].severity == WarningSeverity.INFO
|
||||
|
||||
|
||||
def test_missing_math_checker_is_explicit_and_nonfatal() -> None:
|
||||
result = check_math_renderability("$x$")
|
||||
|
||||
assert result.math_render_error_count == 0
|
||||
assert result.warnings[0].code == WarningCode.MATH_RENDER_FAILED
|
||||
assert result.warnings[0].severity == WarningSeverity.INFO
|
||||
|
||||
|
||||
def test_merge_quality_results_combines_counts_and_warning_order(tmp_path: Path) -> None:
|
||||
asset_result = check_asset_links("", markdown_dir=tmp_path)
|
||||
math_result = check_math_renderability("$x$", lambda _: False)
|
||||
|
||||
result = merge_quality_results(asset_result, math_result)
|
||||
|
||||
assert result.missing_asset_link_count == 1
|
||||
assert result.math_render_error_count == 1
|
||||
assert [warning.code for warning in result.warnings] == [
|
||||
WarningCode.ASSET_LINK_MISSING,
|
||||
WarningCode.MATH_RENDER_FAILED,
|
||||
]
|
||||
@@ -0,0 +1,163 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.ir import (
|
||||
AssetRecord,
|
||||
BlockRecord,
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
)
|
||||
from pdf2md.metadata import build_metadata
|
||||
from pdf2md.quality import QualityResult
|
||||
from pdf2md.report import determine_final_status, pages_with_warnings, render_report
|
||||
|
||||
|
||||
def make_metadata(tmp_path: Path, *, warnings: tuple[WarningRecord, ...] = ()) -> dict[str, object]:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(
|
||||
PageRecord(
|
||||
page_index=0,
|
||||
blocks=(
|
||||
BlockRecord(BlockType.INLINE_FORMULA, page_index=0),
|
||||
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0),
|
||||
),
|
||||
),
|
||||
PageRecord(page_index=1, blocks=(BlockRecord(BlockType.PARAGRAPH, page_index=1),)),
|
||||
),
|
||||
assets=(AssetRecord("paper.assets/fig.png", page_index=1),),
|
||||
warnings=warnings,
|
||||
)
|
||||
return build_metadata(
|
||||
document=document,
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-08T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options={"strict_local": True},
|
||||
)
|
||||
|
||||
|
||||
def test_final_status_success_partial_and_failed(tmp_path: Path) -> None:
|
||||
success_metadata = make_metadata(tmp_path)
|
||||
warning_metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
|
||||
)
|
||||
failed_metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),),
|
||||
)
|
||||
|
||||
assert determine_final_status(success_metadata) == "success"
|
||||
assert determine_final_status(warning_metadata) == "partial"
|
||||
assert determine_final_status(success_metadata, QualityResult(missing_asset_link_count=1)) == "partial"
|
||||
assert determine_final_status(failed_metadata) == "failed"
|
||||
|
||||
|
||||
def test_pages_with_warnings_are_sorted_and_derived_from_metadata_and_quality(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
|
||||
)
|
||||
quality = QualityResult(
|
||||
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.WARNING, "Math failed.", page_index=0),)
|
||||
)
|
||||
|
||||
assert pages_with_warnings(metadata, quality) == (0, 1)
|
||||
|
||||
|
||||
def test_report_content_includes_required_sections_and_counts(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
|
||||
)
|
||||
quality = QualityResult(
|
||||
missing_asset_link_count=2,
|
||||
invalid_asset_link_count=1,
|
||||
math_render_error_count=3,
|
||||
warnings=(WarningRecord(WarningCode.ASSET_LINK_MISSING, WarningSeverity.WARNING, "Missing asset."),),
|
||||
)
|
||||
|
||||
report = render_report(
|
||||
metadata,
|
||||
quality=quality,
|
||||
markdown_path=tmp_path / "paper.md",
|
||||
metadata_path=tmp_path / "paper.metadata.json",
|
||||
report_path=tmp_path / "paper.report.md",
|
||||
)
|
||||
|
||||
assert "# PDF-to-Markdown Quality Report" in report
|
||||
assert "- Final status: `partial`" in report
|
||||
assert f"- Source PDF: {tmp_path / 'paper.pdf'}" in report
|
||||
assert f"- Output Markdown: {tmp_path / 'paper.md'}" in report
|
||||
assert "- Engine: MinerU" in report
|
||||
assert "- Engine version: 3.1.0" in report
|
||||
assert '- Engine options: `{"strict_local": true}`' in report
|
||||
assert "- Pages processed: 2" in report
|
||||
assert "- Warning count: 2" in report
|
||||
assert "- Asset count: 1" in report
|
||||
assert "- Missing asset link count: 2" in report
|
||||
assert "- Invalid asset link count: 1" in report
|
||||
assert "- Inline formula count: 1" in report
|
||||
assert "- Display formula count: 1" in report
|
||||
assert "- Math render error count: 3" in report
|
||||
assert "- Page 1" in report
|
||||
assert "`ASSET_LINK_MISSING`" in report
|
||||
|
||||
|
||||
def test_report_omits_absent_optional_paths_and_does_not_write_files(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(tmp_path)
|
||||
report_path = tmp_path / "paper.report.md"
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "Output Markdown:" not in report
|
||||
assert "Metadata JSON:" not in report
|
||||
assert "Report Markdown:" not in report
|
||||
assert not report_path.exists()
|
||||
|
||||
|
||||
def test_report_failed_status_comes_from_error_severity_warning(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),),
|
||||
)
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "- Final status: `failed`" in report
|
||||
|
||||
|
||||
def test_report_uses_metadata_math_render_count_plus_quality_count(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Metadata math failed."),),
|
||||
)
|
||||
quality = QualityResult(math_render_error_count=2)
|
||||
|
||||
report = render_report(metadata, quality=quality)
|
||||
|
||||
assert "- Math render error count: 3" in report
|
||||
|
||||
|
||||
def test_report_includes_chunk_context_when_metadata_has_chunk_options(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(tmp_path)
|
||||
metadata["engine_options"] = {
|
||||
"strict_local": True,
|
||||
"chunk": {
|
||||
"chunk_index": 2,
|
||||
"total_chunks": 3,
|
||||
"source_page_start": 21,
|
||||
"source_page_end": 40,
|
||||
},
|
||||
}
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "- Chunk: 2/3, source pages: 21-40" in report
|
||||
@@ -0,0 +1,72 @@
|
||||
import MathJax from "mathjax";
|
||||
import process from "node:process";
|
||||
|
||||
const MAX_MESSAGE_LENGTH = 500;
|
||||
|
||||
function errorMessage(error) {
|
||||
const message = error && typeof error.message === "string" ? error.message : String(error);
|
||||
return message.slice(0, MAX_MESSAGE_LENGTH);
|
||||
}
|
||||
|
||||
function renderError(serialized) {
|
||||
const match = serialized.match(/data-mjx-error="([^"]+)"/);
|
||||
return match ? match[1].slice(0, MAX_MESSAGE_LENGTH) : "";
|
||||
}
|
||||
|
||||
async function readStdin() {
|
||||
let input = "";
|
||||
for await (const chunk of process.stdin) {
|
||||
input += chunk;
|
||||
}
|
||||
return input;
|
||||
}
|
||||
|
||||
function parsePayload(input) {
|
||||
const payload = JSON.parse(input || "{}");
|
||||
if (!payload || !Array.isArray(payload.expressions)) {
|
||||
throw new Error("Input JSON must contain an expressions array.");
|
||||
}
|
||||
return payload;
|
||||
}
|
||||
|
||||
async function checkExpression(expression) {
|
||||
const index = Number(expression.index);
|
||||
const body = String(expression.body ?? "");
|
||||
const display = Boolean(expression.display);
|
||||
|
||||
try {
|
||||
const svg = await MathJax.tex2svgPromise(body, { display });
|
||||
const serialized = MathJax.startup.adaptor.serializeXML(svg);
|
||||
const message = renderError(serialized);
|
||||
if (message) {
|
||||
return { index, ok: false, message };
|
||||
}
|
||||
return { index, ok: true };
|
||||
} catch (error) {
|
||||
return { index, ok: false, message: errorMessage(error) };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await MathJax.init({
|
||||
loader: { load: ["input/tex", "output/svg"] },
|
||||
});
|
||||
|
||||
if (process.argv.includes("--health")) {
|
||||
await MathJax.tex2svgPromise("x", { display: false });
|
||||
process.stdout.write(`${JSON.stringify({ ok: true })}\n`);
|
||||
return;
|
||||
}
|
||||
|
||||
const payload = parsePayload(await readStdin());
|
||||
const results = [];
|
||||
for (const expression of payload.expressions) {
|
||||
results.push(await checkExpression(expression));
|
||||
}
|
||||
process.stdout.write(`${JSON.stringify({ results })}\n`);
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
process.stderr.write(`${errorMessage(error)}\n`);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
@@ -0,0 +1,92 @@
|
||||
version = 1
|
||||
revision = 3
|
||||
requires-python = "==3.12.*"
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.6"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "convert-pdf-to-md"
|
||||
version = "0.1.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "pypdf" },
|
||||
]
|
||||
|
||||
[package.dev-dependencies]
|
||||
dev = [
|
||||
{ name = "pytest" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [{ name = "pypdf", specifier = ">=6.10.2,<7" }]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [{ name = "pytest", specifier = ">=8.3" }]
|
||||
|
||||
[[package]]
|
||||
name = "iniconfig"
|
||||
version = "2.3.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "26.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d7/f1/e7a6dd94a8d4a5626c03e4e99c87f241ba9e350cd9e6d75123f992427270/packaging-26.2.tar.gz", hash = "sha256:ff452ff5a3e828ce110190feff1178bb1f2ea2281fa2075aadb987c2fb221661", size = 228134, upload-time = "2026-04-24T20:15:23.917Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pluggy"
|
||||
version = "1.6.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pygments"
|
||||
version = "2.20.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pypdf"
|
||||
version = "6.10.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/7b/3f/9f2167401c2e94833ca3b69535bad89e533b5de75fefe4197a2c224baec2/pypdf-6.10.2.tar.gz", hash = "sha256:7d09ce108eff6bf67465d461b6ef352dcb8d84f7a91befc02f904455c6eea11d", size = 5315679, upload-time = "2026-04-15T16:37:36.978Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/d6/1d5c60cc17bbdf37c1552d9c03862fc6d32c5836732a0415b2d637edc2d0/pypdf-6.10.2-py3-none-any.whl", hash = "sha256:aa53be9826655b51c96741e5d7983ca224d898ac0a77896e64636810517624aa", size = 336308, upload-time = "2026-04-15T16:37:34.851Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "9.0.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
||||
{ name = "iniconfig" },
|
||||
{ name = "packaging" },
|
||||
{ name = "pluggy" },
|
||||
{ name = "pygments" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" },
|
||||
]
|
||||
Reference in New Issue
Block a user