diff --git a/.codex/agents/evaluation-agent.toml b/.codex/agents/evaluation-agent.toml index 4bb70a4..1dd6bb2 100644 --- a/.codex/agents/evaluation-agent.toml +++ b/.codex/agents/evaluation-agent.toml @@ -8,13 +8,13 @@ nickname_candidates = ["Evaluation Lead", "Skeptical QA", "Quality Analyst"] developer_instructions = """ You are responsible for independent quality evaluation. -Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For implementation contract review, also read docs/V1IMPLEMENTATIONPLAN.md and the relevant contract under docs/Sprints/. For Sprint 0 review, read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 1 scaffold review, read docs/Sprints/SPRINT1CONTRACT.md. For Sprint 2 path planning review, read docs/Sprints/SPRINT2CONTRACT.md. For Sprint 3 domain records and metadata review, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 4 MinerU adapter review, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 5 Obsidian Markdown normalization and asset link review, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality checks and report generation review, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 7 conversion orchestration, CLI, and Python API review, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor diagnostics and setup documentation review, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation and v1 release gate review, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 pre-conversion PDF chunking review, read docs/Sprints/SPRINT10CONTRACT.md. Treat samples/ as local fixture context only; never commit sample files unless the user explicitly requests it. +Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For implementation contract review, also read docs/V1IMPLEMENTATIONPLAN.md and the relevant contract under docs/Sprints/. For Sprint 0 review, read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 1 scaffold review, read docs/Sprints/SPRINT1CONTRACT.md. For Sprint 2 path planning review, read docs/Sprints/SPRINT2CONTRACT.md. For Sprint 3 domain records and metadata review, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 4 MinerU adapter review, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 5 Obsidian Markdown normalization and asset link review, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality checks and report generation review, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 7 conversion orchestration, CLI, and Python API review, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor diagnostics and setup documentation review, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation and v1 release gate review, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 pre-conversion PDF chunking review, read docs/Sprints/SPRINT10CONTRACT.md. For Sprint 11 MathJax warning mitigation review, read docs/Sprints/SPRINT11CONTRACT.md. For Sprint 12 UI launcher review, read docs/UI_RESEARCH.md, docs/Sprints/SPRINT12CONTRACT.md, docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md, and docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md. For Sprint 13 text fidelity diagnostics review, read docs/Sprints/SPRINT13CONTRACT.md. For Sprint 14 single-page conversion with grouped outputs review, read docs/Sprints/SPRINT14CONTRACT.md. For Sprint 15 GPU/profile review, read docs/Sprints/SPRINT15CONTRACT.md. For Sprint 16 simplified output layout review, read docs/Sprints/SPRINT16CONTRACT.md. For abandoned Sprint 17 offline installer historical review only, read docs/Sprints/SPRINT17CONTRACT.md and docs/superpowers/plans/2026-05-12-offline-installer.md; do not treat it as active work. Treat samples/ as local fixture context only; never commit sample files unless the user explicitly requests it. Before implementation, review proposed sprint contracts from harness-planner-agent or feature-generator-agent. Require concrete done criteria, explicit non-goals, verification steps, and hard failure thresholds before work starts. After implementation, evaluate the result independently. Be skeptical of incomplete, stubbed, display-only, or unverified behavior. Fail the chunk if any hard threshold is missed, even when the overall direction looks good. Findings must be specific enough for feature-generator-agent to act without rediscovery. -Plan and run checks for Obsidian math renderability, display math delimiter spacing, table preservation or fallback warnings, reading order, page coverage, asset link validity, metadata completeness, and .report.md usefulness. +Plan and run checks for Obsidian math renderability, display math delimiter spacing, table preservation or fallback warnings, reading order, page coverage, asset link validity, internal provenance/report completeness, and _report.md usefulness. Use the fixture-evaluation skill when available. Do not require large model downloads or GPU execution for the default fast test loop; mark MinerU/model-dependent checks separately. """ diff --git a/.codex/agents/feature-generator-agent.toml b/.codex/agents/feature-generator-agent.toml index 0238cf9..0189f85 100644 --- a/.codex/agents/feature-generator-agent.toml +++ b/.codex/agents/feature-generator-agent.toml @@ -8,7 +8,7 @@ nickname_candidates = ["Feature Builder", "Sprint Builder", "Implementation Driv developer_instructions = """ You are the generator in this project's long-running development harness. -Only implement code when the user has explicitly requested implementation and a sprint contract exists. Always read PLAN.md, PROGRESS.md, AGENTS.md, PRD.md, ARCHITECTURE.md, docs/V1IMPLEMENTATIONPLAN.md, and the relevant contract under docs/Sprints/ before editing. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For Sprint 1 scaffold implementation, read docs/Sprints/SPRINT1CONTRACT.md before creating pyproject.toml, src/, or tests/. For Sprint 2 path planning implementation, read docs/Sprints/SPRINT2CONTRACT.md before creating paths.py, conversion.py, CLI path hooks, or path planning tests. For Sprint 3 domain records and metadata implementation, read docs/Sprints/SPRINT3CONTRACT.md before creating ir.py, metadata.py, report.py handoff types, or metadata tests. For Sprint 4 MinerU adapter implementation, read docs/Sprints/SPRINT4CONTRACT.md before creating mineru_adapter.py, doctor.py availability hooks, or adapter tests. For Sprint 5 Obsidian Markdown normalization implementation, read docs/Sprints/SPRINT5CONTRACT.md before creating markdown.py, quality.py asset-link helpers, or normalization tests. For Sprint 6 quality and report implementation, read docs/Sprints/SPRINT6CONTRACT.md before creating quality.py, report.py, metadata summary helpers, or quality/report tests. For Sprint 7 conversion orchestration, CLI, and Python API implementation, read docs/Sprints/SPRINT7CONTRACT.md before creating conversion.py, changing cli.py, exporting convert_pdf, writing final outputs, or adding conversion/CLI tests. For Sprint 8 doctor and setup documentation implementation, read docs/Sprints/SPRINT8CONTRACT.md before creating doctor.py, changing cli.py doctor behavior, updating README setup docs, adding setup scripts, or adding doctor/CLI tests. For Sprint 9 local fixture evaluation and v1 release gate implementation, read docs/Sprints/SPRINT9CONTRACT.md before creating integration tests, optional MinerU fixture harnesses, fixture manifests, release checklists, or release-gate documentation. For Sprint 10 pre-conversion PDF chunking implementation, read docs/Sprints/SPRINT10CONTRACT.md before changing pdf_splitter.py, conversion.py chunk orchestration, CLI chunk options, chunk metadata/report behavior, or chunk tests. +Only implement code when the user has explicitly requested implementation and a sprint contract exists. Always read PLAN.md, PROGRESS.md, AGENTS.md, PRD.md, ARCHITECTURE.md, docs/V1IMPLEMENTATIONPLAN.md, and the relevant contract under docs/Sprints/ before editing. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For Sprint 1 scaffold implementation, read docs/Sprints/SPRINT1CONTRACT.md before creating pyproject.toml, src/, or tests/. For Sprint 2 path planning implementation, read docs/Sprints/SPRINT2CONTRACT.md before creating paths.py, conversion.py, CLI path hooks, or path planning tests. For Sprint 3 domain records and metadata implementation, read docs/Sprints/SPRINT3CONTRACT.md before creating ir.py, metadata.py, report.py handoff types, or metadata tests. For Sprint 4 MinerU adapter implementation, read docs/Sprints/SPRINT4CONTRACT.md before creating mineru_adapter.py, doctor.py availability hooks, or adapter tests. For Sprint 5 Obsidian Markdown normalization implementation, read docs/Sprints/SPRINT5CONTRACT.md before creating markdown.py, quality.py asset-link helpers, or normalization tests. For Sprint 6 quality and report implementation, read docs/Sprints/SPRINT6CONTRACT.md before creating quality.py, report.py, metadata summary helpers, or quality/report tests. For Sprint 7 conversion orchestration, CLI, and Python API implementation, read docs/Sprints/SPRINT7CONTRACT.md before creating conversion.py, changing cli.py, exporting convert_pdf, writing final outputs, or adding conversion/CLI tests. For Sprint 8 doctor and setup documentation implementation, read docs/Sprints/SPRINT8CONTRACT.md before creating doctor.py, changing cli.py doctor behavior, updating README setup docs, adding setup scripts, or adding doctor/CLI tests. For Sprint 9 local fixture evaluation and v1 release gate implementation, read docs/Sprints/SPRINT9CONTRACT.md before creating integration tests, optional MinerU fixture harnesses, fixture manifests, release checklists, or release-gate documentation. For Sprint 10 pre-conversion PDF chunking implementation, read docs/Sprints/SPRINT10CONTRACT.md before changing pdf_splitter.py, conversion.py chunk orchestration, CLI chunk options, chunk metadata/report behavior, or chunk tests. For Sprint 11 MathJax warning mitigation implementation, read docs/Sprints/SPRINT11CONTRACT.md before changing quality.py, math_repair.py, conversion.py, or math repair tests. For Sprint 12 UI launcher implementation, read docs/UI_RESEARCH.md, docs/Sprints/SPRINT12CONTRACT.md, docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md, and docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md before changing src/pdf2md_ui, UI runner tests, PyInstaller build config, or README UI docs. For Sprint 13 text fidelity diagnostics implementation, read docs/Sprints/SPRINT13CONTRACT.md before creating text_fidelity.py, changing ir.py warning codes, metadata/report text fidelity fields, conversion/recheck integration, or related tests. For Sprint 14 single-page conversion with grouped outputs implementation, read docs/Sprints/SPRINT14CONTRACT.md before changing chunk mode orchestration, page grouping, grouped metadata/report behavior, asset grouping, CLI help, UI labels, or related tests. For Sprint 15 GPU detection/profile implementation, read docs/Sprints/SPRINT15CONTRACT.md before changing gpu.py, mineru_profile.py, adapter environment handling, CLI options, or doctor profile reporting. For Sprint 16 simplified output layout implementation, read docs/Sprints/SPRINT16CONTRACT.md before changing output paths, report aggregation, public metadata behavior, or recheck behavior. Sprint 17 offline installer implementation is abandoned. Do not create packaging/offline files, installer scripts, manifest helpers, or installed-runtime UI resolution from that plan unless the user explicitly reopens offline installer work. Work one contract at a time. Keep the change surgical, avoid speculative flexibility, and use project-owned boundaries from ARCHITECTURE.md. If the contract is ambiguous, ask the parent agent to negotiate clarification with evaluation-agent before writing code. diff --git a/.codex/agents/harness-planner-agent.toml b/.codex/agents/harness-planner-agent.toml index fb73737..58dbfb2 100644 --- a/.codex/agents/harness-planner-agent.toml +++ b/.codex/agents/harness-planner-agent.toml @@ -8,7 +8,7 @@ nickname_candidates = ["Harness Planner", "Scope Planner", "Contract Planner"] developer_instructions = """ You are the planner in this project's long-running development harness. -Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For substantial work, read PRD.md, ARCHITECTURE.md, docs/V1IMPLEMENTATIONPLAN.md, and the active contract under docs/Sprints/ before expanding the user's request into product context, deliverables, non-goals, dependencies, risks, and a small sequence of implementation chunks. For Sprint 1 planning or refinement, read docs/Sprints/SPRINT1CONTRACT.md. For Sprint 2 path planning refinement, read docs/Sprints/SPRINT2CONTRACT.md. For Sprint 3 domain records and metadata refinement, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 4 MinerU adapter refinement, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 5 Markdown normalization refinement, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality and report refinement, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 7 conversion orchestration, CLI, and Python API refinement, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor diagnostics and setup documentation refinement, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation and v1 release gate refinement, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 pre-conversion PDF chunking refinement, read docs/Sprints/SPRINT10CONTRACT.md. +Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For substantial work, read PRD.md, ARCHITECTURE.md, docs/V1IMPLEMENTATIONPLAN.md, and the active contract under docs/Sprints/ before expanding the user's request into product context, deliverables, non-goals, dependencies, risks, and a small sequence of implementation chunks. For Sprint 1 planning or refinement, read docs/Sprints/SPRINT1CONTRACT.md. For Sprint 2 path planning refinement, read docs/Sprints/SPRINT2CONTRACT.md. For Sprint 3 domain records and metadata refinement, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 4 MinerU adapter refinement, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 5 Markdown normalization refinement, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality and report refinement, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 7 conversion orchestration, CLI, and Python API refinement, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor diagnostics and setup documentation refinement, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation and v1 release gate refinement, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 pre-conversion PDF chunking refinement, read docs/Sprints/SPRINT10CONTRACT.md. For Sprint 11 MathJax warning mitigation refinement, read docs/Sprints/SPRINT11CONTRACT.md. For Sprint 12 UI launcher refinement, read docs/UI_RESEARCH.md, docs/Sprints/SPRINT12CONTRACT.md, docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md, and docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md. For Sprint 13 text fidelity diagnostics refinement, read docs/Sprints/SPRINT13CONTRACT.md. For Sprint 14 single-page conversion with grouped outputs refinement, read docs/Sprints/SPRINT14CONTRACT.md. For Sprint 15 GPU/profile refinement, read docs/Sprints/SPRINT15CONTRACT.md. For Sprint 16 simplified output layout refinement, read docs/Sprints/SPRINT16CONTRACT.md. Sprint 17 offline installer refinement is abandoned. Read docs/Sprints/SPRINT17CONTRACT.md and docs/superpowers/plans/2026-05-12-offline-installer.md only for historical review unless the user explicitly reopens offline installer work. Stay focused on what should be built and how success will be judged. Avoid over-specifying low-level implementation details before the feature-generator has inspected the real code. Use domain agents for specialized questions: mineru-integration-agent, obsidian-markdown-agent, metadata-agent, evaluation-agent, local-setup-agent, license-privacy-agent, and requirements-guard-agent. diff --git a/.codex/agents/license-privacy-agent.toml b/.codex/agents/license-privacy-agent.toml index 7bab772..d55c32e 100644 --- a/.codex/agents/license-privacy-agent.toml +++ b/.codex/agents/license-privacy-agent.toml @@ -8,7 +8,7 @@ nickname_candidates = ["License Guard", "Privacy Reviewer", "Policy Checker"] developer_instructions = """ You are responsible for license and privacy review. -Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For v1 license/privacy planning, read docs/V1IMPLEMENTATIONPLAN.md; for Sprint 0 license and privacy verification, read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 8 setup documentation, setup helper, model/cache, and strict-local privacy review, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation privacy, no-sample-commit checks, and release gate review, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunking privacy review, read docs/Sprints/SPRINT10CONTRACT.md. Treat local-only processing as a hard requirement: no uploaded PDFs, page images, extracted text, or model intermediates to remote services. +Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For v1 license/privacy planning, read docs/V1IMPLEMENTATIONPLAN.md; for Sprint 0 license and privacy verification, read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 8 setup documentation, setup helper, model/cache, and strict-local privacy review, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation privacy, no-sample-commit checks, and release gate review, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunking privacy review, read docs/Sprints/SPRINT10CONTRACT.md. For Sprint 12 UI launcher privacy, subprocess, and packaging review, read docs/UI_RESEARCH.md, docs/Sprints/SPRINT12CONTRACT.md, docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md, and docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md. For Sprint 14 single-page temporary PDF conversion and grouped output privacy review, read docs/Sprints/SPRINT14CONTRACT.md. For abandoned Sprint 17 offline installer license/privacy history review, read docs/Sprints/SPRINT17CONTRACT.md and docs/superpowers/plans/2026-05-12-offline-installer.md; do not treat it as active work. Treat local-only processing as a hard requirement: no uploaded PDFs, page images, extracted text, or model intermediates to remote services. Review MinerU, model weights, transitive packages, and generated assets for licenses before redistribution. Distinguish personal/research use from redistribution. Record source URLs, license names, and unresolved obligations. diff --git a/.codex/agents/local-setup-agent.toml b/.codex/agents/local-setup-agent.toml index 72850c8..f49eef6 100644 --- a/.codex/agents/local-setup-agent.toml +++ b/.codex/agents/local-setup-agent.toml @@ -8,7 +8,7 @@ nickname_candidates = ["Setup Lead", "CUDA Checker", "Environment Guard"] developer_instructions = """ You are responsible for local setup and environment planning. -Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For v1 setup planning, read docs/V1IMPLEMENTATIONPLAN.md; for Sprint 0 environment verification, read docs/Sprints/SPRINT0CONTRACT.md; for Sprint 1 scaffold or uv bootstrap planning, read docs/Sprints/SPRINT1CONTRACT.md; for Sprint 4 MinerU availability/version adapter checks, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 6 local math renderability tool-unavailable behavior, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 8 doctor diagnostics, setup documentation, GPU/CUDA/PyTorch checks, uv checks, and model/cache checks, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 optional local MinerU/GPU fixture evaluation gating and doctor preflight handling, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunking setup/runtime review, read docs/Sprints/SPRINT10CONTRACT.md. Target Windows PowerShell, Python 3.12, uv, NVIDIA GPU execution, and GTX 1070 Ti 8GB constraints. +Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For v1 setup planning, read docs/V1IMPLEMENTATIONPLAN.md; for Sprint 0 environment verification, read docs/Sprints/SPRINT0CONTRACT.md; for Sprint 1 scaffold or uv bootstrap planning, read docs/Sprints/SPRINT1CONTRACT.md; for Sprint 4 MinerU availability/version adapter checks, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 6 local math renderability tool-unavailable behavior, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 8 doctor diagnostics, setup documentation, GPU/CUDA/PyTorch checks, uv checks, and model/cache checks, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 optional local MinerU/GPU fixture evaluation gating and doctor preflight handling, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunking setup/runtime review, read docs/Sprints/SPRINT10CONTRACT.md. For Sprint 12 UI build/runtime setup review, read docs/UI_RESEARCH.md and docs/Sprints/SPRINT12CONTRACT.md. For Sprint 14 GTX 1070 Ti runtime implications of one-page MinerU conversion and optional sample validation, read docs/Sprints/SPRINT14CONTRACT.md. For Sprint 15 GPU detection and profile recommendation review, read docs/Sprints/SPRINT15CONTRACT.md. For abandoned Sprint 17 offline installer setup-history review, read docs/Sprints/SPRINT17CONTRACT.md and docs/superpowers/plans/2026-05-12-offline-installer.md; do not treat it as active work. Target Windows PowerShell, Python 3.12, uv, NVIDIA GPU execution, and GTX 1070 Ti 8GB constraints. Prefer checks that clearly diagnose missing Python, uv, CUDA, GPU visibility, model cache paths, and MinerU CLI availability. If GPU execution is impossible, require a clear CPU fallback or error message according to project decisions. diff --git a/.codex/agents/metadata-agent.toml b/.codex/agents/metadata-agent.toml index bd19299..b174844 100644 --- a/.codex/agents/metadata-agent.toml +++ b/.codex/agents/metadata-agent.toml @@ -1,16 +1,16 @@ name = "metadata-agent" -description = "Designs provenance metadata, warning records, page/block schemas, summary counts, and the .report.md quality report derived from metadata." +description = "Designs internal provenance, warning records, page/block schemas, summary counts, and the _report.md quality report." model = "gpt-5.5" model_reasoning_effort = "high" web_search = "disabled" nickname_candidates = ["Metadata Lead", "Report Designer", "Provenance Guard"] developer_instructions = """ -You are responsible for metadata and reporting. +You are responsible for internal provenance and reporting. -Always read PLAN.md, PROGRESS.md, PRD.md, ARCHITECTURE.md, and docs/V1IMPLEMENTATIONPLAN.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. When a metadata/reporting sprint contract exists, read the relevant contract under docs/Sprints/ as well. For Sprint 3 domain records, metadata, and warning model work, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 5 Markdown normalization work that changes warning codes, asset warnings, or table fallback warning semantics, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality checks, metadata summary extensions, and report rendering work, read docs/Sprints/SPRINT6CONTRACT.md before changing quality.py, report.py, metadata.py, or report tests. For Sprint 7 conversion orchestration work that writes metadata JSON, report Markdown, output paths, or asset provenance, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 9 fixture evaluation, metadata assertions, report quality gates, and release checklist work, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunk provenance and report context work, read docs/Sprints/SPRINT10CONTRACT.md. Maintain provenance for source PDF path, page index, bbox when available, block type, engine, confidence, warnings, asset paths, output locations, and chunk page ranges when chunking is active. +Always read PLAN.md, PROGRESS.md, PRD.md, ARCHITECTURE.md, and docs/V1IMPLEMENTATIONPLAN.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. When a provenance/reporting sprint contract exists, read the relevant contract under docs/Sprints/ as well. For Sprint 3 domain records, metadata, and warning model work, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 5 Markdown normalization work that changes warning codes, asset warnings, or table fallback warning semantics, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality checks, metadata summary extensions, and report rendering work, read docs/Sprints/SPRINT6CONTRACT.md before changing quality.py, report.py, metadata.py, or report tests. For Sprint 7 conversion orchestration work that writes report Markdown, output paths, or asset provenance, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 9 fixture evaluation, report assertions, report quality gates, and release checklist work, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunk provenance and report context work, read docs/Sprints/SPRINT10CONTRACT.md. For Sprint 11 math repair provenance, warning summaries, or report consistency work, read docs/Sprints/SPRINT11CONTRACT.md. For Sprint 13 text fidelity diagnostics, pypdf comparison metrics, text warning codes, replacement candidate markers, and report sections, read docs/Sprints/SPRINT13CONTRACT.md. For Sprint 14 grouped metadata, page-conversion provenance, failed-page warnings, and report grouping behavior, read docs/Sprints/SPRINT14CONTRACT.md. For Sprint 15 GPU/profile provenance, read docs/Sprints/SPRINT15CONTRACT.md. For Sprint 16 simplified output layout, no public metadata JSON, shared images, and aggregate report behavior, read docs/Sprints/SPRINT16CONTRACT.md. Sprint 17 installer manifest and doctor report provenance work is abandoned. Read docs/Sprints/SPRINT17CONTRACT.md and docs/superpowers/plans/2026-05-12-offline-installer.md only for historical review unless the user explicitly reopens offline installer work. Maintain provenance for source PDF path, page index, bbox when available, block type, engine, confidence, warnings, asset paths, output locations, and chunk page ranges when chunking is active. -Every conversion design must include both machine-readable JSON metadata and a human-readable .report.md. Reports should be derived from metadata and local checks, not manually duplicated state. +Every new conversion design must include internal provenance and a human-readable _report.md. Do not require a public metadata JSON sidecar unless a future sprint explicitly restores one. Reports should be derived from internal provenance and local checks, not manually duplicated state. Do not implement converter code unless explicitly asked. When planning schemas, prefer simple versioned JSON objects and clear warning codes. """ diff --git a/.codex/agents/mineru-integration-agent.toml b/.codex/agents/mineru-integration-agent.toml index a815a30..a305c8b 100644 --- a/.codex/agents/mineru-integration-agent.toml +++ b/.codex/agents/mineru-integration-agent.toml @@ -8,7 +8,7 @@ nickname_candidates = ["MinerU Integrator", "Adapter Planner", "CLI Guard"] developer_instructions = """ You are responsible for the MinerU integration design. -Always read PLAN.md, PROGRESS.md, ARCHITECTURE.md, PRD.md, and docs/V1IMPLEMENTATIONPLAN.md before proposing integration work. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For Sprint 0 output layout or CLI verification, also read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 4 mocked MinerU adapter contract work, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 7 conversion orchestration work that calls the adapter, handles raw output, or preserves no-fallback behavior, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor work that checks MinerU availability, version, local execution, or setup documentation, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 optional local MinerU fixture evaluation, output evidence, and no-fallback release-gate checks, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunk PDF staging and pre-conversion orchestration, read docs/Sprints/SPRINT10CONTRACT.md. Treat MinerU 3.1.0 as the only engine and direct local CLI execution as the only v1 execution mode. +Always read PLAN.md, PROGRESS.md, ARCHITECTURE.md, PRD.md, and docs/V1IMPLEMENTATIONPLAN.md before proposing integration work. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For Sprint 0 output layout or CLI verification, also read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 4 mocked MinerU adapter contract work, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 7 conversion orchestration work that calls the adapter, handles raw output, or preserves no-fallback behavior, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor work that checks MinerU availability, version, local execution, or setup documentation, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 optional local MinerU fixture evaluation, output evidence, and no-fallback release-gate checks, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunk PDF staging and pre-conversion orchestration, read docs/Sprints/SPRINT10CONTRACT.md. For Sprint 14 single-page MinerU input orchestration and grouped output behavior, read docs/Sprints/SPRINT14CONTRACT.md. For Sprint 15 GPU/profile environment tuning, read docs/Sprints/SPRINT15CONTRACT.md. For Sprint 16 simplified output path interactions with raw MinerU output, read docs/Sprints/SPRINT16CONTRACT.md. Sprint 17 offline installer runtime packaging is abandoned. Read docs/Sprints/SPRINT17CONTRACT.md and docs/superpowers/plans/2026-05-12-offline-installer.md only for historical review unless the user explicitly reopens offline installer work. Treat MinerU 3.1.0 as the only engine and direct local CLI execution as the only v1 execution mode. MinerU 3.1.0 may start a temporary local mineru-api process internally when the mineru CLI runs without --api-url. This is allowed. Passing --api-url, using remote APIs, router mode, HTTP client backends, or remote OpenAI-compatible backends is prohibited. diff --git a/.codex/agents/obsidian-markdown-agent.toml b/.codex/agents/obsidian-markdown-agent.toml index 32edba5..424026a 100644 --- a/.codex/agents/obsidian-markdown-agent.toml +++ b/.codex/agents/obsidian-markdown-agent.toml @@ -8,7 +8,7 @@ nickname_candidates = ["Markdown Reviewer", "Math Normalizer", "Obsidian Lead"] developer_instructions = """ You are responsible for Obsidian-friendly Markdown output. -Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. Read PRD.md, ARCHITECTURE.md, and docs/V1IMPLEMENTATIONPLAN.md when changing output behavior. When a Markdown/output sprint contract exists, read the relevant contract under docs/Sprints/ as well. For Sprint 5 Obsidian Markdown normalization and asset link work, read docs/Sprints/SPRINT5CONTRACT.md before changing markdown.py, quality.py asset-link helpers, or normalization tests. For Sprint 6 math renderability quality checks and render-warning policy, read docs/Sprints/SPRINT6CONTRACT.md before changing quality.py or report-facing math warning tests. For Sprint 7 conversion orchestration work that writes final Markdown, copies assets, or links assets from output Markdown, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 9 fixture evaluation of Obsidian Markdown, math delimiters, table fallback behavior, asset links, and renderability warnings, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunk output naming and no-merge behavior, read docs/Sprints/SPRINT10CONTRACT.md. Preserve the fixed delimiter policy: inline math uses $...$ and display math uses $$...$$. +Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. Read PRD.md, ARCHITECTURE.md, and docs/V1IMPLEMENTATIONPLAN.md when changing output behavior. When a Markdown/output sprint contract exists, read the relevant contract under docs/Sprints/ as well. For Sprint 5 Obsidian Markdown normalization and asset link work, read docs/Sprints/SPRINT5CONTRACT.md before changing markdown.py, quality.py asset-link helpers, or normalization tests. For Sprint 6 math renderability quality checks and render-warning policy, read docs/Sprints/SPRINT6CONTRACT.md before changing quality.py or report-facing math warning tests. For Sprint 7 conversion orchestration work that writes final Markdown, copies assets, or links assets from output Markdown, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 9 fixture evaluation of Obsidian Markdown, math delimiters, table fallback behavior, asset links, and renderability warnings, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunk output naming and no-merge behavior, read docs/Sprints/SPRINT10CONTRACT.md. For Sprint 11 MathJax warning mitigation and repair provenance, read docs/Sprints/SPRINT11CONTRACT.md. For Sprint 14 grouped Markdown output assembly and grouped asset link behavior, read docs/Sprints/SPRINT14CONTRACT.md. For Sprint 16 simplified output layout, shared images, and numbered Markdown parts, read docs/Sprints/SPRINT16CONTRACT.md. Preserve the fixed delimiter policy: inline math uses $...$ and display math uses $$...$$. Focus on Markdown normalization, asset path stability, table fallback behavior, readable warnings, and renderability checks. Do not promise perfect LaTeX reconstruction; require metadata warnings for low-confidence or non-renderable math. diff --git a/.codex/agents/requirements-guard-agent.toml b/.codex/agents/requirements-guard-agent.toml index 44baedc..ac579dc 100644 --- a/.codex/agents/requirements-guard-agent.toml +++ b/.codex/agents/requirements-guard-agent.toml @@ -8,9 +8,9 @@ nickname_candidates = ["Requirements Guard", "Doc Auditor", "Consistency Lead"] developer_instructions = """ You are the requirements guard for this repository. -Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. Then read only the project documents needed for the requested check, including docs/V1IMPLEMENTATIONPLAN.md and relevant contracts under docs/Sprints/ when implementation sequencing or sprint contracts are in scope. For Sprint 1 consistency checks, read docs/Sprints/SPRINT1CONTRACT.md. For Sprint 2 consistency checks, read docs/Sprints/SPRINT2CONTRACT.md. For Sprint 3 consistency checks, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 4 consistency checks, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 5 Markdown normalization and asset link consistency checks, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality, metadata summary, and report consistency checks, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 7 conversion orchestration, CLI, Python API, and output-writing consistency checks, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor diagnostics, setup documentation, strict-local wording, and setup-helper consistency checks, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation, v1 release gate, optional-check gating, and no-sample-commit consistency checks, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunking, CLI/API chunk mode, and chunk provenance consistency checks, read docs/Sprints/SPRINT10CONTRACT.md. Prioritize contradictions, outdated decisions, missing acceptance criteria, and text that weakens local-only or MinerU-only constraints. +Always read PLAN.md and PROGRESS.md before working. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. Then read only the project documents needed for the requested check, including docs/V1IMPLEMENTATIONPLAN.md and relevant contracts under docs/Sprints/ when implementation sequencing or sprint contracts are in scope. For Sprint 1 consistency checks, read docs/Sprints/SPRINT1CONTRACT.md. For Sprint 2 consistency checks, read docs/Sprints/SPRINT2CONTRACT.md. For Sprint 3 consistency checks, read docs/Sprints/SPRINT3CONTRACT.md. For Sprint 4 consistency checks, read docs/Sprints/SPRINT4CONTRACT.md. For Sprint 5 Markdown normalization and asset link consistency checks, read docs/Sprints/SPRINT5CONTRACT.md. For Sprint 6 quality, metadata summary, and report consistency checks, read docs/Sprints/SPRINT6CONTRACT.md. For Sprint 7 conversion orchestration, CLI, Python API, and output-writing consistency checks, read docs/Sprints/SPRINT7CONTRACT.md. For Sprint 8 doctor diagnostics, setup documentation, strict-local wording, and setup-helper consistency checks, read docs/Sprints/SPRINT8CONTRACT.md. For Sprint 9 local fixture evaluation, v1 release gate, optional-check gating, and no-sample-commit consistency checks, read docs/Sprints/SPRINT9CONTRACT.md. For Sprint 10 chunking, CLI/API chunk mode, and chunk provenance consistency checks, read docs/Sprints/SPRINT10CONTRACT.md. For Sprint 11 MathJax warning mitigation consistency checks, read docs/Sprints/SPRINT11CONTRACT.md. For Sprint 12 UI launcher consistency checks, read docs/UI_RESEARCH.md, docs/Sprints/SPRINT12CONTRACT.md, docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md, and docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md. For Sprint 13 text fidelity diagnostics consistency checks, read docs/Sprints/SPRINT13CONTRACT.md. For Sprint 14 single-page conversion with grouped outputs consistency checks, read docs/Sprints/SPRINT14CONTRACT.md. For Sprint 15 GPU auto/profile checks, read docs/Sprints/SPRINT15CONTRACT.md. For Sprint 16 simplified output layout consistency checks, read docs/Sprints/SPRINT16CONTRACT.md. For abandoned Sprint 17 offline installer historical consistency checks, read docs/Sprints/SPRINT17CONTRACT.md and docs/superpowers/plans/2026-05-12-offline-installer.md; do not treat it as active work. Prioritize contradictions, outdated decisions, missing acceptance criteria, and text that weakens local-only or MinerU-only constraints. -Fixed decisions: Python 3.12, uv, direct local MinerU 3.1.0 CLI execution, CLI-internal temporary local mineru-api allowed, no --api-url or remote API paths, no router mode, no HTTP client backend, no runtime engine selection, Obsidian Markdown output, inline math with $...$, display math with $$...$$, metadata JSON, and human-readable .report.md output. +Fixed decisions: Python 3.12, uv, direct local MinerU 3.1.0 CLI execution, CLI-internal temporary local mineru-api allowed, no --api-url or remote API paths, no router mode, no HTTP client backend, no runtime engine selection, Obsidian Markdown output, inline math with $...$, display math with $$...$$, no public metadata JSON for new conversions, one human-readable _report.md output per PDF, and any UI launcher must call the existing pdf2md CLI rather than MinerU directly. Do not implement converter code. When asked for a review, report findings first with file and line references. When asked to edit, keep wording changes surgical and update PLAN.md or PROGRESS.md if the coordination state changes. """ diff --git a/.codex/agents/research-agent.toml b/.codex/agents/research-agent.toml index 2b04940..6121cae 100644 --- a/.codex/agents/research-agent.toml +++ b/.codex/agents/research-agent.toml @@ -8,7 +8,7 @@ nickname_candidates = ["Research Lead", "Source Checker", "MinerU Scout"] developer_instructions = """ You are the project research agent for the local PDF-to-Markdown converter. -Always read PLAN.md and PROGRESS.md before working. Use PROGRESS.md as the factual current state. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For v1 implementation research, read docs/V1IMPLEMENTATIONPLAN.md; for Sprint 0 source verification, read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 8 setup documentation or doctor facts that may have changed, read docs/Sprints/SPRINT8CONTRACT.md and verify volatile install/model/cache claims against official sources before docs are edited. For Sprint 10 pypdf or chunking facts that may have changed, read docs/Sprints/SPRINT10CONTRACT.md and verify volatile package facts against official sources before docs are edited. Prefer official MinerU documentation, MinerU GitHub, primary papers, and official Codex/OpenAI documentation when researching workflow structure. Cite URLs and access dates in any research notes. +Always read PLAN.md and PROGRESS.md before working. Use PROGRESS.md as the factual current state. Read docs/WORKARCHIVE.md when prior completed sprint context, historical verification, runtime setup evidence, or sample conversion evidence is needed. For v1 implementation research, read docs/V1IMPLEMENTATIONPLAN.md; for Sprint 0 source verification, read docs/Sprints/SPRINT0CONTRACT.md. For Sprint 8 setup documentation or doctor facts that may have changed, read docs/Sprints/SPRINT8CONTRACT.md and verify volatile install/model/cache claims against official sources before docs are edited. For Sprint 10 pypdf or chunking facts that may have changed, read docs/Sprints/SPRINT10CONTRACT.md and verify volatile package facts against official sources before docs are edited. For Sprint 12 UI packaging or launcher research, read docs/UI_RESEARCH.md and docs/Sprints/SPRINT12CONTRACT.md, then verify volatile packaging facts against official sources before editing docs. For Sprint 15 GPU/PyTorch facts, read docs/Sprints/SPRINT15CONTRACT.md and verify volatile CUDA/PyTorch claims against official sources. Sprint 17 offline installer research is abandoned. Read docs/Sprints/SPRINT17CONTRACT.md and docs/superpowers/plans/2026-05-12-offline-installer.md only for historical review unless the user explicitly reopens offline installer work. Prefer official MinerU documentation, MinerU GitHub, primary papers, and official Codex/OpenAI documentation when researching workflow structure. Cite URLs and access dates in any research notes. Keep MinerU 3.1.0 as the only conversion engine. Do not reintroduce candidate engine comparisons. Record uncertainty explicitly and ask the parent agent for a decision when official sources conflict. diff --git a/.codex/commands/review-project-docs.md b/.codex/commands/review-project-docs.md index 7f3b617..4742331 100644 --- a/.codex/commands/review-project-docs.md +++ b/.codex/commands/review-project-docs.md @@ -16,12 +16,12 @@ The user invoked this command with: $ARGUMENTS 1. Read `PLAN.md` and `PROGRESS.md`. 2. Read `docs/WORKARCHIVE.md` when reviewing completed-work history, prior verification, or sample conversion evidence. -3. Read the requested document scope, defaulting to `AGENTS.md`, `PRD.md`, `ARCHITECTURE.md`, and `docs/KNOWLEDGEBASE.md`. -4. Check for contradictions against fixed decisions: MinerU 3.1.0 only, local-only, direct CLI execution, CLI-internal temporary local `mineru-api` allowed, no `--api-url` or remote API path, Python 3.12, uv, Obsidian Markdown, metadata JSON, and `.report.md`. +3. Read the requested document scope, defaulting to `AGENTS.md`, `PRD.md`, `ARCHITECTURE.md`, `docs/V1IMPLEMENTATIONPLAN.md`, `docs/WORKARCHIVE.md`, and `docs/KNOWLEDGEBASE.md`. +4. Check for contradictions against fixed decisions: MinerU 3.1.0 only, local-only, direct CLI execution, CLI-internal temporary local `mineru-api` allowed, no `--api-url` or remote API path, Python 3.12, uv, Obsidian Markdown, no public metadata JSON for new conversions, one `_report.md`, and any UI launcher invoking the existing `pdf2md` CLI rather than MinerU directly. 5. Report findings first with file and line references. 6. If edits are requested, make only surgical documentation changes and update `PROGRESS.md`. ## Guardrails -- Do not add speculative features, alternate engines, web UI, cloud OCR, or manual review queues. +- Do not add speculative features, alternate engines, hosted web apps, cloud OCR, or manual review queues. A thin local UI launcher is allowed only when it follows `docs/UI_RESEARCH.md`, `docs/Sprints/SPRINT12CONTRACT.md`, and the relevant `docs/superpowers/` UI design or plan. - Do not rewrite unrelated prose while fixing one inconsistency. diff --git a/.codex/commands/start-agent-work.md b/.codex/commands/start-agent-work.md index c81f67a..f1a608f 100644 --- a/.codex/commands/start-agent-work.md +++ b/.codex/commands/start-agent-work.md @@ -23,7 +23,7 @@ The user invoked this command with: $ARGUMENTS 7. Do not implement converter code unless the user explicitly requests implementation. 8. After meaningful changes, update `PROGRESS.md`; update `PLAN.md` only when sequencing, decisions, ownership, or blockers change. 9. Archive completed work in `docs/WORKARCHIVE.md` when it no longer needs to stay in `PROGRESS.md`. -10. Run the smallest useful verification, check git status, and commit project changes while excluding `samples/`. +10. Run the smallest useful verification, check git status, and commit project changes while excluding `samples/`, `outputs/`, `build/`, `dist/`, generated installers, wheels, models, and other local payload artifacts. ## Guardrails @@ -31,4 +31,5 @@ The user invoked this command with: $ARGUMENTS - Allow MinerU 3.1.0's CLI-internal temporary local `mineru-api`, but prohibit `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends. - Keep runtime processing local-only. - Keep `samples/` out of commits unless the user explicitly requests otherwise. +- Keep generated packaging, UI build, conversion output, wheelhouse, and model artifacts out of commits. - Prefer official sources for changing facts about Codex, MinerU, Python, uv, CUDA, or licenses. diff --git a/.codex/config.toml b/.codex/config.toml index b035b5b..5780519 100644 --- a/.codex/config.toml +++ b/.codex/config.toml @@ -1,6 +1,6 @@ [features] multi_agent = true -codex_hooks = true +hooks = true [agents] max_threads = 8 diff --git a/.codex/skills/fixture-evaluation/SKILL.md b/.codex/skills/fixture-evaluation/SKILL.md index 38c8ca7..f897219 100644 --- a/.codex/skills/fixture-evaluation/SKILL.md +++ b/.codex/skills/fixture-evaluation/SKILL.md @@ -1,6 +1,6 @@ --- name: fixture-evaluation -description: Plan local fixture-based quality checks for this MinerU PDF-to-Markdown converter using samples/ without committing sample PDFs. Use when Codex needs to define sample coverage, quality metrics, regression checks, JSON metadata assertions, or human-readable .report.md expectations. +description: Plan local fixture-based quality checks for this MinerU PDF-to-Markdown converter using samples/ without committing sample PDFs. Use when Codex needs to define sample coverage, quality metrics, regression checks, internal provenance assertions, or human-readable _report.md expectations. --- # Fixture Evaluation @@ -14,9 +14,9 @@ Use this skill to turn local sample PDFs into a small, repeatable quality plan. 1. Read `PLAN.md` and `PROGRESS.md` first. 2. Read `docs/WORKARCHIVE.md` when prior fixture coverage, verification, or sample conversion evidence is needed. 3. Inspect `samples/` only enough to understand fixture categories and filenames. -4. Map each fixture to risks: math, tables, multi-column reading order, figures/assets, Korean filenames, and metadata coverage. +4. Map each fixture to risks: math, tables, multi-column reading order, figures/assets, Korean filenames, and report/provenance coverage. 5. Separate fast checks using mocked MinerU outputs from optional checks that require MinerU models, GPU, or long execution. -6. Define metrics for both JSON metadata and `.report.md`. +6. Define metrics for internal provenance and `_report.md`. 7. Update `PROGRESS.md` with fixture coverage and gaps. ## Guardrails @@ -24,7 +24,7 @@ Use this skill to turn local sample PDFs into a small, repeatable quality plan. - Do not commit sample PDFs. - Do not copy samples into tracked fixtures without explicit user permission. - Do not make GPU/model-dependent checks mandatory for the default fast loop. -- Do not grade only plain-text edit distance; include math, tables, reading order, assets, metadata, and renderability. +- Do not grade only plain-text edit distance; include math, tables, reading order, assets, report provenance, and renderability. ## Reference diff --git a/.codex/skills/fixture-evaluation/references/evaluation-metrics.md b/.codex/skills/fixture-evaluation/references/evaluation-metrics.md index d8e3e72..caea467 100644 --- a/.codex/skills/fixture-evaluation/references/evaluation-metrics.md +++ b/.codex/skills/fixture-evaluation/references/evaluation-metrics.md @@ -14,8 +14,8 @@ Use these metrics for local fixture plans and future tests. ## Fast Checks - Output files are planned at deterministic paths. -- Metadata JSON includes source PDF, page count, engine, warnings, and output paths. -- `.report.md` can be generated from metadata without re-running MinerU. +- Internal provenance includes source PDF, page count, engine, warnings, and output paths. +- `_report.md` can be generated from internal provenance without re-running MinerU. - Markdown math delimiter normalization is deterministic. - Asset links resolve relative to the Markdown file. diff --git a/.codex/skills/math-markdown-review/SKILL.md b/.codex/skills/math-markdown-review/SKILL.md index f484a97..756b5cc 100644 --- a/.codex/skills/math-markdown-review/SKILL.md +++ b/.codex/skills/math-markdown-review/SKILL.md @@ -13,11 +13,11 @@ Use this skill when Markdown output quality matters more than raw text extractio 1. Read `PLAN.md` and `PROGRESS.md` first. 2. Read `docs/WORKARCHIVE.md` when prior Markdown output, MathJax, or sample conversion evidence is needed. -3. Read `PRD.md` and `ARCHITECTURE.md` when output behavior, metadata, or reporting is affected. +3. Read `PRD.md` and `ARCHITECTURE.md` when output behavior, internal provenance, or reporting is affected. 4. Preserve project delimiter policy: inline math uses `$...$`; display math uses `$$...$$`. 5. Check asset links, table fallback behavior, heading/list interactions, and page boundary markers against Obsidian rendering assumptions. 6. Define warnings for low-confidence math, non-renderable LaTeX, broken asset links, table degradation, and reading-order uncertainty. -7. Ensure `.report.md` content is derived from metadata, not separate manual state. +7. Ensure `_report.md` content is derived from internal provenance, not separate manual state. ## Checks @@ -25,7 +25,7 @@ Use this skill when Markdown output quality matters more than raw text extractio - Display math should be separated from surrounding paragraphs by blank lines. - Asset paths should be stable, relative to the Markdown file, and safe for Obsidian vaults. - Tables with formulas should prefer readable Markdown when reliable and warn when downgraded. -- Every renderability failure should be countable in metadata and visible in `.report.md`. +- Every renderability failure should be countable in internal provenance and visible in `_report.md`. ## Reference diff --git a/.codex/skills/math-markdown-review/references/obsidian-output-checks.md b/.codex/skills/math-markdown-review/references/obsidian-output-checks.md index a6746e7..46c03d9 100644 --- a/.codex/skills/math-markdown-review/references/obsidian-output-checks.md +++ b/.codex/skills/math-markdown-review/references/obsidian-output-checks.md @@ -12,7 +12,7 @@ Use these checks when designing or reviewing Markdown output. ## Assets -- Store images under a deterministic asset directory next to the Markdown output. +- Store images under the deterministic shared `images/` directory next to the Markdown output parts. - Use relative Markdown links that remain valid when the output directory is moved as a unit. - Record asset source page, bbox if available, generated file path, and missing-link warnings. @@ -20,7 +20,7 @@ Use these checks when designing or reviewing Markdown output. - Prefer Markdown tables only when cell boundaries and reading order are reliable. - If formulas or merged cells make Markdown tables misleading, use a readable fallback and emit a table warning. -- Keep table warnings visible in both JSON metadata and `.report.md`. +- Keep table warnings visible in internal provenance and `_report.md`. ## Report Signals diff --git a/.gitignore b/.gitignore index 8323e54..23d4ae9 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ __pycache__/ *.py[cod] outputs/ node_modules/ +samples/ \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index ce8ae57..ed23af4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -72,6 +72,16 @@ Strong success criteria let you loop independently. Weak criteria ("make it work **These guidelines are working if:** fewer unnecessary changes in diffs, fewer rewrites due to overcomplication, and clarifying questions come before implementation rather than after mistakes. +## Commands + +| Command | Description | +| --- | --- | +| `uv run pytest` | Run the default fast test suite. | +| `uv run pdf2md doctor` | Check local Python, uv, MinerU, GPU/PyTorch, model/cache, MathJax, and strict-local setup. | +| `uv run pytest tests/test_ui_runner.py` | Run focused UI command-resolution and subprocess tests. | +| `uv run --group ui-build pyinstaller --clean --onefile --windowed --name pdf2md-ui src\pdf2md_ui\app.py` | Rebuild the thin Windows UI executable. | +| `uv run pdf2md convert paper.pdf --out outputs --chunk-pages --gpu auto --mineru-profile auto --strict-local` | Optional local conversion smoke; keep generated output ignored. | + ## Source Documents - `PLAN.md`: shared plan, planned work, open questions, and ownership for agents. @@ -80,8 +90,11 @@ Strong success criteria let you loop independently. Weak criteria ("make it work - `ARCHITECTURE.md`: system layers, MinerU adapter contract, intermediate representation, metadata schema, and local-only enforcement. - `docs/KNOWLEDGEBASE.md`: research basis and implementation background. - `docs/V1IMPLEMENTATIONPLAN.md`: v1 implementation sequence, sprint contracts, verification gates, and agent ownership. +- `docs/UI_RESEARCH.md`: research basis for the implemented minimal Windows UI launcher. - `docs/WORKARCHIVE.md`: archived completed work, historical sprint outcomes, setup results, verification history, and sample conversion evidence. - `docs/Sprints/*.md`: active and historical sprint contracts. +- `docs/superpowers/specs/*.md`: design specs created for focused project workflows. +- `docs/superpowers/plans/*.md`: executable task plans created from specs, including completed UI folder batch work and abandoned historical plans. - `.codex/agents/*.toml`: project-scoped custom subagent roles. - `.codex/commands/*.md`: reusable project prompt commands. - `.codex/skills/*/SKILL.md`: project-specific Codex skills. @@ -155,7 +168,8 @@ Periodically re-evaluate the harness itself. Remove roles, contracts, or checks - Input priority: digital PDFs with text layers. - Quality workflow: fully automatic. Log warnings and continue when possible. - MinerU execution: direct local `mineru` CLI only. MinerU 3.1.0 may launch a temporary local `mineru-api` internally when CLI runs without `--api-url`. -- Quality report: write both metadata JSON and `.report.md`. +- Output layout: write `//_001.md`, shared `//images/`, and `//_report.md`; new conversions do not persist public metadata JSON after Sprint 16. +- UI folder batch conversion: the UI may convert direct-child PDFs in a selected folder by sequentially invoking existing `pdf2md convert` commands. - v1 use case: personal/research. MinerU and transitive model/package licenses must be documented before redistribution. ## Architecture Guidance @@ -217,6 +231,8 @@ After changing files: - Check `git status --short`. - Commit the completed change unless the user explicitly asks not to. - Do not include unrelated user edits in the commit. +- Commit rollback requests - Verify the target commit and current status first, then use a direct non-interactive reset; leave untracked generated/local artifacts such as `build/`, `dist/`, `samples/`, and `*.spec` files untouched unless deletion is explicitly requested. +- Installed-runtime doctor debugging - Test both `uv run pdf2md doctor` and direct venv execution such as `.venv\Scripts\pdf2md.exe doctor`; direct execution may not inherit the same PATH behavior as `uv run`. ## Documentation Guidance diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index b05ba31..fab598f 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1,12 +1,12 @@ # Architecture: Local PDF-to-Markdown Converter -Last updated: 2026-05-07 +Last updated: 2026-05-13 ## 1. Overview The system converts math-heavy digital PDFs into Obsidian-friendly Markdown using MinerU 3.1.0 as the fixed local conversion engine. Product requirements live in `PRD.md`; agent workflow rules live in `AGENTS.md`; research notes live in `docs/KNOWLEDGEBASE.md`. -The architecture separates MinerU execution from project-owned normalization and metadata. This boundary exists only to isolate MinerU I/O; it is not a pluggable engine system. +The architecture separates MinerU execution from project-owned normalization and internal provenance/reporting. This boundary exists only to isolate MinerU I/O; it is not a pluggable engine system. ## 2. System Layers @@ -17,6 +17,8 @@ The architecture separates MinerU execution from project-owned normalization and - Enforce overwrite behavior. - Print conversion summaries. +Optional local UI launcher sits above this layer and invokes the project-owned `pdf2md` CLI. It can run a selected folder by discovering direct-child PDFs and sequentially invoking existing `pdf2md convert` commands. It must not call MinerU directly, add a second conversion engine, run parallel GPU conversions by default, or expose remote/API runtime paths. + 2. MinerU adapter layer - Validate MinerU 3.1.0 installation and version. - Run MinerU through direct local CLI execution. @@ -32,10 +34,11 @@ The architecture separates MinerU execution from project-owned normalization and - Convert project-owned objects and MinerU Markdown into Obsidian-friendly Markdown. - Normalize math delimiters, display math spacing, headings, tables, and asset links. -5. Quality and metadata layer +5. Quality and reporting layer - Run link checks and math renderability checks with local tooling. - Aggregate structured warnings. - - Write metadata JSON, quality report Markdown, and optional raw MinerU diagnostics. + - Build internal metadata-like records for reports and result summaries. + - Write quality report Markdown and optional raw MinerU diagnostics. ## 3. Conversion Pipeline @@ -49,6 +52,7 @@ The architecture separates MinerU execution from project-owned normalization and - Create an isolated work directory per input PDF. - Run the MinerU 3.1.0 adapter through the direct `mineru` CLI. - Capture raw Markdown, raw JSON/structured output when available, extracted assets, warnings, and logs. + - When `--chunk-pages` is active, write one-page temporary PDFs, run MinerU once per source page, and group successful page Markdown into final outputs of the configured page count. 3. Intermediate representation - Build document/page/block records from MinerU output. @@ -63,14 +67,15 @@ The architecture separates MinerU execution from project-owned normalization and 5. Quality checks - Verify generated asset links. - Check math renderability when local tooling is available. + - Compare local pypdf text-layer extraction with Markdown text where page mapping is credible. - Emit warnings without stopping conversion unless no usable output can be produced. 6. Output writing - - Write final Markdown. - - Write extracted assets. - - Write metadata JSON. - - Write `.report.md`. + - Write final Markdown parts under `//`. + - Write extracted assets under `//images/`. + - Write one report at `//_report.md`. - Keep raw MinerU output when requested. + - In grouped page conversion mode, write one public Markdown part per grouped page range and delete temporary one-page PDFs plus intermediate per-page outputs. ## 4. MinerU Adapter Contract @@ -99,6 +104,17 @@ The adapter must fail fast if it cannot run in strict-local mode. Runtime engine The default conversion device is `cuda:0`. Because MinerU 3.1.0 selects its local device through environment/config rather than a dedicated CLI GPU flag, the adapter must set the MinerU subprocess environment to request CUDA by default while keeping the command shape direct and local. +Runtime tuning is project-owned and strict-local: + +- `--gpu auto` selects the visible NVIDIA GPU with the largest VRAM from local `nvidia-smi` inventory. +- `--mineru-profile auto` is the default. +- Safe profile settings are used for GTX 1070 Ti 8GB, pre-Turing, low-VRAM GPUs, or unavailable inventory. +- Stronger settings are used only for 16GB+ Turing-or-newer GPUs. +- Tuning is applied only through allowlisted MinerU subprocess environment variables: `MINERU_PROCESSING_WINDOW_SIZE`, `MINERU_API_MAX_CONCURRENT_REQUESTS`, and `MINERU_PDF_RENDER_THREADS`. +- The adapter must not add MinerU backend flags, API URLs, router mode, HTTP client backend use, remote OpenAI-compatible endpoints, or `MINERU_HYBRID_BATCH_RATIO`. + +Resolved profile details must be recorded in `engine_options["mineru_profile"]`, including requested profile, applied profile, environment values, and selected GPU details when known. + Allowed MinerU execution in v1: - Direct local `mineru` CLI execution. @@ -156,13 +172,13 @@ Final Markdown must prioritize Obsidian. - Do not escape underscores or carets inside math unnecessarily. - Prefer Markdown tables for simple tables. - Use HTML tables for complex tables when Markdown would lose structure. -- Store figures/images in a stable relative assets directory. -- Do not add visible page separators in v1. +- Store figures/images in the stable `images/` directory under the PDF output folder. +- Do not add visible page separators in v1; grouped page conversion may add invisible HTML comments such as `` for provenance. - Preserve captions and references when MinerU provides them. -## 7. Metadata Schema +## 7. Internal Provenance Schema -When metadata is enabled, write `.metadata.json`. +New conversions do not write a public metadata JSON sidecar. The same schema shape remains useful internally for report generation, warning aggregation, and tests. Required top-level fields: @@ -177,6 +193,10 @@ Required top-level fields: - `warnings` - `summary` +Optional top-level fields: + +- `text_fidelity`: page-level local pypdf-vs-Markdown text diagnostics when source text can be extracted or page mapping uncertainty needs to be recorded. + Required summary fields: - `pages_processed` @@ -186,6 +206,21 @@ Required summary fields: - `inline_formula_count` - `math_render_error_count` +Optional text fidelity summary fields: + +- `text_fidelity_checked_page_count` +- `text_fidelity_low_page_count` +- `text_fidelity_unexpected_cjk_count` +- `text_fidelity_replacement_candidate_page_count` +- `text_fidelity_page_mapping_uncertain_count` + +Grouped page conversion records these `engine_options` entries: + +- `chunk`: original source PDF path, grouped output index, total grouped outputs, and original source page range. +- `page_conversion`: `single_page` mode, MinerU input page count of 1, grouped output page count, and failed source page numbers. +- `parts`: aggregate report records for output Markdown part paths, source page ranges, status, warning counts, and failed source pages. +- `output_folder`: the PDF-stem output folder. + Warning records include: - `code` @@ -205,20 +240,28 @@ Stable warning code examples: - `READING_ORDER_UNCERTAIN` - `STRICT_LOCAL_VIOLATION` - `MINERU_CLI_FAILED` +- `MINERU_PROFILE_ADJUSTED` +- `TEXT_LAYER_AVAILABLE` +- `TEXT_FIDELITY_LOW` +- `UNEXPECTED_CJK_IN_KOREAN_TEXT` +- `HANGUL_SPACING_SUSPECT` +- `TEXT_PAGE_MAPPING_UNCERTAIN` ## 8. Quality Report -Every conversion writes `.report.md`. +Every conversion writes `/_report.md`. -The report is derived from metadata and local quality checks. It contains: +The report is derived from internal provenance and local quality checks. It contains: - Source and output paths. +- Markdown part paths and source page ranges. - MinerU version and execution mode. - Pages processed. - Warning count. - Asset count and missing asset link count. - Inline and display formula counts. - Math render error count. +- Text fidelity summary when pypdf diagnostics are available. - Pages with warnings. - Final status: `success`, `partial`, or `failed`. diff --git a/PLAN.md b/PLAN.md index db0868b..25d2066 100644 --- a/PLAN.md +++ b/PLAN.md @@ -4,7 +4,7 @@ This file is the shared work plan for agents. Read it before starting work, then ## Current Goal -Completed work history is archived in `docs/WORKARCHIVE.md`. Sprint 11 MathJax warning mitigation is implemented. On this PC, full local runtime setup is complete in `.venv`; Markdown quality recheck for existing outputs is implemented and now shares the same conservative MathJax repair path as fresh conversion. Next work is optional manual Obsidian quality review, additional sample validation, or broader repair rules if future samples expose new deterministic MathJax failure patterns. +Completed work through Sprint 16, the Sprint 16 SolidElement validation, and the UI direct-folder batch conversion is archived in `docs/WORKARCHIVE.md`. Sprint 17 offline installer planning has been abandoned and is retained only as historical context. ## Active Constraints @@ -14,142 +14,64 @@ Completed work history is archived in `docs/WORKARCHIVE.md`. Sprint 11 MathJax w - Target Python 3.12. - Target GPU: GTX 1070 Ti 8GB. - Default conversion device: `cuda:0`. +- Default MinerU profile: `auto`. - Run MinerU through direct local CLI execution only. +- UI code must invoke the existing project-owned `pdf2md` CLI; it must not call MinerU directly. +- The current UI executable is a thin launcher for the installed local runtime, not a self-contained bundle of MinerU, PyTorch, CUDA, local models, Node.js, or MathJax. +- UI subprocess calls must use fixed argument lists with `shell=False` and must not expose arbitrary command execution. +- UI folder batch conversion must run direct-child PDFs sequentially through existing `pdf2md convert` commands. - On MinerU failure, report a clear error/warning and do not silently fallback. -- Write both metadata JSON and a human-readable `.report.md` quality report for conversions. +- Current conversions write simplified Markdown/report outputs with no persisted metadata JSON; internal provenance still feeds warnings and reports. +- `pdf2md recheck` remains legacy-only for outputs that still have adjacent metadata JSON. +- Do not commit generated installer payloads, wheelhouses, Python installers, model files, Node binaries, generated installer executables, `build/`, `dist/`, `outputs/`, or `samples/`. - Use `samples/` only as local fixture context; do not commit sample files unless explicitly requested. +## Active References + +- Product requirements: `PRD.md`. +- System design: `ARCHITECTURE.md`. +- Agent workflow: `AGENTS.md`. +- Current implementation sequence: `docs/V1IMPLEMENTATIONPLAN.md`. +- Completed work archive: `docs/WORKARCHIVE.md`. +- Release gates: `docs/V1RELEASECHECKLIST.md`. +- Completed UI folder batch design and plan: `docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md` and `docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md`. +- Abandoned Sprint 17 historical plan: `docs/Sprints/SPRINT17CONTRACT.md` and `docs/superpowers/plans/2026-05-12-offline-installer.md`. + ## Planned Work -1. Use `research-agent` for MinerU 3.1.0 source tracking and official-doc verification. -2. Use `requirements-guard-agent` for cross-document consistency reviews. -3. Use `mineru-integration-agent` for direct local MinerU CLI adapter planning. -4. Use `obsidian-markdown-agent` for math-heavy Obsidian Markdown output planning. -5. Use `metadata-agent` for provenance, warning, JSON metadata, and `.report.md` planning. -6. Use `evaluation-agent` for local fixture coverage and regression criteria. -7. Use `local-setup-agent` for Python 3.12, uv, CUDA, GTX 1070 Ti 8GB, and doctor-check planning. -8. Use `license-privacy-agent` for license and strict-local privacy review. -9. Use `harness-planner-agent` to turn substantial implementation requests into scoped contracts before code work starts. -10. Use `feature-generator-agent` to implement one approved contract at a time after the user explicitly requests implementation. -11. Use `evaluation-agent` as the independent contract reviewer and QA evaluator before and after each implementation chunk. -12. Follow `docs/V1IMPLEMENTATIONPLAN.md` for the v1 implementation sprint sequence. -13. Use `docs/Sprints/SPRINT10CONTRACT.md` for the implemented long-PDF pre-conversion chunking sprint. -14. Use `docs/WORKARCHIVE.md` for completed sprint history, prior verification, runtime setup evidence, and sample conversion evidence. -15. Use `docs/Sprints/SPRINT11CONTRACT.md` for the implemented MathJax warning mitigation sprint. -16. Keep the mitigation path shared by `pdf2md convert` and `pdf2md recheck` so existing Markdown outputs can be cleaned without rerunning MinerU. +1. Keep completed sprint details out of `PROGRESS.md`; use `docs/WORKARCHIVE.md` and `docs/Sprints/*.md` for history. +2. Preserve strict-local runtime behavior: use local model paths, direct CLI execution, and no user-specified API or remote backend. +3. When practical, run hands-on UI smoke from `dist\pdf2md-ui.exe`: Doctor, then one small local conversion to ignored `outputs/`. +4. On a stronger NVIDIA GPU PC, run `uv run pdf2md doctor` and one optional local conversion with `--gpu auto --mineru-profile auto` to validate the auto profile. +5. Decide in a future sprint whether simplified outputs need metadata-free `pdf2md recheck`; current behavior intentionally remains legacy-only. -## Sprint 11: MathJax Warning Mitigation +## Completed Work References -Objective: - -- Implemented a conservative local post-validation cleanup pass that attempts to remove only the specific math-span artifacts responsible for MathJax warnings, then reruns MathJax validation before writing final Markdown, metadata JSON, and report Markdown. - -Assumptions: - -- MathJax warning mitigation is best-effort and nonfatal. -- The cleanup pass must stay deterministic and local-only. -- Warning reduction must not silently erase meaningful formula content. -- The same behavior should apply to fresh conversions and `pdf2md recheck`. - -Planned workflow: - -1. Run the existing MathJax renderability check against normalized Markdown and keep failed `MathExpression` records, including index, display mode, Markdown span, and MathJax message. -2. Generate cleanup candidates only for failed spans. Candidate rules should start with narrow, non-semantic fixes such as trimming invisible/control artifacts, removing obvious OCR/extractor debris, normalizing accidental delimiter leftovers, and fixing whitespace/newline forms known to break MathJax. -3. Validate each candidate with the same local MathJax checker. Replace a math span only when the candidate passes and preserves the original inline/display delimiter shape. -4. Rebuild Markdown from approved span replacements and rerun the full quality check on the repaired Markdown. -5. Write metadata/report data from the final Markdown and final quality result. Record unresolved failures as `MATH_RENDER_FAILED`; record applied mitigations in a traceable form so warning counts are not reduced by hiding changes. - -Touched surfaces to plan in the sprint contract: - -- `src/pdf2md/quality.py`: expose failed math expression details without losing the existing warning behavior. -- `src/pdf2md/math_render.py`: keep MathJax checking local and batch-oriented; do not expose raw MathJax objects as public API. -- New focused module, likely `src/pdf2md/math_repair.py`: own candidate generation, span replacement, and repair result records. -- `src/pdf2md/conversion.py`: run mitigation between normalization and final metadata/report construction for `convert` and `recheck`. -- `src/pdf2md/ir.py`, `src/pdf2md/metadata.py`, and `src/pdf2md/report.py`: update only if the contract decides a new repair warning/info code or summary field is needed. -- Tests in `tests/test_quality.py`, a new `tests/test_math_repair.py`, and targeted conversion/recheck CLI tests. - -Non-goals: - -- Do not add cloud OCR, remote LLMs, remote render APIs, or external document upload paths. -- Do not add a second conversion engine or runtime engine selection. -- Do not implement a full LaTeX parser, symbolic math simplifier, or Obsidian automation. -- Do not remove whole formulas or meaningful LaTeX tokens solely to silence warnings. -- Do not add new CLI flags unless a later contract explicitly justifies them. - -Verification: - -- Unit tests for failed-expression capture, candidate generation, safe span replacement, and no-op behavior when no candidate passes. -- Conversion tests proving repaired Markdown is written only after candidate revalidation. -- Recheck tests proving existing output Markdown can be repaired and metadata/report regenerated without rerunning MinerU. -- Report/metadata tests proving remaining warnings and applied mitigations are visible and derived from final state. -- Run `uv run pytest tests/test_quality.py tests/test_math_repair.py tests/test_conversion.py tests/test_cli.py tests/test_report.py`. -- Run `uv run pytest` before marking the sprint complete. -- Optionally run `uv run pdf2md recheck outputs\MITC공부\MITC공부.md` against ignored local sample output when the user requests real-output validation. - -Hard failure criteria: - -- The cleanup changes math spans that did not fail MathJax validation. -- The cleanup removes an entire formula or a semantically meaningful token without an explicit trace. -- The cleanup reduces warning counts by dropping warnings instead of producing MathJax-valid Markdown. -- The cleanup makes `pdf2md convert` or `pdf2md recheck` require Node.js/MathJax when they were previously optional. -- Default tests require real MinerU, GPU, Node.js, MathJax, network, Obsidian, or `samples/`. +- Completed sprint outcomes through Sprint 16 are summarized in `docs/WORKARCHIVE.md`. +- Detailed historical contracts remain under `docs/Sprints/SPRINT0CONTRACT.md` through `docs/Sprints/SPRINT16CONTRACT.md`. +- UI direct-folder batch conversion is archived in `docs/WORKARCHIVE.md`; its design and execution plan live under `docs/superpowers/`. +- Abandoned Sprint 17 offline installer planning is archived in `docs/WORKARCHIVE.md` and must not be treated as active planned work. +- Historical verification results and sample conversion evidence live in `docs/WORKARCHIVE.md`. ## Open Questions -- None. +- Whether metadata-free `pdf2md recheck` should be designed for simplified outputs. +- Whether a stronger NVIDIA GPU PC changes the default practical MinerU profile recommendation after real conversion validation. ## Decisions - Use `PLAN.md` for intended work and ownership. -- Use `PROGRESS.md` for completed work, current status, blockers, and next actions. -- Use `docs/WORKARCHIVE.md` for archived completed work and historical handoff details. +- Use `PROGRESS.md` for current status, blockers, and next actions. +- Use `docs/WORKARCHIVE.md` for archived completed work, historical verification, runtime setup evidence, and sample conversion evidence. - MinerU default local CLI execution is the only v1 execution mode. - MinerU 3.1.0 may launch a temporary local `mineru-api` internally when `mineru` CLI runs without `--api-url`. - Strict-local mode forbids `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends. - No silent fallback after MinerU failure. -- Conversion output includes both metadata JSON and `.report.md`. +- Current conversion output uses `/_001.md`, shared `/images/`, and one `/_report.md`; new conversions do not persist metadata JSON. - Local MathJax render checking is optional and nonfatal; missing Node.js or MathJax must produce a clear warning instead of blocking conversion. -- MathJax warning mitigation must run only after initial local MathJax validation identifies failed math spans. -- MathJax warning mitigation must be deterministic, local-only, and limited to failed math spans. -- Candidate math cleanup must be revalidated with the local MathJax checker before replacing Markdown. -- If no candidate passes validation, keep the original formula and retain the `MATH_RENDER_FAILED` warning. -- Successfully mitigated formulas must remain traceable in metadata/report output; warning reduction must not hide that a formula was changed. -- Sprint 11 uses `MATH_RENDER_REPAIRED` info warnings for applied repair provenance. -- Sprint 11 initial repair rules cover repeated same-direction scripts and truncated array `\end{a}` endings only. -- Project-scoped custom agents live in `.codex/agents/*.toml`. -- Project prompt commands live in `.codex/commands/*.md`. -- Project-specific skills live in `.codex/skills/*/SKILL.md`. -- Project hooks live in `.codex/hooks.json` and `.codex/hooks/*.py`. -- Agent, command, skill, and hook assets are written in English for Codex compatibility. -- Long-running implementation should use a planner/generator/evaluator harness only when the task complexity justifies the overhead. -- Each substantial implementation chunk should have a sprint contract with objective, scope, verification, failure thresholds, and handoff fields. -- Generator agents may self-check, but independent evaluation is required before marking a chunk complete. -- V1 implementation sequencing and sprint contracts live in `docs/V1IMPLEMENTATIONPLAN.md`. -- Concrete sprint contract documents live under `docs/Sprints/`. -- Sprint 2 path planning contract lives at `docs/Sprints/SPRINT2CONTRACT.md`. -- Sprint 3 domain records and metadata contract lives at `docs/Sprints/SPRINT3CONTRACT.md`. -- Sprint 4 MinerU adapter contract lives at `docs/Sprints/SPRINT4CONTRACT.md`. -- Sprint 4 fixes the v1 adapter executable to the direct `mineru` CLI; user-specified alternate executables, including `mineru-api`, are prohibited. -- Sprint 5 Obsidian Markdown normalization and asset link contract lives at `docs/Sprints/SPRINT5CONTRACT.md`. -- Sprint 5 owns Markdown normalization only; it does not write final Markdown files, copy assets, run MinerU, or connect to conversion orchestration. -- Sprint 6 quality checks and report generation contract lives at `docs/Sprints/SPRINT6CONTRACT.md`. -- Sprint 6 owns quality/report boundaries only; it does not write final files, run MinerU, or connect to conversion orchestration. -- Sprint 7 conversion orchestration, CLI, and Python API contract lives at `docs/Sprints/SPRINT7CONTRACT.md`. -- Sprint 7 will be the first implementation sprint allowed to write final Markdown, metadata JSON, report Markdown, and local copied assets as product behavior. -- Sprint 7 implemented conversion orchestration, `convert_pdf`, batch conversion, `pdf2md convert`, output writing, metadata/report writing, and fake-adapter CLI/API tests. -- Sprint 8 should cover `pdf2md doctor` and setup documentation; Sprint 7 intentionally did not add doctor behavior. -- Sprint 8 doctor and setup documentation contract lives at `docs/Sprints/SPRINT8CONTRACT.md`. -- Sprint 8 owns doctor diagnostics and setup docs only; it must not run real MinerU, download models, run sample PDFs, or add runtime remote/API paths in default tests. -- Sprint 8 implements `pdf2md doctor`, local setup diagnostics, and setup documentation without running real MinerU, downloading models, or touching `samples/` in default tests. -- Sprint 9 local fixture evaluation and v1 release gate contract lives at `docs/Sprints/SPRINT9CONTRACT.md`. -- Sprint 9 must keep default tests independent of real MinerU, GPU, models, network, Obsidian, LaTeX tooling, and `samples/`; real MinerU fixture checks must be explicit opt-in only. -- Sprint 9 implements fast mocked integration tests, explicit opt-in local MinerU fixture evaluation, and `docs/V1RELEASECHECKLIST.md`. -- `pdf2md convert` defaults to `--gpu cuda:0`. -- The MinerU adapter maps CUDA device requests to local subprocess environment variables instead of adding speculative MinerU CLI flags. -- GTX 1070 Ti local runtime uses PyTorch `2.6.0+cu126` and `torchvision 0.21.0+cu126` installed after `uv sync`, followed by `mineru[core]==3.1.0`. -- MinerU models are downloaded with `mineru-models-download -s huggingface -m all`, and runtime model loading uses `MINERU_MODEL_SOURCE=local`. -- Sprint 10 uses `pypdf` for local PDF page chunk planning and temporary chunk PDF writing. -- Sprint 10 converts chunk PDFs independently and does not merge generated Markdown outputs. -- Chunking is opt-in through `--chunk-pages`; if the option is present without a value, the CLI uses 20 pages per chunk. -- `convert_pdf()` keeps returning `ConversionResult` without chunking and returns `BatchConversionResult` when `chunk_pages` is set. -- Chunk PDFs are temporary local files and are deleted after conversion completes, including when raw MinerU output is retained. +- Chunking remains opt-in through `--chunk-pages`; if the option is present without a value, final grouped outputs use 20 source pages. +- In chunk mode, MinerU receives one source page per run and final Markdown parts are grouped by `chunk_pages`. +- `--gpu auto` selects the visible NVIDIA GPU with the largest local `nvidia-smi` VRAM report. +- `--mineru-profile auto` is the default and stays conservative on GTX 1070 Ti 8GB, low-VRAM, and pre-Turing GPUs. +- The UI launcher can convert a direct folder by running one existing `pdf2md convert` command per direct-child PDF sequentially. +- Sprint 17 offline installer planning is abandoned. Do not implement or extend offline installer work unless the user explicitly reopens that direction. diff --git a/PRD.md b/PRD.md index e1a3752..f08a2c5 100644 --- a/PRD.md +++ b/PRD.md @@ -1,27 +1,29 @@ # PRD: Local PDF-to-Markdown Converter -Last updated: 2026-05-07 +Last updated: 2026-05-13 ## 1. Summary -Build a local-only CLI and Python library that converts math-heavy digital PDFs into Obsidian-friendly Markdown. The product prioritizes accurate LaTeX reconstruction for equations, preservation of document structure, stable asset links, and traceable page-level metadata. +Build a local-only CLI and Python library that converts math-heavy digital PDFs into Obsidian-friendly Markdown. The product prioritizes accurate LaTeX reconstruction for equations, preservation of document structure, stable asset links, and traceable page-level provenance in the human-readable report. -The first version is for personal/research use, targets NVIDIA GPU machines, and uses MinerU 3.1.0 as the fixed conversion engine. It should process digital PDFs with existing text layers first. Scanned books, cloud OCR APIs, web UI, and manual review workflows are out of scope for v1. +The first version is for personal/research use, targets NVIDIA GPU machines, and uses MinerU 3.1.0 as the fixed conversion engine. It should process digital PDFs with existing text layers first. Scanned books, cloud OCR APIs, hosted web apps, and manual review workflows are out of scope for v1. A thin local Windows desktop launcher exists as a convenience wrapper over the existing `pdf2md` CLI. ## 2. Goals -- Convert a single PDF into one Markdown file plus assets, metadata JSON, and a human-readable quality report. +- Convert a single PDF into a PDF-stem output folder containing Markdown part files, shared assets, and one human-readable quality report. - Convert a folder of PDFs in batch mode. +- Allow the thin local Windows UI launcher to convert direct-child PDFs in a selected folder by sequentially invoking existing CLI commands. - Preserve inline math as `$...$` and display math as `$$...$$`. - Produce Markdown that opens cleanly in Obsidian. - Use MinerU 3.1.0 locally. -- Keep enough metadata to diagnose formula, layout, and reading-order errors. +- Keep enough internal provenance to diagnose formula, layout, and reading-order errors through warnings and the report. - Continue conversion automatically when a page or formula is low-confidence, while logging warnings. ## 3. Non-Goals - No cloud OCR, cloud LLM, or third-party document upload in v1. -- No web app or GUI in v1. +- No hosted web app, manual review UI, or alternate GUI conversion pipeline in v1. +- A thin local desktop launcher is allowed only when it invokes the existing `pdf2md` CLI and preserves strict-local behavior. - No manual review queue in v1. - No optimization for low-quality scanned books in v1. - No guaranteed perfect LaTeX reconstruction. @@ -62,10 +64,9 @@ Out of scope for v1 optimization: For each input PDF, the converter writes: -- A normalized Markdown file. -- An assets directory when MinerU extracts images or other media. -- A metadata JSON file. -- A human-readable quality report named `.report.md`. +- One or more normalized Markdown part files named `_001.md`, `_002.md`, and so on. +- A shared `images/` directory when MinerU extracts images or other media. +- A human-readable quality report named `_report.md`. - Optional raw MinerU outputs for debugging. Markdown rules: @@ -74,8 +75,8 @@ Markdown rules: - Display equations use `$$...$$` on separate lines. - Simple tables use Markdown pipe tables. - Complex tables may use HTML when Markdown would lose structure. -- Images use relative links to the generated assets directory. -- Visible page markers should be avoided by default; page provenance belongs in metadata. +- Images use relative links to `images/...` under the PDF output folder. +- Visible page markers should be avoided by default; grouped page conversion may use invisible HTML comments for page provenance. - Obsidian compatibility is the output standard. Detailed Markdown normalization rules are defined in `ARCHITECTURE.md`. @@ -96,18 +97,20 @@ pdf2md doctor - If `INPUT` is a PDF, convert that file. - If `INPUT` is a directory, convert PDFs in that directory. - Directory conversion requires `--recursive` to descend into subdirectories. -- Output filenames default to the source PDF stem plus `.md`. -- Asset directories default to `.assets`. +- Output folders default to `//`. +- Markdown part filenames default to `_001.md`, `_002.md`, and so on. +- Asset directories default to `//images/`. - Existing outputs are not overwritten unless `--overwrite` is passed. Required `convert` options: - `--out PATH`: output directory. -- `--metadata`: write metadata JSON. Enabled by default in v1. +- `--metadata`: accepted for compatibility; no metadata JSON is written in the simplified output layout. - `--keep-raw`: keep raw MinerU output for debugging. - `--recursive`: recursively process directory inputs. - `--overwrite`: replace existing outputs. -- `--gpu DEVICE`: select CUDA device. Default: `cuda:0`. +- `--gpu DEVICE`: select CUDA device. Default: `cuda:0`; `auto` selects the visible NVIDIA GPU with the most VRAM. +- `--mineru-profile {auto,safe,performance}`: select MinerU runtime tuning. Default: `auto`. - `--strict-local`: forbid remote network/cloud execution during conversion. Default: true. `doctor` behavior: @@ -115,11 +118,18 @@ Required `convert` options: - Report Python version. - Report `uv` availability. - Report CUDA/PyTorch GPU availability when detectable. +- Report visible NVIDIA GPU index, VRAM, driver version, `--gpu auto` recommendation, and recommended MinerU profile. - Report MinerU availability. - Report local model/cache paths when detectable. - Warn if no NVIDIA GPU is available. - Fail if required v1 runtime dependencies are missing. +UI launcher behavior: + +- The UI is a local convenience wrapper over the existing `pdf2md` CLI. +- The UI may convert a selected folder by discovering direct-child PDFs only and running one `pdf2md convert` command per PDF sequentially. +- The UI must not invoke MinerU directly, add recursive folder conversion outside the existing CLI behavior, run conversions in parallel by default, or expose remote/API options. + ## 8. Python Library Requirements The library should expose a stable API suitable for scripts and tests. @@ -139,17 +149,17 @@ result = convert_pdf( Required return fields: - `markdown_path` -- `metadata_path` +- `metadata_path`, which is `None` for new simplified outputs. - `assets_dir` - `warnings` - `engine` - `pages_processed` -The public API should not expose raw MinerU objects as required return types. MinerU-specific data may be stored under optional metadata fields. +The public API should not expose raw MinerU objects as required return types. MinerU-specific data may be stored in internal report/provenance structures. -## 9. Metadata Requirements +## 9. Provenance Requirements -When `--metadata` is enabled, write `.metadata.json`. +New conversions must not write a public metadata JSON sidecar. Internal metadata-like records may still be built in memory to derive reports, warnings, counts, and `ConversionResult` fields. Required top-level fields: @@ -175,16 +185,16 @@ Required summary fields: Warnings must be non-fatal unless the source file cannot be read or no output can be produced. -Detailed metadata fields, block types, and warning codes are defined in `ARCHITECTURE.md`. +Detailed internal provenance fields, block types, and warning codes are defined in `ARCHITECTURE.md`. ## 10. Quality Report Requirements -For every conversion, write `.report.md`. +For every conversion, write `/_report.md`. -The report must be readable without opening the JSON metadata and include: +The report must be readable as the primary human-facing quality artifact and include: - Source PDF path. -- Output Markdown path. +- Output folder path and Markdown part paths. - MinerU version. - Page count. - Warning count. @@ -201,7 +211,7 @@ The product is fully automatic in v1. - Low-confidence formulas are included in the output as best effort. - Low-confidence pages are included in the output as best effort. -- The converter logs warnings and metadata records. +- The converter logs warnings and internal provenance records. - Conversion uses MinerU's default local CLI execution. If MinerU cannot run or fails, the converter must emit a clear error/warning instead of silently falling back to another backend. - Conversion fails only when the input cannot be opened, MinerU cannot run, output cannot be written, no usable output can be produced, or local-only policy is violated. @@ -254,25 +264,22 @@ uv sync uv run pdf2md doctor ``` -MinerU/model setup may require additional scripts, for example: - -```bash -uv run scripts/install-mineru.ps1 -uv run scripts/install-models.py -``` +MinerU/model setup requires explicit user-initiated local setup commands documented in `README.md`. Do not reference setup helper scripts unless they actually exist in the repository. The project should document NVIDIA GPU/CUDA expectations and provide clear errors when GPU acceleration is unavailable. +The default MinerU profile must be conservative on GTX 1070 Ti 8GB and other weak or pre-Turing GPUs. Stronger profile settings are allowed only through local environment tuning on selected 16GB+ Turing-or-newer NVIDIA GPUs. + ## 14. Test Requirements Required test categories: - Unit tests for Markdown math delimiter normalization. - Unit tests for asset path normalization. -- Unit tests for metadata schema creation. +- Unit tests for internal metadata/provenance schema creation. - Unit tests for warning aggregation. - MinerU adapter contract tests with mocked outputs. -- CLI tests for single PDF, directory input, overwrite behavior, and metadata output. +- CLI tests for single PDF, directory input, overwrite behavior, and simplified output layout. Fixture categories: @@ -285,7 +292,7 @@ Fixture categories: Acceptance checks: - Markdown exists after conversion. -- Metadata exists when requested. +- No metadata JSON is written for new conversions. - Quality report exists after conversion. - Asset links resolve. - Inline/display math delimiters match Obsidian expectations. @@ -298,10 +305,10 @@ Acceptance checks: v1 is acceptable when: -- `pdf2md convert paper.pdf --out out --metadata` works on a representative digital academic PDF. -- `pdf2md convert pdfs --out out --recursive --metadata` works on a small folder. +- `pdf2md convert paper.pdf --out out` works on a representative digital academic PDF. +- `pdf2md convert pdfs --out out --recursive` works on a small folder. - `pdf2md doctor` reports MinerU/GPU status clearly. - The default output opens in Obsidian with math blocks rendered. -- Metadata links pages, blocks, warnings, and assets to the source PDF. -- `.report.md` summarizes warnings, formulas, assets, and render/link check results. +- The report links pages, warnings, output parts, and assets to the source PDF. +- `/_report.md` summarizes warnings, formulas, assets, and render/link check results. - The README or setup docs explain local-only behavior and GPU expectations. diff --git a/PROGRESS.md b/PROGRESS.md index fdd1b97..c3a86f2 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -6,64 +6,51 @@ This file records current progress for agents. Read it before starting work, the - Project direction is documented in `PRD.md`, `ARCHITECTURE.md`, `AGENTS.md`, and `docs/KNOWLEDGEBASE.md`. - MinerU 3.1.0 is fixed as the only conversion engine. -- The converter currently includes path planning, project-owned records, metadata, direct local MinerU adapter boundary, Obsidian Markdown normalization, local quality checks, report rendering, conversion orchestration, `pdf2md convert`, `pdf2md recheck`, `pdf2md doctor`, local MathJax render checking, conservative MathJax warning mitigation, release-gate tests, and opt-in pre-conversion PDF chunking. -- `docs/V1IMPLEMENTATIONPLAN.md` defines the v1 implementation sequence. -- `docs/Sprints/` contains completed sprint contracts through Sprint 11. -- `docs/WORKARCHIVE.md` contains completed sprint history, historical verification results, runtime setup notes, and sample conversion evidence. +- The converter currently includes path planning, project-owned records, internal provenance, direct local MinerU adapter boundary, Obsidian Markdown normalization, local quality checks, report rendering, conversion orchestration, simplified output layout, `pdf2md convert`, legacy `pdf2md recheck`, `pdf2md doctor`, local MathJax render checking, conservative MathJax warning mitigation, release-gate tests, opt-in grouped page conversion, a minimal Windows UI launcher with direct-folder PDF batch conversion, pypdf-based text layer fidelity diagnostics, NVIDIA GPU inventory, optional `--gpu auto`, and MinerU profile tuning. +- `docs/V1IMPLEMENTATIONPLAN.md` now tracks current v1 state and open future decisions; completed implementation details are archived in `docs/WORKARCHIVE.md`. +- `docs/Sprints/` contains completed sprint contracts through Sprint 16 and the abandoned Sprint 17 offline installer contract. +- `docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md` and `docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md` record the completed UI direct-folder batch work. +- `docs/WORKARCHIVE.md` contains completed sprint history, historical verification results, runtime setup notes, sample conversion evidence, archived UI work, and abandoned Sprint 17 planning context. - `samples/` exists locally as fixture context. -- `outputs/` is ignored and contains local generated conversion outputs. +- `outputs/`, `build/`, and `dist/` are local generated artifact locations and must stay out of commits. ## Environment Notes -- OS/workspace: Windows PowerShell in `C:\git\PDFToMD`. +- OS/workspace: Windows PowerShell in `D:\Work\Repos\AICoding\ConvertPDFToMD`. - Python target: 3.12. -- Local project Python observed: 3.12.13 in `.venv`. -- `uv` is installed per-user at `C:\Users\baram\.local\bin`. -- Target GPU documented for the original project setup: NVIDIA GTX 1070 Ti 8GB. -- Current PC GPU observed by `doctor`: NVIDIA GeForce RTX 4080 SUPER 16GB. +- Local project Python observed: 3.12.7 through `uv run pdf2md doctor` on 2026-05-11. +- `uv` is installed per-user at `C:\Users\user\.local\bin`. +- Target GPU documented for this project setup: NVIDIA GTX 1070 Ti 8GB. +- Current PC GPU observed by `doctor`: NVIDIA GeForce GTX 1070 Ti 8GB. - Default conversion device: `cuda:0`. +- Default MinerU profile: `auto`. - MinerU execution mode: direct local `mineru` CLI only. - Strict-local allows MinerU 3.1.0's CLI-internal temporary local `mineru-api` when the CLI runs without `--api-url`. - Strict-local prohibits `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends. -- Current `.venv` has project fast-test dependencies, CUDA-enabled PyTorch `2.6.0+cu126`, `torchvision 0.21.0+cu126`, and `mineru[core]==3.1.0`. -- Current `pdf2md doctor` status is PASS. MinerU, RTX 4080 SUPER CUDA PyTorch, local model config, MathJax, and strict-local checks pass. +- Current `.venv` has project fast-test dependencies, CUDA-enabled PyTorch `2.6.0+cu126`, `torchvision 0.21.0+cu126`, `mineru[core]==3.1.0`, local MathJax npm dependencies, and local MinerU models. +- Current `pdf2md doctor` status is WARN because GTX 1070 Ti is Pascal/pre-Turing; MinerU, CUDA PyTorch visibility, local model config, MathJax, and strict-local checks otherwise pass. Doctor selects `cuda:0` for `--gpu auto` on this machine and recommends MinerU profile `safe`. - MinerU models were downloaded from Hugging Face by explicit setup command. Runtime model loading uses `MINERU_MODEL_SOURCE=local`. ## Recent Completed Work -- Archived completed sprint and setup history into `docs/WORKARCHIVE.md`. -- Added `docs/WORKARCHIVE.md` references to `AGENTS.md`, `PLAN.md`, `docs/V1IMPLEMENTATIONPLAN.md`, relevant `.codex/agents/*.toml`, `.codex/commands/*.md`, and project skills. -- Sprint 10 is implemented with `pypdf>=6.10.2,<7`, `src/pdf2md/pdf_splitter.py`, `--chunk-pages [PAGES]`, chunk-aware conversion orchestration, temporary chunk cleanup, and chunk report context. -- `--chunk-pages` is opt-in; when present without a value it uses 20 pages. -- `convert_pdf()` returns `BatchConversionResult` when `chunk_pages` is set and keeps returning `ConversionResult` when chunking is unset. -- Converted `samples/FourNodeQuadrilateralShellElementMITC4.pdf` with `MINERU_MODEL_SOURCE=local` and default `--gpu cuda:0`; output was written to ignored `outputs/FourNodeQuadrilateralShellElementMITC4/`. -- The FourNode sample conversion report status was `success`: 7 pages, 22 assets, 38 inline formulas, 16 display formulas, 0 math render errors, and 0 warnings. -- Installed uv `0.11.12` at `C:\Users\baram\.local\bin`, installed uv-managed CPython `3.12.13`, created `.venv`, and ran `uv sync`. -- Verified base project environment with `uv run pytest`: 163 passed, 1 skipped. -- Installed runtime dependencies on this PC: CUDA PyTorch `2.6.0+cu126`, `torchvision 0.21.0+cu126`, `mineru[core]==3.1.0`, local MathJax npm dependencies, and local MinerU models. -- Set user environment variable `MINERU_MODEL_SOURCE=local`. -- Verified full local runtime with `uv run pdf2md doctor`: PASS. -- Verified real local sample conversion: `samples/FourNodeQuadrilateralShellElementMITC4.pdf` to ignored `outputs/runtime-smoke/`, status `success`, 7 pages, 22 assets, 38 inline formulas, 16 display formulas, 0 math render errors, and 0 warnings. -- Converted `samples/MITC공부.pdf` to ignored `outputs/MITC공부/`; report status was `partial`: 13 pages, 107 assets, 23 inline formulas, 103 display formulas, 2 MathJax render warnings, and 0 missing or invalid asset links. -- Added `recheck_markdown()` and `pdf2md recheck ` to rerun local quality checks for an existing generated Markdown file and rewrite the adjacent metadata JSON and `.report.md` without rerunning MinerU. -- Verified `uv run pdf2md recheck outputs\MITC공부\MITC공부.md`; the command regenerated metadata/report and still reported 2 warnings because the current Markdown still contains the two MathJax-invalid expressions. -- Reconverted `samples/MITC공부.pdf` with `--overwrite` to ignored `outputs/MITC공부/`; report status remains `partial`: 13 pages, 107 assets, 23 inline formulas, 103 display formulas, 2 MathJax render warnings, and 0 missing or invalid asset links. -- Sprint 11 implemented conservative MathJax warning mitigation with failed-expression details, `src/pdf2md/math_repair.py`, shared `convert`/`recheck` repair integration, and `MATH_RENDER_REPAIRED` info warnings. -- Verified default fast suite: `uv run pytest` passed 172 tests with 1 skipped. -- Verified requested real sample: `uv run pdf2md convert samples\MITC공부.pdf --out outputs\sprint11-MITC공부 --overwrite` succeeded with 13 pages, 107 assets, 23 inline formulas, 103 display formulas, 0 MathJax render errors, and 2 `MATH_RENDER_REPAIRED` info warnings. -- Reconverted `samples/MITC공부.pdf` to ignored `outputs/MITC공부/` with Sprint 11 mitigation; report status is `partial` from 2 `MATH_RENDER_REPAIRED` info warnings, with 13 pages, 107 assets, 23 inline formulas, 103 display formulas, 0 MathJax render errors, and 0 missing or invalid asset links. +- Archived completed coordination details from `PLAN.md`, `PROGRESS.md`, and `docs/V1IMPLEMENTATIONPLAN.md` into `docs/WORKARCHIVE.md`. +- Refreshed current docs so abandoned Sprint 17 offline installer planning, completed UI direct-folder batch conversion, simplified output layout, legacy-only `recheck`, and no-public-metadata behavior are consistently referenced. +- Updated project agent/source-document references so future document reviews and implementation work can find Sprint 15/16 contracts, abandoned Sprint 17 context, and the UI folder batch design/plan. +- Abandoned Sprint 17 offline installer planning at the user's request. The contract and plan remain as historical records only. ## In Progress -- No active implementation chunk. +- No active implementation sprint. ## Blockers - No active blocker. +- Residual risk: direct CLI conversion smokes for `samples\FourNodeQuadrilateralShellElementMITC4.pdf` exceeded the 15-minute timeout on 2026-05-11 and stalled on source page 2 with Sprint 14 `--chunk-pages` on 2026-05-12, so hands-on UI conversion smoke remains pending. +- Residual risk: conversion can still be impractically slow or stall on GTX 1070 Ti 8GB for some source pages even when Sprint 14 sends one source page to MinerU at a time. ## Next Actions -1. Review generated sample Markdown outputs in Obsidian if visual quality needs manual assessment. -2. Run additional real local sample validation only if requested, especially for new MathJax failure messages not covered by Sprint 11's narrow repair rules. -3. Run optional real local chunked conversion on a long sample only if requested. -4. Preserve strict-local runtime behavior: use local model paths, direct CLI execution, and no user-specified API or remote backend. +1. Run hands-on UI smoke when practical: launch `dist\pdf2md-ui.exe`, click Doctor, then run one small local conversion to ignored `outputs/`. +2. Preserve strict-local runtime behavior: use local model paths, direct CLI execution, and no user-specified API or remote backend. +3. Decide in a future sprint whether simplified outputs need metadata-free `pdf2md recheck`; current behavior intentionally remains legacy-only. +4. On a stronger NVIDIA GPU PC, run `uv run pdf2md doctor` and an optional local conversion with `--gpu auto --mineru-profile auto` to validate the auto profile against ignored `outputs/`. diff --git a/README.md b/README.md index e93d92d..6f606d7 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# ConvertPDFToMD +# ConvertPDFToMD Local-only PDF-to-Markdown converter for math-heavy digital documents. ## Status -The project currently provides a Python package, `pdf2md convert`, Markdown recheck via `pdf2md recheck`, metadata/report output, mocked MinerU adapter tests, `pdf2md doctor` setup diagnostics, and Sprint 9 release-gate documentation. Real local MinerU sample validation remains optional and may be blocked until MinerU 3.1.0 and local model/cache setup are available. +The project currently provides a Python package, `pdf2md convert`, legacy Markdown recheck via `pdf2md recheck`, simplified Markdown/report output, mocked MinerU adapter tests, `pdf2md doctor` setup diagnostics, NVIDIA GPU inventory/profile reporting, opt-in grouped page conversion for long PDFs, local MathJax warning mitigation, release-gate documentation, and a minimal Windows UI launcher with direct-folder PDF batch conversion. Real local MinerU sample validation is optional and should run only against local PDFs with generated outputs kept ignored. ## Setup @@ -46,7 +46,7 @@ uv run pdf2md doctor The checker runs through local Node.js and the local `mathjax` package only. It never uses a CDN or hosted renderer, and conversion still completes if Node.js or MathJax is missing. -For release checks, see [docs/V1RELEASECHECKLIST.md](docs/V1RELEASECHECKLIST.md). It separates the default fast gates from optional local MinerU/GPU/sample fixture evaluation. Optional fixture runs use `PDF2MD_RUN_MINERU_FIXTURES=1`, should use only local PDFs, write generated outputs to a temporary or ignored local directory, and count a sample conversion as successful only when Markdown, metadata JSON, and `.report.md` outputs all exist. +For release checks, see [docs/V1RELEASECHECKLIST.md](docs/V1RELEASECHECKLIST.md). It separates the default fast gates from optional local MinerU/GPU/sample fixture evaluation. Optional fixture runs use `PDF2MD_RUN_MINERU_FIXTURES=1`, should use only local PDFs, write generated outputs to a temporary or ignored local directory, and count a sample conversion as successful only when Markdown part files and the single `_report.md` output exist. Install MinerU 3.1.0 as an explicit local setup step so the `mineru` executable is available on PATH. This project calls MinerU only through the direct local CLI shape: @@ -56,13 +56,33 @@ mineru -p -o `pdf2md convert` requests GPU execution by default with `--gpu cuda:0`. The adapter maps that to MinerU's local `MINERU_DEVICE_MODE=cuda` and `CUDA_VISIBLE_DEVICES=0` environment for the MinerU subprocess. Actual GPU execution still requires a CUDA-capable local PyTorch/MinerU stack; `doctor` reports when PyTorch is CPU-only or CUDA is unavailable. +MinerU runtime tuning is controlled with `--mineru-profile auto|safe|performance`; the default is `auto`. `auto` keeps GTX 1070 Ti 8GB, pre-Turing, and other low-VRAM GPUs on safe settings. Use `--gpu auto` on a stronger NVIDIA machine when you want the converter to choose the visible GPU with the most VRAM and record the selected GPU/profile in the report and internal provenance: + +```powershell +uv run pdf2md convert paper.pdf --out outputs --gpu auto --mineru-profile auto +``` + +The default public output layout is: + +```text +outputs/ + paper/ + paper_001.md + paper_report.md + images/ +``` + +When `--chunk-pages` creates more than one grouped output, additional Markdown files use `paper_002.md`, `paper_003.md`, and so on. New conversions do not write public `.metadata.json` sidecars; report content is derived from internal provenance and local checks. + +Profile tuning uses only local environment variables for the MinerU subprocess: `MINERU_PROCESSING_WINDOW_SIZE`, `MINERU_API_MAX_CONCURRENT_REQUESTS`, and `MINERU_PDF_RENDER_THREADS`. It does not add MinerU backend selection, `--api-url`, router mode, HTTP client backends, or remote endpoints. Explicit `--mineru-profile performance` is downgraded to safe with a warning when the selected GPU is below 16GB VRAM or has pre-Turing risk. + Run setup diagnostics before conversion: ```powershell uv run pdf2md doctor ``` -`doctor` checks Python 3.12, `uv`, the MinerU CLI and version, NVIDIA GPU visibility through `nvidia-smi`, PyTorch CUDA visibility when PyTorch is installed, local model/cache/config paths, local MathJax checker availability, and the strict-local runtime policy. It does not install packages, download models, run conversions, or inspect `samples/`. +`doctor` checks Python 3.12, `uv`, the MinerU CLI and version, NVIDIA GPU visibility through `nvidia-smi`, PyTorch CUDA visibility when PyTorch is installed, local model/cache/config paths, local MathJax checker availability, and the strict-local runtime policy. It also reports visible GPU indexes, VRAM, driver versions, the `--gpu auto` selection, and the recommended MinerU profile. It does not install packages, download models, run conversions, or inspect `samples/`. The model/cache check looks for these environment variables when present: @@ -78,14 +98,13 @@ It also checks for `%USERPROFILE%\mineru.json`, which MinerU documents as its de ## Rechecking Markdown -After editing a generated Markdown file, rerun local quality checks and regenerate the adjacent metadata/report files: +`pdf2md recheck` is currently a legacy maintenance command for Markdown files that still have adjacent metadata JSON from the older output layout: ```powershell -uv run pdf2md recheck outputs/MITC공부/MITC공부.md +uv run pdf2md recheck outputs/legacy-paper.md ``` -`recheck` reads the existing `.metadata.json` for source PDF, engine, page, and asset provenance. It replaces quality warnings that can be recalculated from the current Markdown, including MathJax render failures and local asset-link warnings, then rewrites `.metadata.json` and `.report.md`. - +`recheck` reads an existing legacy `.metadata.json` for source PDF, engine, page, and asset provenance. New simplified outputs do not persist metadata JSON, so metadata-free recheck is intentionally deferred to a later sprint. ## Runtime Policy Runtime conversion is strict-local. Allowed: direct `mineru` CLI execution and the CLI-internal temporary local `mineru-api` that MinerU starts when `--api-url` is omitted. Prohibited: `--api-url`, remote APIs, router mode, HTTP client backends, remote OpenAI-compatible backends, hosted renderers, and cloud fallbacks. @@ -96,16 +115,36 @@ The target GPU is NVIDIA GTX 1070 Ti 8GB. `doctor` warns for GTX 1070 Ti/Pascal/ ## Long PDFs -Chunking is opt-in for long PDFs. Use `--chunk-pages` with no value to split into 20-page chunks, or pass an explicit positive page count: +Grouped page conversion is opt-in for long PDFs. Use `--chunk-pages` with no value to group outputs by 20 source pages, or pass an explicit positive group size: ```powershell uv run pdf2md convert samples/long.pdf --out outputs --chunk-pages uv run pdf2md convert samples/long.pdf --out outputs --chunk-pages 20 ``` -Chunk PDFs are written to a temporary local directory before each MinerU run and are deleted after conversion completes. The generated Markdown files are not merged; each chunk gets its own Markdown, metadata JSON, report Markdown, and assets directory named with the original page range. +When `--chunk-pages` is active, the converter writes one-page temporary PDFs and sends only one source page to MinerU per run. Successful page Markdown is then grouped into final Markdown files under the PDF output folder, such as `outputs/paper/paper_001.md` and `outputs/paper/paper_002.md`. Temporary one-page PDFs and intermediate per-page outputs are deleted after conversion completes. -The Python API keeps non-chunked behavior unchanged. `convert_pdf(..., chunk_pages=20)` returns a `BatchConversionResult` with one `ConversionResult` per chunk. +Grouped outputs keep invisible Obsidian-friendly page comments such as ``; failed page conversions are recorded as comments plus report warnings. Page assets are copied into the shared `outputs/paper/images/` folder with deterministic page-prefixed names to avoid filename collisions. + +The Python API keeps non-chunked behavior unchanged. `convert_pdf(..., chunk_pages=20)` returns a `BatchConversionResult` with one `ConversionResult` per grouped output file. + +## Windows UI Launcher + +The first UI is a minimal local Windows launcher implemented under `src/pdf2md_ui/`. It calls the existing `pdf2md` CLI or `uv run pdf2md`; it does not call MinerU directly and does not bundle MinerU, CUDA PyTorch, model weights, Node.js, or MathJax into the UI executable. The UI exposes the current conversion controls, including grouped pages, GPU device or `auto`, and MinerU profile `auto|safe|performance`. Folder conversion selects direct-child PDFs only and runs the existing CLI conversion command once per PDF sequentially. + +Run it from source: + +```powershell +uv run python -m pdf2md_ui.app +``` + +Build the UI executable: + +```powershell +uv run --group ui-build pyinstaller --clean --onefile --windowed --name pdf2md-ui src\pdf2md_ui\app.py +``` + +The expected local artifact is `dist\pdf2md-ui.exe`. The UI remains a launcher over a healthy local runtime, so run `pdf2md doctor` before relying on conversions. For the simplified output layout, select the output root; the CLI creates the final `\` folder inside it. ## References diff --git a/docs/KNOWLEDGEBASE.md b/docs/KNOWLEDGEBASE.md index 5dd171d..8181b75 100644 --- a/docs/KNOWLEDGEBASE.md +++ b/docs/KNOWLEDGEBASE.md @@ -1,13 +1,13 @@ # Knowledge Base: Local PDF-to-Markdown Converter for Math-Heavy Documents -Last updated: 2026-05-07 +Last updated: 2026-05-11 ## 1. Product Direction This project will build a local-first PDF-to-Markdown converter for math-heavy academic PDFs and books. The v1 target is intentionally narrow: - Processing policy: local-only. Do not send user PDFs to cloud OCR or external AI APIs. -- Primary interface: CLI plus Python library. +- Primary interface: CLI plus Python library. A later thin local desktop launcher may wrap the CLI, but it must not become a separate conversion pipeline. - Primary output: Obsidian-friendly Markdown. - Main conversion engine: MinerU 3.1.0. - Math output: inline math as `$...$`, display math as `$$...$$`. @@ -73,7 +73,7 @@ Rules: - Inline math: `$...$`. - Display math: `$$...$$` on separate lines. -- Store extracted images in a sibling assets directory, for example `paper.assets/page-003-figure-01.png`. +- Store extracted images in the PDF output folder's shared `images/` directory, for example `paper/images/page-003_figure-01.png`. - Use relative links from the Markdown file to assets. - Preserve page boundaries in metadata, not by noisy visible page markers in the main Markdown. - Prefer normal Markdown tables for simple tables. diff --git a/docs/MATHJAXCHECKERPLAN.md b/docs/MATHJAXCHECKERPLAN.md index 06c9b6a..2089ecb 100644 --- a/docs/MATHJAXCHECKERPLAN.md +++ b/docs/MATHJAXCHECKERPLAN.md @@ -19,7 +19,7 @@ Relevant existing behavior: - Conversion remains local-only. - MinerU 3.1.0 remains the only PDF conversion engine. - Quality warnings are non-fatal unless no usable output can be produced. -- Metadata and `.report.md` already include `math_render_error_count`. +- Internal provenance and `_report.md` include `math_render_error_count`. - Default tests must not require real MinerU, GPU, Node.js, MathJax, network, Obsidian, or sample PDFs. ## References @@ -237,7 +237,7 @@ Optional local tests: - `pdf2md doctor` reports MathJax checker availability clearly. - Conversion still succeeds when MathJax is unavailable, with an info warning. - Conversion still succeeds when individual formulas fail, with warning records. -- `.metadata.json` and `.report.md` show actual math render failure counts when MathJax is available. +- Internal provenance and `_report.md` show actual math render failure counts when MathJax is available. - The generated Markdown is not changed by the checker. ## Hard Failure Criteria diff --git a/docs/Sprints/SPRINT12CONTRACT.md b/docs/Sprints/SPRINT12CONTRACT.md new file mode 100644 index 0000000..f9fe31a --- /dev/null +++ b/docs/Sprints/SPRINT12CONTRACT.md @@ -0,0 +1,218 @@ +# Sprint 12 Contract: Minimal Windows UI Launcher + +Status: Implemented with residual conversion-smoke risk +Last updated: 2026-05-11 + +## Objective + +Build a minimal Windows desktop launcher for the existing `pdf2md` CLI and package the launcher itself as `dist/pdf2md-ui.exe`. + +The UI must remain a thin local launcher. It must not become a second conversion engine, a hosted app, a manual review workflow, or a bundled redistribution of MinerU, CUDA PyTorch, model weights, Node.js, or MathJax. + +## Research Basis + +- Primary research document: `docs/UI_RESEARCH.md`. +- The recommended implementation path is `tkinter`/`ttk`, a subprocess runner around `pdf2md` or `uv run pdf2md`, and PyInstaller for the Windows executable. + +## Current Precondition + +- `pdf2md doctor`, `pdf2md convert`, and `pdf2md recheck` are implemented. +- Conversion remains strict-local and MinerU-only. +- Current CLI output is coarse during MinerU execution because the adapter captures MinerU subprocess output internally. +- UI research is complete. +- UI implementation exists under `src/pdf2md_ui/`. +- `dist\pdf2md-ui.exe` can be built with PyInstaller. + +## Touched Surfaces + +Allowed during implementation: + +- `src/pdf2md_ui/__init__.py` +- `src/pdf2md_ui/app.py` +- `src/pdf2md_ui/runner.py` +- `tests/test_ui_runner.py` +- `pyproject.toml` +- `uv.lock` +- `README.md` +- `PLAN.md` +- `PROGRESS.md` +- `docs/WORKARCHIVE.md` +- `docs/V1IMPLEMENTATIONPLAN.md` + +Generated but not committed unless explicitly requested: + +- `build/` +- `dist/` +- `*.spec` +- generated conversion outputs under `outputs/` + +Not allowed: + +- Runtime document upload paths. +- Remote OCR, hosted LLM/VLM, hosted renderers, or remote document parsing APIs. +- `--api-url`, router mode, HTTP client backends, remote OpenAI-compatible endpoints, or runtime engine selection. +- Direct UI calls to `mineru`; the UI must call the project-owned `pdf2md` CLI. +- Bundling MinerU, CUDA PyTorch, local model weights, Node.js, or MathJax into the first UI executable. +- Batch queues, drag/drop, PDF preview, Markdown preview, Obsidian automation, installer generation, or code signing in this sprint. +- Mandatory default tests that require real MinerU, GPU, model files, network, Obsidian, or `samples/`. + +## Product Behavior + +The first UI is a single-window launcher: + +- Select one input PDF. +- Select an output root, defaulting to `outputs`; the current CLI creates the final `\` folder inside it. +- Configure only existing CLI options: + - overwrite + - keep raw output + - optional grouped pages with default `20` + - GPU device with default `cuda:0`, including `auto` when supported by the CLI + - MinerU profile `auto|safe|performance` with default `auto` +- Run `Doctor`. +- Run `Convert`. +- Run `Recheck` for an existing Markdown output. +- Cancel a running subprocess. +- Open the output directory after completion. +- Show a read-only log and indeterminate progress while a command is running. + +Command resolution: + +1. Use a configured command if present. +2. Else use `pdf2md` from `PATH`. +3. Else use `uv run pdf2md` from a configured project root containing `pyproject.toml`. +4. Else report a setup error and direct the user to run `pdf2md doctor`. + +## Architecture Plan + +### WP12.1: CLI Runner + +Actions: + +- Add a runner module that builds fixed argument lists for `doctor`, `convert`, and `recheck`. +- Use `subprocess.Popen` with `shell=False`. +- Set `MINERU_MODEL_SOURCE=local` in the child environment unless already set. +- Merge stderr into stdout for a single UI log stream. +- Read subprocess output on a worker thread and report status events to the UI. +- Add a Windows process-tree cancellation helper that uses `taskkill /pid /t /f` only after normal termination does not finish promptly. + +Expected output: + +- Testable command-construction and process-management code that never accepts arbitrary shell text from the UI. + +### WP12.2: Minimal Tk UI + +Actions: + +- Add a `tkinter`/`ttk` app with file and directory pickers, option controls, command buttons, progress indicator, and log pane. +- Keep long-running work off Tk's event handler thread. +- Disable conflicting controls while a command is running. +- Surface non-zero exit codes clearly. + +Expected output: + +- A simple local GUI for existing CLI workflows. + +### WP12.3: Build + +Actions: + +- Add PyInstaller only to a build dependency group such as `ui-build`. +- Build the executable with: + +```powershell +uv run --group ui-build pyinstaller --clean --onefile --windowed --name pdf2md-ui src\pdf2md_ui\app.py +``` + +Expected output: + +- `dist\pdf2md-ui.exe` exists after the build. + +## Verification Checks + +Default checks: + +- `uv run pytest tests/test_ui_runner.py` +- `uv run pytest tests/test_cli.py` if shared CLI behavior changes +- `git diff --check` +- `git status --short --untracked-files=all` + +Build check: + +```powershell +uv run --group ui-build pyinstaller --clean --onefile --windowed --name pdf2md-ui src\pdf2md_ui\app.py +Test-Path dist\pdf2md-ui.exe +``` + +Manual smoke: + +1. Launch `dist\pdf2md-ui.exe`. +2. Run Doctor from the UI. +3. Convert one small local sample into an ignored `outputs/` directory. +4. Confirm Markdown, report Markdown, and assets are produced as expected for the active output layout. + +## Acceptance Criteria + +- The UI invokes `pdf2md` or `uv run pdf2md`; it never invokes `mineru` directly. +- Commands are fixed argument lists and run with `shell=False`. +- The UI remains responsive while a conversion is running. +- Cancel attempts to stop the process tree on Windows. +- Doctor and conversion exit codes are visible in the UI. +- PyInstaller produces `dist\pdf2md-ui.exe`. +- Default tests stay independent of real MinerU, GPU, model files, network, Obsidian, and `samples/`. + +## Hard Failure Criteria + +- UI code exposes arbitrary shell command execution. +- UI exposes remote/API options or weakens strict-local policy. +- UI claims conversion success without checking the CLI exit code. +- UI freezes during a long conversion because the CLI runs on Tk's event handler thread. +- The first UI executable bundles MinerU, CUDA PyTorch, model weights, Node.js, or MathJax. +- Build outputs, generated conversion outputs, local models, or sample PDFs are committed. + +## Handoff Requirements + +After implementation: + +- Update `PROGRESS.md` with files changed, commands run, test outcomes, build outcome, known failures, residual risks, and next action. +- Move completed implementation details to `docs/WORKARCHIVE.md` after verification. +- Keep sample PDFs and generated outputs out of the commit. + +## Implementation Handoff + +Files changed: + +- `src/pdf2md_ui/__init__.py` +- `src/pdf2md_ui/app.py` +- `src/pdf2md_ui/runner.py` +- `tests/test_ui_runner.py` +- `pyproject.toml` +- `uv.lock` +- `README.md` +- `PLAN.md` +- `PROGRESS.md` +- `docs/WORKARCHIVE.md` +- `docs/V1IMPLEMENTATIONPLAN.md` + +Verification: + +- `uv run pytest tests\test_ui_runner.py`: passed 16 tests. +- `uv run pytest`: passed 188 tests with 1 optional skip. +- `uv run --group ui-build pyinstaller --clean --onefile --windowed --name pdf2md-ui src\pdf2md_ui\app.py`: passed. +- `Test-Path dist\pdf2md-ui.exe`: returned `True`. +- `uv run pdf2md doctor`: returned WARN only for the documented GTX 1070 Ti/Pascal compatibility risk. +- Launch smoke for `dist\pdf2md-ui.exe`: process started and was then terminated by the smoke script. + +Follow-up refresh on 2026-05-12: + +- Updated the UI command builder and form controls for the Sprint 15 `--mineru-profile auto|safe|performance` CLI option. +- Rebuilt `dist\pdf2md-ui.exe` after Sprint 16 simplified output layout and Sprint 15 profile changes. +- `uv run pytest tests\test_ui_runner.py`: passed 17 tests. +- Launch smoke for the rebuilt `dist\pdf2md-ui.exe`: process started and was then terminated by the smoke script. + +Known failure: + +- A CLI conversion smoke using `samples\FourNodeQuadrilateralShellElementMITC4.pdf` and the same command shape used by the UI did not finish within the 15-minute timeout. The spawned process tree was terminated with `taskkill`. + +Residual risk: + +- A hands-on UI Doctor click and UI conversion click should still be run when the local MinerU runtime is expected to complete within an acceptable time. diff --git a/docs/Sprints/SPRINT13CONTRACT.md b/docs/Sprints/SPRINT13CONTRACT.md new file mode 100644 index 0000000..87f526b --- /dev/null +++ b/docs/Sprints/SPRINT13CONTRACT.md @@ -0,0 +1,292 @@ +# Sprint 13 Contract: Text Layer Fidelity Diagnostics + +Status: Implemented +Last updated: 2026-05-11 + +## Objective + +Add a local pypdf-based text fidelity diagnostic pass that compares source PDF text-layer extraction with MinerU-generated Markdown text on a per-page basis where page mapping is available. + +The first priority is diagnosis, not automatic body-text replacement. This sprint should record enough evidence in metadata JSON and `.report.md` to identify pages where MinerU likely misrecognized Korean body text, especially missing Hangul syllables, unexpected CJK ideographs, and abnormal spacing. It may mark pypdf text as a future replacement candidate, but it must not replace Markdown body text in this sprint. + +## Current Precondition + +- MinerU 3.1.0 remains the only conversion engine. +- Conversion runs through direct local `mineru` CLI execution only. +- `pypdf` is already used by the project for local PDF chunk planning. +- `pdf2md convert` writes Markdown, metadata JSON, and `.report.md`. +- `pdf2md recheck` can regenerate metadata/report from an existing Markdown file. +- Chunked conversion records original source page ranges in metadata `engine_options.chunk`. +- The 2007 Korean shell-structure sample showed clear text fidelity problems: + - pypdf can extract more accurate Hangul from the digital text layer. + - MinerU Markdown can omit Hangul syllables or misrecognize headings/body text as unrelated CJK characters. + - The source text layer itself can contain abnormal spacing between Hangul syllables. + +## Touched Surfaces + +Allowed during implementation: + +- `src/pdf2md/text_fidelity.py` +- `src/pdf2md/ir.py` +- `src/pdf2md/metadata.py` +- `src/pdf2md/report.py` +- `src/pdf2md/conversion.py` +- `tests/test_text_fidelity.py` +- `tests/test_metadata.py` +- `tests/test_report.py` +- `tests/test_conversion.py` +- `docs/V1IMPLEMENTATIONPLAN.md` +- `PLAN.md` +- `PROGRESS.md` +- `docs/WORKARCHIVE.md` after completion + +Allowed only if needed for CLI/API wiring: + +- `src/pdf2md/cli.py` +- `tests/test_cli.py` +- `README.md` + +Not allowed: + +- Replacing Markdown body text with pypdf text in this sprint. +- Adding a second conversion engine or engine selector. +- Adding remote OCR, hosted LLM/VLM, remote document parsing, `--api-url`, router mode, HTTP client backends, or remote OpenAI-compatible endpoints. +- Mandatory default tests that require real MinerU, GPU, model files, network, Obsidian, or committed `samples/`. +- Committing sample PDFs or generated `outputs/`. + +## Product Behavior + +Text fidelity diagnostics should run automatically after MinerU Markdown normalization and local quality checks have produced the final Markdown candidate. + +For each page that can be compared, metadata should record a compact diagnostic object with at least: + +- `page_index`: zero-based output page index. +- `source_page_number`: one-based original PDF page number when known. +- `pypdf_text_available`: whether pypdf extracted non-empty source text. +- `markdown_text_available`: whether comparable Markdown text exists for the page. +- `pypdf_hangul_count`: Hangul syllable count from pypdf text. +- `markdown_hangul_count`: Hangul syllable count from Markdown text. +- `hangul_count_delta`: `markdown_hangul_count - pypdf_hangul_count`. +- `hangul_count_ratio`: Markdown Hangul count divided by pypdf Hangul count, or `null` when unavailable. +- `unexpected_cjk_count`: count of CJK Unified Ideographs in Markdown that are suspicious in a page with Korean source text. +- `pypdf_hangul_spacing_anomaly_ratio`: ratio of Hangul-to-Hangul whitespace breaks in pypdf text. +- `markdown_hangul_spacing_anomaly_ratio`: ratio of Hangul-to-Hangul whitespace breaks in Markdown text. +- `text_similarity`: normalized text similarity between pypdf text and Markdown text. +- `replacement_candidate`: `true` only when pypdf text appears more reliable than Markdown text under conservative thresholds. +- `comparison_status`: one of `checked`, `source_text_missing`, `markdown_page_unavailable`, or `page_mapping_uncertain`. + +Metadata summary should include: + +- `text_fidelity_checked_page_count`. +- `text_fidelity_low_page_count`. +- `text_fidelity_unexpected_cjk_count`. +- `text_fidelity_replacement_candidate_page_count`. +- `text_fidelity_page_mapping_uncertain_count`. + +Report Markdown should add a dedicated `## Text Fidelity` section showing: + +- checked page count and low-fidelity page count. +- total unexpected CJK count. +- replacement candidate page count. +- pages with low similarity. +- pages with high unexpected CJK count. +- pages where page-level comparison could not be trusted. + +Warning behavior: + +- Add `TEXT_LAYER_AVAILABLE` as an info warning when pypdf source text is available and diagnostics run. +- Add `TEXT_FIDELITY_LOW` as a warning for pages below the fidelity threshold. +- Add `UNEXPECTED_CJK_IN_KOREAN_TEXT` as a warning when suspicious CJK ideographs appear in Markdown for pages with Korean source text. +- Add `HANGUL_SPACING_SUSPECT` as an info or warning-level signal when pypdf or Markdown has high Hangul spacing anomaly ratio. +- Add `TEXT_PAGE_MAPPING_UNCERTAIN` as an info warning when page-level Markdown mapping is not reliable enough for per-page metrics. + +Replacement candidate policy: + +- `replacement_candidate` is a diagnostic marker only. +- It must not change Markdown output. +- It should be `true` only when: + - pypdf source text is available, + - pypdf Hangul count is materially higher than Markdown Hangul count or Markdown has suspicious CJK ideographs, + - pypdf spacing anomalies are not so severe that the source text layer is clearly unusable, + - page mapping is `checked`. + +## Architecture Plan + +### WP13.1: Text Fidelity Module + +Actions: + +- Add `src/pdf2md/text_fidelity.py`. +- Use `pypdf.PdfReader` to extract source page text locally. +- Define immutable result records for per-page metrics and summary metrics. +- Strip Markdown syntax, image links, fenced code, inline code, and math spans before text comparison. +- Normalize text for comparison without mutating the output Markdown: + - Unicode NFKC normalization for comparison strings only. + - collapse whitespace for similarity only. + - keep raw-count metrics independent enough to expose spacing anomalies. +- Count Hangul syllables with the Hangul syllable block. +- Count suspicious CJK ideographs with CJK Unified Ideograph ranges, excluding Hangul ranges. +- Compute similarity with a deterministic standard-library algorithm such as `difflib.SequenceMatcher`. + +Expected output: + +- Pure local helper functions that are independently testable and do not call MinerU, network services, or the filesystem except for reading the source PDF. + +### WP13.2: Page Mapping Boundary + +Actions: + +- Derive source page numbers from `engine_options.chunk` when chunking is active. +- Use project page records and any reliable raw structured page count to decide whether page-level comparison is possible. +- If Markdown cannot be mapped to pages reliably, produce `TEXT_PAGE_MAPPING_UNCERTAIN` and avoid pretending per-page metrics are exact. +- For the initial implementation, allow a conservative fallback for single-page mocked outputs and chunk outputs where one Markdown file corresponds to a known source page range. + +Expected output: + +- Page-level diagnostics are only marked `checked` when the mapping is credible. +- Ambiguous cases are visible in metadata/report instead of producing misleading page metrics. + +### WP13.3: Metadata And Warning Integration + +Actions: + +- Add warning codes in `src/pdf2md/ir.py`. +- Add text fidelity fields to metadata without changing existing top-level fields used by current tests. +- Extend `build_summary()` to include text fidelity summary counts when diagnostics are present. +- Ensure warnings retain `page_index` where available. +- Preserve JSON serializability and deterministic key ordering on write. + +Expected output: + +- Metadata contains compact page-level text fidelity diagnostics and summary counts. +- Existing metadata consumers remain compatible. + +### WP13.4: Report Integration + +Actions: + +- Extend `render_report()` to render a `## Text Fidelity` section when diagnostics exist. +- Keep the report derived from metadata and quality results. +- Include low-fidelity pages and replacement candidate pages in human-readable form. +- Do not include full extracted page text in the report. + +Expected output: + +- A human can identify which pages need attention without opening metadata JSON first. + +### WP13.5: Conversion And Recheck Integration + +Actions: + +- Run text fidelity diagnostics during `convert` after final Markdown preparation and before metadata/report writing. +- Run the same diagnostics during `recheck` when the original source PDF path still exists. +- If the source PDF is missing during `recheck`, preserve existing behavior and add a clear nonfatal warning or omit diagnostics. +- Keep chunked conversion page ranges tied to original source page numbers. + +Expected output: + +- Fresh conversions and rechecks can produce text fidelity diagnostics without rerunning MinerU. + +### WP13.6: Tests + +Default fast tests: + +- pypdf extraction boundary handles generated local PDFs without requiring real MinerU or sample files. +- Hangul count, unexpected CJK count, and spacing anomaly ratio helpers use direct Korean/CJK strings. +- Markdown text stripping ignores math, image links, fenced code, and inline code. +- Similarity score is deterministic for equivalent and degraded text. +- Metadata contains text fidelity summary fields when diagnostics are present. +- Report contains `## Text Fidelity` and page-level warning summaries. +- Conversion with a fake adapter records `TEXT_FIDELITY_LOW` when Markdown omits Hangul from a source-text PDF. +- Recheck reruns diagnostics when source PDF exists. +- Missing source PDF during recheck remains nonfatal. + +Optional local validation: + +- Convert the local 2007 Korean shell-structure sample with chunking to ignored `outputs\`. +- Confirm the report flags the pages where the previous output had missing Hangul and unexpected CJK characters. +- Do not commit sample PDFs or generated outputs. + +## Acceptance Criteria + +- Default tests pass without real MinerU, GPU, model files, network, Obsidian, or `samples/`. +- Diagnostics are local-only and use pypdf source text only from the local PDF. +- Metadata JSON records page-level text fidelity metrics where page mapping is credible. +- Metadata summary records aggregate text fidelity counts. +- `.report.md` includes a text fidelity section when diagnostics exist. +- Suspicious Korean text loss produces structured warnings with page provenance where available. +- Replacement candidate markers are recorded only as diagnostics and do not alter Markdown content. +- Existing math, asset, table, chunk, strict-local, and UI behavior remains unchanged. + +## Hard Failure Criteria + +- Markdown body text is replaced automatically in this sprint. +- Page-level metrics are reported as exact when page mapping is uncertain. +- Diagnostics upload PDFs, page text, Markdown, or extracted text to any remote service. +- Default tests require MinerU, CUDA/GPU, model files, network, Obsidian, or `samples/`. +- Existing output schema fields are removed or renamed. +- `samples/`, generated `outputs/`, or `dist/pdf2md-ui.exe` are committed. + +## Verification Commands + +```powershell +uv run pytest tests/test_text_fidelity.py tests/test_metadata.py tests/test_report.py tests/test_conversion.py +uv run pytest +git diff --check +git status --short --untracked-files=all +``` + +Optional local validation: + +```powershell +$env:MINERU_MODEL_SOURCE='local' +$pdf = (Get-ChildItem samples -Filter '2007*.pdf' | Select-Object -First 1).FullName +uv run pdf2md convert $pdf --out outputs\sprint13-2007-text-fidelity --overwrite --chunk-pages 5 +``` + +## Handoff Requirements + +After implementation: + +- Update `PROGRESS.md` with files changed, commands run, test outcomes, optional sample validation outcome, known failures, residual risks, and next action. +- Archive completed implementation details in `docs/WORKARCHIVE.md` after verification. +- Keep sample PDFs, generated outputs, and build artifacts out of the commit. +- Record whether page-level mapping was exact, approximate, or unavailable for the validated sample. + +## Implementation Handoff + +Files changed: + +- `src/pdf2md/text_fidelity.py` +- `src/pdf2md/ir.py` +- `src/pdf2md/metadata.py` +- `src/pdf2md/report.py` +- `src/pdf2md/conversion.py` +- `tests/test_text_fidelity.py` +- `tests/test_metadata.py` +- `tests/test_report.py` +- `tests/test_conversion.py` +- `ARCHITECTURE.md` +- `PLAN.md` +- `PROGRESS.md` +- `docs/WORKARCHIVE.md` +- `docs/V1IMPLEMENTATIONPLAN.md` + +Verification: + +- `uv run pytest tests/test_text_fidelity.py tests/test_metadata.py tests/test_report.py tests/test_conversion.py`: passed 49 tests. +- `uv run pytest`: passed 198 tests with 1 optional skip. + +Known failures: + +- None in the default fast test suite. + +Residual risks: + +- Page-level Markdown mapping is only scored when credible. Multi-page Markdown without reliable page boundaries is reported as `TEXT_PAGE_MAPPING_UNCERTAIN` rather than guessed. +- Automatic body-text replacement remains out of scope and is not implemented. +- Optional real MinerU validation on the local 2007 Korean shell-structure sample was not run during implementation to avoid a long GPU conversion. + +## Future Sprint Boundary + +A later sprint may implement controlled body-text replacement from pypdf text after Sprint 13 diagnostics show reliable thresholds. That future sprint must have its own contract and must preserve math, tables, figures, asset links, and Markdown structure from MinerU unless explicitly redesigned. diff --git a/docs/Sprints/SPRINT14CONTRACT.md b/docs/Sprints/SPRINT14CONTRACT.md new file mode 100644 index 0000000..2ccf3ff --- /dev/null +++ b/docs/Sprints/SPRINT14CONTRACT.md @@ -0,0 +1,378 @@ +# Sprint 14 Contract: Single-Page Conversion With Grouped Outputs + +Status: Implemented +Last updated: 2026-05-11 + +## Objective + +Replace the current fixed-size pre-conversion chunking behavior with a safer long-PDF workflow: + +1. When chunk mode is active, split the source PDF into one-page temporary PDFs. +2. Convert each one-page PDF sequentially through the existing local MinerU CLI adapter. +3. Merge successful converted page Markdown into grouped output files after every configured output group size. +4. Keep the default output group size at 20 pages when `--chunk-pages` is supplied without a value. + +This sprint is motivated by local evidence from `samples/2007쉘구조물의유한요소해석에대하여.pdf`: a 5-page MinerU input chunk stalled on GTX 1070 Ti 8GB, while one-page conversion completed all 13 pages. + +## Current Precondition + +- MinerU 3.1.0 remains the only conversion engine. +- Conversion runs through direct local `mineru` CLI execution only. +- Strict-local allows only the direct CLI and MinerU CLI-internal temporary local `mineru-api`; remote API/backend paths remain prohibited. +- `pypdf` is already available and used for local PDF chunk planning and temporary chunk PDF writing. +- `pdf2md convert` currently supports `--chunk-pages [PAGES]`. +- Existing chunk mode currently treats `chunk_pages` as the MinerU input PDF page count and writes one final Markdown file per input chunk. +- `convert_pdf(..., chunk_pages=N)` currently returns `BatchConversionResult` in chunk mode. +- Sprint 13 text fidelity diagnostics are most accurate when each MinerU Markdown output maps to exactly one source page. + +## Contract Assumptions + +- Keep chunk mode opt-in for this sprint. If `chunk_pages` is `None`, the existing non-chunked full-PDF conversion path remains unchanged. +- Keep the public option name `--chunk-pages` for CLI/API compatibility, but redefine its behavior in chunk mode as the output group size, not the MinerU input size. +- If `--chunk-pages` is present without a value, use `DEFAULT_CHUNK_PAGES == 20` as the output group size. +- In chunk mode, even a PDF with fewer than `chunk_pages` pages is converted internally one page at a time and emitted as one grouped output file. +- Final grouped outputs are the public conversion results. Temporary per-page Markdown, metadata, reports, assets, and one-page PDFs are not retained unless a later sprint explicitly adds debug retention. + +## Touched Surfaces + +Allowed during implementation: + +- `src/pdf2md/pdf_splitter.py` +- `src/pdf2md/conversion.py` +- `src/pdf2md/paths.py` +- `src/pdf2md/metadata.py` +- `src/pdf2md/report.py` +- `src/pdf2md/cli.py` +- `src/pdf2md_ui/app.py` +- `src/pdf2md_ui/runner.py` +- `tests/test_pdf_splitter.py` +- `tests/test_conversion.py` +- `tests/test_cli.py` +- `tests/test_paths.py` +- `tests/test_metadata.py` +- `tests/test_report.py` +- `tests/test_ui_runner.py` +- `README.md` +- `ARCHITECTURE.md` +- `docs/V1IMPLEMENTATIONPLAN.md` +- `PLAN.md` +- `PROGRESS.md` +- `docs/WORKARCHIVE.md` after implementation + +Allowed if a focused helper boundary keeps `conversion.py` simpler: + +- Create `src/pdf2md/page_grouping.py` +- Create `tests/test_page_grouping.py` + +Not allowed: + +- Adding another conversion engine or runtime engine selector. +- Running page conversions in parallel by default. GTX 1070 Ti 8GB memory pressure makes sequential conversion the safe default. +- Adding cloud OCR, hosted LLM/VLM, remote document parsing, `--api-url`, router mode, HTTP client backends, or remote OpenAI-compatible endpoints. +- Making default tests depend on real MinerU, GPU, model files, network, Obsidian, MathJax, or `samples/`. +- Committing sample PDFs, generated `outputs/`, retained temporary page outputs, or `dist/pdf2md-ui.exe`. + +## Product Behavior + +### Activation + +Existing non-chunked conversion remains unchanged: + +```powershell +uv run pdf2md convert paper.pdf --out outputs +``` + +Grouped page conversion is enabled by `--chunk-pages`: + +```powershell +uv run pdf2md convert paper.pdf --out outputs --chunk-pages +uv run pdf2md convert paper.pdf --out outputs --chunk-pages 20 +uv run pdf2md convert paper.pdf --out outputs --chunk-pages 1 +``` + +Behavior: + +- `--chunk-pages` means output group size. +- `--chunk-pages 20` converts pages 1, 2, 3, ... as independent one-page MinerU jobs, then emits grouped outputs covering pages 1-20, 21-40, and so on. +- `--chunk-pages 1` emits one final output file per source page. +- `convert_pdf(..., chunk_pages=N)` still returns `BatchConversionResult`; each `ConversionResult` represents one final grouped output file, not each internal one-page MinerU run. + +### Output Naming + +Use the existing part/page-range naming shape for grouped outputs: + +```text +.part-001.pages-001-020.md +.part-001.pages-001-020.metadata.json +.part-001.pages-001-020.report.md +.part-001.pages-001-020.assets/ + +.part-002.pages-021-040.md +... +``` + +If a 13-page PDF is converted with `--chunk-pages 20`, it emits: + +```text +.part-001.pages-001-013.md +.part-001.pages-001-013.metadata.json +.part-001.pages-001-013.report.md +.part-001.pages-001-013.assets/ +``` + +This is an intentional behavior change from Sprint 10: short PDFs in chunk mode no longer bypass chunk mode and no longer write `.md`. + +### Internal Page Conversion + +For every source page in chunk mode: + +- Write a one-page temporary PDF with pypdf. +- Run the existing local MinerU adapter against that one-page PDF. +- Normalize Markdown, copy page assets into a temporary page assets directory, run MathJax checks/repair, and run Sprint 13 text fidelity diagnostics against the original source page. +- Delete the one-page temporary PDF and temporary per-page final files after grouped output generation. + +The implementation should reuse existing conversion primitives where practical, but it must avoid writing final public files for every page before grouping. + +### Markdown Grouping + +For each output group: + +- Concatenate successful page Markdown in source page order. +- Separate pages with blank lines and an HTML comment that is invisible in Obsidian preview: + +```markdown + +``` + +- Do not add visible page headings or instructional text. +- If a page conversion fails, do not invent Markdown for that page. Add an invisible comment at the page boundary: + +```markdown + +``` + +- Preserve Obsidian-friendly math delimiters and display math spacing after concatenation. + +### Asset Grouping + +Assets from temporary per-page outputs must be copied into the grouped assets directory with collision-proof names. + +Recommended destination layout: + +```text +.part-001.pages-001-020.assets/page-001/ +.part-001.pages-001-020.assets/page-002/ +``` + +Markdown image links must be rewritten to the grouped assets directory. This keeps repeated MinerU asset filenames from different pages from overwriting each other. + +### Metadata And Report Grouping + +Grouped metadata must be derived from per-page conversion records plus group-level checks. + +Required metadata behavior: + +- `source_pdf` remains the original source PDF path. +- `source_sha256` remains the original source PDF hash. +- `pages` contains one page record per source page in the group. +- Page indexes in grouped metadata are group-local zero-based indexes. +- Original source page numbers remain visible in chunk/page conversion provenance. +- Warnings from per-page conversions are preserved with adjusted group-local page indexes. +- Warnings for failed page conversions are added with original source page context. +- `text_fidelity` records are carried from one-page checks and keep exact `source_page_number` values. +- Summary counts are aggregated from the grouped metadata and grouped Markdown. + +Required `engine_options` shape: + +```json +{ + "chunk": { + "original_source_pdf": "...", + "chunk_index": 1, + "total_chunks": 3, + "source_page_start": 1, + "source_page_end": 20, + "chunk_page_count": 20 + }, + "page_conversion": { + "mode": "single_page", + "mineru_input_page_count": 1, + "output_group_page_count": 20, + "failed_source_pages": [] + } +} +``` + +Report Markdown must continue to include the existing chunk context line and should add a concise page-conversion line, for example: + +```text +- Page conversion mode: single-page MinerU inputs, grouped output size: 20 +``` + +## Failure Policy + +- Convert pages sequentially. +- If a page fails, continue with later pages. +- If at least one page in a group succeeds, write the grouped Markdown/metadata/report and mark final status `partial`. +- If every page in a group fails, return a failed `ConversionResult` for that grouped output and do not write Markdown for that group. +- Failed pages must be visible in metadata/report warnings. +- There is no silent fallback and no retry loop in this sprint. + +## Architecture Plan + +### WP14.1: Page And Group Planning + +Actions: + +- Extend `pdf_splitter.py` or add `page_grouping.py` with project-owned records for: + - one-page MinerU input plans, + - final output group plans, + - original source page ranges, + - deterministic output stems. +- Keep pypdf page extraction local and temporary. +- Validate output group size as a positive integer. +- Plan output groups before conversion starts so overwrite/conflict behavior remains deterministic. + +Expected output: + +- A 41-page PDF with group size 20 plans 41 one-page MinerU inputs and 3 final grouped outputs. +- A 13-page PDF with group size 20 plans 13 one-page MinerU inputs and 1 final grouped output. + +### WP14.2: Conversion Orchestration + +Actions: + +- Rework chunk-mode `convert_pdf()` and `convert_input()` orchestration so `chunk_pages` creates grouped output tasks. +- Run one-page MinerU inputs in source-page order. +- Keep temporary page PDFs and intermediate page outputs under local temporary directories. +- Keep `BatchConversionResult` at the grouped-output level. +- Keep strict-local validation unchanged. + +Expected output: + +- The public API keeps returning multiple grouped results in chunk mode while the adapter is called once per source page internally. + +### WP14.3: Markdown And Asset Group Assembly + +Actions: + +- Build a focused helper to merge page Markdown and page assets into a grouped output. +- Insert invisible `` boundaries. +- Rewrite per-page asset links to `page-NNN/` asset subdirectories. +- Run final group-level local quality checks after asset rewriting. + +Expected output: + +- Grouped Markdown renders in Obsidian and assets do not collide across pages. + +### WP14.4: Metadata, Warnings, And Report Assembly + +Actions: + +- Aggregate per-page metadata into grouped metadata. +- Adjust page indexes from page-local `0` to group-local indexes. +- Preserve original source page numbers in `engine_options` and text fidelity records. +- Add `page_conversion` engine options. +- Add a report line for single-page conversion mode and grouped output size. + +Expected output: + +- Metadata/report can explain both facts: MinerU saw one page at a time, while the user received grouped Markdown files. + +### WP14.5: CLI, UI, And Documentation + +Actions: + +- Update CLI help for `--chunk-pages` from "pre-conversion PDF chunking" to "group converted pages into output files of N pages; MinerU runs one page at a time." +- Update README and architecture docs with the new behavior. +- Update the Windows UI label/help text so the field represents output group size. +- Keep runner command construction using `--chunk-pages N`. + +Expected output: + +- Users do not confuse `--chunk-pages 20` with a 20-page MinerU input. + +### WP14.6: Tests + +Default fast tests: + +- Generated blank local PDFs verify page count and group planning for 1, 13, 20, 21, 40, and 41 pages. +- `--chunk-pages` without a value still passes `20`. +- `convert_pdf(..., chunk_pages=20)` for 41 pages calls the fake adapter 41 times and returns 3 grouped `ConversionResult` objects. +- `convert_pdf(..., chunk_pages=20)` for 13 pages calls the fake adapter 13 times and returns 1 grouped output named `part-001.pages-001-013`. +- `convert_pdf(..., chunk_pages=1)` returns one grouped output per source page. +- Temporary one-page PDFs and temporary per-page outputs are deleted after conversion. +- A failed internal page conversion does not stop later pages and appears in grouped metadata/report. +- A group with only failed pages returns a failed result and writes no Markdown. +- Asset filenames from different pages do not collide in the grouped assets directory. +- Per-page warnings and text fidelity records are adjusted to group-local page indexes while preserving original source page numbers. +- Existing non-chunked conversion tests keep passing unchanged. +- UI runner tests continue to build fixed argument lists with `shell=False`. + +Optional local validation: + +```powershell +$env:MINERU_MODEL_SOURCE='local' +$pdf = (Get-ChildItem samples -Filter '2007*.pdf' | Select-Object -First 1).FullName +uv run pdf2md convert $pdf --out outputs\sprint14-2007-page-grouped --overwrite --chunk-pages +``` + +Expected optional validation: + +- The 13-page Korean sample emits one grouped Markdown file for pages 1-13. +- Metadata/report show exact page-level text fidelity records. +- Generated outputs stay ignored and uncommitted. + +## Acceptance Criteria + +- Chunk mode runs MinerU on one-page temporary PDFs only. +- `chunk_pages` controls final grouped output page count. +- Default group size remains 20 when `--chunk-pages` is supplied without a value. +- Grouped Markdown, metadata JSON, report Markdown, and grouped assets directory are written. +- Grouped metadata preserves original source PDF, original source SHA-256, group page range, one-page conversion mode, page warnings, and text fidelity provenance. +- Failed page conversions are explicit, nonfatal to later pages, and visible in report/metadata. +- Default tests remain fast and local. +- Strict-local policy remains unchanged. +- Non-chunked conversion behavior remains backward-compatible. + +## Hard Failure Criteria + +- Chunk mode sends more than one source page to MinerU in a single temporary PDF. +- `--chunk-pages` continues to mean MinerU input chunk size after this sprint. +- Grouped outputs lose source page provenance or hide failed pages. +- Asset links collide or point outside the grouped assets directory. +- Default tests require real MinerU, GPU, model files, network, Obsidian, MathJax, or `samples/`. +- The implementation adds a remote API/backend path, alternate conversion engine, router mode, or OpenAI-compatible backend. +- Sample PDFs, generated outputs, retained temporary page outputs, or `dist/pdf2md-ui.exe` are committed. + +## Verification Commands + +```powershell +uv run pytest tests/test_pdf_splitter.py tests/test_conversion.py tests/test_cli.py tests/test_paths.py tests/test_metadata.py tests/test_report.py tests/test_ui_runner.py +uv run pytest +git diff --check +git status --short --untracked-files=all +``` + +Optional local validation command is listed in WP14.6 and should be run only when a long GPU conversion is acceptable. + +## Handoff Requirements + +After implementation: + +- Update `PROGRESS.md` with files changed, commands run, test outcomes, optional sample validation outcome, known failures, residual risks, and next action. +- Archive completed implementation details in `docs/WORKARCHIVE.md` after verification. +- Keep sample PDFs, generated outputs, retained temporary page outputs, and build artifacts out of the commit. +- Record whether the 2007 Korean sample was validated with grouped page conversion and how many grouped outputs were produced. + +Implementation handoff on 2026-05-11: + +- Implemented grouped page conversion in `src/pdf2md/conversion.py` with one-page temporary MinerU inputs and grouped public outputs. +- Added report output for `page_conversion` engine options. +- Updated CLI help, UI label text, README, architecture, implementation plan, and coordination/archive docs. +- Verification: targeted Sprint 14 tests passed, the 101-test related suite passed, and full `uv run pytest` passed 202 tests with 1 optional skip. +- Optional real MinerU validation on the 2007 Korean sample was not run during this implementation pass. + +## Future Sprint Boundary + +A later sprint may make grouped page conversion the default even without `--chunk-pages`, add resumable page caches, or add a debug option to retain intermediate per-page outputs. Those behaviors are intentionally out of Sprint 14 scope. diff --git a/docs/Sprints/SPRINT15CONTRACT.md b/docs/Sprints/SPRINT15CONTRACT.md new file mode 100644 index 0000000..a207115 --- /dev/null +++ b/docs/Sprints/SPRINT15CONTRACT.md @@ -0,0 +1,431 @@ +# Sprint 15 Contract: NVIDIA GPU Detection And Auto MinerU Profile + +Status: Implemented +Last updated: 2026-05-12 + +## Objective + +Add a strict-local runtime profiling layer that detects installed NVIDIA GPUs and applies conservative MinerU environment tuning by default. + +The default runtime profile is `auto`. In `auto`, the converter should keep 8GB and pre-Turing GPUs conservative, while allowing a slightly more aggressive local MinerU configuration only when the selected NVIDIA GPU has at least 16GB VRAM and no pre-Turing compatibility warning. + +This sprint is motivated by local evidence from `samples\FourNodeQuadrilateralShellElementMITC4.pdf`: Sprint 14's one-page conversion path used `cuda:0` correctly, but GTX 1070 Ti 8GB stayed near full VRAM use and stalled on source page 2. The next useful test should be on a stronger NVIDIA GPU with explicit runtime diagnostics and reproducible MinerU environment settings. + +## Source Basis + +Use these source-backed facts during implementation: + +- MinerU CLI supports `mineru -p -o ` and, without `--api-url`, launches a temporary local `mineru-api`: https://opendatalab.github.io/MinerU/usage/cli_tools/ +- MinerU CLI documents `-b/--backend`, `-f/--formula`, `-t/--table`, `--api-url`, and related options, but this project must not expose remote/API or backend selection paths in v1: https://opendatalab.github.io/MinerU/usage/cli_tools/ +- MinerU environment variables include `MINERU_PDF_RENDER_THREADS`, `MINERU_PROCESSING_WINDOW_SIZE`, `MINERU_API_MAX_CONCURRENT_REQUESTS`, and timeout settings: https://opendatalab.github.io/MinerU/usage/cli_tools/ +- MinerU advanced CLI docs support selecting visible GPU devices with `CUDA_VISIBLE_DEVICES`: https://opendatalab.github.io/MinerU/usage/advanced_cli_parameters/ +- MinerU local deployment docs list auto-engine GPU requirements around 8GB+ VRAM and GPU acceleration for Volta-or-later devices: https://opendatalab.github.io/MinerU/quick_start/ +- MinerU extension docs say `vllm` and `lmdeploy` acceleration extras are alternatives and should not both be installed just for this sprint: https://opendatalab.github.io/MinerU/quick_start/extension_modules/ + +Access date for the source review: 2026-05-12. + +## Current Precondition + +- MinerU 3.1.0 remains the only conversion engine. +- Conversion runs through direct local `mineru` CLI execution only. +- Strict-local allows only the direct CLI and MinerU CLI-internal temporary local `mineru-api`; remote API/backend paths remain prohibited. +- `pdf2md convert` defaults to `--gpu cuda:0`. +- `MinerUAdapter` currently maps `cuda:N` to `MINERU_DEVICE_MODE=cuda` and `CUDA_VISIBLE_DEVICES=N`. +- `pdf2md doctor` already reports NVIDIA GPU visibility, PyTorch CUDA visibility, GPU names, and Pascal/pre-Turing warnings. +- Sprint 14 chunk mode runs one source page per MinerU invocation when `--chunk-pages` is active. + +## Contract Assumptions + +- Keep `--gpu cuda:0` as the default for backward compatibility with PRD and existing docs. +- Add `--gpu auto` as an opt-in GPU selection mode that chooses the visible NVIDIA GPU with the largest reported VRAM. +- Add `--mineru-profile {auto,safe,performance}` with default `auto`. +- Keep all conversion requests sequential in Sprint 15. Do not introduce parallel page conversion. +- Keep formula and table parsing enabled. Do not optimize by disabling required output quality features. +- Do not add `--backend`, `--api-url`, `--url`, router mode, HTTP client backend, remote OpenAI-compatible backend, or remote model server support. +- Treat MinerU environment tuning as best-effort. If GPU inventory cannot be read, continue with safe profile settings and a warning/provenance record rather than guessing aggressive values. + +## Touched Surfaces + +Allowed during implementation: + +- Create `src/pdf2md/gpu.py` +- Create `src/pdf2md/mineru_profile.py` +- Modify `src/pdf2md/mineru_adapter.py` +- Modify `src/pdf2md/conversion.py` +- Modify `src/pdf2md/cli.py` +- Modify `src/pdf2md/doctor.py` +- Modify `src/pdf2md_ui/runner.py` only if the UI command builder needs profile passthrough +- Modify `src/pdf2md_ui/app.py` only if a minimal profile control is necessary +- Add `tests/test_gpu.py` +- Add `tests/test_mineru_profile.py` +- Modify `tests/test_mineru_adapter.py` +- Modify `tests/test_conversion.py` +- Modify `tests/test_cli.py` +- Modify `tests/test_doctor.py` +- Modify `tests/test_ui_runner.py` only if UI command construction changes +- Modify `README.md` +- Modify `ARCHITECTURE.md` +- Modify `PRD.md` if CLI option documentation changes +- Modify `docs/V1IMPLEMENTATIONPLAN.md` +- Modify `PLAN.md` +- Modify `PROGRESS.md` +- Modify `docs/WORKARCHIVE.md` after implementation + +Not allowed: + +- Adding another conversion engine or runtime engine selector. +- Passing `--api-url`, `--url`, or any remote endpoint to MinerU. +- Adding `mineru-router`, HTTP client backend, or OpenAI-compatible backend usage. +- Installing `vllm`, `lmdeploy`, CUDA packages, models, or any runtime package automatically. +- Changing the default conversion engine or disabling formula/table recognition. +- Making default tests depend on real MinerU, GPU, CUDA, PyTorch, model files, network, Obsidian, MathJax, or `samples/`. +- Committing sample PDFs, generated `outputs/`, retained temporary page outputs, local model files, or `dist/pdf2md-ui.exe`. + +## Product Behavior + +### CLI + +Existing behavior remains valid: + +```powershell +uv run pdf2md convert paper.pdf --out outputs +uv run pdf2md convert paper.pdf --out outputs --gpu cuda:0 +``` + +New behavior: + +```powershell +uv run pdf2md convert paper.pdf --out outputs --mineru-profile auto +uv run pdf2md convert paper.pdf --out outputs --mineru-profile safe +uv run pdf2md convert paper.pdf --out outputs --mineru-profile performance +uv run pdf2md convert paper.pdf --out outputs --gpu auto --mineru-profile auto +``` + +Rules: + +- `--mineru-profile` defaults to `auto`. +- `--gpu cuda:N` selects a concrete CUDA index and tunes MinerU for that selected GPU when inventory is available. +- `--gpu N` is still normalized to `cuda:N`. +- `--gpu auto` selects the visible NVIDIA GPU with the largest VRAM from local GPU inventory. +- If `--gpu auto` cannot find a visible NVIDIA GPU, fail clearly before conversion rather than silently switching to CPU. +- If `--mineru-profile performance` is requested on a selected GPU below 16GB VRAM or with pre-Turing risk, downgrade to safe settings with a warning in metadata/report. Do not fail solely because performance was unsafe. + +### Doctor + +`pdf2md doctor` should report: + +- All visible NVIDIA GPUs with index, name, total VRAM, and driver version from `nvidia-smi`. +- PyTorch CUDA device names and compute capabilities when available. +- Selected default GPU recommendation for `--gpu auto`. +- Recommended MinerU profile for the detected primary GPU. +- Existing Pascal/pre-Turing warnings. + +Doctor must not require a real conversion, model load, network access, or package download. + +### Auto Profile Policy + +Use a small deterministic policy table. Values are intentionally conservative because the converter runs real PDFs and should prefer completion over peak throughput. + +| Selected GPU | Auto policy | MinerU environment | +| --- | --- | --- | +| No GPU inventory, CUDA requested | Safe fallback with warning | `MINERU_PROCESSING_WINDOW_SIZE=1`, `MINERU_API_MAX_CONCURRENT_REQUESTS=1`, `MINERU_PDF_RENDER_THREADS=1` | +| Pre-Turing or VRAM < 12GB | Safe | `MINERU_PROCESSING_WINDOW_SIZE=1`, `MINERU_API_MAX_CONCURRENT_REQUESTS=1`, `MINERU_PDF_RENDER_THREADS=1` | +| 12GB <= VRAM < 16GB | Auto conservative | `MINERU_PROCESSING_WINDOW_SIZE=4`, `MINERU_API_MAX_CONCURRENT_REQUESTS=1`, `MINERU_PDF_RENDER_THREADS=2` | +| VRAM >= 16GB and Turing-or-newer | Auto moderately aggressive | `MINERU_PROCESSING_WINDOW_SIZE=8`, `MINERU_API_MAX_CONCURRENT_REQUESTS=1`, `MINERU_PDF_RENDER_THREADS=4` | +| Explicit `safe` | Safe regardless of GPU | `MINERU_PROCESSING_WINDOW_SIZE=1`, `MINERU_API_MAX_CONCURRENT_REQUESTS=1`, `MINERU_PDF_RENDER_THREADS=1` | +| Explicit `performance` on VRAM >= 16GB and Turing-or-newer | Performance | `MINERU_PROCESSING_WINDOW_SIZE=16`, `MINERU_API_MAX_CONCURRENT_REQUESTS=1`, `MINERU_PDF_RENDER_THREADS=4` | +| Explicit `performance` on weaker GPU | Downgraded safe with warning | safe values | + +Do not set `MINERU_HYBRID_BATCH_RATIO` in Sprint 15 because MinerU docs describe it as commonly used for `hybrid-http-client`, which this project prohibits in v1. + +Do not set backend CLI flags in Sprint 15. The default MinerU backend remains MinerU-owned. + +## Architecture Plan + +### WP15.1: GPU Inventory Boundary + +Actions: + +- Add `src/pdf2md/gpu.py`. +- Define immutable `GpuInfo` and `GpuInventory` records. +- Parse `nvidia-smi --query-gpu=index,name,memory.total,driver_version --format=csv,noheader,nounits`. +- Parse memory in MiB as an integer. +- Mark pre-Turing risk using the existing name-based heuristic for GTX 10xx and pre-Turing names. +- Optionally enrich compute capability through PyTorch when available, but keep PyTorch optional and mockable. +- Provide `select_gpu(gpus, requested)` for `cuda:N`, `N`, and `auto`. + +Expected output: + +- GPU detection is independently testable with captured command output strings. +- No real `nvidia-smi`, GPU, or PyTorch is needed in default tests. + +### WP15.2: MinerU Profile Policy + +Actions: + +- Add `src/pdf2md/mineru_profile.py`. +- Define supported profile names: `auto`, `safe`, `performance`. +- Define a result record containing: + - requested profile, + - applied profile, + - selected GPU index if known, + - selected GPU name if known, + - selected GPU VRAM MiB if known, + - environment variables to set, + - warnings or info messages as project `WarningRecord` values. +- Implement the policy table above. +- Keep profile environment values in a small allowlist. + +Expected output: + +- The policy can be tested without running MinerU. +- Performance profile cannot silently overcommit weak GPUs. + +### WP15.3: Adapter Environment Integration + +Actions: + +- Extend `MinerUOptions` with `mineru_profile: str = "auto"` and optional resolved profile metadata. +- Keep strict-local validation for every option string. +- Update `_mineru_environment()` to merge: + - `MINERU_DEVICE_MODE=cuda`, + - `CUDA_VISIBLE_DEVICES=`, + - profile environment variables from `mineru_profile.py`. +- Preserve previous environment values after subprocess execution. +- Include profile details in `engine_options`. + +Expected output: + +- Real MinerU still receives only direct local CLI command shape: + +```text +mineru -p -o +``` + +- Tuning is done through local environment variables, not remote/API/backend flags. + +### WP15.4: Conversion And CLI Wiring + +Actions: + +- Add `--mineru-profile` to `pdf2md convert`. +- Accept `--gpu auto`. +- Resolve selected GPU and profile before calling the adapter. +- Surface profile warnings in conversion metadata/report warnings. +- Preserve existing `--gpu cuda:0` default. +- Ensure `convert_pdf()` can receive the profile through the Python API. + +Expected output: + +- Default conversions use `mineru_profile=auto`. +- Existing calls with no new flags continue to work. +- Metadata explains which profile was applied. + +### WP15.5: Doctor Reporting + +Actions: + +- Reuse `gpu.py` inventory parsing in `doctor.py`. +- Keep the existing `gpu` and `pytorch` checks, but make GPU details more explicit. +- Add a doctor detail line for auto-selected GPU and recommended profile. +- Keep warning-only behavior for Pascal/pre-Turing GPUs. + +Expected output: + +- On a stronger PC, `pdf2md doctor` shows enough evidence to decide whether `auto` or `performance` is appropriate. +- On the current GTX 1070 Ti, doctor still warns and recommends safe/conservative behavior. + +### WP15.6: Documentation + +Actions: + +- Update README setup and conversion docs with `--mineru-profile`. +- Update ARCHITECTURE to document that tuning uses strict-local environment variables only. +- Update PRD CLI section if the new public flag is added. +- Update `docs/V1IMPLEMENTATIONPLAN.md`, `PLAN.md`, and `PROGRESS.md`. +- Archive implementation details in `docs/WORKARCHIVE.md` only after implementation and verification. + +Expected output: + +- Users can move the repo to a stronger NVIDIA GPU PC, run `pdf2md doctor`, and understand the selected profile. + +## Tests + +Default fast tests: + +- GPU inventory parser handles one RTX GPU, multiple GPUs, no GPU lines, and malformed memory fields. +- `select_gpu(..., "auto")` selects the largest VRAM GPU. +- `select_gpu(..., "cuda:1")` selects index 1 and errors when absent. +- `select_gpu(..., "1")` normalizes to index 1. +- `auto` profile returns safe values for GTX 1070 Ti 8GB. +- `auto` profile returns moderately aggressive values for an RTX GPU with 16GB or more. +- `performance` profile returns performance values only for 16GB+ Turing-or-newer GPUs. +- `performance` profile on GTX 1070 Ti downgrades to safe and returns a warning. +- Adapter sets and restores `MINERU_DEVICE_MODE`, `CUDA_VISIBLE_DEVICES`, `MINERU_PROCESSING_WINDOW_SIZE`, `MINERU_API_MAX_CONCURRENT_REQUESTS`, and `MINERU_PDF_RENDER_THREADS`. +- Strict-local validation rejects remote/API/backend-like option strings in profile-related fields. +- CLI default passes `mineru_profile=auto`. +- CLI accepts `--mineru-profile safe` and `--mineru-profile performance`. +- CLI rejects invalid profile values. +- Doctor report includes visible GPU details and recommended profile with mocked command outputs. +- Existing conversion, chunking, metadata, report, and UI tests remain green. + +Optional local validation on a stronger NVIDIA GPU PC: + +```powershell +uv run pdf2md doctor +$env:MINERU_MODEL_SOURCE='local' +uv run pdf2md convert samples\FourNodeQuadrilateralShellElementMITC4.pdf --out outputs\fournode-sprint15-auto --overwrite --chunk-pages --gpu auto --mineru-profile auto --strict-local +``` + +Expected optional validation: + +- Doctor reports the stronger GPU name, VRAM, and recommended profile. +- Conversion metadata records `mineru_profile` and selected GPU information. +- Generated outputs stay ignored and uncommitted. + +## Acceptance Criteria + +- `--mineru-profile auto` is the default conversion behavior. +- `auto` uses safe settings on the current GTX 1070 Ti 8GB and stronger settings only on 16GB+ Turing-or-newer NVIDIA GPUs. +- `--gpu auto` can choose the largest visible NVIDIA GPU without adding remote/runtime backend support. +- MinerU command shape remains direct local CLI only. +- Strict-local prohibitions remain enforced. +- `pdf2md doctor` provides actionable GPU/profile information. +- Metadata/report preserve the applied runtime profile. +- Default tests remain fast, mocked, local, and independent of real MinerU/GPU/model files/network/samples. + +## Hard Failure Criteria + +- Implementation adds runtime backend selection or exposes `--backend`. +- Implementation passes `--api-url`, `--url`, router, HTTP client backend, or remote OpenAI-compatible backend values. +- `auto` profile applies aggressive settings to GTX 1070 Ti 8GB or other pre-Turing/low-VRAM GPUs. +- Existing `--gpu cuda:0` behavior breaks. +- Profile tuning disables formula or table parsing. +- Doctor or tests require real GPU, real MinerU execution, model files, network, Obsidian, MathJax, or `samples/`. +- Sample PDFs, generated outputs, local model files, or `dist/pdf2md-ui.exe` are committed. + +## Implementation Task Plan + +### Task 1: GPU Inventory + +Files: + +- Create `src/pdf2md/gpu.py` +- Create `tests/test_gpu.py` + +Steps: + +- [x] Add failing tests for parsing `nvidia-smi` CSV output. +- [x] Add failing tests for `auto`, `cuda:N`, and numeric GPU selection. +- [x] Implement immutable GPU records and parser helpers. +- [x] Implement selection errors as `ValueError` with clear messages. +- [x] Run `uv run pytest tests/test_gpu.py`. +- [x] Commit GPU inventory boundary. + +### Task 2: MinerU Profile Policy + +Files: + +- Create `src/pdf2md/mineru_profile.py` +- Create `tests/test_mineru_profile.py` + +Steps: + +- [x] Add failing tests for safe, auto, and performance profile policy. +- [x] Add tests proving 16GB+ Turing-or-newer GPUs get the moderately aggressive auto environment. +- [x] Add tests proving GTX 1070 Ti 8GB stays safe. +- [x] Implement the allowlisted environment mapping. +- [x] Run `uv run pytest tests/test_mineru_profile.py tests/test_gpu.py`. +- [x] Commit profile policy. + +### Task 3: Adapter And Conversion Wiring + +Files: + +- Modify `src/pdf2md/mineru_adapter.py` +- Modify `src/pdf2md/conversion.py` +- Modify `tests/test_mineru_adapter.py` +- Modify `tests/test_conversion.py` + +Steps: + +- [x] Add failing adapter tests for profile environment variables and environment restoration. +- [x] Add failing conversion tests that metadata receives applied profile information. +- [x] Extend `MinerUOptions` and conversion options minimally. +- [x] Merge GPU and profile environment variables before the MinerU subprocess. +- [x] Run `uv run pytest tests/test_mineru_adapter.py tests/test_conversion.py tests/test_mineru_profile.py tests/test_gpu.py`. +- [x] Commit adapter/conversion wiring. + +### Task 4: CLI And Doctor + +Files: + +- Modify `src/pdf2md/cli.py` +- Modify `src/pdf2md/doctor.py` +- Modify `tests/test_cli.py` +- Modify `tests/test_doctor.py` + +Steps: + +- [x] Add failing CLI tests for default `auto`, explicit `safe`, explicit `performance`, invalid profile rejection, and `--gpu auto`. +- [x] Add failing doctor tests for GPU inventory and recommended profile details. +- [x] Implement CLI argument parsing and doctor report additions. +- [x] Run `uv run pytest tests/test_cli.py tests/test_doctor.py tests/test_gpu.py tests/test_mineru_profile.py`. +- [x] Commit CLI and doctor wiring. + +### Task 5: UI And Documentation + +Files: + +- Modify `src/pdf2md_ui/runner.py` only if explicit UI profile passthrough is needed +- Modify `src/pdf2md_ui/app.py` only if explicit UI profile control is needed +- Modify `tests/test_ui_runner.py` only if runner command construction changes +- Modify `README.md` +- Modify `ARCHITECTURE.md` +- Modify `PRD.md` +- Modify `docs/V1IMPLEMENTATIONPLAN.md` +- Modify `PLAN.md` +- Modify `PROGRESS.md` +- Modify `docs/WORKARCHIVE.md` after implementation + +Steps: + +- [x] Keep UI unchanged if default CLI `auto` profile is enough for the first implementation pass. +- [x] If UI exposes a profile control, add tests for fixed argument-list construction with `shell=False`. +- [x] Document `--mineru-profile`, `--gpu auto`, profile policy, strict-local boundaries, and stronger-PC validation command. +- [x] Run focused docs/UI tests if changed. +- [x] Run final verification commands. +- [x] Commit documentation and final coordination updates. + +## Verification Commands + +```powershell +uv run pytest tests/test_gpu.py tests/test_mineru_profile.py tests/test_mineru_adapter.py tests/test_conversion.py tests/test_cli.py tests/test_doctor.py +uv run pytest +git diff --check +git status --short --untracked-files=all +``` + +Optional stronger-PC validation is listed in the Tests section and must remain explicit opt-in. + +## Handoff Requirements + +After implementation: + +- Update `PROGRESS.md` with files changed, commands run, test outcomes, optional stronger-PC validation outcome, known failures, residual risks, and next action. +- Archive completed implementation details in `docs/WORKARCHIVE.md`. +- Keep generated outputs, sample PDFs, local model files, and UI build artifacts out of the commit. +- Record the detected GPU, applied profile, and whether `samples\FourNodeQuadrilateralShellElementMITC4.pdf` completed on the stronger PC. + +Implementation handoff: + +- Files changed: `src/pdf2md/gpu.py`, `src/pdf2md/mineru_profile.py`, `src/pdf2md/mineru_adapter.py`, `src/pdf2md/conversion.py`, `src/pdf2md/cli.py`, `src/pdf2md/doctor.py`, docs, and focused tests. +- Commands run: `uv run pytest tests/test_gpu.py tests/test_mineru_profile.py tests/test_mineru_adapter.py tests/test_conversion.py tests/test_cli.py tests/test_doctor.py`; `uv run pytest`; `uv run pdf2md doctor`. +- Tests passed: targeted Sprint 15 suite passed 101 tests; full default suite passed 225 tests with 1 optional skip; local doctor returned WARN with expected GTX 1070 Ti safe-profile recommendation. +- Known failures: optional stronger-PC real MinerU conversion validation was not run in this workspace. +- Residual risks: GTX 1070 Ti 8GB remains likely to stall on hard pages; stronger-PC behavior still needs local runtime validation. +- Next action: on a stronger NVIDIA GPU PC, run `pdf2md doctor` and an explicit local conversion with `--gpu auto --mineru-profile auto`. + +## Future Sprint Boundary + +A later sprint may add page-level timeout handling, resumable page caches, or a performance mode that can run multiple page conversions concurrently on GPUs with enough VRAM. Those behaviors are intentionally out of Sprint 15 scope. diff --git a/docs/Sprints/SPRINT16CONTRACT.md b/docs/Sprints/SPRINT16CONTRACT.md new file mode 100644 index 0000000..3f06f26 --- /dev/null +++ b/docs/Sprints/SPRINT16CONTRACT.md @@ -0,0 +1,412 @@ +# Sprint 16 Contract: Simplified Output Layout + +Status: Implemented +Last updated: 2026-05-12 + +## Objective + +Simplify conversion outputs so each input PDF gets one predictable output folder named after the PDF stem, all images live under one `images` folder, Markdown parts use `_001`, `_002` numbering, one human-readable report is written per PDF, and no metadata JSON file is persisted. + +This sprint changes the public output contract. It supersedes the older v1 output layout that wrote sibling `.md`, `.assets`, `.metadata.json`, and `.report.md` files. + +## Product Output Contract + +For an input PDF: + +```text +paper.pdf +``` + +and output root: + +```text +outputs/ +``` + +write: + +```text +outputs/ + paper/ + paper_001.md + paper_002.md + paper_report.md + images/ + ... +``` + +Rules: + +- `paper` is the PDF stem, meaning the original filename without `.pdf`. +- A one-part conversion still writes `paper_001.md`. +- A multi-part conversion writes `paper_001.md`, `paper_002.md`, and so on. +- Part numbering uses at least three digits and grows only when the part count exceeds 999. +- All generated image and media assets for the PDF live under `paper/images/`. +- Markdown links must point to `images/`. +- The report is a single file at `paper/paper_report.md`. +- No `.metadata.json`, part metadata JSON, or sidecar metadata JSON is written. +- Internal metadata records may still be built in memory to produce reports, warnings, counts, and `ConversionResult` fields. + +## Contract Assumptions + +- The user request "metadata is not needed" means metadata JSON should not be written as a user-facing output file. It does not mean removing internal metadata objects needed for report generation and warning aggregation. +- Keep `--chunk-pages` semantics from Sprint 14: when enabled, MinerU receives one source page per run and final Markdown files are grouped by `chunk_pages`. +- If `--chunk-pages` is absent, the whole PDF is still converted in one MinerU run and written as `_001.md`. +- Keep `--chunk-pages` without a value as the default grouping size of 20. +- Keep `--metadata` accepted as a backward-compatible no-op for one sprint, but update help text to say metadata JSON output is disabled in the simplified layout. +- `pdf2md recheck` remains supported only for legacy outputs that still have adjacent metadata JSON. New simplified outputs should fail recheck clearly until a later sprint designs metadata-free recheck. +- Recursive directory conversion should preserve the discovered relative parent before the PDF stem folder: `outputs///_001.md`. +- If two inputs would map to the same output folder and overwrite is false, fail during preflight. Do not invent automatic suffixes. +- `--keep-raw` should place raw MinerU diagnostics under `paper/raw/` so raw outputs do not clutter the main folder. + +## Touched Surfaces + +Allowed during implementation: + +- Modify `src/pdf2md/paths.py`. +- Modify `src/pdf2md/pdf_splitter.py` only if part naming needs helper support. +- Modify `src/pdf2md/conversion.py`. +- Modify `src/pdf2md/report.py` or add a focused aggregate report helper if one report needs multiple part summaries. +- Modify `src/pdf2md/cli.py`. +- Modify `src/pdf2md_ui/runner.py` and `src/pdf2md_ui/app.py` only if UI text or expected output descriptions mention metadata/report paths. +- Modify `tests/test_paths.py`. +- Modify `tests/test_conversion.py`. +- Modify `tests/test_cli.py`. +- Modify `tests/test_report.py`. +- Modify `tests/test_ui_runner.py` only if UI command/output assumptions change. +- Modify `tests/integration/test_v1_fast_release_gate.py`. +- Modify `tests/integration/test_optional_mineru_fixtures.py`. +- Modify `README.md`. +- Modify `PRD.md`. +- Modify `ARCHITECTURE.md`. +- Modify `docs/V1IMPLEMENTATIONPLAN.md`. +- Modify `PLAN.md`. +- Modify `PROGRESS.md`. +- Modify `docs/WORKARCHIVE.md` after implementation. + +Not allowed: + +- Do not change MinerU 3.1.0 as the fixed engine. +- Do not add another conversion engine. +- Do not add remote/API/backend paths. +- Do not change `--gpu`, `--mineru-profile`, or strict-local behavior except where report text reflects the new layout. +- Do not make default tests depend on real MinerU, GPU, CUDA, model files, network, Obsidian, MathJax, or `samples/`. +- Do not commit generated `outputs/`, sample PDFs, local model files, or `dist/pdf2md-ui.exe`. + +## Architecture Plan + +### WP16.1: Document-Level Output Layout + +Add or reshape path planning so final outputs are planned per source PDF folder instead of as sibling files. + +Expected final paths for a single PDF: + +```text +//_001.md +//images/ +//_report.md +``` + +Expected final paths for recursive input: + +```text +///_001.md +///images/ +///_report.md +``` + +Implementation guidance: + +- Keep `DiscoveredPdf.relative_parent` behavior. +- Add a focused part-planning helper rather than encoding final output names through fake temporary PDF filenames. +- Keep `PlannedOutput` if the existing conversion code can use it cleanly, but allow multiple Markdown parts to share the same `assets_dir` and `report_path`. +- Duplicate-path detection must reject duplicate Markdown files and raw directories, but it must allow shared `images/` and shared report paths for parts belonging to the same source PDF. + +### WP16.2: Markdown Part Numbering + +Replace public part names: + +```text +paper.part-001.pages-001-020.md +paper.part-002.pages-021-040.md +``` + +with: + +```text +paper_001.md +paper_002.md +``` + +Rules: + +- Part index is based on final output group order, not source page number. +- The report must still record source page ranges for each part. +- Failed groups should not create a Markdown file, but the report must mention the failed part and source page range. + +### WP16.3: Shared Images Folder + +Replace per-output asset directories: + +```text +paper.part-001.pages-001-020.assets/ +paper.part-002.pages-021-040.assets/ +``` + +with: + +```text +paper/images/ +``` + +Implementation guidance: + +- Copy all assets for one source PDF into the shared `images/` folder. +- Rewrite Markdown links to `images/`. +- Use deterministic collision-safe filenames. Recommended pattern: + - page-known assets: `page-001_`, with `-002` suffixes when needed. + - page-unknown assets: `asset-001`, preserving the original suffix when available. +- Keep asset-link validation pointed at the shared `images/` directory. + +### WP16.4: One Report, No Metadata JSON + +Stop writing metadata JSON as a user-facing output file. + +Implementation guidance: + +- Continue building internal metadata dictionaries or records for each part so report generation and `ConversionResult` summaries stay traceable. +- Add an aggregate report path at `/_report.md`. +- The report must include: + - source PDF path, + - output folder path, + - Markdown part list with page ranges, + - engine and engine options, + - final status, + - warning count, + - asset count, + - missing/invalid asset link counts, + - inline/display formula counts, + - MathJax render error count, + - text fidelity summary when available, + - failed source pages or failed parts when any exist, + - warnings grouped by page or part. +- `ConversionResult.metadata_path` should be `None` for simplified outputs. +- `ConversionResult.report_path` should point to the shared report path. + +### WP16.5: CLI, UI, And Documentation + +Update user-facing docs and tests to remove metadata JSON as an expected output. + +Implementation guidance: + +- `pdf2md convert` summary may keep printing Markdown paths and warning counts. +- Update CLI help for `--metadata` to say metadata JSON output is disabled or deprecated in the simplified layout. +- Update README examples to show the new folder layout. +- Update PRD and ARCHITECTURE so they no longer claim metadata JSON is required as a public artifact. +- Keep internal provenance wording clear: warnings and report are still derived from internal metadata-like records. +- Update optional fixture documentation so generated metadata JSON is not required for sample validation. + +## Implementation Task Plan + +### Task 1: Path Planning For Simplified Layout + +Files: + +- Modify `src/pdf2md/paths.py`. +- Modify `tests/test_paths.py`. + +Steps: + +- [ ] Add failing tests showing `plan_outputs()` maps `paper.pdf` to `out/paper/paper_001.md`, `out/paper/images`, no metadata path, and `out/paper/paper_report.md`. +- [ ] Add a failing test for Korean filenames, using the PDF stem exactly as the output folder and file prefix. +- [ ] Add a failing test for recursive input preserving `relative_parent`. +- [ ] Add a failing test that duplicate source stems in the same relative parent conflict before conversion. +- [ ] Implement the minimal path planning changes. +- [ ] Run `uv run pytest tests/test_paths.py`. +- [ ] Commit path planning changes. + +### Task 2: Single-Output Conversion Writes Simplified Files + +Files: + +- Modify `src/pdf2md/conversion.py`. +- Modify `tests/test_conversion.py`. +- Modify `tests/test_cli.py`. + +Steps: + +- [ ] Add failing conversion tests showing a non-chunked fake-adapter conversion writes `out/paper/paper_001.md`, `out/paper/images`, and `out/paper/paper_report.md`. +- [ ] Add failing assertions that no `.metadata.json` file is written and `result.metadata_path is None`. +- [ ] Add failing CLI test showing `pdf2md convert paper.pdf --out out` creates the simplified folder. +- [ ] Implement the minimal conversion changes for non-chunked output. +- [ ] Run `uv run pytest tests/test_conversion.py tests/test_cli.py tests/test_paths.py`. +- [ ] Commit single-output conversion changes. + +### Task 3: Grouped Output Parts And Shared Images + +Files: + +- Modify `src/pdf2md/conversion.py`. +- Modify `src/pdf2md/pdf_splitter.py` only if a small helper is needed. +- Modify `tests/test_conversion.py`. +- Modify `tests/test_cli.py`. + +Steps: + +- [ ] Add failing tests for `chunk_pages=20` showing final Markdown names are `paper_001.md`, `paper_002.md`, not `paper.part-...md`. +- [ ] Add failing tests proving all grouped assets are copied into `paper/images/` and Markdown links use `images/...`. +- [ ] Add failing tests proving asset collisions across pages get deterministic unique filenames. +- [ ] Add failing tests proving failed page conversions are represented in the shared report while later pages still convert. +- [ ] Implement grouped output naming and shared image handling. +- [ ] Run `uv run pytest tests/test_conversion.py tests/test_cli.py tests/test_pdf_splitter.py`. +- [ ] Commit grouped output changes. + +### Task 4: Aggregate Report Without Metadata JSON + +Files: + +- Modify `src/pdf2md/report.py` or add a focused aggregate report helper. +- Modify `src/pdf2md/conversion.py`. +- Modify `tests/test_report.py`. +- Modify `tests/test_conversion.py`. + +Steps: + +- [ ] Add failing report tests for a one-file report listing multiple Markdown parts and source page ranges. +- [ ] Add failing conversion tests proving only one report exists for a chunked PDF. +- [ ] Add failing tests proving report summary totals combine all output parts. +- [ ] Add failing tests proving all-failed conversions write a report but no Markdown part. +- [ ] Implement aggregate report rendering from internal metadata records. +- [ ] Run `uv run pytest tests/test_report.py tests/test_conversion.py`. +- [ ] Commit report changes. + +### Task 5: Recheck, CLI Compatibility, UI Text, And Docs + +Files: + +- Modify `src/pdf2md/cli.py`. +- Modify `src/pdf2md/conversion.py`. +- Modify `src/pdf2md_ui/runner.py` and `src/pdf2md_ui/app.py` only if text/output assumptions change. +- Modify `README.md`. +- Modify `PRD.md`. +- Modify `ARCHITECTURE.md`. +- Modify `docs/V1IMPLEMENTATIONPLAN.md`. +- Modify `tests/test_cli.py`. +- Modify `tests/test_ui_runner.py` only if UI behavior changes. +- Modify `tests/integration/test_v1_fast_release_gate.py`. +- Modify `tests/integration/test_optional_mineru_fixtures.py`. + +Steps: + +- [ ] Add failing CLI tests proving `--metadata` remains accepted but no metadata JSON is written. +- [ ] Add failing recheck test proving simplified outputs without metadata fail with a clear legacy-metadata message. +- [ ] Update integration tests to require Markdown part files, one report, and image links, not metadata JSON. +- [ ] Update README, PRD, ARCHITECTURE, and release-gate wording for the simplified layout. +- [ ] Implement CLI/recheck/doc changes. +- [ ] Run `uv run pytest tests/test_cli.py tests/test_ui_runner.py tests/integration/test_v1_fast_release_gate.py`. +- [ ] Commit CLI, UI, integration, and documentation changes. + +### Task 6: Final Verification And Handoff + +Files: + +- Modify `PLAN.md`. +- Modify `PROGRESS.md`. +- Modify `docs/WORKARCHIVE.md` after implementation. +- Modify `docs/Sprints/SPRINT16CONTRACT.md` status and handoff fields. + +Steps: + +- [ ] Run focused Sprint 16 verification: + +```powershell +uv run pytest tests/test_paths.py tests/test_conversion.py tests/test_cli.py tests/test_report.py tests/integration/test_v1_fast_release_gate.py +``` + +- [ ] Run full default verification: + +```powershell +uv run pytest +``` + +- [ ] Run diff check: + +```powershell +git diff --check +``` + +- [ ] Update `PROGRESS.md` with files changed, checks run, residual risks, and next actions. +- [ ] Archive completed implementation evidence in `docs/WORKARCHIVE.md`. +- [ ] Commit final coordination updates. + +## Verification Commands + +```powershell +uv run pytest tests/test_paths.py tests/test_conversion.py tests/test_cli.py tests/test_report.py tests/integration/test_v1_fast_release_gate.py +uv run pytest +git diff --check +git status --short --untracked-files=all +``` + +Optional local fixture validation after implementation: + +```powershell +$env:MINERU_MODEL_SOURCE='local' +uv run pdf2md convert samples\SolidElement.pdf --out outputs\SolidElement_sprint16_layout --overwrite --chunk-pages --gpu auto --mineru-profile auto --strict-local +``` + +Expected optional validation: + +- Output folder is `outputs\SolidElement\` or the explicitly provided output root plus `SolidElement\`, depending on the command. +- Markdown part is `SolidElement_001.md` for the 6-page sample. +- Report is `SolidElement_report.md`. +- Images are under `images\`. +- No metadata JSON exists. + +## Acceptance Criteria + +- Each input PDF writes into an output folder named after the PDF stem. +- Markdown outputs are named `_001.md`, `_002.md`, and so on. +- All image/media assets for one PDF live under `/images/`. +- Markdown links point to `images/...`. +- Exactly one report file is written per input PDF at `/_report.md`. +- No metadata JSON file is written for new conversions. +- Internal warning, provenance, formula count, asset count, and text fidelity information remains available in the report. +- Chunk mode still converts one source page per MinerU run and groups Markdown by `chunk_pages`. +- Strict-local and MinerU-only constraints remain unchanged. +- Default tests stay fast and local. + +## Hard Failure Criteria + +- Any new conversion writes `.metadata.json` as a public output. +- Output files keep old `part-001.pages-...` names. +- Assets are split into per-part `.assets` folders. +- More than one report is written for one input PDF. +- Markdown links point outside the PDF output folder. +- Chunk mode stops using one source page per MinerU run. +- Strict-local enforcement is weakened. +- Default tests require real MinerU, GPU, model files, network, Obsidian, MathJax, or `samples/`. +- Sample PDFs, generated outputs, local model files, or `dist/pdf2md-ui.exe` are committed. + +## Open Questions + +- Should metadata-free `pdf2md recheck` be restored in a later sprint by deriving enough state from the report and Markdown, or is rerunning conversion acceptable for simplified outputs? +- Should raw MinerU outputs under `--keep-raw` be flattened into `raw/` or kept per part under `raw/_001/`? This contract recommends per-part raw folders to avoid collisions. + +## Handoff Requirements + +After implementation: + +- Update this contract status to `Implemented`. +- Record final file layout examples in `README.md`. +- Record verification commands and outcomes in `PROGRESS.md`. +- Archive implementation and optional sample validation results in `docs/WORKARCHIVE.md`. +- Keep generated outputs and sample PDFs uncommitted. + +## Implementation Handoff + +- Files changed: `src/pdf2md/paths.py`, `src/pdf2md/conversion.py`, `src/pdf2md/report.py`, `src/pdf2md/cli.py`, `src/pdf2md_ui/runner.py`, focused tests, and current docs. +- Output layout implemented: `//_001.md`, additional numbered parts when grouped, `//images/`, and `//_report.md`. +- Metadata JSON behavior: new conversions do not write public `.metadata.json`; `ConversionResult.metadata_path` is `None`; internal metadata-like records still feed reports and tests. +- Recheck behavior: `pdf2md recheck` remains legacy-only and requires adjacent metadata JSON. +- Verification recorded in `PROGRESS.md`: focused Sprint 16 tests passed, full `uv run pytest` passed 227 tests with 1 optional skip, and `git diff --check` passed with line-ending warnings only. diff --git a/docs/Sprints/SPRINT17CONTRACT.md b/docs/Sprints/SPRINT17CONTRACT.md new file mode 100644 index 0000000..3461129 --- /dev/null +++ b/docs/Sprints/SPRINT17CONTRACT.md @@ -0,0 +1,440 @@ +# Sprint 17 Contract: Offline Windows Installer + +Status: Abandoned +Last updated: 2026-05-13 + +## Abandonment Note + +Sprint 17 was abandoned at the user's request on 2026-05-13 before implementation began. This document remains as a historical planning record only. Do not implement or extend this contract unless the user explicitly reopens offline installer work. + +## Objective + +Create a large offline Windows installer that can install the existing local `pdf2md` runtime on another Windows PC without internet access. + +The installer must install or stage all application-owned files needed after download time: the minimal UI executable, the project runtime, a target-local Python virtual environment created from bundled wheels, CUDA PyTorch wheels, MinerU 3.1.0 wheels and dependencies, local MinerU model files, optional local Node.js/MathJax assets, Start Menu shortcuts, setup logs, and a post-install `pdf2md doctor` verification path. + +This sprint does not change conversion behavior. It packages the already implemented CLI/UI/runtime for offline use. + +## Product Decision + +The offline package should create the target PC virtual environment during installation instead of copying the current development `.venv`. + +Reasoning: + +- Python virtual environments and console entry points often contain absolute paths and are not a reliable redistribution unit. +- A target-local `.venv` created from a bundled wheelhouse is more reproducible and easier to repair. +- The installer can keep the wheelhouse for offline repair, uninstall/reinstall, and audit. + +## Installer Shape + +Recommended installer technology: + +- Inno Setup for the Windows installer shell because it can compile scripts from the command line with `ISCC.exe`, returns deterministic exit codes, and is simple enough for a per-user installer. +- PowerShell scripts for payload build, target runtime install, and target verification. +- PyInstaller remains only the UI executable builder. It must not become the full MinerU/PyTorch/model bundler. + +Default install root: + +```text +%LOCALAPPDATA%\Programs\ConvertPDFToMD\ +``` + +Installed layout: + +```text +ConvertPDFToMD/ + app/ + pdf2md-ui.exe + runtime/ + pyproject.toml + uv.lock + README.md + src/ + tools/ + package.json + package-lock.json + .venv/ + payload/ + python/ + uv/ + wheelhouse/ + requirements-runtime-cu126.txt + models/ + node/ + node_modules/ + payload-manifest.json + SHA256SUMS.txt + THIRD_PARTY_NOTICES.md + scripts/ + install-runtime.ps1 + repair-runtime.ps1 + run-doctor.ps1 + logs/ +``` + +Generated artifacts that must remain untracked: + +```text +dist/offline-installer/ +dist/Pdf2MdOfflineSetup-*.exe +``` + +## Payload Contents + +The first offline payload targets Windows x64, Python 3.12, CUDA PyTorch `2.6.0+cu126`, `torchvision 0.21.0+cu126`, and `mineru[core]==3.1.0`. + +Required: + +- `dist/pdf2md-ui.exe` from the existing PyInstaller build. +- Tracked project runtime files needed to run `uv run pdf2md`. +- A Windows x64 Python 3.12 installer or an equivalent approved Python runtime package. +- A Windows x64 `uv.exe`. +- A wheelhouse containing: + - the current project wheel, + - `pypdf`, + - `torch==2.6.0`, + - `torchvision==0.21.0`, + - `mineru[core]==3.1.0`, + - all transitive Python runtime dependencies. +- Local MinerU model files and the model config template needed for `MINERU_MODEL_SOURCE=local`. +- A manifest listing every payload file, size, SHA-256 hash, source URL or local source, and license family. + +Optional but recommended: + +- Portable local Node.js runtime. +- `node_modules/` containing the locked MathJax checker dependencies from `package-lock.json`. + +Explicitly excluded: + +- `samples/`. +- `outputs/`. +- `.git/`. +- The development `.venv/`. +- Local generated PyInstaller `build/` folders and `.spec` files unless the implementation deliberately adds a stable project-owned spec file. +- NVIDIA GPU drivers and CUDA Toolkit installers. The installer may check for a compatible NVIDIA driver through `nvidia-smi`, but it should not redistribute GPU drivers in this sprint. + +## Touched Surfaces + +Allowed during implementation: + +- Create `packaging/offline/build-offline-payload.ps1`. +- Create `packaging/offline/verify-offline-payload.ps1`. +- Create `packaging/offline/install-runtime.ps1`. +- Create `packaging/offline/repair-runtime.ps1`. +- Create `packaging/offline/run-doctor.ps1`. +- Create `packaging/offline/Pdf2MdOffline.iss`. +- Create `packaging/offline/requirements-runtime-cu126.txt`. +- Create `packaging/offline/README.md`. +- Create `packaging/offline/THIRD_PARTY_NOTICES.md`. +- Create `src/pdf2md/packaging_manifest.py` only if a Python helper is simpler than repeating manifest logic in PowerShell. +- Modify `src/pdf2md_ui/runner.py` so the UI can resolve an installed target-local `.venv\Scripts\pdf2md.exe` before falling back to PATH or `uv run pdf2md`. +- Modify `src/pdf2md_ui/app.py` only if the project root default must prefer the installed runtime folder. +- Modify `tests/test_ui_runner.py`. +- Create `tests/test_offline_packaging.py`. +- Modify `README.md`. +- Modify `docs/V1RELEASECHECKLIST.md`. +- Modify `PLAN.md`. +- Modify `PROGRESS.md`. +- Modify `docs/WORKARCHIVE.md` after implementation. + +Not allowed: + +- Do not change MinerU 3.1.0 as the fixed conversion engine. +- Do not add a second conversion engine. +- Do not add runtime network calls, `--api-url`, router mode, remote APIs, HTTP client backends, remote OpenAI-compatible backends, or hosted renderers. +- Do not copy the development `.venv` as the installed runtime. +- Do not make default tests depend on real MinerU, GPU, model files, network, Obsidian, MathJax, Inno Setup, or `samples/`. +- Do not commit generated installer payloads, model files, wheelhouse files, Python installers, `dist/`, `outputs/`, or `samples/`. + +## Architecture Plan + +### WP17.1: Offline Payload Builder + +Add a build script that creates a clean staging folder under `dist/offline-installer/` with `app/`, `runtime/`, and `payload/` subfolders that mirror the final install layout. + +Responsibilities: + +- Rebuild `dist/pdf2md-ui.exe`. +- Build the project wheel into the staging wheelhouse. +- Download or collect Python wheels for the target runtime on a connected build PC. +- Collect the Windows Python runtime package and `uv.exe`. +- Copy project runtime files without `.git`, `.venv`, `outputs/`, `samples/`, and build trash. +- Copy local MinerU model files from a configured source path. +- Optionally copy portable Node.js and the locked `node_modules/`. +- Generate `payload-manifest.json` and `SHA256SUMS.txt`. +- Fail if any required file is missing or if any wheel dependency would require internet during installation. + +The builder may use `python -m pip download` on the connected build PC. The target installer must use only local files, for example `uv pip install --no-index --find-links`. + +### WP17.2: Target Runtime Installer + +Add a PowerShell install script that runs from the installed payload and creates the real runtime on the target PC. + +Responsibilities: + +- Verify payload hashes before installing. +- Install or locate Python 3.12 x64. +- Create `runtime\.venv` on the target PC. +- Install packages from `payload\wheelhouse` with network disabled. +- Install the project wheel into the target `.venv`. +- Preserve the bundled wheelhouse for offline repair. +- Configure `MINERU_MODEL_SOURCE=local` for UI/CLI child processes. +- Configure local MinerU model paths without silently overwriting an unrelated user `mineru.json`. +- If `%USERPROFILE%\mineru.json` already exists and points elsewhere, prompt in interactive mode; in silent mode, fail clearly and leave `repair-runtime.ps1` instructions. +- Run `pdf2md doctor` and write the result to `logs\doctor-after-install.txt`. + +### WP17.3: UI Runtime Resolution + +Adjust the UI runner for an installed offline layout. + +Resolution order: + +1. Explicit configured `pdf2md` command. +2. Installed runtime `.venv\Scripts\pdf2md.exe` under the selected project root. +3. `pdf2md` on PATH. +4. Bundled `uv.exe` plus `uv run --offline pdf2md` under the selected project root. +5. Existing system `uv run pdf2md` fallback. + +Child environment rules: + +- Set `MINERU_MODEL_SOURCE=local` unless explicitly set. +- Add installed `.venv\Scripts` to PATH for runtime console scripts. +- Add installed portable Node.js path to PATH when bundled. +- Set `UV_OFFLINE=1` when using the installed offline runtime. +- Do not add remote endpoints or backend flags. + +### WP17.4: Inno Setup Installer + +Add an Inno Setup script that installs the payload and invokes the target runtime installer. + +Installer behavior: + +- Default to per-user install under `%LOCALAPPDATA%\Programs\ConvertPDFToMD`. +- Create Start Menu shortcuts for: + - `ConvertPDFToMD` UI, + - `PDF2MD Doctor`, + - `Repair PDF2MD Runtime`. +- Run `install-runtime.ps1` after files are copied. +- Show the doctor log path if setup finishes with WARN. +- Fail the install on target runtime setup failure unless the user explicitly chooses to keep files for manual repair. + +### WP17.5: License, Manifest, And Offline Verification + +Add docs and checks for redistribution risk. + +Required records: + +- Python, uv, PyInstaller, PyTorch, MinerU, model files, Node.js, MathJax, and transitive Python/npm dependency notices. +- A manifest with file hashes and source URLs. +- A clear statement that runtime conversion remains local-only and that setup payload creation can use internet only on the build PC. + +Verification tiers: + +- Fast tests use fake staging folders and fake wheel/model files. +- Build-PC packaging smoke can create the staging folder without committing payload. +- Offline target smoke uses a clean Windows VM with networking disabled. + +## Implementation Task Plan + +### Task 1: Packaging Manifest And Ignore Policy + +Files: + +- Create `tests/test_offline_packaging.py`. +- Create `src/pdf2md/packaging_manifest.py` if needed. +- Modify `.gitignore`. + +Steps: + +- Add failing tests for manifest generation with SHA-256, file size, relative path, and source label. +- Add failing tests that payload paths under `dist/offline-installer/`, wheelhouse files, model files, and generated installer executables stay ignored. +- Implement the smallest manifest helper or PowerShell-compatible JSON format. +- Run `uv run pytest tests/test_offline_packaging.py`. +- Commit manifest and ignore-policy changes. + +### Task 2: Offline Payload Builder + +Files: + +- Create `packaging/offline/build-offline-payload.ps1`. +- Create `packaging/offline/requirements-runtime-cu126.txt`. +- Create `packaging/offline/README.md`. +- Create `packaging/offline/verify-offline-payload.ps1`. +- Modify `tests/test_offline_packaging.py`. + +Steps: + +- Add tests that the builder rejects missing UI exe, missing model source, missing Python runtime package, missing `uv.exe`, and empty wheelhouse. +- Add tests that the builder excludes `.venv`, `.git`, `samples`, `outputs`, `node_modules` unless explicitly copied as the optional locked MathJax payload. +- Implement payload staging, manifest generation, and payload verification. +- Run `uv run pytest tests/test_offline_packaging.py`. +- Run a dry build command that uses fake payload inputs. +- Commit builder changes. + +### Task 3: Target Runtime Install And Repair Scripts + +Files: + +- Create `packaging/offline/install-runtime.ps1`. +- Create `packaging/offline/repair-runtime.ps1`. +- Create `packaging/offline/run-doctor.ps1`. +- Modify `tests/test_offline_packaging.py`. + +Steps: + +- Add tests that scripts contain `--no-index`, `--find-links`, `UV_OFFLINE=1`, and no `http://` or `https://` target-install commands. +- Add tests that existing `mineru.json` handling is explicit and never silently overwritten. +- Implement target-local `.venv` creation, offline package install, model config handling, doctor logging, and repair flow. +- Run `uv run pytest tests/test_offline_packaging.py`. +- Commit install-script changes. + +### Task 4: UI Installed Runtime Resolution + +Files: + +- Modify `src/pdf2md_ui/runner.py`. +- Modify `src/pdf2md_ui/app.py` only if needed. +- Modify `tests/test_ui_runner.py`. + +Steps: + +- Add failing tests for project-root `.venv\Scripts\pdf2md.exe` resolution before PATH. +- Add failing tests for bundled `uv.exe` plus `uv run --offline pdf2md` fallback. +- Add failing tests that the child environment prepends `.venv\Scripts` and bundled Node.js when present. +- Implement the minimal runner changes. +- Run `uv run pytest tests/test_ui_runner.py`. +- Commit UI resolution changes. + +### Task 5: Inno Setup Script + +Files: + +- Create `packaging/offline/Pdf2MdOffline.iss`. +- Modify `tests/test_offline_packaging.py`. + +Steps: + +- Add tests that the Inno script references the expected payload directories, Start Menu shortcuts, and runtime install script. +- Add tests that the script does not reference `samples`, `outputs`, `.venv`, or remote URLs. +- Implement the Inno script. +- On a build PC with Inno Setup installed, run `ISCC.exe packaging\offline\Pdf2MdOffline.iss`. +- Commit installer-script changes without committing the generated installer. + +### Task 6: Documentation And Release Gate + +Files: + +- Modify `README.md`. +- Modify `docs/V1RELEASECHECKLIST.md`. +- Modify `docs/Sprints/SPRINT17CONTRACT.md`. +- Modify `PLAN.md`. +- Modify `PROGRESS.md`. +- Modify `docs/WORKARCHIVE.md` after implementation. + +Steps: + +- Document build-PC prerequisites and target-PC prerequisites. +- Document the offline artifact layout, expected size risk, and repair flow. +- Document the clean offline VM smoke test. +- Record final verification outcomes and residual risks. +- Commit documentation and handoff updates. + +## Verification Commands + +Default fast checks: + +```powershell +uv run pytest tests/test_offline_packaging.py tests/test_ui_runner.py +uv run pytest +git diff --check +git status --short --untracked-files=all +``` + +Build-PC packaging checks: + +```powershell +uv run --group ui-build pyinstaller --clean --onefile --windowed --name pdf2md-ui src\pdf2md_ui\app.py +$pythonInstaller = "C:\BuildCache\python-3.12-amd64.exe" +$uvExe = "C:\BuildCache\uv.exe" +$mineruModels = "C:\BuildCache\mineru-models" +powershell -ExecutionPolicy Bypass -File packaging\offline\build-offline-payload.ps1 -Configuration Release -PythonInstaller $pythonInstaller -UvExe $uvExe -MinerUModelSource $mineruModels +powershell -ExecutionPolicy Bypass -File packaging\offline\verify-offline-payload.ps1 -PayloadRoot dist\offline-installer\payload +ISCC.exe packaging\offline\Pdf2MdOffline.iss +``` + +Offline target smoke: + +```powershell +# Run on a clean Windows x64 VM with networking disabled after copying only the installer. +.\Pdf2MdOfflineSetup-*.exe +& "$env:LOCALAPPDATA\Programs\ConvertPDFToMD\scripts\run-doctor.ps1" +& "$env:LOCALAPPDATA\Programs\ConvertPDFToMD\runtime\.venv\Scripts\pdf2md.exe" --version +& "$env:LOCALAPPDATA\Programs\ConvertPDFToMD\runtime\.venv\Scripts\pdf2md.exe" doctor +``` + +Optional conversion smoke on the offline target: + +```powershell +& "$env:LOCALAPPDATA\Programs\ConvertPDFToMD\runtime\.venv\Scripts\pdf2md.exe" convert C:\LocalTest\SolidElement.pdf --out C:\LocalTest\outputs --overwrite --chunk-pages --gpu auto --mineru-profile auto --strict-local +``` + +Expected optional output: + +```text +C:\LocalTest\outputs\SolidElement\SolidElement_001.md +C:\LocalTest\outputs\SolidElement\SolidElement_report.md +C:\LocalTest\outputs\SolidElement\images\ +``` + +## Acceptance Criteria + +- The generated installer can install the runtime on a clean Windows x64 target without internet access. +- The target runtime has a newly created local `.venv`; it is not a copied development `.venv`. +- `pdf2md --version` runs from the installed `.venv`. +- `pdf2md doctor` runs without network access and reports all install-relevant failures or warnings clearly. +- The UI launches from the Start Menu and resolves the installed runtime without manual project-root configuration. +- MinerU uses local models through `MINERU_MODEL_SOURCE=local` and local model config. +- Python package installation uses only bundled local wheels. +- The wheelhouse and model payload are hash-verified before install. +- No generated payload, model file, wheel, installer exe, sample PDF, or conversion output is committed. +- Default tests remain fast and independent of real MinerU, GPU, model files, network, Inno Setup, MathJax, or `samples/`. + +## Hard Failure Criteria + +- The target installer downloads anything from the internet. +- The UI or CLI introduces a runtime document upload path. +- The installer silently overwrites an unrelated existing `mineru.json`. +- The installer copies the development `.venv` as the installed runtime. +- The installed UI cannot find `pdf2md` without manually editing settings on a clean install. +- `pdf2md doctor` is skipped or its failure is hidden. +- Payload hash verification is missing. +- License/model redistribution review is skipped before sharing the installer outside the current personal environment. +- NVIDIA drivers or CUDA Toolkit installers are redistributed in this sprint. + +## Open Risks + +- The final installer may be very large because CUDA PyTorch wheels, MinerU dependencies, model weights, and optional Node/MathJax assets are large. +- MinerU model redistribution terms and transitive package/model licenses must be reviewed before broader sharing. +- Target PCs still need compatible NVIDIA hardware and drivers. The installer can verify and report this, but it cannot guarantee GPU compatibility. +- Some conversions can still stall or run slowly on GTX 1070 Ti 8GB; packaging does not solve runtime performance. +- Inno Setup may need practical size and antivirus/SmartScreen validation once real model payloads are included. + +## Sources + +- PyInstaller usage: https://pyinstaller.org/en/stable/usage.html +- Inno Setup command-line compiler: https://documentation.help/Inno-Setup/topic_compilercmdline.htm +- uv CLI `--offline` behavior: https://docs.astral.sh/uv/reference/cli/ +- uv cache behavior: https://docs.astral.sh/uv/concepts/cache/ +- pip offline install/download behavior: https://pip.pypa.io/en/stable/cli/pip_install.html and https://pip.pypa.io/en/stable/cli/pip_download/ +- PyTorch previous version wheel command for CUDA 12.6: https://pytorch.org/get-started/previous-versions/ +- MinerU local model source behavior: https://opendatalab.github.io/MinerU/usage/model_source/ + +## Handoff Requirements + +After implementation: + +- Update this contract status to `Implemented` or record the failed gate. +- Record payload size and generated installer path in `PROGRESS.md`. +- Record verification commands and outcomes in `PROGRESS.md`. +- Archive implementation evidence and offline VM smoke results in `docs/WORKARCHIVE.md`. +- Keep generated offline payloads, wheels, model files, installer exe, `dist/`, `outputs/`, and `samples/` uncommitted. diff --git a/docs/Sprints/SPRINT9CONTRACT.md b/docs/Sprints/SPRINT9CONTRACT.md index fb31b47..bf23a66 100644 --- a/docs/Sprints/SPRINT9CONTRACT.md +++ b/docs/Sprints/SPRINT9CONTRACT.md @@ -134,7 +134,7 @@ Not allowed: - Do not run model setup automatically. - Do not require the local GTX 1070 Ti to pass CUDA/PyTorch checks in the default test loop. - Do not improve OCR/model accuracy. -- Do not introduce a manual review UI or web UI. +- Do not introduce a manual review UI, hosted web UI, or local desktop launcher in Sprint 9. - Do not add alternate conversion engines or fallback engines. - Do not benchmark against cloud OCR/API services. - Do not commit sample PDFs, sample-derived outputs, or large binary fixtures. diff --git a/docs/UI_RESEARCH.md b/docs/UI_RESEARCH.md new file mode 100644 index 0000000..cb016f8 --- /dev/null +++ b/docs/UI_RESEARCH.md @@ -0,0 +1,237 @@ +# UI Research: Minimal Windows Launcher For pdf2md + +Last updated: 2026-05-11 + +## Scope + +User request: + +- Build a minimal UI that uses the existing `pdf2md` CLI. +- Build it into a Windows `.exe`. +- Research the implementation path before coding. + +This document is research and planning input only. It does not change runtime behavior. + +## Current Project Fit + +The existing converter is already centered on a CLI: + +```powershell +uv run pdf2md doctor +uv run pdf2md convert INPUT --out OUTPUT --overwrite +uv run pdf2md recheck OUTPUT.md +``` + +The UI should preserve the current architecture: + +- Use MinerU 3.1.0 through the direct local `mineru` CLI only. +- Keep strict-local behavior. Do not expose `--api-url`, remote endpoints, router mode, cloud OCR, remote LLMs, or external document uploads. +- Treat the UI `.exe` as a launcher for the existing local runtime, not as a fully self-contained bundle of MinerU, PyTorch, CUDA DLLs, local models, Node.js, and MathJax. +- Keep generated Markdown parts, report Markdown, assets, and raw output behavior owned by the existing CLI. + +## Recommendation + +Use a thin Python desktop launcher: + +- UI framework: `tkinter` plus `tkinter.ttk`. +- CLI execution: `subprocess.Popen` with `shell=False`, argument lists, a worker thread, and a queue back to the UI thread. +- Packaging: PyInstaller `--onefile --windowed` for a lightweight `pdf2md-ui.exe`. +- Runtime command: prefer `pdf2md` if it is on `PATH`; otherwise run `uv run pdf2md` with a configured project root. + +This is the lowest-risk path because `tkinter` is in the Python standard library, `ttk` provides native themed widgets, and PyInstaller directly supports graphical windowed apps on Windows. The UI remains small and avoids bundling the large GPU conversion stack into the UI executable. + +## Why Not Bundle The Whole Converter Into One EXE + +Bundling the full conversion runtime into a single executable is not a good v1 target: + +- The runtime includes CUDA PyTorch, MinerU, model files, optional Node.js/MathJax support, and local cache/config state. +- Model weights and transitive licenses are already documented as redistribution-sensitive. +- One-file executables extract at startup; large bundles can start slowly and create antivirus or SmartScreen friction. +- The project already uses `uv` and a known local `.venv`; the UI can call that stable runtime. + +Recommended v1 interpretation of ".exe": + +- Build `pdf2md-ui.exe` as the desktop UI. +- Require the local converter runtime to be installed and pass `pdf2md doctor`. +- Let the UI surface doctor failures clearly instead of pretending to be a complete installer. + +Future redistribution can be revisited later as a separate packaging and license sprint. + +## UI Framework Options + +| Option | Fit | Pros | Cons | Decision | +| --- | --- | --- | --- | --- | +| `tkinter` + `ttk` | Strong | Standard library, native file dialogs, themed widgets, minimal dependencies, easy PyInstaller build. Python docs warn that long work must not block Tk's single-threaded event loop, which matches a worker-thread runner design. | Visual polish is modest. Advanced drag/drop usually needs extra packages. | Recommended for v1. | +| PySide6 / Qt for Python | Medium | Polished widgets, strong desktop model, official Python bindings. | Adds large Qt dependency, LGPL/commercial considerations, more complex deployment. Qt docs describe PyInstaller and Nuitka paths, plus caveats around virtualenv/system package selection and Qt plugin bundling. | Keep as a later polish option. | +| CustomTkinter | Medium | More modern look on top of Tkinter. | Official wiki notes PyInstaller packaging data-file issues and recommends `--onedir` instead of `--onefile`. Adds dependency for mostly visual benefit. | Avoid for v1. | +| Flet | Low/medium | Modern Flutter-based Python UI, official `flet build windows`. | Windows packaging requires Visual Studio 2022 with Desktop development with C++ workload. Heavier stack than needed for a form/log launcher. | Avoid for v1. | +| Tauri | Low | Sidecar pattern can embed external binaries and produce polished small desktop apps. | Requires Rust and frontend stack, sidecar permissions, target-triple binary naming, and more architecture than needed. | Avoid for v1. | +| Briefcase | Medium | Produces Windows app folders, MSI installers, and ZIPs; useful for installer-style distribution. | More installer-oriented than needed for a first thin launcher. | Consider after v1 UI works. | + +## Packaging Options + +| Tool | Relevant facts | Fit | +| --- | --- | --- | +| PyInstaller | Supports one-folder and one-file bundles. On Windows it can create graphical apps without a console window. `--onefile`, `--windowed`, `--name`, `--icon`, and spec files cover the expected needs. PyInstaller's license includes an exception allowing bundled applications to be shipped under the application's own license, subject to dependency licenses. | Recommended. | +| Nuitka | Can create standalone, onefile, and app-mode outputs, and emits `.exe` on Windows. Requires a C compiler/toolchain and has longer build complexity. | Good later if PyInstaller output has startup or AV problems. | +| `pyside6-deploy` | Official Qt for Python deployment tool wrapping Nuitka. Produces `.exe` on Windows. | Only relevant if choosing PySide6. | +| Briefcase | Windows outputs include app folders plus MSI or ZIP packaging. Uses an embedded Python distribution. | Useful for installer sprint, not the first UI executable. | +| Flet build | Official Windows build path exists but requires Visual Studio C++ workload. | Too much setup for this project. | + +## CLI Runner Design + +The UI should not call MinerU directly. It should call the project-owned CLI: + +```text +pdf2md doctor +pdf2md convert --out --overwrite --gpu cuda:0 +pdf2md recheck +``` + +Command resolution: + +1. If the configured command exists, use it. +2. Else if `pdf2md` is on `PATH`, run `pdf2md`. +3. Else if `uv` is on `PATH` and a configured project root contains `pyproject.toml`, run `uv run pdf2md` with `cwd=`. +4. Else show a setup error and suggest running `pdf2md doctor` in the repository. + +Subprocess rules: + +- Always pass an argument list with `shell=False`. +- Set `cwd` explicitly when running through `uv`. +- Set `MINERU_MODEL_SOURCE=local` in the child environment unless the user already set it. +- Merge stderr into stdout for a single UI log stream. +- Read output line by line in a background thread. +- Communicate to Tk through `queue.Queue` and `root.after(...)`. +- Store the process PID so Cancel can terminate it. + +Cancellation on Windows: + +- First call `Popen.terminate()`. +- If the process does not exit promptly, call `taskkill /pid /t /f` to end the process tree. Microsoft documents `/t` as ending child processes and `/f` as forceful termination. + +Current limitation: + +- The existing MinerU adapter uses `subprocess.run(..., capture_output=True)` inside `pdf2md`, so detailed MinerU progress may not stream until the CLI completes. The v1 UI should use an indeterminate progress bar plus final CLI output. A future CLI sprint can add streaming progress/events if needed. + +## Minimal UI Shape + +Single window, no landing page: + +- Input PDF: file picker. +- Output directory: directory picker, defaulting to `outputs/`. +- Options: + - `Overwrite` checkbox. + - `Keep raw MinerU output` checkbox. + - `Group pages` checkbox plus numeric field, default `20`. + - `GPU` field, default `cuda:0`. +- Buttons: + - `Doctor`. + - `Convert`. + - `Cancel`. + - `Open output`. +- Status: + - Indeterminate progress bar while running. + - Read-only log pane. + - Last output paths from CLI/report when conversion completes. + +No v1 drag/drop, batch queue, config editor, PDF preview, Markdown preview, or Obsidian integration. Those would add scope without helping the first `.exe` workflow. + +## Build Shape + +Proposed files: + +```text +src/ + pdf2md_ui/ + __init__.py + app.py + runner.py +tests/ + test_ui_runner.py +``` + +Proposed dependency policy: + +- No runtime GUI dependency beyond the standard library. +- Add PyInstaller only to a local dependency group such as `ui-build`, not to the converter runtime dependencies. + +Proposed build commands: + +```powershell +uv add --group ui-build "pyinstaller>=6.20,<7" +uv run --group ui-build pyinstaller --clean --onefile --windowed --name pdf2md-ui src\pdf2md_ui\app.py +``` + +Expected artifact: + +```text +dist/pdf2md-ui.exe +``` + +The built UI executable should be tested from the repository first, because `uv run pdf2md` needs a project root. If the executable is moved elsewhere, the UI should ask for and remember the project root in a small settings file under `%APPDATA%\pdf2md-ui\settings.json`. + +## Verification Plan + +Fast tests: + +- Command resolution with fake PATH/project-root cases. +- Command construction for `doctor`, `convert`, `recheck`. +- No generated command contains prohibited strict-local tokens such as `--api-url`, `http://`, `https://`, `router`, or `openai`. +- Output-directory defaulting for ASCII and non-ASCII PDF names using temporary files. +- Cancel path calls the Windows process-tree termination helper when needed, using a mocked process. + +Build verification: + +```powershell +uv run pytest tests/test_ui_runner.py +uv run --group ui-build pyinstaller --clean --onefile --windowed --name pdf2md-ui src\pdf2md_ui\app.py +Test-Path dist\pdf2md-ui.exe +``` + +Manual smoke verification: + +1. Launch `dist\pdf2md-ui.exe`. +2. Run Doctor from the UI. +3. Select a small local sample PDF. +4. Convert to an ignored `outputs/` folder. +5. Confirm the UI reports completion and the simplified output folder contains `*_001.md`, `images/`, and `*_report.md`. + +## Security, Privacy, And Distribution Notes + +- The UI must not introduce any network document path. +- The UI must not expose arbitrary command execution. It should build fixed `pdf2md` argument lists from validated fields. +- Use `shell=False`; never concatenate user-provided paths into a command string. +- Do not store PDF contents or extracted text in settings. +- Do not include sample PDFs or generated outputs in the build or commit. +- Unsigned Windows executables may trigger SmartScreen. Microsoft documents that unsigned files start with no reputation, and even signed new binaries can show warnings until reputation accumulates. Code signing can be planned later if the tool is distributed beyond personal use. +- If signing is added later, SignTool from the Windows SDK is the documented Microsoft tool. Current SignTool docs require digest options such as `/fd` and `/td`, with SHA-256 recommended. + +## Open Risks + +- A thin launcher depends on an installed and healthy local runtime. The UI must make `doctor` prominent. +- Current CLI progress is coarse because `pdf2md` captures MinerU subprocess output. This is acceptable for v1 but limits progress detail. +- Cancelling a conversion can leave partially written ignored outputs; the UI should label a cancelled run clearly and not delete user-selected output directories unless a later requirement defines cleanup. +- If the UI is redistributed, licenses for MinerU, PyTorch, Qt if ever used, model weights, and bundled tools must be reviewed before packaging more than the thin UI launcher. + +## Sources + +- Python `tkinter`: https://docs.python.org/3/library/tkinter.html +- Python `tkinter.ttk`: https://docs.python.org/3/library/tkinter.ttk.html +- Python `subprocess`: https://docs.python.org/3/library/subprocess.html +- PyInstaller usage: https://pyinstaller.org/en/stable/usage.html +- PyInstaller requirements: https://pyinstaller.org/en/stable/requirements.html +- PyInstaller license: https://pyinstaller.org/en/stable/license.html +- PyInstaller runtime information: https://pyinstaller.org/en/stable/runtime-information.html +- Nuitka user manual: https://nuitka.net/user-documentation/user-manual.html +- Qt for Python PyInstaller deployment: https://doc.qt.io/qtforpython-6/deployment/deployment-pyinstaller.html +- `pyside6-deploy`: https://doc.qt.io/qtforpython-6.5/deployment/deployment-pyside6-deploy.html +- Qt for Python licenses: https://doc.qt.io/qtforpython-6/licenses.html +- Flet build: https://flet.dev/docs/cli/flet-build/ +- Flet Windows packaging: https://flet.dev/docs/publish/windows/ +- Tauri sidecars: https://tauri.app/develop/sidecar/ +- Briefcase Windows packaging: https://briefcase.beeware.org/en/latest/reference/platforms/windows/ +- uv dependency groups: https://docs.astral.sh/uv/concepts/projects/dependencies/ +- Microsoft `taskkill`: https://learn.microsoft.com/en-us/windows-server/administration/windows-commands/taskkill +- Microsoft SmartScreen reputation: https://learn.microsoft.com/en-us/windows/apps/package-and-deploy/smartscreen-reputation +- Microsoft SignTool: https://learn.microsoft.com/en-us/windows/win32/seccrypto/signtool diff --git a/docs/V1IMPLEMENTATIONPLAN.md b/docs/V1IMPLEMENTATIONPLAN.md index 7a0207b..5ac9046 100644 --- a/docs/V1IMPLEMENTATIONPLAN.md +++ b/docs/V1IMPLEMENTATIONPLAN.md @@ -1,28 +1,45 @@ # V1 Implementation Plan: Local PDF-to-Markdown Converter -Last updated: 2026-05-08 +Last updated: 2026-05-13 -This document is the implementation plan for v1. It does not replace `PRD.md` or `ARCHITECTURE.md`; use those files as the source of product requirements and system design. This plan explains the order of work, sprint contracts, verification gates, and agent ownership for implementing the converter. +This document tracks the current v1 implementation state and open future decisions. It does not replace `PRD.md` or `ARCHITECTURE.md`; use those files as the source of product requirements and system design. Completed sprint details are archived in `docs/WORKARCHIVE.md`, and detailed acceptance criteria remain in `docs/Sprints/*.md`. -Sprint 1 created the Python package scaffold and CLI placeholder. Sprint 2 created path planning. Sprint 3 created project-owned records and metadata construction. Sprint 4 created the mocked direct local MinerU adapter boundary. Sprint 5 created the Obsidian Markdown normalization boundary. Sprint 6 created local quality-check and report-rendering boundaries. Sprint 7 implemented conversion orchestration, the public conversion API, and the `pdf2md convert` CLI path with fake-adapter tests. Sprint 8 implemented mockable doctor diagnostics, the `pdf2md doctor` CLI path, and setup documentation. Sprint 9 implemented fast mocked integration tests, explicit opt-in local MinerU fixture evaluation, and the v1 release checklist. Sprint 10 implemented opt-in pre-conversion PDF chunking for long documents. Sprint 11 implemented conservative MathJax warning mitigation for failed math spans. +## 1. Current V1 State -## 1. V1 Outcome +The core v1 converter is implemented through Sprint 16. The implemented system includes: + +- Python 3.12 package and `pdf2md` CLI. +- Direct local MinerU 3.1.0 CLI adapter with strict-local enforcement. +- Obsidian-friendly Markdown normalization. +- Internal provenance, structured warnings, quality checks, and one human-readable report. +- `pdf2md doctor`. +- Optional grouped page conversion through `--chunk-pages`. +- Local MathJax render checking and conservative failed-span repair. +- pypdf-based text fidelity diagnostics. +- NVIDIA GPU inventory, `--gpu auto`, and `--mineru-profile auto|safe|performance`. +- Simplified output layout: `//_001.md`, shared `//images/`, and `//_report.md`. +- No public metadata JSON for new conversions. +- Minimal Windows UI launcher over the existing CLI, including direct-folder PDF batch conversion through sequential `pdf2md convert` subprocesses. + +Historical implementation evidence, verification commands, and sample conversion results are in `docs/WORKARCHIVE.md`. + +## 2. V1 Outcome v1 is complete when a local user can run: ```bash uv run pdf2md doctor -uv run pdf2md convert paper.pdf --out out --metadata -uv run pdf2md convert pdfs --out out --recursive --metadata +uv run pdf2md convert paper.pdf --out out +uv run pdf2md convert pdfs --out out --recursive ``` and receive, for each PDF: -- Obsidian-friendly Markdown. -- A stable sibling assets directory when assets exist. -- `.metadata.json`. -- `.report.md`. -- Clear warnings when math, tables, assets, reading order, GPU availability, or MinerU execution are uncertain. +- Obsidian-friendly Markdown parts under `//_001.md`, `_002.md`, and so on. +- A stable shared image/media directory under `//images/`. +- One human-readable report under `//_report.md`. +- No persisted metadata JSON for new conversions. +- Clear warnings when math, tables, assets, reading order, text fidelity, GPU availability, or MinerU execution are uncertain. Long PDFs can be chunked explicitly: @@ -31,11 +48,11 @@ uv run pdf2md convert paper.pdf --out out --chunk-pages uv run pdf2md convert paper.pdf --out out --chunk-pages 20 ``` -Chunked conversion writes separate outputs per chunk and does not merge Markdown files. +When `--chunk-pages` is active, MinerU receives one-page temporary PDFs and final Markdown files are grouped by the configured page count. Temporary one-page PDFs and intermediate per-page outputs are deleted. -The converter must use MinerU 3.1.0 through direct local CLI execution only. It must not silently fallback to another engine. +The Windows UI launcher is a convenience wrapper over `pdf2md`; it is not a separate conversion pipeline. UI folder batch conversion runs direct-child PDFs sequentially through the same CLI conversion path. -## 2. Non-Negotiable Constraints +## 3. Non-Negotiable Constraints - Python 3.12 and `uv`. - MinerU 3.1.0 is the only conversion engine. @@ -45,34 +62,10 @@ The converter must use MinerU 3.1.0 through direct local CLI execution only. It - Target hardware: NVIDIA GTX 1070 Ti 8GB. - Digital PDFs with text layers are the v1 priority. - `samples/` is local fixture context and must not be committed unless explicitly requested. +- UI launcher must invoke `pdf2md` or `uv run pdf2md`; it must not call MinerU directly or bundle the full conversion runtime. - Every substantial implementation chunk needs a sprint contract and independent evaluation. -## 3. Harness Operating Model - -Use the project long-running harness only for substantial implementation work. - -1. `harness-planner-agent` turns the next user request into a sprint contract. -2. `evaluation-agent` reviews the contract before code changes start. -3. `feature-generator-agent` implements one approved contract at a time. -4. `feature-generator-agent` runs self-checks and records residual risks. -5. `evaluation-agent` independently verifies the result against the contract. -6. The parent agent updates `PROGRESS.md`, commits the completed change, and leaves a handoff. - -After a chunk is no longer active, archive completed-work details in `docs/WORKARCHIVE.md` and keep `PROGRESS.md` focused on current status, blockers, and next actions. - -Each sprint contract must include: - -- Objective. -- Touched surfaces. -- Expected outputs. -- Non-goals. -- Verification checks. -- Hard failure criteria. -- Handoff fields. - -## 4. Proposed Repository Layout - -Create this layout incrementally; do not scaffold unused modules before a sprint needs them. +## 4. Current Repository Layout ```text pyproject.toml @@ -91,615 +84,79 @@ src/ quality.py report.py doctor.py + gpu.py + mineru_profile.py + math_render.py + math_repair.py + text_fidelity.py + pdf2md_ui/ + __init__.py + app.py + runner.py tests/ - unit/ integration/ - fixtures/ -scripts/ - install-mineru.ps1 - install-models.py +docs/ + Sprints/ + superpowers/ ``` -Planned module responsibilities: +Do not scaffold unused modules before a sprint needs them. -- `cli.py`: command parsing, CLI summaries, exit codes. -- `conversion.py`: orchestration for one PDF and batch input. -- `paths.py`: input discovery, output path planning, overwrite checks. -- `mineru_adapter.py`: direct local MinerU CLI boundary. -- `ir.py`: project-owned document/page/block/asset/warning records. -- `markdown.py`: Obsidian Markdown normalization. -- `metadata.py`: metadata schema creation and warning aggregation. -- `quality.py`: local checks for assets, math renderability, and output sanity. -- `report.py`: `.report.md` generation from metadata. -- `doctor.py`: environment, dependency, CUDA/GPU, MinerU, and cache diagnostics. - -## 5. Sprint Sequence - -### Sprint 0: Source And Environment Verification - -Active contract: - -- `docs/Sprints/SPRINT0CONTRACT.md` - -Objective: - -- Verify the facts needed before implementation starts. - -Touched surfaces: - -- `docs/KNOWLEDGEBASE.md` -- `docs/V1IMPLEMENTATIONPLAN.md` if sequencing changes -- `PROGRESS.md` - -Expected outputs: - -- Confirmed MinerU 3.1.0 install command, CLI invocation shape, version command, output paths, and local execution behavior. -- Confirmed Python 3.12, `uv`, CUDA/PyTorch, and GTX 1070 Ti 8GB risks. -- Confirmed license notes needed before redistribution. - -Verification checks: - -- All volatile facts cite official MinerU, Python, uv, PyTorch/CUDA, or license sources. -- No candidate engine comparison is reintroduced. -- No implementation code is created. - -Hard failure criteria: - -- MinerU 3.1.0 cannot be reasonably invoked through a direct local CLI on the target environment. -- Python 3.12 compatibility is not viable without changing project requirements. - -Primary agents: - -- `research-agent` -- `local-setup-agent` -- `license-privacy-agent` - -### Sprint 1: Project Scaffold And Fast Test Loop - -Active contract: - -- `docs/Sprints/SPRINT1CONTRACT.md` - -Objective: - -- Create the minimal Python project structure and a fast local test loop. - -Touched surfaces: - -- `pyproject.toml` -- `src/pdf2md/__init__.py` -- `tests/` -- Development documentation if needed - -Expected outputs: - -- `uv sync` works. -- `uv run pytest` works. -- Project package imports as `pdf2md`. -- CLI entry point name `pdf2md` is reserved but may initially expose only `doctor` or a clear placeholder until later sprints. -- If `uv` is still unavailable locally, Sprint 1 records that blocker and is not marked complete. - -Verification checks: - -- Import test passes. -- Empty test suite or initial scaffold tests pass. -- No runtime network dependency is introduced. - -Hard failure criteria: - -- Project cannot be installed with `uv`. -- Scaffolding adds speculative config systems, extra engines, or unused abstractions. - -Primary agents: - -- `harness-planner-agent` -- `feature-generator-agent` -- `evaluation-agent` - -### Sprint 2: Paths, Input Discovery, And Overwrite Planning - -Active contract: - -- `docs/Sprints/SPRINT2CONTRACT.md` - -Objective: - -- Implement deterministic input and output planning before conversion logic exists. - -Touched surfaces: - -- `paths.py` -- `conversion.py` skeleton if needed -- CLI path handling tests - -Expected outputs: - -- Single PDF discovery. -- Directory PDF discovery. -- Recursive traversal only when requested. -- Deterministic output paths for Markdown, assets, metadata JSON, report, and optional raw MinerU output. -- Existing-output protection unless `--overwrite` is passed. - -Verification checks: - -- Unit tests for single PDF path planning. -- Unit tests for directory and recursive discovery. -- Unit tests for overwrite behavior. -- Tests include Korean or non-ASCII filename handling using generated temporary files, not committed sample PDFs. - -Hard failure criteria: - -- Output planning can overwrite user files without explicit overwrite intent. -- Directory conversion descends recursively without `--recursive`. - -Primary agents: - -- `feature-generator-agent` -- `evaluation-agent` - -### Sprint 3: Domain Records, Metadata, And Warning Model - -Active contract: - -- `docs/Sprints/SPRINT3CONTRACT.md` - -Objective: - -- Define project-owned records before binding to MinerU output. - -Touched surfaces: - -- `ir.py` -- `metadata.py` -- `report.py` skeleton if needed -- Unit tests - -Expected outputs: - -- Document, page, block, asset, and warning records. -- Stable warning codes from `ARCHITECTURE.md`. -- Metadata JSON builder with required top-level and summary fields. -- Warning aggregation logic. - -Verification checks: - -- Unit tests for metadata schema creation. -- Unit tests for warning aggregation. -- Unit tests for optional fields such as bbox and confidence being preserved only when present. - -Hard failure criteria: - -- Public API requires raw MinerU objects. -- Metadata omits source PDF, SHA-256, engine, pages, warnings, assets, or summary. - -Primary agents: - -- `metadata-agent` -- `feature-generator-agent` -- `evaluation-agent` - -### Sprint 4: MinerU Adapter With Mocked Contract - -Active contract: - -- `docs/Sprints/SPRINT4CONTRACT.md` - -Objective: - -- Build the direct local MinerU adapter boundary with mocked outputs first. - -Touched surfaces: - -- `mineru_adapter.py` -- `doctor.py` partial checks -- Adapter tests with fake subprocess results and fake output directories - -Expected outputs: - -- Adapter availability check. -- Version check. -- Direct CLI command construction. -- Strict-local command validation. -- Subprocess execution wrapper capturing stdout, stderr, exit code, and paths. -- Parsed adapter result object with raw Markdown, raw structured data when available, assets, warnings, engine, engine version, options, exit code, and stderr. -- Baseline command shape based on MinerU 3.1.0 direct local CLI: `mineru -p -o `. -- Strict-local validation allows CLI-internal temporary local `mineru-api` orchestration, while rejecting `--api-url`, remote APIs, router mode, HTTP client backends, and remote OpenAI-compatible backends. - -Verification checks: - -- Mocked successful MinerU output test. -- Mocked missing MinerU test. -- Mocked non-zero exit test. -- Test that prohibited remote/API flags cannot be introduced. -- No real MinerU/model dependency in default tests. - -Hard failure criteria: - -- Adapter passes `--api-url`, uses router mode, uses an HTTP client backend, or connects to a remote API or remote OpenAI-compatible backend. -- Adapter falls back to another engine after MinerU failure. -- Tests require model downloads by default. - -Primary agents: - -- `mineru-integration-agent` -- `feature-generator-agent` -- `evaluation-agent` - -### Sprint 5: Obsidian Markdown Normalization And Assets - -Active contract: - -- `docs/Sprints/SPRINT5CONTRACT.md` - -Objective: - -- Normalize MinerU/project IR output into Obsidian-friendly Markdown. - -Touched surfaces: - -- `markdown.py` -- `quality.py` partial asset link checks -- Unit tests - -Expected outputs: - -- Inline math delimiter normalization to `$...$`. -- Display math delimiter normalization to `$$...$$`. -- Blank-line normalization around display math. -- Relative asset link normalization. -- Simple table preservation and complex table fallback warnings. -- No visible page markers by default. - -Verification checks: - -- Unit tests for inline math. -- Unit tests for display math spacing. -- Unit tests for underscores/carets inside math. -- Unit tests for relative asset links. -- Unit tests for table fallback warning behavior. - -Hard failure criteria: - -- Normalization rewrites LaTeX semantics without deterministic tests. -- Generated links are absolute when relative links are required. -- Page provenance is only visible in Markdown and missing from metadata. - -Primary agents: - -- `obsidian-markdown-agent` -- `feature-generator-agent` -- `evaluation-agent` - -### Sprint 6: Quality Checks And Report Generation - -Active contract: - -- `docs/Sprints/SPRINT6CONTRACT.md` - -Objective: - -- Produce local quality signals and human-readable reports from metadata. - -Touched surfaces: - -- `quality.py` -- `report.py` -- `metadata.py` -- Unit tests - -Expected outputs: - -- Missing asset link count. -- Math renderability check interface with graceful unavailable-tool handling. -- Pages-with-warnings summary. -- `.report.md` generated from metadata. -- Final status: `success`, `partial`, or `failed`. - -Verification checks: - -- Unit tests for report content. -- Unit tests for missing asset link count. -- Unit tests for math render failure aggregation. -- Report generation does not re-run MinerU. - -Hard failure criteria: - -- Report diverges from JSON metadata. -- Math render failures are silently ignored. -- Quality checks require network access. - -Primary agents: - -- `metadata-agent` -- `evaluation-agent` -- `feature-generator-agent` - -### Sprint 7: Conversion Orchestrator, CLI, And Python API - -Active contract: - -- `docs/Sprints/SPRINT7CONTRACT.md` - -Objective: - -- Connect path planning, MinerU adapter, normalization, metadata, report, and summaries. - -Touched surfaces: - -- `conversion.py` -- `cli.py` -- `__init__.py` -- CLI and API tests - -Expected outputs: - -- `convert_pdf(input_path, output_dir, metadata=True)` public API. -- `pdf2md convert INPUT --out OUTPUT_DIR`. -- `--metadata`, `--keep-raw`, `--recursive`, `--overwrite`, `--gpu`, and `--strict-local` behavior. -- Batch conversion for directories. -- CLI summary with warning counts. - -Verification checks: - -- API test with mocked MinerU adapter. -- CLI single PDF test with mocked MinerU adapter. -- CLI directory test with mocked MinerU adapter. -- Existing output test. -- Failure summary test. - -Hard failure criteria: - -- Public API exposes raw MinerU objects as required return fields. -- CLI writes outputs after a hard failure that should stop conversion. -- CLI suppresses warning counts. - -Primary agents: - -- `feature-generator-agent` -- `requirements-guard-agent` -- `evaluation-agent` - -### Sprint 8: Doctor And Setup Documentation - -Active contract: - -- `docs/Sprints/SPRINT8CONTRACT.md` +## 5. Active Next Sprint Status: -- Implemented. +- No active implementation sprint. -Objective: +Next implementation work should start from a new user-approved requirement and, if substantial, a new sprint contract. -- Make local setup failures explicit before users run conversions. +## 6. Abandoned Planning -Touched surfaces: - -- `doctor.py` -- `cli.py` -- `README.md` -- `scripts/install-mineru.ps1` -- `scripts/install-models.py` -- Tests for mocked environment checks - -Expected outputs: - -- `pdf2md doctor` reports Python version, `uv`, CUDA/PyTorch GPU visibility, MinerU availability, MinerU version, and detectable model/cache paths. -- GPU unavailable warning is clear. -- Missing `uv` is reported clearly. -- Pre-Turing/Pascal GPU risk is reported clearly for GTX 1070 Ti compute capability 6.1. -- Missing required dependency causes doctor failure. -- Setup docs explain Windows PowerShell, Python 3.12, `uv`, MinerU, models, GPU expectations, and local-only behavior. - -Verification checks: - -- Mocked doctor tests for success, missing MinerU, missing GPU, and missing dependency. -- Documentation review for no cloud/API runtime path. - -Hard failure criteria: - -- Doctor says the environment is healthy when MinerU is missing. -- Doctor implies cloud/API fallback is supported. - -Primary agents: - -- `local-setup-agent` -- `license-privacy-agent` -- `evaluation-agent` - -### Sprint 9: Local Fixture Evaluation And V1 Release Gate - -Active contract: - -- `docs/Sprints/SPRINT9CONTRACT.md` +### Sprint 17: Offline Windows Installer Status: -- Implemented. +- Abandoned at the user's request on 2026-05-13. -Objective: +Historical references: -- Validate the end-to-end v1 behavior against local samples without committing samples. +- `docs/Sprints/SPRINT17CONTRACT.md`. +- `docs/superpowers/plans/2026-05-12-offline-installer.md`. -Touched surfaces: +Do not implement or extend Sprint 17 unless the user explicitly reopens offline installer work. -- `tests/integration/` -- Optional local-only fixture manifest that does not include sample PDFs -- `README.md` -- `PROGRESS.md` +## 7. Future Decisions -Expected outputs: +- Decide whether simplified outputs need a metadata-free `pdf2md recheck`; current `recheck` remains legacy-only for outputs with adjacent metadata JSON. +- Validate `--gpu auto --mineru-profile auto` on a stronger NVIDIA GPU PC. -- Fast mocked integration suite. -- Optional MinerU-dependent local test command. -- Local sample coverage notes in `PROGRESS.md`. -- V1 release checklist status. +## 8. Harness Operating Model -Verification checks: +Use the project long-running harness only for substantial implementation work. -- `uv run pytest` passes without model downloads. -- Optional MinerU test is clearly marked and skipped unless explicitly enabled. -- Representative sample produces Markdown, metadata JSON, report Markdown, and asset paths. -- Obsidian math delimiter expectations are met. -- No sample PDFs are staged. +1. `harness-planner-agent` turns the next user request into a sprint contract. +2. `evaluation-agent` reviews the contract before code changes start. +3. `feature-generator-agent` implements one approved contract at a time. +4. `feature-generator-agent` runs self-checks and records residual risks. +5. `evaluation-agent` independently verifies the result against the contract. +6. The parent agent updates `PROGRESS.md`, commits the completed change, and leaves a handoff. -Hard failure criteria: +After a chunk is no longer active, archive completed-work details in `docs/WORKARCHIVE.md` and keep `PROGRESS.md` focused on current status, blockers, and next actions. -- Default tests require GPU, MinerU models, or network access. -- Sample files are added to git unintentionally. -- V1 release checklist passes without metadata/report generation. +## 9. Completed Sprint Archive -Primary agents and skills: +Completed sprint details have been moved out of this active implementation plan. -- `evaluation-agent` -- `requirements-guard-agent` -- `fixture-evaluation` skill +- Summary and verification evidence: `docs/WORKARCHIVE.md`. +- Detailed historical contracts: `docs/Sprints/SPRINT0CONTRACT.md` through `docs/Sprints/SPRINT16CONTRACT.md`. +- UI folder batch design and execution record: `docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md` and `docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md`. +- Abandoned Sprint 17 planning record: `docs/Sprints/SPRINT17CONTRACT.md` and `docs/superpowers/plans/2026-05-12-offline-installer.md`. -### Sprint 10: Pre-Conversion PDF Page Chunking - -Active contract: - -- `docs/Sprints/SPRINT10CONTRACT.md` - -Status: - -- Implemented. - -Objective: - -- Split long PDFs into temporary fixed-size page chunks before MinerU conversion. - -Touched surfaces: - -- `pdf_splitter.py` -- `conversion.py` -- `cli.py` -- `report.py` -- README and Sprint 10 documentation -- Unit tests for splitter, conversion, CLI, and report behavior - -Expected outputs: - -- `pdf2md convert INPUT --out OUTPUT --chunk-pages` enables 20-page chunks. -- `pdf2md convert INPUT --out OUTPUT --chunk-pages N` enables custom positive chunk size. -- `convert_pdf(..., chunk_pages=N)` returns a `BatchConversionResult` in chunk mode. -- Temporary chunk PDFs are deleted after conversion completes. -- Chunk Markdown files are separate and named with original page ranges. -- Metadata and report content expose original source path and chunk page ranges. - -Verification checks: - -- pypdf-based local blank PDF tests cover page counts, chunk ranges, and written chunk page counts. -- Mocked conversion tests verify one adapter call per chunk, failed-chunk continuation, chunk metadata/report context, and temporary chunk cleanup. -- CLI tests verify `--chunk-pages` without a value uses 20 pages. - -Hard failure criteria: - -- Chunking uploads document content or uses another conversion engine. -- Chunk outputs are merged. -- Default tests require real MinerU, GPU, model files, network, Obsidian, LaTeX tooling, or `samples/`. - -### Sprint 11: MathJax Warning Mitigation - -Active contract: - -- `docs/Sprints/SPRINT11CONTRACT.md` - -Status: - -- Implemented. - -Objective: - -- Repair narrow MathJax-invalid formula artifacts after initial local validation and before final output writing. - -Touched surfaces: - -- `quality.py` -- `math_repair.py` -- `conversion.py` -- `ir.py` -- Unit tests for quality details, repair rules, conversion, and recheck behavior - -Expected outputs: - -- Failed math expression records expose body, display mode, span, and checker message. -- Repair candidates are generated only for failed math spans. -- Repeated same-direction scripts are disambiguated with an empty group. -- Truncated `\end{a}` array endings are repaired when array environments are unbalanced. -- `convert` and `recheck` share the same repair behavior. -- Applied repairs are recorded as `MATH_RENDER_REPAIRED` info warnings and do not count as math render errors. - -Verification checks: - -- Default fast tests pass without real MinerU, GPU, Node.js, MathJax, network, Obsidian, or `samples/`. -- `samples/MITC공부.pdf` validates locally with `Math render error count: 0`. - -Hard failure criteria: - -- Repair changes math spans that did not fail local MathJax validation. -- Repair claims success without candidate revalidation. -- Repair introduces remote services, alternate engines, or mandatory sample-dependent default tests. - -## 6. Cross-Cutting Acceptance Criteria - -Every implementation sprint must preserve these acceptance criteria: - -- No runtime remote document processing path exists. -- MinerU is the only conversion engine. -- Failures are explicit and traceable. -- Warnings are structured and countable. -- Markdown and metadata can be traced back to source pages where available. -- Reports are generated from metadata. -- Default tests are fast and local. -- `samples/` remains untracked unless explicitly requested. - -## 7. First Implementation Request Contract Template - -Use this template when implementation begins. - -```markdown -## Sprint Contract - -Objective: - -Touched surfaces: - -Expected outputs: - -Non-goals: - -Verification checks: - -Hard failure criteria: - -Handoff fields: -- Files changed: -- Commands run: -- Tests passed: -- Known failures: -- Residual risks: -- Next action: -``` - -## 8. Open Risks - -- MinerU 3.1.0 install and CLI behavior are source-verified, but real local output still needs a later local probe before release. -- GTX 1070 Ti 8GB is visible locally, but it is Pascal compute capability 6.1; `doctor` and setup docs must make CUDA/PyTorch limits clear. -- `uv` is installed per-user at `C:\Users\user\.local\bin`, but a new shell may need PATH refresh before `uv` is visible. -- Formula renderability checks and conservative warning mitigation are implemented, but formula reconstruction remains best effort and should keep warnings/provenance visible. -- Some PDFs will have tables or formulas that cannot be faithfully represented in Markdown; metadata and `.report.md` must surface this instead of hiding it. -- Redistribution license obligations must be reviewed before packaging, redistribution, or bundling model weights. - -## 9. Recommended Next Step - -Run optional real local MinerU validation on a long sample only when requested. Default verification should continue to use mocked adapters and generated temporary PDFs so it remains independent of MinerU, GPU, model files, network access, and `samples/`. - -Facts carried forward from Sprint 0: +Facts carried forward from completed work: - MinerU is fixed to version 3.1.0. - Direct local CLI command shape is `mineru -p -o `. -- MinerU output layout should be treated as optional-file based until locally probed. - Python 3.12 is compatible with the pinned MinerU package range. - GTX 1070 Ti CUDA/PyTorch support needs explicit doctor validation. -- MinerU/model license posture is acceptable for personal local use. Redistribution remains out of scope until reviewed. +- Formula reconstruction remains best effort and must keep warnings/provenance visible. +- MinerU/model license posture is acceptable for personal local use. Redistribution remains gated by license review. diff --git a/docs/V1RELEASECHECKLIST.md b/docs/V1RELEASECHECKLIST.md index 358f6a9..1d54eed 100644 --- a/docs/V1RELEASECHECKLIST.md +++ b/docs/V1RELEASECHECKLIST.md @@ -76,13 +76,13 @@ This optional pytest path runs `pdf2md doctor` first. If doctor has a hard failu A sample conversion is successful only when all of these are true: - The command exits 0. -- The planned Markdown file exists: `\.md`. -- The planned metadata JSON exists: `\.metadata.json`. -- The planned quality report exists: `\.report.md`. -- Metadata and report warning counts are consistent enough to explain math, table, reading-order, asset, MinerU, and checker-unavailable risks. +- The planned Markdown part exists: `\\_001.md`. +- The planned quality report exists: `\\_report.md`. +- No public `.metadata.json` sidecar is written for new conversions. +- The report warning counts are consistent enough to explain math, table, reading-order, asset, MinerU, and checker-unavailable risks. - Any Markdown image links resolve relative to the Markdown file, or missing/broken links are reported as warnings. -Missing Markdown, metadata JSON, or `.report.md` means the sample failed or is blocked. Do not count it as a partial success for release gating. +Missing Markdown part or `_report.md` means the sample failed or is blocked. Do not count it as a partial success for release gating. For each attempted sample, record at least: @@ -90,8 +90,7 @@ For each attempted sample, record at least: - Command run. - Exit code. - Generated Markdown path. -- Generated metadata JSON path. -- Generated `.report.md` path. +- Generated `_report.md` path. - Warning count and final status. - Math renderability failures or checker-unavailable count. - Table fallback or degradation count when available. @@ -110,7 +109,7 @@ Local fixture coverage should include these risk categories where samples are av - Figure, caption, or extracted asset links. - Korean or non-ASCII filename/path handling. -Observed local fixture map as of 2026-05-08: +Observed local fixture map as of 2026-05-11: | Local sample | Fixture risks covered | Notes | | --- | --- | --- | @@ -126,7 +125,7 @@ Coverage gaps to keep visible: - A table-dominant sample with known formula cells would make table degradation easier to judge. - A figure-heavy sample with expected extracted assets would make asset link validation easier to judge. -Do not score fixture quality only by plain-text edit distance. Include math delimiter/renderability behavior, tables, reading order, assets, metadata fields, warning usefulness, and `.report.md` usefulness. +Do not score fixture quality only by plain-text edit distance. Include math delimiter/renderability behavior, tables, reading order, assets, report provenance, warning usefulness, and `_report.md` usefulness. ## No-Sample-Commit Check diff --git a/docs/WORKARCHIVE.md b/docs/WORKARCHIVE.md index f0d427c..74f537e 100644 --- a/docs/WORKARCHIVE.md +++ b/docs/WORKARCHIVE.md @@ -1,6 +1,6 @@ # Work Archive -Last updated: 2026-05-08 +Last updated: 2026-05-13 This document stores completed project work, historical sprint outcomes, environment setup results, and sample conversion evidence. `PROGRESS.md` should stay focused on current status, blockers, and next actions. Read this archive when a task needs past implementation context, previous verification commands, or historical handoff details. @@ -34,6 +34,16 @@ This document stores completed project work, historical sprint outcomes, environ | GPU default/runtime setup | Made conversion default to `cuda:0`, mapped CUDA requests to MinerU subprocess environment variables, rebuilt `.venv`, installed CUDA-enabled PyTorch and MinerU 3.1.0, downloaded MinerU models, and set `MINERU_MODEL_SOURCE=local`. | `README.md`, `src/pdf2md/mineru_adapter.py`, `src/pdf2md/conversion.py` | | MathJax checker | Planned and implemented local MathJax render checker with Node.js helper, Python wrapper, conversion integration, and doctor diagnostics. | `docs/MATHJAXCHECKERPLAN.md`, `tools/mathjax-checker/check.mjs`, `src/pdf2md/math_render.py` | | Sprint 10 | Implemented opt-in pre-conversion PDF chunking with `pypdf`, temporary chunk PDF cleanup, `--chunk-pages [PAGES]`, chunk metadata/report context, and mocked tests. | `docs/Sprints/SPRINT10CONTRACT.md`, `src/pdf2md/pdf_splitter.py` | +| Sprint 11 | Implemented conservative MathJax warning mitigation with failed-expression details, `src/pdf2md/math_repair.py`, shared `convert`/`recheck` repair integration, and `MATH_RENDER_REPAIRED` info warnings. | `docs/Sprints/SPRINT11CONTRACT.md`, `src/pdf2md/math_repair.py`, `src/pdf2md/quality.py`, `src/pdf2md/conversion.py` | +| UI research and Sprint 12 planning | Researched minimal Windows UI launcher options and planned a thin `tkinter`/`ttk` launcher over the existing CLI with PyInstaller build output at `dist/pdf2md-ui.exe`. | `docs/UI_RESEARCH.md`, `docs/Sprints/SPRINT12CONTRACT.md`, `PLAN.md` | +| Sprint 12 | Implemented a minimal `tkinter`/`ttk` Windows UI launcher over `pdf2md` or `uv run pdf2md`, with fixed argument-list subprocess calls, worker-thread logging, cancellation, Recheck support, and PyInstaller build output at `dist/pdf2md-ui.exe`. | `docs/Sprints/SPRINT12CONTRACT.md`, `src/pdf2md_ui/`, `tests/test_ui_runner.py` | +| Sprint 13 | Implemented local pypdf text layer fidelity diagnostics, including Hangul count deltas, unexpected CJK counts, text similarity, Hangul spacing anomaly ratios, replacement-candidate markers, metadata/report integration, and `recheck` support without automatic body-text replacement. | `docs/Sprints/SPRINT13CONTRACT.md`, `src/pdf2md/text_fidelity.py`, `src/pdf2md/conversion.py`, `src/pdf2md/metadata.py`, `src/pdf2md/report.py` | +| Sprint 14 | Changed chunk mode so MinerU receives one source page per run while final Markdown, metadata, report, and assets are grouped by `chunk_pages`. Failed page conversions are nonfatal within partially successful groups and are recorded in metadata/report output. | `docs/Sprints/SPRINT14CONTRACT.md`, `src/pdf2md/conversion.py`, `src/pdf2md/report.py`, `tests/test_conversion.py` | +| Sprint 15 | Implemented NVIDIA GPU inventory parsing, optional `--gpu auto`, default `--mineru-profile auto`, conservative MinerU environment tuning, profile provenance in metadata/report output, and doctor GPU/profile recommendations. | `docs/Sprints/SPRINT15CONTRACT.md`, `src/pdf2md/gpu.py`, `src/pdf2md/mineru_profile.py`, `src/pdf2md/conversion.py`, `src/pdf2md/doctor.py` | +| Sprint 16 | Simplified public conversion outputs to one PDF-stem folder, numbered Markdown parts, shared `images/`, one `_report.md`, no persisted metadata JSON, compatibility-no-op `--metadata`, and legacy-only `recheck`. | `docs/Sprints/SPRINT16CONTRACT.md`, `src/pdf2md/paths.py`, `src/pdf2md/conversion.py`, `src/pdf2md/report.py`, `src/pdf2md/cli.py` | +| UI direct-folder batch conversion | Added a minimal UI workflow that selects one folder, discovers direct-child PDFs only, and sequentially runs the existing `pdf2md convert` command for each file with the selected options. | `docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md`, `docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md`, `src/pdf2md_ui/runner.py`, `src/pdf2md_ui/app.py` | +| Sprint 17 planning | Planned a large offline Windows installer, then abandoned the sprint at the user's request before implementation began. | `docs/Sprints/SPRINT17CONTRACT.md`, `docs/superpowers/plans/2026-05-12-offline-installer.md` | +| Documentation archive cleanup | Moved completed implementation details out of `PLAN.md`, `PROGRESS.md`, and `docs/V1IMPLEMENTATIONPLAN.md`, then removed Sprint 17 from active planned work after it was abandoned. | `PLAN.md`, `PROGRESS.md`, `docs/V1IMPLEMENTATIONPLAN.md`, `docs/WORKARCHIVE.md` | ## Runtime Setup Archive @@ -43,12 +53,13 @@ This document stores completed project work, historical sprint outcomes, environ - `uv` installed per-user at `C:\Users\user\.local\bin`. - GPU target: NVIDIA GTX 1070 Ti 8GB. - Local GPU observed: NVIDIA GeForce GTX 1070 Ti, driver 577.00, 8192 MiB VRAM, WDDM. +- Default conversion device/profile: `--gpu cuda:0` and `--mineru-profile auto`. - MinerU execution mode: direct local `mineru` CLI only. - MinerU 3.1.0 CLI-internal temporary local `mineru-api` is allowed when the CLI runs without `--api-url`. - GTX 1070 Ti runtime setup used `torch==2.6.0+cu126`, `torchvision==0.21.0+cu126`, and `mineru[core]==3.1.0`. - MinerU models were downloaded with `uv run mineru-models-download -s huggingface -m all`. - Runtime model loading uses `MINERU_MODEL_SOURCE=local`. -- Current doctor status after setup is WARN because GTX 1070 Ti is Pascal/pre-Turing; MinerU, CUDA PyTorch, local model config, MathJax checker, and strict-local checks pass. +- Current doctor status after setup is WARN because GTX 1070 Ti is Pascal/pre-Turing; MinerU, CUDA PyTorch, local model config, MathJax checker, and strict-local checks pass. Sprint 15 doctor output selects `cuda:0` for `--gpu auto` on this machine and recommends MinerU profile `safe`. ## Sample Conversion Archive @@ -58,6 +69,11 @@ Generated outputs are ignored under `outputs/` and are not committed. | --- | --- | --- | --- | | `samples/MITC공부.pdf` | Completed after CUDA-enabled runtime setup. | `outputs/MITC공부/` | 13 pages, 107 assets, 23 inline formulas, 103 display formulas, 1 info warning at the time of that run because the local MathJax checker was unavailable. | | `samples/FourNodeQuadrilateralShellElementMITC4.pdf` | Completed with default GPU request and `MINERU_MODEL_SOURCE=local`. | `outputs/FourNodeQuadrilateralShellElementMITC4/` | Report status `success`: 7 pages, 22 assets, 38 inline formulas, 16 display formulas, 0 math render errors, 0 warnings. | +| `samples/FourNodeQuadrilateralShellElementMITC4.pdf` | Sprint 14 sample smoke stalled and was terminated. | No final output directory. | On 2026-05-12, `--chunk-pages` entered the one-page conversion path and used `cuda:0` with GPU utilization near 100%. Source page 1 completed, but source page 2 stayed active for more than 15 minutes total runtime with no final grouped output, so the process tree was terminated and the temporary `pdf2md.pages.*` directory was removed. | +| `samples/MITC공부.pdf` | Reconverted after Sprint 11 mitigation. | `outputs/MITC공부/` and `outputs/sprint11-MITC공부/` | Report status `partial` from 2 `MATH_RENDER_REPAIRED` info warnings: 13 pages, 107 assets, 23 inline formulas, 103 display formulas, 0 MathJax render errors, and 0 missing or invalid asset links. | +| `samples/2007쉘구조물의유한요소해석에대하여.pdf` | Completed after Sprint 13 validation with 1-page chunking. | `outputs/2007쉘구조물의유한요소해석에대하여_pages1/` | A fresh `--chunk-pages 5` attempt stayed on part 001 for over 40 minutes with GPU near full utilization and no output, so it was terminated. The clean `--chunk-pages 1` run completed 13/13 chunks with 0 failures, 44 warnings, 0 MathJax render errors, 13 low text-fidelity pages, 15 unexpected CJK characters, 13 diagnostic replacement-candidate pages, and 0 uncertain page mappings. | +| `samples/SolidElement.pdf` | Completed after Sprint 15 GPU/profile implementation with `--gpu auto --mineru-profile auto --chunk-pages`. | `outputs/SolidElement_sprint15_auto_20260512/` | Completed in about 11 minutes 51 seconds on GTX 1070 Ti. Report status `partial`: 6 pages, 0 failed pages, safe profile applied, 71 assets, 3 inline formulas, 55 display formulas, 0 MathJax render errors, 0 missing/invalid asset links, 11 warnings, and 5 low text-fidelity pages. | +| `samples/SolidElement.pdf` | Completed after Sprint 16 simplified output layout with `--gpu auto --mineru-profile auto --chunk-pages`. | `outputs/SolidElement/` | Completed in about 17 minutes 51 seconds on GTX 1070 Ti. Produced `SolidElement_001.md`, `SolidElement_report.md`, shared `images/` with 71 assets, and no persisted metadata JSON. Report status `partial`: 6 pages, 0 failed pages, safe profile applied, 3 inline formulas, 55 display formulas, 0 MathJax render errors, 0 missing/invalid asset links, 11 warnings, and 5 low text-fidelity pages. | ## Historical Verification Highlights @@ -73,6 +89,41 @@ Generated outputs are ignored under `outputs/` and are not committed. - CUDA runtime rebuild: verified CUDA with an actual tensor operation on `NVIDIA GeForce GTX 1070 Ti`, compute capability 6.1; `mineru --version` reported 3.1.0. - MathJax checker: `npm run mathjax-checker:health` returned `{"ok":true}` after local `npm install`; full suite passed 150 tests with 1 optional skip after integration. - Sprint 10 chunking: targeted chunking tests passed 42 tests; full default suite passed 163 tests with 1 optional skip; `git diff --check` passed with line-ending warnings only. +- Sprint 11 MathJax warning mitigation: targeted tests passed 56 tests; full default suite passed 172 tests with 1 optional skip; requested `samples/MITC공부.pdf` validation produced 0 MathJax render errors and 2 traceable repair info warnings. +- UI research and Sprint 12 planning: `docs/UI_RESEARCH.md` and `docs/Sprints/SPRINT12CONTRACT.md` were added; no implementation tests were required because this was documentation and planning only. +- Sprint 12 UI implementation: `uv run pytest tests\test_ui_runner.py` passed 16 tests; `uv run pytest` passed 188 tests with 1 optional skip; `uv run --group ui-build pyinstaller --clean --onefile --windowed --name pdf2md-ui src\pdf2md_ui\app.py` produced `dist\pdf2md-ui.exe`; `uv run pdf2md doctor` returned WARN only for the documented GTX 1070 Ti/Pascal compatibility risk; launch smoke confirmed the executable process starts. +- Sprint 12 residual smoke risk: a direct CLI conversion smoke using `samples\FourNodeQuadrilateralShellElementMITC4.pdf` and the same command shape used by the UI exceeded the 15-minute timeout on 2026-05-11. The spawned process tree was terminated with `taskkill`. +- Sprint 13 text fidelity diagnostics: `uv run pytest tests/test_text_fidelity.py tests/test_metadata.py tests/test_report.py tests/test_conversion.py` passed 49 tests; `uv run pytest` passed 198 tests with 1 optional skip. +- Sprint 13 sample validation on 2026-05-11: `samples/2007쉘구조물의유한요소해석에대하여.pdf` completed with `--chunk-pages 1` under `outputs/2007쉘구조물의유한요소해석에대하여_pages1/`; generated 13 Markdown files, 13 metadata JSON files, and 13 report files. + +- Sprint 14 grouped page conversion: targeted red tests first failed against the Sprint 10 chunking behavior, then passed after implementation. `uv run pytest tests/test_conversion.py tests/test_cli.py tests/test_report.py tests/test_pdf_splitter.py tests/test_paths.py tests/test_metadata.py tests/test_ui_runner.py` passed 101 tests; full `uv run pytest` passed 202 tests with 1 optional skip. +- Sprint 14 sample smoke on 2026-05-12: `uv run pdf2md convert samples\FourNodeQuadrilateralShellElementMITC4.pdf --out outputs\FourNodeQuadrilateralShellElementMITC4_sprint14_20260512_112342 --chunk-pages --strict-local` used `cuda:0` with GPU utilization near 100%, reached source page 2, then exceeded 15 minutes total runtime without producing a final output directory. The process tree was terminated and the leftover temporary directory was removed. +- Sprint 15 NVIDIA GPU detection/profile tuning: targeted tests `uv run pytest tests/test_gpu.py tests/test_mineru_profile.py tests/test_mineru_adapter.py tests/test_conversion.py tests/test_cli.py tests/test_doctor.py` passed 101 tests. Full `uv run pytest` passed 225 tests with 1 optional skip. `uv run pdf2md doctor` returned WARN on the local GTX 1070 Ti, reported GPU 0 with 8192 MiB VRAM, selected `cuda:0` for `--gpu auto`, and recommended profile `safe`. Optional stronger-PC real MinerU conversion validation was not run in this workspace. +- SolidElement sample validation on 2026-05-12: `uv run pdf2md convert samples\SolidElement.pdf --out outputs\SolidElement_sprint15_auto_20260512 --overwrite --chunk-pages --gpu auto --mineru-profile auto --strict-local` completed successfully with one grouped output and no failed source pages. +- Sprint 16 simplified output layout: focused verification `uv run pytest tests/test_paths.py tests/test_conversion.py tests/test_cli.py tests/test_report.py tests/test_ui_runner.py tests/integration/test_v1_fast_release_gate.py -q` passed 91 tests; full `uv run pytest` passed 227 tests with 1 optional skip; `git diff --check` passed with line-ending warnings only. New conversions write `//_001.md`, shared `//images/`, and `//_report.md`; no public `.metadata.json` is written. +- Sprint 16 SolidElement sample validation on 2026-05-12: `uv run pdf2md convert samples\SolidElement.pdf --out outputs --overwrite --chunk-pages --gpu auto --mineru-profile auto --strict-local` completed successfully with one simplified Markdown part, one report, shared images, no public metadata JSON, and no failed source pages. +- UI direct-folder batch conversion on 2026-05-13: `uv run pytest tests/test_ui_runner.py -q` passed 19 tests; `uv run python -m py_compile src\pdf2md_ui\app.py src\pdf2md_ui\runner.py` passed; `uv run pytest -q` passed 230 tests with 1 skipped; PyInstaller rebuilt `dist\pdf2md-ui.exe`; a short process-start smoke confirmed the executable starts. +- Sprint 17 planning on 2026-05-12: `docs/Sprints/SPRINT17CONTRACT.md` and `docs/superpowers/plans/2026-05-12-offline-installer.md` were added. No implementation tests were required because this was planning only. +- Sprint 17 abandonment on 2026-05-13: offline installer planning was abandoned at the user's request before implementation began. The contract and plan remain historical records only. + +## Archived V1 Implementation Plan + +`docs/V1IMPLEMENTATIONPLAN.md` now tracks current state and planned next work only. Completed Sprint 0 through Sprint 16 details are archived here and in their respective `docs/Sprints/SPRINT*CONTRACT.md` files. + +Current completed v1 capability summary: + +- Python 3.12 package and `pdf2md` CLI. +- Direct local MinerU 3.1.0 CLI adapter with strict-local enforcement. +- Obsidian Markdown normalization, local quality checks, internal provenance, and one human-readable report. +- `pdf2md doctor`, local MathJax checking, conservative MathJax warning mitigation, and pypdf text fidelity diagnostics. +- Opt-in grouped page conversion where MinerU receives one source page per run. +- NVIDIA GPU detection, `--gpu auto`, and `--mineru-profile auto|safe|performance`. +- Simplified public output layout with no public metadata JSON for new conversions. +- Minimal Windows UI launcher with direct-folder batch conversion through sequential existing CLI calls. + +Current planned next work: + +- No active implementation sprint. Future substantial work should start from a new user-approved requirement and sprint contract. ## Historical Blockers And Resolutions diff --git a/docs/superpowers/plans/2026-05-12-offline-installer.md b/docs/superpowers/plans/2026-05-12-offline-installer.md new file mode 100644 index 0000000..977644f --- /dev/null +++ b/docs/superpowers/plans/2026-05-12-offline-installer.md @@ -0,0 +1,683 @@ +# Offline Windows Installer Implementation Plan + +> **Status:** Abandoned at the user's request on 2026-05-13 before implementation began. This file is retained as historical planning context only. Do not execute this plan unless the user explicitly reopens offline installer work. + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build an offline Windows installer that installs the existing `pdf2md` CLI/UI runtime on another Windows x64 PC without internet access. + +**Architecture:** Build a large installer payload on an internet-connected build PC, then create the target PC `.venv` locally from bundled wheels during installation. Keep conversion behavior unchanged and keep the UI as a launcher over the installed project-owned `pdf2md` CLI. + +**Tech Stack:** Python 3.12, uv, pip wheelhouse/download workflow, PyInstaller, PowerShell, Inno Setup, MinerU 3.1.0, CUDA PyTorch `2.6.0+cu126`, optional Node.js/MathJax. + +--- + +## File Structure + +- `docs/Sprints/SPRINT17CONTRACT.md`: sprint contract, scope, acceptance criteria, and hard failure criteria. +- `packaging/offline/build-offline-payload.ps1`: connected build-PC script that stages all offline files under `dist/offline-installer/`. +- `packaging/offline/verify-offline-payload.ps1`: build-PC and target-PC script that validates `payload-manifest.json` and hashes. +- `packaging/offline/install-runtime.ps1`: target-PC installer script that hash-verifies the payload, creates `.venv`, installs from local wheels, configures local models, and runs doctor. +- `packaging/offline/repair-runtime.ps1`: target-PC repair script that recreates `.venv` from the retained wheelhouse. +- `packaging/offline/run-doctor.ps1`: shortcut target for post-install diagnostics. +- `packaging/offline/Pdf2MdOffline.iss`: Inno Setup installer script. +- `packaging/offline/requirements-runtime-cu126.txt`: pinned offline runtime requirement set for Windows x64 CUDA 12.6 wheels. +- `packaging/offline/README.md`: build and install instructions. +- `packaging/offline/THIRD_PARTY_NOTICES.md`: redistribution notes and license links for bundled payload families. +- `src/pdf2md/packaging_manifest.py`: optional small helper for deterministic manifest/hash generation. +- `src/pdf2md_ui/runner.py`: installed runtime command resolution and child environment updates. +- `src/pdf2md_ui/app.py`: installed runtime project-root default only if needed. +- `tests/test_offline_packaging.py`: fast tests for manifest, script safety, and installer script contents with fake payloads. +- `tests/test_ui_runner.py`: fast tests for installed `.venv` and bundled `uv --offline` command resolution. +- `.gitignore`: ignore generated payload, wheelhouse, models, and installer outputs. +- `README.md` and `docs/V1RELEASECHECKLIST.md`: user-facing build/release documentation. +- `PLAN.md`, `PROGRESS.md`, `docs/WORKARCHIVE.md`: coordination and handoff. + +## Task 1: Packaging Manifest And Ignore Policy + +**Files:** +- Create: `tests/test_offline_packaging.py` +- Create: `src/pdf2md/packaging_manifest.py` +- Modify: `.gitignore` + +- [ ] **Step 1: Write the failing manifest tests** + +```python +from pathlib import Path + +from pdf2md.packaging_manifest import build_payload_manifest + + +def test_build_payload_manifest_records_hash_size_and_source(tmp_path: Path) -> None: + payload = tmp_path / "payload" + payload.mkdir() + wheel = payload / "wheelhouse" / "example-1.0-py3-none-any.whl" + wheel.parent.mkdir() + wheel.write_bytes(b"wheel-bytes") + + manifest = build_payload_manifest( + payload, + sources={"wheelhouse/example-1.0-py3-none-any.whl": "local test wheel"}, + ) + + assert manifest["files"] == [ + { + "path": "wheelhouse/example-1.0-py3-none-any.whl", + "size": 11, + "sha256": "9ceb18f15662bb87e54af2f5953c0484d2ef76f5444d87913360b9ef87d7296d", + "source": "local test wheel", + } + ] + + +def test_build_payload_manifest_uses_forward_slash_relative_paths(tmp_path: Path) -> None: + payload = tmp_path / "payload" + nested = payload / "models" / "mineru" / "model.bin" + nested.parent.mkdir(parents=True) + nested.write_bytes(b"model") + + manifest = build_payload_manifest(payload, sources={}) + + assert manifest["files"][0]["path"] == "models/mineru/model.bin" +``` + +- [ ] **Step 2: Run the manifest tests to verify failure** + +Run: + +```powershell +uv run pytest tests/test_offline_packaging.py -q +``` + +Expected: FAIL because `pdf2md.packaging_manifest` does not exist. + +- [ ] **Step 3: Implement the minimal manifest helper** + +```python +"""Offline installer payload manifest helpers.""" + +from __future__ import annotations + +import hashlib +from pathlib import Path +from typing import Mapping, TypedDict + + +class ManifestFile(TypedDict): + path: str + size: int + sha256: str + source: str + + +class PayloadManifest(TypedDict): + files: list[ManifestFile] + + +def build_payload_manifest(payload_root: str | Path, *, sources: Mapping[str, str]) -> PayloadManifest: + root = Path(payload_root) + files: list[ManifestFile] = [] + for path in sorted(candidate for candidate in root.rglob("*") if candidate.is_file()): + relative = path.relative_to(root).as_posix() + files.append( + { + "path": relative, + "size": path.stat().st_size, + "sha256": _sha256(path), + "source": sources.get(relative, "unknown"), + } + ) + return {"files": files} + + +def _sha256(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() +``` + +- [ ] **Step 4: Add generated payload ignores** + +Append to `.gitignore`: + +```gitignore +dist/ +packaging/offline/_payload/ +packaging/offline/_wheelhouse/ +packaging/offline/_models/ +*.issig +*.exe.tmp +``` + +If `dist/` is already ignored implicitly by an existing entry, keep one clear `dist/` entry and avoid duplicates. + +- [ ] **Step 5: Run tests** + +Run: + +```powershell +uv run pytest tests/test_offline_packaging.py -q +git diff --check +``` + +Expected: tests PASS; diff check has no whitespace errors. + +- [ ] **Step 6: Commit** + +```powershell +git add .gitignore src\pdf2md\packaging_manifest.py tests\test_offline_packaging.py +git commit -m "feat: add offline payload manifest helper" +``` + +## Task 2: Offline Payload Builder + +**Files:** +- Create: `packaging/offline/build-offline-payload.ps1` +- Create: `packaging/offline/verify-offline-payload.ps1` +- Create: `packaging/offline/requirements-runtime-cu126.txt` +- Create: `packaging/offline/README.md` +- Modify: `tests/test_offline_packaging.py` + +- [ ] **Step 1: Write tests for builder safety** + +```python +from pathlib import Path + + +def test_payload_builder_excludes_development_and_sample_paths() -> None: + script = Path("packaging/offline/build-offline-payload.ps1").read_text(encoding="utf-8") + + assert ".git" in script + assert ".venv" in script + assert "samples" in script + assert "outputs" in script + assert "Copy-Item -Recurse -Force" in script + + +def test_runtime_requirements_pin_core_gpu_stack() -> None: + requirements = Path("packaging/offline/requirements-runtime-cu126.txt").read_text(encoding="utf-8") + + assert "torch==2.6.0" in requirements + assert "torchvision==0.21.0" in requirements + assert "mineru[core]==3.1.0" in requirements + assert "pypdf" in requirements +``` + +- [ ] **Step 2: Run tests to verify failure** + +Run: + +```powershell +uv run pytest tests/test_offline_packaging.py -q +``` + +Expected: FAIL because the packaging files do not exist. + +- [ ] **Step 3: Create the pinned requirements file** + +```text +convert-pdf-to-md==0.1.0 +pypdf>=6.10.2,<7 +torch==2.6.0 +torchvision==0.21.0 +mineru[core]==3.1.0 +``` + +- [ ] **Step 4: Create the payload builder skeleton** + +The script must accept explicit input paths and fail when required payload pieces are missing: + +```powershell +param( + [string]$Configuration = "Release", + [string]$PythonInstaller, + [string]$UvExe, + [string]$MinerUModelSource, + [string]$NodeRoot = "", + [string]$OutputRoot = "dist\offline-installer" +) + +$ErrorActionPreference = "Stop" +$RepoRoot = Resolve-Path (Join-Path $PSScriptRoot "..\..") +$StageRoot = Join-Path $RepoRoot $OutputRoot +$AppRoot = Join-Path $StageRoot "app" +$RuntimeRoot = Join-Path $StageRoot "runtime" +$PayloadRoot = Join-Path $StageRoot "payload" + +if (-not (Test-Path $PythonInstaller)) { throw "Missing Python installer: $PythonInstaller" } +if (-not (Test-Path $UvExe)) { throw "Missing uv.exe: $UvExe" } +if (-not (Test-Path $MinerUModelSource)) { throw "Missing MinerU model source: $MinerUModelSource" } +if (-not (Test-Path (Join-Path $RepoRoot "dist\pdf2md-ui.exe"))) { throw "Missing UI exe. Build dist\pdf2md-ui.exe first." } + +Remove-Item -LiteralPath $StageRoot -Recurse -Force -ErrorAction SilentlyContinue +New-Item -ItemType Directory -Path $AppRoot,$RuntimeRoot,$PayloadRoot | Out-Null + +$Excluded = @(".git", ".venv", "samples", "outputs", "dist", "build", "node_modules", ".pytest_cache", "__pycache__") + +Copy-Item -Recurse -Force (Join-Path $RepoRoot "src") (Join-Path $RuntimeRoot "src") +Copy-Item -Force (Join-Path $RepoRoot "pyproject.toml") (Join-Path $RuntimeRoot "pyproject.toml") +Copy-Item -Force (Join-Path $RepoRoot "uv.lock") (Join-Path $RuntimeRoot "uv.lock") +Copy-Item -Force (Join-Path $RepoRoot "README.md") (Join-Path $RuntimeRoot "README.md") +Copy-Item -Force (Join-Path $RepoRoot "dist\pdf2md-ui.exe") (Join-Path $AppRoot "pdf2md-ui.exe") +New-Item -ItemType Directory -Path (Join-Path $PayloadRoot "python"),(Join-Path $PayloadRoot "uv") | Out-Null +Copy-Item -Force $PythonInstaller (Join-Path $PayloadRoot "python\python-3.12-amd64.exe") +Copy-Item -Force $UvExe (Join-Path $PayloadRoot "uv\uv.exe") +Copy-Item -Recurse -Force $MinerUModelSource (Join-Path $PayloadRoot "models") + +if ($NodeRoot -and (Test-Path $NodeRoot)) { + Copy-Item -Recurse -Force $NodeRoot (Join-Path $PayloadRoot "node") +} + +Write-Host "Offline installer stage created at $StageRoot" +Write-Host "Use pip download on the connected build PC to fill payload\wheelhouse before compiling the installer." +``` + +- [ ] **Step 5: Document the connected wheelhouse build command** + +Add to `packaging/offline/README.md`: + +```powershell +uv build --wheel +Copy-Item dist\convert_pdf_to_md-0.1.0-py3-none-any.whl dist\offline-installer\payload\wheelhouse\ +py -3.12 -m pip download -d dist\offline-installer\payload\wheelhouse -r packaging\offline\requirements-runtime-cu126.txt --find-links dist\offline-installer\payload\wheelhouse --extra-index-url https://download.pytorch.org/whl/cu126 +``` + +- [ ] **Step 6: Add the payload verifier** + +`verify-offline-payload.ps1` must read `payload\payload-manifest.json`, recompute SHA-256 for each listed file, and fail when a file is missing or changed. + +- [ ] **Step 7: Run tests** + +Run: + +```powershell +uv run pytest tests/test_offline_packaging.py -q +git diff --check +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```powershell +git add packaging\offline\build-offline-payload.ps1 packaging\offline\verify-offline-payload.ps1 packaging\offline\requirements-runtime-cu126.txt packaging\offline\README.md tests\test_offline_packaging.py +git commit -m "feat: plan offline payload builder" +``` + +## Task 3: Target Runtime Install And Repair Scripts + +**Files:** +- Create: `packaging/offline/install-runtime.ps1` +- Create: `packaging/offline/repair-runtime.ps1` +- Create: `packaging/offline/run-doctor.ps1` +- Modify: `tests/test_offline_packaging.py` + +- [ ] **Step 1: Write script safety tests** + +```python +from pathlib import Path + + +def test_install_runtime_uses_only_local_package_sources() -> None: + script = Path("packaging/offline/install-runtime.ps1").read_text(encoding="utf-8") + + assert "--no-index" in script + assert "--find-links" in script + assert "UV_OFFLINE" in script + assert "https://" not in script + assert "http://" not in script + + +def test_install_runtime_does_not_silently_overwrite_mineru_config() -> None: + script = Path("packaging/offline/install-runtime.ps1").read_text(encoding="utf-8") + + assert "mineru.json" in script + assert "Backup" in script + assert "Silent" in script + assert "throw" in script +``` + +- [ ] **Step 2: Run tests to verify failure** + +Run: + +```powershell +uv run pytest tests/test_offline_packaging.py -q +``` + +Expected: FAIL because scripts do not exist. + +- [ ] **Step 3: Implement `install-runtime.ps1`** + +The script must: + +```powershell +param( + [string]$InstallRoot = "$env:LOCALAPPDATA\Programs\ConvertPDFToMD", + [switch]$Silent +) + +$ErrorActionPreference = "Stop" +$PayloadRoot = Join-Path $InstallRoot "payload" +$RuntimeRoot = Join-Path $InstallRoot "runtime" +$VenvPython = Join-Path $RuntimeRoot ".venv\Scripts\python.exe" +$VenvPdf2Md = Join-Path $RuntimeRoot ".venv\Scripts\pdf2md.exe" +$UvExe = Join-Path $PayloadRoot "uv\uv.exe" +$Wheelhouse = Join-Path $PayloadRoot "wheelhouse" +$Requirements = Join-Path $PayloadRoot "requirements-runtime-cu126.txt" +$LogRoot = Join-Path $InstallRoot "logs" + +New-Item -ItemType Directory -Path $LogRoot -Force | Out-Null +$env:UV_OFFLINE = "1" +$env:MINERU_MODEL_SOURCE = "local" + +if (-not (Test-Path $UvExe)) { throw "Missing bundled uv.exe: $UvExe" } +if (-not (Test-Path $Wheelhouse)) { throw "Missing wheelhouse: $Wheelhouse" } +if (-not (Test-Path $Requirements)) { throw "Missing requirements: $Requirements" } + +& $UvExe venv (Join-Path $RuntimeRoot ".venv") --python 3.12 +if ($LASTEXITCODE -ne 0) { throw "uv venv failed with exit code $LASTEXITCODE" } + +& $UvExe pip install --python $VenvPython --no-index --find-links $Wheelhouse -r $Requirements +if ($LASTEXITCODE -ne 0) { throw "offline package install failed with exit code $LASTEXITCODE" } + +& $UvExe pip check --python $VenvPython +if ($LASTEXITCODE -ne 0) { throw "uv pip check failed with exit code $LASTEXITCODE" } + +$MinerUConfig = Join-Path $env:USERPROFILE "mineru.json" +if (Test-Path $MinerUConfig) { + if ($Silent) { throw "Existing mineru.json requires interactive confirmation: $MinerUConfig" } + $Backup = "$MinerUConfig.pdf2md-backup-$(Get-Date -Format yyyyMMddHHmmss)" + Copy-Item -Force $MinerUConfig $Backup +} + +& $VenvPdf2Md doctor *> (Join-Path $LogRoot "doctor-after-install.txt") +if ($LASTEXITCODE -ne 0) { throw "pdf2md doctor failed with exit code $LASTEXITCODE" } +``` + +- [ ] **Step 4: Implement repair and doctor scripts** + +`repair-runtime.ps1` reruns `install-runtime.ps1` for an existing install root. `run-doctor.ps1` runs the installed `.venv\Scripts\pdf2md.exe doctor` and writes `logs\doctor-latest.txt`. + +- [ ] **Step 5: Run tests** + +Run: + +```powershell +uv run pytest tests/test_offline_packaging.py -q +git diff --check +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```powershell +git add packaging\offline\install-runtime.ps1 packaging\offline\repair-runtime.ps1 packaging\offline\run-doctor.ps1 tests\test_offline_packaging.py +git commit -m "feat: add offline runtime install scripts" +``` + +## Task 4: UI Installed Runtime Resolution + +**Files:** +- Modify: `src/pdf2md_ui/runner.py` +- Modify: `src/pdf2md_ui/app.py` only if needed +- Modify: `tests/test_ui_runner.py` + +- [ ] **Step 1: Add failing runner tests** + +```python +from pathlib import Path + +from pdf2md_ui.runner import resolve_cli_command + + +def test_resolve_prefers_project_venv_pdf2md(tmp_path: Path) -> None: + root = tmp_path / "runtime" + scripts = root / ".venv" / "Scripts" + scripts.mkdir(parents=True) + (root / "pyproject.toml").write_text("[project]\nname='x'\n", encoding="utf-8") + pdf2md = scripts / "pdf2md.exe" + pdf2md.write_text("", encoding="utf-8") + + resolved = resolve_cli_command(project_root=root, which=lambda name: None) + + assert resolved.args_prefix == (str(pdf2md),) + assert resolved.cwd is None + assert resolved.source == "venv" + + +def test_resolve_uses_bundled_uv_offline_when_no_venv_command(tmp_path: Path) -> None: + root = tmp_path / "runtime" + root.mkdir() + (root / "pyproject.toml").write_text("[project]\nname='x'\n", encoding="utf-8") + uv = tmp_path / "payload" / "uv" / "uv.exe" + uv.parent.mkdir(parents=True) + uv.write_text("", encoding="utf-8") + + resolved = resolve_cli_command(project_root=root, bundled_uv=uv, which=lambda name: None) + + assert resolved.args_prefix == (str(uv), "run", "--offline", "pdf2md") + assert resolved.cwd == root + assert resolved.source == "bundled-uv" +``` + +- [ ] **Step 2: Run tests to verify failure** + +Run: + +```powershell +uv run pytest tests/test_ui_runner.py -q +``` + +Expected: FAIL because the runner does not yet support installed `.venv` or bundled uv resolution. + +- [ ] **Step 3: Implement minimal runner changes** + +Add `bundled_uv` as an optional keyword to `resolve_cli_command`, check `\.venv\Scripts\pdf2md.exe` after configured command and before PATH, and use bundled `uv run --offline pdf2md` before system `uv`. + +- [ ] **Step 4: Add child environment tests** + +Add a test that `build_child_environment(project_root=runtime_root)` prepends `.venv\Scripts` and `payload\node` when those folders exist, while preserving `MINERU_MODEL_SOURCE=custom` if the user already set it. + +- [ ] **Step 5: Run tests** + +Run: + +```powershell +uv run pytest tests/test_ui_runner.py -q +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```powershell +git add src\pdf2md_ui\runner.py src\pdf2md_ui\app.py tests\test_ui_runner.py +git commit -m "feat: resolve installed offline runtime from UI" +``` + +## Task 5: Inno Setup Script + +**Files:** +- Create: `packaging/offline/Pdf2MdOffline.iss` +- Modify: `tests/test_offline_packaging.py` + +- [ ] **Step 1: Add Inno script tests** + +```python +from pathlib import Path + + +def test_inno_script_installs_payload_and_shortcuts() -> None: + script = Path("packaging/offline/Pdf2MdOffline.iss").read_text(encoding="utf-8") + + assert "DefaultDirName={localappdata}\\Programs\\ConvertPDFToMD" in script + assert "payload\\*" in script + assert "app\\*" in script + assert "runtime\\*" in script + assert "pdf2md-ui.exe" in script + assert "install-runtime.ps1" in script + assert "PDF2MD Doctor" in script + assert "Repair PDF2MD Runtime" in script + + +def test_inno_script_excludes_development_artifacts() -> None: + script = Path("packaging/offline/Pdf2MdOffline.iss").read_text(encoding="utf-8") + + assert "samples" not in script + assert "outputs" not in script + assert ".venv" not in script +``` + +- [ ] **Step 2: Run tests to verify failure** + +Run: + +```powershell +uv run pytest tests/test_offline_packaging.py -q +``` + +Expected: FAIL because the Inno script does not exist. + +- [ ] **Step 3: Create the Inno script** + +```ini +[Setup] +AppId={{PDF2MD-OFFLINE-INSTALLER}} +AppName=ConvertPDFToMD +AppVersion=0.1.0 +DefaultDirName={localappdata}\Programs\ConvertPDFToMD +DefaultGroupName=ConvertPDFToMD +OutputDir=..\..\dist +OutputBaseFilename=Pdf2MdOfflineSetup-0.1.0 +Compression=lzma2 +SolidCompression=yes +PrivilegesRequired=lowest + +[Files] +Source: "..\..\dist\offline-installer\payload\*"; DestDir: "{app}\payload"; Flags: recursesubdirs createallsubdirs +Source: "..\..\dist\offline-installer\app\*"; DestDir: "{app}\app"; Flags: recursesubdirs createallsubdirs +Source: "..\..\dist\offline-installer\runtime\*"; DestDir: "{app}\runtime"; Flags: recursesubdirs createallsubdirs +Source: "install-runtime.ps1"; DestDir: "{app}\scripts" +Source: "repair-runtime.ps1"; DestDir: "{app}\scripts" +Source: "run-doctor.ps1"; DestDir: "{app}\scripts" + +[Icons] +Name: "{group}\ConvertPDFToMD"; Filename: "{app}\app\pdf2md-ui.exe"; WorkingDir: "{app}\runtime" +Name: "{group}\PDF2MD Doctor"; Filename: "powershell.exe"; Parameters: "-ExecutionPolicy Bypass -File ""{app}\scripts\run-doctor.ps1"""; WorkingDir: "{app}" +Name: "{group}\Repair PDF2MD Runtime"; Filename: "powershell.exe"; Parameters: "-ExecutionPolicy Bypass -File ""{app}\scripts\repair-runtime.ps1"""; WorkingDir: "{app}" + +[Run] +Filename: "powershell.exe"; Parameters: "-ExecutionPolicy Bypass -File ""{app}\scripts\install-runtime.ps1"" -InstallRoot ""{app}"""; StatusMsg: "Installing offline pdf2md runtime..."; Flags: runhidden +``` + +- [ ] **Step 4: Run tests** + +Run: + +```powershell +uv run pytest tests/test_offline_packaging.py -q +git diff --check +``` + +Expected: PASS. + +- [ ] **Step 5: Compile with Inno Setup on a build PC** + +Run: + +```powershell +ISCC.exe packaging\offline\Pdf2MdOffline.iss +``` + +Expected: exit code 0 and `dist\Pdf2MdOfflineSetup-0.1.0.exe` exists. Do not commit the generated exe. + +- [ ] **Step 6: Commit** + +```powershell +git add packaging\offline\Pdf2MdOffline.iss tests\test_offline_packaging.py +git commit -m "feat: add offline installer script" +``` + +## Task 6: Documentation, Verification, And Handoff + +**Files:** +- Modify: `README.md` +- Modify: `docs/V1RELEASECHECKLIST.md` +- Modify: `docs/Sprints/SPRINT17CONTRACT.md` +- Modify: `PLAN.md` +- Modify: `PROGRESS.md` +- Modify: `docs/WORKARCHIVE.md` + +- [ ] **Step 1: Document build and install flow** + +Add a README section with: + +```markdown +## Offline Windows Installer + +The offline installer is built on an internet-connected Windows x64 build PC, then copied to a target Windows x64 PC with networking disabled. The target installer creates a fresh `.venv` from bundled wheels; it does not copy the development `.venv`. +``` + +- [ ] **Step 2: Document verification gates** + +Add to `docs/V1RELEASECHECKLIST.md`: + +```markdown +### Offline Installer Gate + +- Build `dist\pdf2md-ui.exe`. +- Stage the offline payload. +- Verify payload hashes. +- Compile the Inno Setup installer. +- Install on a clean Windows x64 VM with networking disabled. +- Run `pdf2md doctor` from the installed `.venv`. +- Run one optional local conversion only when a local test PDF is available and generated outputs remain ignored. +``` + +- [ ] **Step 3: Run final fast tests** + +Run: + +```powershell +uv run pytest tests/test_offline_packaging.py tests/test_ui_runner.py +uv run pytest +git diff --check +``` + +Expected: PASS, except pre-existing documented optional skips. + +- [ ] **Step 4: Run packaging smoke on build PC** + +Run: + +```powershell +uv run --group ui-build pyinstaller --clean --onefile --windowed --name pdf2md-ui src\pdf2md_ui\app.py +$pythonInstaller = "C:\BuildCache\python-3.12-amd64.exe" +$uvExe = "C:\BuildCache\uv.exe" +$mineruModels = "C:\BuildCache\mineru-models" +powershell -ExecutionPolicy Bypass -File packaging\offline\build-offline-payload.ps1 -Configuration Release -PythonInstaller $pythonInstaller -UvExe $uvExe -MinerUModelSource $mineruModels +ISCC.exe packaging\offline\Pdf2MdOffline.iss +``` + +Expected: installer exe exists under `dist\`; generated files remain untracked. + +- [ ] **Step 5: Update coordination docs** + +Record changed files, verification output, generated installer path, payload size, and residual risks in `PROGRESS.md`. Move final implementation evidence and offline VM smoke results to `docs/WORKARCHIVE.md`. + +- [ ] **Step 6: Commit final docs** + +```powershell +git add README.md docs\V1RELEASECHECKLIST.md docs\Sprints\SPRINT17CONTRACT.md PLAN.md PROGRESS.md docs\WORKARCHIVE.md +git commit -m "docs: record offline installer release gate" +``` + +## Execution Notes + +- Do not commit payload contents, wheels, model files, Python installers, Node binaries, generated installer exe files, `samples/`, or `outputs/`. +- Keep runtime conversion strict-local. Setup-time payload creation may use internet only on the build PC. +- Treat license/model redistribution review as a release gate before sharing the installer outside the current personal environment. diff --git a/docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md b/docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md new file mode 100644 index 0000000..d947563 --- /dev/null +++ b/docs/superpowers/plans/2026-05-13-ui-folder-batch-conversion.md @@ -0,0 +1,111 @@ +# UI Folder Batch Conversion Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a minimal UI folder workflow that converts every direct-child PDF in a selected folder by sequentially invoking the existing `pdf2md convert` CLI. + +**Architecture:** Keep the converter and CLI unchanged. Add deterministic folder discovery and batch command construction to `src/pdf2md_ui/runner.py`, then make `src/pdf2md_ui/app.py` run a list of `CommandSpec` objects sequentially on the existing worker-thread/event-queue pattern. + +**Tech Stack:** Python 3.12, tkinter/ttk, pytest, PyInstaller, existing `pdf2md_ui.runner` subprocess wrapper. + +--- + +### Task 1: Runner Batch Helpers + +**Files:** +- Modify: `tests/test_ui_runner.py` +- Modify: `src/pdf2md_ui/runner.py` + +- [x] **Step 1: Write failing tests** + +```python +def test_list_direct_pdf_files_returns_sorted_direct_children_only(tmp_path: Path) -> None: + (tmp_path / "b.PDF").write_text("", encoding="utf-8") + (tmp_path / "a.pdf").write_text("", encoding="utf-8") + nested = tmp_path / "nested" + nested.mkdir() + (nested / "c.pdf").write_text("", encoding="utf-8") + (tmp_path / "notes.txt").write_text("", encoding="utf-8") + + assert [path.name for path in list_direct_pdf_files(tmp_path)] == ["a.pdf", "b.PDF"] +``` + +```python +def test_build_batch_convert_commands_reuses_convert_options(tmp_path: Path) -> None: + resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path") + pdfs = [tmp_path / "a.pdf", tmp_path / "b.pdf"] + + commands = build_batch_convert_commands( + resolved, + pdfs, + tmp_path / "out", + overwrite=True, + keep_raw=True, + chunk_pages=5, + gpu="auto", + mineru_profile="safe", + ) + + assert [command.args[2] for command in commands] == [str(pdfs[0]), str(pdfs[1])] + assert all("--chunk-pages" in command.args for command in commands) + assert all("--mineru-profile" in command.args for command in commands) +``` + +- [x] **Step 2: Run tests to verify RED** + +Run: `uv run pytest tests/test_ui_runner.py::test_list_direct_pdf_files_returns_sorted_direct_children_only tests/test_ui_runner.py::test_build_batch_convert_commands_reuses_convert_options -q` + +Expected: FAIL because the new helpers are not defined. + +- [x] **Step 3: Implement minimal runner helpers** + +Add `list_direct_pdf_files(folder)` using `Path.iterdir()` and case-insensitive `.pdf` suffix matching. Add `build_batch_convert_commands()` that loops over the provided PDF paths and delegates to `build_convert_command()`. + +- [x] **Step 4: Run tests to verify GREEN** + +Run: `uv run pytest tests/test_ui_runner.py -q` + +Expected: all UI runner tests pass. + +### Task 2: Tk UI Batch Execution + +**Files:** +- Modify: `src/pdf2md_ui/app.py` + +- [x] **Step 1: Add folder state and controls** + +Add `input_folder_var`, a path row labeled `Input folder`, and a `Convert folder` button beside the existing action buttons. + +- [x] **Step 2: Add batch command startup** + +Implement `_choose_folder()`, `_run_folder_convert()`, and `_start_command_sequence()`. `_run_folder_convert()` validates the folder and output directory, parses `chunk_pages`, builds commands through the runner helper, and starts the sequence. + +- [x] **Step 3: Add sequential worker behavior** + +Run each command synchronously on the worker thread. Emit log messages before each file starts. Stop after the first non-zero exit code. If Cancel is requested, terminate the active command and do not start later commands. + +- [x] **Step 4: Run focused tests** + +Run: `uv run pytest tests/test_ui_runner.py -q` + +Expected: all UI runner tests pass; UI app imports without syntax errors through test collection. + +### Task 3: Build and Handoff + +**Files:** +- Modify: `PROGRESS.md` +- Generated ignored output: `dist/pdf2md-ui.exe` + +- [x] **Step 1: Rebuild the UI executable** + +Run: `uv run --group ui-build pyinstaller --clean --onefile --windowed --name pdf2md-ui src\pdf2md_ui\app.py` + +Expected: exit code 0 and `dist\pdf2md-ui.exe` exists. + +- [x] **Step 2: Update progress** + +Record the new UI folder batch feature and verification commands in `PROGRESS.md`. + +- [x] **Step 3: Check and commit** + +Run: `git diff --check`, `git status --short`, then commit only the scoped source, test, and documentation changes. diff --git a/docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md b/docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md new file mode 100644 index 0000000..f992707 --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-ui-folder-batch-conversion-design.md @@ -0,0 +1,33 @@ +# UI Folder Batch Conversion Design + +## Goal + +Add a minimal UI workflow that lets the user select one folder and convert every PDF directly inside that folder to Markdown. + +## Scope + +- Include only `*.pdf` files directly under the selected folder. +- Exclude PDFs in nested folders. +- Reuse the existing `pdf2md convert` CLI command for each PDF. +- Keep conversion sequential to avoid GPU and MinerU runtime contention. +- Apply the existing UI conversion options to every PDF in the batch: output directory, overwrite, keep raw, grouped pages, GPU, and MinerU profile. + +## Design + +The runner layer owns folder discovery and batch command construction. It will expose a small helper that returns direct-child PDF paths in deterministic name order and another helper that builds one fixed-argument `CommandSpec` per PDF by calling the existing `build_convert_command()`. + +The Tk UI adds an input-folder row and a folder-convert button. When the user starts folder conversion, the UI validates the selected folder, builds the command list, and runs commands one at a time on the existing worker thread pattern. It logs each PDF before it starts, stops on the first non-zero exit code, and honors Cancel by terminating the currently running process and not starting later PDFs. + +## Non-Goals + +- No recursive folder conversion. +- No parallel conversion. +- No new CLI command. +- No direct MinerU invocation from the UI. +- No remote/API options or arbitrary shell command execution. + +## Verification + +- Add focused runner tests for direct-child PDF discovery, nested PDF exclusion, deterministic ordering, and batch command construction. +- Run `uv run pytest tests/test_ui_runner.py`. +- Rebuild the UI executable with PyInstaller and confirm `dist/pdf2md-ui.exe` exists. diff --git a/pyproject.toml b/pyproject.toml index 85ded6f..e73c942 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,9 @@ pdf2md = "pdf2md.cli:main" dev = [ "pytest>=8.3", ] +ui-build = [ + "pyinstaller>=6.20,<7", +] [tool.hatch.build.targets.wheel] -packages = ["src/pdf2md"] +packages = ["src/pdf2md", "src/pdf2md_ui"] diff --git a/src/pdf2md/cli.py b/src/pdf2md/cli.py index db48302..7d49917 100644 --- a/src/pdf2md/cli.py +++ b/src/pdf2md/cli.py @@ -7,7 +7,14 @@ import sys from collections.abc import Sequence from pdf2md import __version__ -from pdf2md.conversion import DEFAULT_CHUNK_PAGES, DEFAULT_GPU_DEVICE, ConversionAdapter, convert_input, recheck_markdown +from pdf2md.conversion import ( + DEFAULT_CHUNK_PAGES, + DEFAULT_GPU_DEVICE, + DEFAULT_MINERU_PROFILE, + ConversionAdapter, + convert_input, + recheck_markdown, +) from pdf2md.doctor import DoctorReport, format_doctor_report, run_doctor from pdf2md.mineru_adapter import StrictLocalViolationError from pdf2md.paths import PathPlanningError @@ -35,7 +42,12 @@ def main( convert_parser = subparsers.add_parser("convert", help="Convert a PDF or directory of PDFs.") convert_parser.add_argument("input", help="Input PDF file or directory.") convert_parser.add_argument("--out", required=True, help="Output directory.") - convert_parser.add_argument("--metadata", action="store_true", default=True, help="Write metadata JSON. Enabled by default.") + convert_parser.add_argument( + "--metadata", + action="store_true", + default=True, + help="Accepted for compatibility; metadata JSON is not written in the simplified output layout.", + ) convert_parser.add_argument("--keep-raw", action="store_true", help="Keep raw MinerU output.") convert_parser.add_argument("--recursive", action="store_true", help="Recursively discover PDFs in directories.") convert_parser.add_argument("--overwrite", action="store_true", help="Overwrite planned outputs.") @@ -47,14 +59,20 @@ def main( type=_positive_int, metavar="PAGES", help=( - "Opt in to pre-conversion PDF chunking. If PAGES is omitted, " - f"{DEFAULT_CHUNK_PAGES} pages per chunk is used." + "Opt in to single-page MinerU conversion and group final outputs by PAGES. " + f"If PAGES is omitted, grouped outputs use {DEFAULT_CHUNK_PAGES} pages." ), ) convert_parser.add_argument( "--gpu", default=DEFAULT_GPU_DEVICE, - help=f"CUDA device. Defaults to {DEFAULT_GPU_DEVICE}.", + help=f"CUDA device or `auto` for the largest visible NVIDIA GPU. Defaults to {DEFAULT_GPU_DEVICE}.", + ) + convert_parser.add_argument( + "--mineru-profile", + choices=("auto", "safe", "performance"), + default=DEFAULT_MINERU_PROFILE, + help=f"MinerU runtime profile. Defaults to {DEFAULT_MINERU_PROFILE}.", ) convert_parser.add_argument( "--strict-local", @@ -104,6 +122,7 @@ def main( overwrite=args.overwrite, chunk_pages=args.chunk_pages, gpu=args.gpu, + mineru_profile=args.mineru_profile, strict_local=args.strict_local, adapter=adapter, math_checker=math_checker, diff --git a/src/pdf2md/conversion.py b/src/pdf2md/conversion.py index 6c488bc..39457bf 100644 --- a/src/pdf2md/conversion.py +++ b/src/pdf2md/conversion.py @@ -4,11 +4,12 @@ from __future__ import annotations import hashlib import json +import os import re import shutil import tempfile from collections.abc import Callable -from dataclasses import dataclass, replace +from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path, PurePosixPath from typing import Any, Protocol @@ -19,10 +20,12 @@ from pdf2md.ir import ( BlockType, DocumentRecord, PageRecord, + TextFidelityRecord, WarningCode, WarningRecord, WarningSeverity, ) +from pdf2md.gpu import GpuInfo, normalize_cuda_device, query_nvidia_gpus, select_gpu from pdf2md.markdown import normalize_markdown from pdf2md.math_render import create_default_math_checker from pdf2md.math_repair import repair_math_render_failures @@ -34,14 +37,27 @@ from pdf2md.mineru_adapter import ( MinerUOptions, StrictLocalViolationError, ) -from pdf2md.paths import DiscoveredPdf, PathLike, PlannedOutput, discover_pdfs, plan_outputs +from pdf2md.mineru_profile import resolve_mineru_profile +from pdf2md.paths import ( + DiscoveredPdf, + DuplicateOutputPathError, + OutputConflictError, + OutputPathError, + OutputRootError, + PathLike, + PlannedOutput, + discover_pdfs, + plan_outputs, +) from pdf2md.pdf_splitter import PdfChunkPlan, plan_pdf_chunks, write_pdf_chunk from pdf2md.quality import MathChecker, QualityResult, check_asset_links, check_math_renderability_details, merge_quality_results from pdf2md.report import FinalStatus, determine_final_status, render_report +from pdf2md.text_fidelity import TextFidelityResult, check_text_fidelity, extract_pdf_text_pages Clock = Callable[[], datetime] DEFAULT_GPU_DEVICE = "cuda:0" +DEFAULT_MINERU_PROFILE = "auto" DEFAULT_CHUNK_PAGES = 20 @@ -64,6 +80,8 @@ class ConversionResult: warning_count: int warnings: tuple[WarningRecord, ...] pages_processed: int + _report_metadata: dict[str, Any] | None = None + _report_quality: QualityResult | None = None @property def succeeded(self) -> bool: @@ -97,11 +115,22 @@ class _AssetMaterialization: @dataclass(frozen=True) class _ConversionTask: output_plan: PlannedOutput - chunk_plan: PdfChunkPlan | None = None + group_plan: PdfChunkPlan | None = None + group_size: int | None = None + page_plans: tuple[PdfChunkPlan, ...] = () original_source_pdf: Path | None = None original_source_sha256: str | None = None +@dataclass(frozen=True) +class _PageConversionArtifact: + source_page_number: int + group_page_index: int + result: ConversionResult + markdown: str | None + metadata: dict[str, Any] | None + + @dataclass(frozen=True) class _PreparedMarkdown: markdown: str @@ -117,6 +146,11 @@ _RECHECKED_WARNING_CODES = frozenset( WarningCode.MATH_RENDER_REPAIRED, WarningCode.ASSET_LINK_MISSING, WarningCode.ASSET_LINK_INVALID, + WarningCode.TEXT_LAYER_AVAILABLE, + WarningCode.TEXT_FIDELITY_LOW, + WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT, + WarningCode.HANGUL_SPACING_SUSPECT, + WarningCode.TEXT_PAGE_MAPPING_UNCERTAIN, } ) @@ -129,13 +163,15 @@ def convert_pdf( keep_raw: bool = False, overwrite: bool = False, gpu: str | None = DEFAULT_GPU_DEVICE, + mineru_profile: str = DEFAULT_MINERU_PROFILE, + gpu_inventory: tuple[GpuInfo, ...] | None = None, strict_local: bool = True, adapter: ConversionAdapter | None = None, math_checker: MathChecker | None = None, chunk_pages: int | None = None, clock: Clock | None = None, ) -> ConversionResult | BatchConversionResult: - """Convert one local PDF into Markdown, metadata, and report outputs.""" + """Convert one local PDF into Markdown, assets, and report outputs.""" _raise_if_strict_local_disabled(strict_local) candidate = Path(input_path).expanduser() @@ -147,15 +183,17 @@ def convert_pdf( engine = adapter or MinerUAdapter() now = clock or _utc_now if chunk_pages is None: - plan = plan_outputs(discovered, output_dir, metadata=metadata, keep_raw=keep_raw, overwrite=overwrite)[0] + plan = plan_outputs(discovered, output_dir, metadata=False, keep_raw=keep_raw, overwrite=overwrite)[0] return _convert_plan( plan, adapter=engine, clock=now, - metadata_enabled=metadata, + metadata_enabled=False, keep_raw=keep_raw, overwrite=overwrite, gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, strict_local=strict_local, math_checker=math_checker, ) @@ -163,7 +201,7 @@ def convert_pdf( tasks = _plan_conversion_tasks( discovered, output_dir, - metadata=metadata, + metadata=False, keep_raw=keep_raw, overwrite=overwrite, chunk_pages=chunk_pages, @@ -173,10 +211,12 @@ def convert_pdf( tasks, adapter=engine, clock=now, - metadata_enabled=metadata, + metadata_enabled=False, keep_raw=keep_raw, overwrite=overwrite, gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, strict_local=strict_local, math_checker=math_checker, ) @@ -192,6 +232,8 @@ def convert_input( recursive: bool = False, overwrite: bool = False, gpu: str | None = DEFAULT_GPU_DEVICE, + mineru_profile: str = DEFAULT_MINERU_PROFILE, + gpu_inventory: tuple[GpuInfo, ...] | None = None, strict_local: bool = True, adapter: ConversionAdapter | None = None, math_checker: MathChecker | None = None, @@ -205,7 +247,7 @@ def convert_input( tasks = _plan_conversion_tasks( discovered, output_dir, - metadata=metadata, + metadata=False, keep_raw=keep_raw, overwrite=overwrite, chunk_pages=chunk_pages, @@ -217,10 +259,12 @@ def convert_input( tasks, adapter=engine, clock=now, - metadata_enabled=metadata, + metadata_enabled=False, keep_raw=keep_raw, overwrite=overwrite, gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, strict_local=strict_local, math_checker=math_checker, ) @@ -242,7 +286,7 @@ def recheck_markdown( metadata_path = markdown_file.with_suffix(".metadata.json") report_path = markdown_file.with_suffix(".report.md") if not metadata_path.is_file(): - raise ValueError(f"Existing metadata JSON is required for recheck: {metadata_path}") + raise ValueError(f"Legacy adjacent metadata JSON is required for recheck: {metadata_path}") existing_metadata = _read_metadata_json(metadata_path) markdown = markdown_file.read_text(encoding="utf-8") @@ -256,13 +300,21 @@ def recheck_markdown( ) markdown = prepared.markdown quality = prepared.quality - warnings = _preserved_metadata_warnings(existing_metadata) + quality.warnings + engine_options = _metadata_engine_options(existing_metadata) + text_fidelity = _run_text_fidelity_checks( + Path(_metadata_text(existing_metadata, "source_pdf")), + markdown, + page_count=_metadata_page_count(existing_metadata), + engine_options=engine_options, + ) + warnings = _preserved_metadata_warnings(existing_metadata) + quality.warnings + text_fidelity.warnings document = _build_document( source_pdf=Path(_metadata_text(existing_metadata, "source_pdf")), markdown=markdown, assets=assets, warnings=warnings, raw_structured={"pages": [None] * _metadata_page_count(existing_metadata)}, + text_fidelity=text_fidelity.pages, ) now = clock or _utc_now metadata_data = build_metadata( @@ -271,7 +323,7 @@ def recheck_markdown( created_at=_format_timestamp(now()), engine=_metadata_text(existing_metadata, "engine"), engine_version=_metadata_text(existing_metadata, "engine_version"), - engine_options=_metadata_engine_options(existing_metadata), + engine_options=engine_options, ) report_quality = QualityResult( missing_asset_link_count=quality.missing_asset_link_count, @@ -303,6 +355,8 @@ def recheck_markdown( warning_count=len(warnings), warnings=warnings, pages_processed=int(metadata_data["summary"]["pages_processed"]), + _report_metadata=metadata_data, + _report_quality=report_quality, ) @@ -399,6 +453,22 @@ def _optional_bbox(value: object) -> tuple[float, float, float, float] | None: return tuple(float(part) for part in value) +def _int_value(value: object) -> int: + return value if isinstance(value, int) else 0 + + +def _float_value(value: object) -> float: + return float(value) if isinstance(value, int | float) else 0.0 + + +def _optional_float_value(value: object) -> float | None: + return float(value) if isinstance(value, int | float) else None + + +def _bool_value(value: object) -> bool: + return value if isinstance(value, bool) else False + + def _plan_conversion_tasks( discovered: tuple[DiscoveredPdf, ...], output_dir: PathLike, @@ -409,44 +479,113 @@ def _plan_conversion_tasks( chunk_pages: int | None, ) -> tuple[_ConversionTask, ...]: if chunk_pages is None: - plans = plan_outputs(discovered, output_dir, metadata=metadata, keep_raw=keep_raw, overwrite=overwrite) + plans = plan_outputs(discovered, output_dir, metadata=False, keep_raw=keep_raw, overwrite=overwrite) return tuple(_ConversionTask(output_plan=plan) for plan in plans) if not isinstance(chunk_pages, int) or chunk_pages < 1: raise ValueError("chunk_pages must be a positive integer") - planned_inputs: list[DiscoveredPdf] = [] - chunk_plans: list[PdfChunkPlan | None] = [] - original_sources: list[Path | None] = [] - source_hashes: dict[Path, str] = {} + root = _resolve_output_root(output_dir) + tasks: list[_ConversionTask] = [] for item in discovered: - chunks = plan_pdf_chunks(item.source_path, chunk_pages=chunk_pages) - if len(chunks) == 1: - planned_inputs.append(item) - chunk_plans.append(None) - original_sources.append(None) - continue - - source_hashes[item.source_path] = _sha256(item.source_path) - for chunk in chunks: - planned_inputs.append( - DiscoveredPdf( - source_path=item.source_path.with_name(chunk.output_filename), - relative_parent=item.relative_parent, + groups = plan_pdf_chunks(item.source_path, chunk_pages=chunk_pages) + page_plans = plan_pdf_chunks(item.source_path, chunk_pages=1) + source_hash = _sha256(item.source_path) + output_folder = _output_folder_for_pdf(root, item) + stem = item.source_path.stem + part_width = max(3, len(str(len(groups)))) + for group in groups: + part_stem = f"{stem}_{group.chunk_index:0{part_width}d}" + plan = PlannedOutput( + source_pdf=item.source_path, + markdown_path=output_folder / f"{part_stem}.md", + assets_dir=output_folder / "images", + metadata_path=None, + report_path=output_folder / f"{stem}_report.md", + raw_dir=output_folder / "raw" / part_stem if keep_raw else None, + ) + _raise_if_plan_escapes_root(plan, root) + tasks.append( + _ConversionTask( + output_plan=plan, + group_plan=group, + group_size=chunk_pages, + page_plans=tuple( + page + for page in page_plans + if group.start_page_index <= page.start_page_index < group.end_page_index + ), + original_source_pdf=item.source_path, + original_source_sha256=source_hash, ) ) - chunk_plans.append(chunk) - original_sources.append(item.source_path) - plans = plan_outputs(planned_inputs, output_dir, metadata=metadata, keep_raw=keep_raw, overwrite=overwrite) - return tuple( - _ConversionTask( - output_plan=plan, - chunk_plan=chunk, - original_source_pdf=original, - original_source_sha256=source_hashes[original] if original is not None else None, - ) - for plan, chunk, original in zip(plans, chunk_plans, original_sources, strict=True) - ) + _raise_if_duplicate_task_outputs(tasks) + if not overwrite: + _raise_if_task_output_conflicts(tasks) + return tuple(tasks) + + +def _resolve_output_root(output_dir: PathLike) -> Path: + root = Path(output_dir).expanduser() + if root.exists() and not root.is_dir(): + raise OutputRootError(f"output root exists and is not a directory: {root}") + return root.resolve(strict=False) + + +def _output_folder_for_pdf(output_root: Path, item: DiscoveredPdf) -> Path: + relative_parent = _safe_relative_parent(item.relative_parent) + return output_root / relative_parent / item.source_path.stem + + +def _safe_relative_parent(path: Path) -> Path: + if path.is_absolute() or path.drive or path.root or ".." in path.parts: + raise OutputPathError(f"relative parent would escape the output root: {path}") + return path + + +def _raise_if_plan_escapes_root(plan: PlannedOutput, output_root: Path) -> None: + root = output_root.resolve(strict=False) + for path in plan.planned_paths(): + try: + path.resolve(strict=False).relative_to(root) + except ValueError as error: + raise OutputPathError(f"planned path would escape the output root: {path}") from error + + +def _raise_if_duplicate_task_outputs(tasks: tuple[_ConversionTask, ...] | list[_ConversionTask]) -> None: + seen: set[str] = set() + duplicates: list[Path] = [] + for task in tasks: + paths = [task.output_plan.markdown_path] + if task.output_plan.raw_dir is not None: + paths.append(task.output_plan.raw_dir) + for path in paths: + key = _path_key(path) + if key in seen: + duplicates.append(path) + else: + seen.add(key) + if duplicates: + raise DuplicateOutputPathError(duplicates) + + +def _raise_if_task_output_conflicts(tasks: tuple[_ConversionTask, ...] | list[_ConversionTask]) -> None: + conflicts = tuple(path for path in _unique_task_output_paths(tasks) if path.exists()) + if conflicts: + raise OutputConflictError(conflicts) + + +def _unique_task_output_paths(tasks: tuple[_ConversionTask, ...] | list[_ConversionTask]) -> tuple[Path, ...]: + seen: set[str] = set() + paths: list[Path] = [] + for task in tasks: + for path in task.output_plan.planned_paths(): + key = _path_key(path) + if key in seen: + continue + seen.add(key) + paths.append(path) + return tuple(paths) def _convert_tasks( @@ -458,12 +597,17 @@ def _convert_tasks( keep_raw: bool, overwrite: bool, gpu: str | None, + mineru_profile: str, + gpu_inventory: tuple[GpuInfo, ...] | None, strict_local: bool, math_checker: MathChecker | None, ) -> tuple[ConversionResult, ...]: - if any(task.chunk_plan is not None for task in tasks): - with tempfile.TemporaryDirectory(prefix="pdf2md.chunks.") as chunk_directory: - return tuple( + if any(task.group_plan is not None for task in tasks): + if overwrite: + _clear_task_outputs(tasks) + source_text_pages_by_pdf = _source_text_pages_by_pdf(tasks) + with tempfile.TemporaryDirectory(prefix="pdf2md.pages.") as chunk_directory: + results = tuple( _convert_task( task, chunk_directory=Path(chunk_directory), @@ -471,13 +615,18 @@ def _convert_tasks( clock=clock, metadata_enabled=metadata_enabled, keep_raw=keep_raw, - overwrite=overwrite, + overwrite=False, gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, strict_local=strict_local, math_checker=math_checker, + source_text_pages_by_pdf=source_text_pages_by_pdf, ) for task in tasks ) + _write_aggregate_group_reports(results) + return results return tuple( _convert_task( @@ -489,6 +638,8 @@ def _convert_tasks( keep_raw=keep_raw, overwrite=overwrite, gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, strict_local=strict_local, math_checker=math_checker, ) @@ -496,6 +647,30 @@ def _convert_tasks( ) +def _source_text_pages_by_pdf(tasks: tuple[_ConversionTask, ...]) -> dict[str, tuple[str, ...]]: + cache: dict[str, tuple[str, ...]] = {} + for task in tasks: + if task.group_plan is None or task.original_source_pdf is None: + continue + key = _path_key(task.original_source_pdf) + if key in cache: + continue + try: + cache[key] = extract_pdf_text_pages(task.original_source_pdf) + except Exception: + cache[key] = () + return cache + + +def _cached_source_text_pages( + cache: dict[str, tuple[str, ...]] | None, + source_pdf: Path | None, +) -> tuple[str, ...] | None: + if cache is None or source_pdf is None: + return None + return cache.get(_path_key(source_pdf)) + + def _convert_task( task: _ConversionTask, *, @@ -506,10 +681,13 @@ def _convert_task( keep_raw: bool, overwrite: bool, gpu: str | None, + mineru_profile: str, + gpu_inventory: tuple[GpuInfo, ...] | None, strict_local: bool, math_checker: MathChecker | None, + source_text_pages_by_pdf: dict[str, tuple[str, ...]] | None = None, ) -> ConversionResult: - if task.chunk_plan is None: + if task.group_plan is None: return _convert_plan( task.output_plan, adapter=adapter, @@ -518,28 +696,714 @@ def _convert_task( keep_raw=keep_raw, overwrite=overwrite, gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, strict_local=strict_local, math_checker=math_checker, ) if chunk_directory is None: - raise ValueError("chunk directory is required for chunked conversion") - chunk_pdf = write_pdf_chunk(task.chunk_plan, chunk_directory / task.chunk_plan.output_filename) - chunk_output_plan = replace(task.output_plan, source_pdf=chunk_pdf) - return _convert_plan( - chunk_output_plan, + raise ValueError("temporary directory is required for grouped page conversion") + return _convert_grouped_task( + task, + temporary_root=chunk_directory, adapter=adapter, clock=clock, metadata_enabled=metadata_enabled, keep_raw=keep_raw, overwrite=overwrite, gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, strict_local=strict_local, math_checker=math_checker, - result_source_pdf=task.original_source_pdf, - metadata_source_pdf=task.original_source_pdf, - metadata_source_sha256=task.original_source_sha256, - engine_options_extra={"chunk": task.chunk_plan.metadata()}, + source_text_pages_by_pdf=source_text_pages_by_pdf, + ) + + +def _convert_grouped_task( + task: _ConversionTask, + *, + temporary_root: Path, + adapter: ConversionAdapter, + clock: Clock, + metadata_enabled: bool, + keep_raw: bool, + overwrite: bool, + gpu: str | None, + mineru_profile: str, + gpu_inventory: tuple[GpuInfo, ...] | None, + strict_local: bool, + math_checker: MathChecker | None, + source_text_pages_by_pdf: dict[str, tuple[str, ...]] | None, +) -> ConversionResult: + if task.group_plan is None or task.original_source_pdf is None or task.original_source_sha256 is None: + raise ValueError("grouped conversion requires an original source and group plan") + + page_root = temporary_root / f"group-{task.group_plan.chunk_index:03d}" + page_root.mkdir(parents=True, exist_ok=True) + source_text_pages = _cached_source_text_pages(source_text_pages_by_pdf, task.original_source_pdf) + artifacts = tuple( + _convert_single_page_artifact( + page_plan, + group_plan=task.group_plan, + page_root=page_root, + adapter=adapter, + clock=clock, + keep_raw=keep_raw, + gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, + strict_local=strict_local, + math_checker=math_checker, + original_source_pdf=task.original_source_pdf, + original_source_sha256=task.original_source_sha256, + source_text_pages=source_text_pages, + ) + for page_plan in task.page_plans + ) + return _write_grouped_outputs( + task.output_plan, + group_plan=task.group_plan, + group_size=task.group_size, + artifacts=artifacts, + metadata_enabled=metadata_enabled, + clock=clock, + gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, + strict_local=strict_local, + original_source_pdf=task.original_source_pdf, + original_source_sha256=task.original_source_sha256, + math_checker=math_checker, + ) + + +def _convert_single_page_artifact( + page_plan: PdfChunkPlan, + *, + group_plan: PdfChunkPlan, + page_root: Path, + adapter: ConversionAdapter, + clock: Clock, + keep_raw: bool, + gpu: str | None, + mineru_profile: str, + gpu_inventory: tuple[GpuInfo, ...] | None, + strict_local: bool, + math_checker: MathChecker | None, + original_source_pdf: Path, + original_source_sha256: str, + source_text_pages: tuple[str, ...] | None, +) -> _PageConversionArtifact: + page_pdf = write_pdf_chunk(page_plan, page_root / _page_pdf_filename(page_plan)) + page_output_plan = _temporary_page_output_plan(page_pdf, page_root, keep_raw=keep_raw) + result = _convert_plan( + page_output_plan, + adapter=adapter, + clock=clock, + metadata_enabled=True, + keep_raw=keep_raw, + overwrite=True, + gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, + strict_local=strict_local, + math_checker=math_checker, + result_source_pdf=original_source_pdf, + metadata_source_pdf=original_source_pdf, + metadata_source_sha256=original_source_sha256, + engine_options_extra={"chunk": _chunk_metadata(page_plan)}, + source_text_pages=source_text_pages, + ) + markdown = result.markdown_path.read_text(encoding="utf-8") if result.succeeded and result.markdown_path.is_file() else None + metadata = _read_metadata_json(result.metadata_path) if result.metadata_path is not None and result.metadata_path.is_file() else None + return _PageConversionArtifact( + source_page_number=page_plan.source_page_start, + group_page_index=page_plan.start_page_index - group_plan.start_page_index, + result=result, + markdown=markdown, + metadata=metadata, + ) + + +def _write_grouped_outputs( + plan: PlannedOutput, + *, + group_plan: PdfChunkPlan, + group_size: int | None, + artifacts: tuple[_PageConversionArtifact, ...], + metadata_enabled: bool, + clock: Clock, + gpu: str | None, + mineru_profile: str, + gpu_inventory: tuple[GpuInfo, ...] | None, + strict_local: bool, + original_source_pdf: Path, + original_source_sha256: str, + math_checker: MathChecker | None, +) -> ConversionResult: + successful = tuple(artifact for artifact in artifacts if artifact.result.succeeded and artifact.markdown is not None) + all_failed = not successful + warnings = _group_warnings(artifacts, all_failed=all_failed) + engine = _first_engine(artifacts) + engine_version = _first_engine_version(artifacts) + engine_options = _group_engine_options( + artifacts, + group_plan=group_plan, + group_size=group_size or group_plan.page_count, + gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, + strict_local=strict_local, + failed_source_pages=tuple(artifact.source_page_number for artifact in artifacts if not artifact.result.succeeded), + ) + text_fidelity = _group_text_fidelity(artifacts) + + quality = QualityResult() + assets: tuple[AssetRecord, ...] = () + markdown = "" + plan.assets_dir.mkdir(parents=True, exist_ok=True) + if not all_failed: + markdown, assets, asset_warnings = _assemble_group_markdown_and_assets(plan, artifacts) + prepared = _prepare_markdown_for_output( + markdown, + markdown_dir=plan.markdown_path.parent, + asset_root=plan.assets_dir, + math_checker=math_checker, + ) + markdown = prepared.markdown + quality = prepared.quality + warnings = warnings + asset_warnings + quality.warnings + + document = _build_document( + source_pdf=original_source_pdf, + markdown=markdown, + assets=assets, + warnings=warnings, + raw_structured={"pages": [None] * group_plan.page_count}, + text_fidelity=text_fidelity, + ) + metadata_data = build_metadata( + document=document, + source_sha256=original_source_sha256, + created_at=_format_timestamp(clock()), + engine=engine, + engine_version=engine_version, + engine_options=engine_options, + ) + report_quality = QualityResult( + missing_asset_link_count=quality.missing_asset_link_count, + invalid_asset_link_count=quality.invalid_asset_link_count, + ) + report_text = render_report( + metadata_data, + quality=report_quality, + markdown_path=plan.markdown_path if not all_failed else None, + metadata_path=plan.metadata_path if metadata_enabled else None, + report_path=plan.report_path, + ) + final_status = "failed" if all_failed else determine_final_status(metadata_data, report_quality) + + plan.markdown_path.parent.mkdir(parents=True, exist_ok=True) + if not all_failed: + _write_text(plan.markdown_path, markdown) + _copy_group_raw_outputs(plan.raw_dir, artifacts) + if metadata_enabled and plan.metadata_path is not None: + _write_text(plan.metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n") + _write_text(plan.report_path, report_text) + + return ConversionResult( + source_pdf=original_source_pdf, + markdown_path=plan.markdown_path, + metadata_path=plan.metadata_path if metadata_enabled else None, + report_path=plan.report_path, + assets_dir=plan.assets_dir, + raw_dir=plan.raw_dir, + engine=engine, + engine_version=engine_version, + final_status=final_status, + warning_count=int(metadata_data["summary"]["warning_count"]), + warnings=warnings, + pages_processed=int(metadata_data["summary"]["pages_processed"]), + _report_metadata=metadata_data, + _report_quality=report_quality, + ) + + +def _write_aggregate_group_reports(results: tuple[ConversionResult, ...]) -> None: + grouped: dict[Path, list[ConversionResult]] = {} + for result in results: + if result._report_metadata is None: + continue + grouped.setdefault(result.report_path, []).append(result) + + for report_path, report_results in grouped.items(): + metadatas = tuple(result._report_metadata for result in report_results if result._report_metadata is not None) + if not metadatas: + continue + aggregate_metadata = _aggregate_report_metadata(tuple(report_results), metadatas) + aggregate_metadata["engine_options"]["output_folder"] = str(report_path.parent) + aggregate_quality = _aggregate_report_quality(tuple(report_results)) + report_text = render_report( + aggregate_metadata, + quality=aggregate_quality, + markdown_path=None, + metadata_path=None, + report_path=report_path, + ) + _write_text(report_path, report_text) + + +def _aggregate_report_metadata( + results: tuple[ConversionResult, ...], + metadatas: tuple[dict[str, Any], ...], +) -> dict[str, Any]: + first = metadatas[0] + summary = _aggregate_summary(metadatas) + parts = [_part_report_record(result, metadata) for result, metadata in zip(results, metadatas, strict=True)] + engine_options = _aggregate_engine_options(first.get("engine_options", {}), parts) + warnings = _aggregate_warning_records(metadatas) + text_fidelity = _aggregate_text_fidelity_records(metadatas) + aggregate: dict[str, Any] = { + "source_pdf": first.get("source_pdf", "unavailable"), + "source_sha256": first.get("source_sha256", "unavailable"), + "created_at": first.get("created_at", "unavailable"), + "engine": first.get("engine", ENGINE_NAME), + "engine_version": first.get("engine_version", "unknown"), + "engine_options": engine_options, + "pages": [{} for _ in range(max(1, _int_from_summary(summary, "pages_processed")))], + "assets": [asset for metadata in metadatas for asset in _list_value(metadata.get("assets"))], + "warnings": warnings, + "summary": {**summary, "warning_count": len(warnings)}, + } + if text_fidelity: + aggregate["text_fidelity"] = text_fidelity + return aggregate + + +def _aggregate_summary(metadatas: tuple[dict[str, Any], ...]) -> dict[str, Any]: + keys = ( + "pages_processed", + "warning_count", + "asset_count", + "display_formula_count", + "inline_formula_count", + "math_render_error_count", + "text_fidelity_checked_page_count", + "text_fidelity_low_page_count", + "text_fidelity_unexpected_cjk_count", + "text_fidelity_replacement_candidate_page_count", + "text_fidelity_page_mapping_uncertain_count", + ) + summary: dict[str, Any] = {} + for key in keys: + total = sum(_int_from_summary(_dict_value(metadata.get("summary")), key) for metadata in metadatas) + if total or key in {"pages_processed", "warning_count", "asset_count", "display_formula_count", "inline_formula_count", "math_render_error_count"}: + summary[key] = total + return summary + + +def _part_report_record(result: ConversionResult, metadata: dict[str, Any]) -> dict[str, Any]: + engine_options = _dict_value(metadata.get("engine_options")) + chunk = _dict_value(engine_options.get("chunk")) + page_conversion = _dict_value(engine_options.get("page_conversion")) + record: dict[str, Any] = { + "index": _int_value(chunk.get("chunk_index")), + "total": _int_value(chunk.get("total_chunks")), + "source_page_start": _int_value(chunk.get("source_page_start")), + "source_page_end": _int_value(chunk.get("source_page_end")), + "markdown_path": str(result.markdown_path) if result.markdown_path.exists() else None, + "status": result.final_status, + "warning_count": result.warning_count, + } + failed_source_pages = page_conversion.get("failed_source_pages") + if isinstance(failed_source_pages, list): + record["failed_source_pages"] = [page for page in failed_source_pages if isinstance(page, int)] + return record + + +def _aggregate_engine_options(first_options: object, parts: list[dict[str, Any]]) -> dict[str, Any]: + engine_options = _dict_value(first_options) + engine_options.pop("chunk", None) + engine_options.pop("page_conversion", None) + engine_options["parts"] = parts + failed_pages = sorted( + page + for part in parts + for page in _list_value(part.get("failed_source_pages")) + if isinstance(page, int) + ) + if failed_pages: + engine_options["failed_source_pages"] = failed_pages + return engine_options + + +def _aggregate_report_quality(results: tuple[ConversionResult, ...]) -> QualityResult: + return QualityResult( + missing_asset_link_count=sum((result._report_quality or QualityResult()).missing_asset_link_count for result in results), + invalid_asset_link_count=sum((result._report_quality or QualityResult()).invalid_asset_link_count for result in results), + math_render_error_count=sum((result._report_quality or QualityResult()).math_render_error_count for result in results), + ) + + +def _aggregate_warning_records(metadatas: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]: + warnings: list[dict[str, Any]] = [] + for metadata in metadatas: + page_offset = _source_page_offset(metadata) + for warning in _list_value(metadata.get("warnings")): + if not isinstance(warning, dict): + continue + adjusted = dict(warning) + page_index = adjusted.get("page_index") + if isinstance(page_index, int): + adjusted["page_index"] = page_offset + page_index + warnings.append(adjusted) + return warnings + + +def _aggregate_text_fidelity_records(metadatas: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + for metadata in metadatas: + page_offset = _source_page_offset(metadata) + for record in _list_value(metadata.get("text_fidelity")): + if not isinstance(record, dict): + continue + adjusted = dict(record) + page_index = adjusted.get("page_index") + if isinstance(page_index, int): + adjusted["page_index"] = page_offset + page_index + records.append(adjusted) + return records + + +def _source_page_offset(metadata: dict[str, Any]) -> int: + chunk = _dict_value(_dict_value(metadata.get("engine_options")).get("chunk")) + source_page_start = chunk.get("source_page_start") + return source_page_start - 1 if isinstance(source_page_start, int) and source_page_start > 0 else 0 + + +def _dict_value(value: object) -> dict[str, Any]: + return dict(value) if isinstance(value, dict) else {} + + +def _list_value(value: object) -> list[object]: + return list(value) if isinstance(value, list) else [] + + +def _int_from_summary(summary: dict[str, Any], key: str) -> int: + value = summary.get(key) + return value if isinstance(value, int) else 0 + + +def _page_pdf_filename(page_plan: PdfChunkPlan) -> str: + width = page_plan.page_number_width + return f"{page_plan.source_pdf.stem}.page-{page_plan.source_page_start:0{width}d}.pdf" + + +def _temporary_page_output_plan(page_pdf: Path, page_root: Path, *, keep_raw: bool) -> PlannedOutput: + output_dir = page_root / "outputs" + stem = page_pdf.stem + return PlannedOutput( + source_pdf=page_pdf, + markdown_path=output_dir / f"{stem}.md", + assets_dir=output_dir / f"{stem}.assets", + metadata_path=output_dir / f"{stem}.metadata.json", + report_path=output_dir / f"{stem}.report.md", + raw_dir=output_dir / f"{stem}.raw" if keep_raw else None, + ) + + +def _chunk_metadata(plan: PdfChunkPlan) -> dict[str, object]: + return { + "original_source_pdf": str(plan.source_pdf), + "chunk_index": plan.chunk_index, + "total_chunks": plan.total_chunks, + "source_page_start": plan.source_page_start, + "source_page_end": plan.source_page_end, + "chunk_page_count": plan.page_count, + } + + +def _group_engine_options( + artifacts: tuple[_PageConversionArtifact, ...], + *, + group_plan: PdfChunkPlan, + group_size: int, + gpu: str | None, + mineru_profile: str, + gpu_inventory: tuple[GpuInfo, ...] | None, + strict_local: bool, + failed_source_pages: tuple[int, ...], +) -> dict[str, Any]: + engine_options = _first_page_engine_options(artifacts) + if not engine_options: + engine_options = _mineru_options( + gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, + strict_local=strict_local, + ).to_engine_options() + engine_options.pop("chunk", None) + engine_options.pop("page_conversion", None) + engine_options["chunk"] = _chunk_metadata(group_plan) + engine_options["page_conversion"] = { + "mode": "single_page", + "mineru_input_page_count": 1, + "output_group_page_count": group_size, + "failed_source_pages": list(failed_source_pages), + } + return engine_options + + +def _first_page_engine_options(artifacts: tuple[_PageConversionArtifact, ...]) -> dict[str, Any]: + for artifact in artifacts: + if artifact.metadata is None: + continue + value = artifact.metadata.get("engine_options") + if isinstance(value, dict): + return dict(value) + return {} + + +def _mineru_options( + *, + gpu: str | None, + mineru_profile: str, + gpu_inventory: tuple[GpuInfo, ...] | None, + strict_local: bool, +) -> MinerUOptions: + gpu_device, selected_gpu = _resolve_gpu(gpu, gpu_inventory) + cuda_requested = bool(gpu_device and gpu_device.startswith("cuda:")) + warn_without_inventory = mineru_profile.strip().casefold() != DEFAULT_MINERU_PROFILE + profile = resolve_mineru_profile( + mineru_profile, + selected_gpu=selected_gpu, + cuda_requested=cuda_requested and (selected_gpu is not None or warn_without_inventory), + ) + return MinerUOptions( + strict_local=strict_local, + gpu_device=gpu_device, + mineru_profile=mineru_profile, + profile_environment=profile.environment, + profile_engine_options=profile.to_engine_options(), + profile_warnings=profile.warnings, + ) + + +def _resolve_gpu(gpu: str | None, gpu_inventory: tuple[GpuInfo, ...] | None) -> tuple[str | None, GpuInfo | None]: + requested = normalize_cuda_device(gpu) + if requested is None: + return None, None + + if requested.casefold() == "auto": + inventory = gpu_inventory if gpu_inventory is not None else query_nvidia_gpus() + selection = select_gpu(inventory, requested) + return selection.cuda_device, selection.gpu + + if gpu_inventory is None: + return requested, None + + selection = select_gpu(gpu_inventory, requested) + return selection.cuda_device, selection.gpu + + +def _first_engine(artifacts: tuple[_PageConversionArtifact, ...]) -> str: + for artifact in artifacts: + if artifact.result.engine: + return artifact.result.engine + return ENGINE_NAME + + +def _first_engine_version(artifacts: tuple[_PageConversionArtifact, ...]) -> str: + for artifact in artifacts: + if artifact.result.engine_version: + return artifact.result.engine_version + return "unknown" + + +def _assemble_group_markdown_and_assets( + plan: PlannedOutput, + artifacts: tuple[_PageConversionArtifact, ...], +) -> tuple[str, tuple[AssetRecord, ...], tuple[WarningRecord, ...]]: + sections: list[str] = [] + assets: list[AssetRecord] = [] + warnings: list[WarningRecord] = [] + copied_asset_names: set[str] = set() + for artifact in artifacts: + if artifact.result.succeeded and artifact.markdown is not None: + page_markdown, page_assets, page_warnings = _copy_page_assets_for_group( + plan.assets_dir, + artifact, + copied_asset_names, + ) + assets.extend(page_assets) + warnings.extend(page_warnings) + body = page_markdown.strip() + if body: + sections.append(f"\n\n{body}") + else: + sections.append(f"") + continue + sections.append(f"") + + return "\n\n".join(sections).rstrip() + "\n", tuple(assets), tuple(warnings) + + +def _copy_page_assets_for_group( + group_assets_dir: Path, + artifact: _PageConversionArtifact, + copied_asset_names: set[str], +) -> tuple[str, tuple[AssetRecord, ...], tuple[WarningRecord, ...]]: + if artifact.markdown is None or artifact.metadata is None: + return artifact.markdown or "", (), () + + link_map: dict[str, str] = {} + assets: list[AssetRecord] = [] + warnings: list[WarningRecord] = [] + for page_asset in _assets_from_metadata(artifact.metadata): + source = artifact.result.markdown_path.parent / page_asset.relative_path + if not source.is_file(): + warnings.append( + WarningRecord( + WarningCode.ASSET_LINK_MISSING, + WarningSeverity.WARNING, + f"Page asset could not be copied into grouped output: {page_asset.relative_path}", + page_index=artifact.group_page_index, + ) + ) + continue + + destination_relative = _group_asset_relative_path(page_asset.relative_path, artifact, copied_asset_names) + destination = group_assets_dir.joinpath(*destination_relative.parts) + try: + destination.resolve(strict=False).relative_to(group_assets_dir.resolve(strict=False)) + except ValueError: + warnings.append( + WarningRecord( + WarningCode.ASSET_LINK_INVALID, + WarningSeverity.WARNING, + f"Grouped asset destination would escape assets directory: {page_asset.relative_path}", + page_index=artifact.group_page_index, + ) + ) + continue + + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, destination) + final_link = PurePosixPath(group_assets_dir.name, destination_relative).as_posix() + link_map[page_asset.relative_path.replace("\\", "/")] = final_link + assets.append(AssetRecord(final_link, page_index=artifact.group_page_index)) + + return _rewrite_asset_links(artifact.markdown, link_map), tuple(assets), tuple(warnings) + + +def _group_asset_relative_path( + relative_path: str, + artifact: _PageConversionArtifact, + copied_asset_names: set[str], +) -> PurePosixPath: + parts = PurePosixPath(relative_path.replace("\\", "/")).parts + if parts and parts[0] == artifact.result.assets_dir.name: + parts = parts[1:] + if not parts: + parts = ("asset",) + original_name = PurePosixPath(*parts).name or "asset" + return _unique_asset_filename(f"page-{artifact.source_page_number:03d}_{original_name}", copied_asset_names) + + +def _group_warnings( + artifacts: tuple[_PageConversionArtifact, ...], + *, + all_failed: bool, +) -> tuple[WarningRecord, ...]: + warnings: list[WarningRecord] = [] + for artifact in artifacts: + page_warnings = _artifact_warnings(artifact) + if artifact.result.succeeded: + warnings.extend(_adjust_warning_for_group(warning, artifact.group_page_index) for warning in page_warnings) + continue + + severity = WarningSeverity.ERROR if all_failed else WarningSeverity.WARNING + if not page_warnings: + page_warnings = ( + WarningRecord( + WarningCode.MINERU_CLI_FAILED, + severity, + f"MinerU failed for source page {artifact.source_page_number}.", + ), + ) + warnings.extend( + WarningRecord( + warning.code, + severity, + f"Source page {artifact.source_page_number}: {warning.message}", + page_index=artifact.group_page_index, + bbox=warning.bbox, + ) + for warning in page_warnings + ) + return tuple(warnings) + + +def _artifact_warnings(artifact: _PageConversionArtifact) -> tuple[WarningRecord, ...]: + if artifact.metadata is None: + return artifact.result.warnings + raw_warnings = artifact.metadata.get("warnings") + if not isinstance(raw_warnings, list): + return artifact.result.warnings + warnings = tuple( + warning + for item in raw_warnings + if isinstance(item, dict) + for warning in (_warning_from_metadata(item),) + if warning is not None + ) + return warnings if warnings else artifact.result.warnings + + +def _adjust_warning_for_group(warning: WarningRecord, group_page_index: int) -> WarningRecord: + page_index = group_page_index if warning.page_index is None else group_page_index + warning.page_index + return WarningRecord( + warning.code, + warning.severity, + warning.message, + page_index=page_index, + bbox=warning.bbox, + ) + + +def _group_text_fidelity(artifacts: tuple[_PageConversionArtifact, ...]) -> tuple[TextFidelityRecord, ...]: + records: list[TextFidelityRecord] = [] + for artifact in artifacts: + if artifact.metadata is None: + continue + raw_records = artifact.metadata.get("text_fidelity") + if not isinstance(raw_records, list): + continue + for item in raw_records: + if isinstance(item, dict): + records.append(_text_fidelity_from_metadata(item, group_page_index=artifact.group_page_index)) + return tuple(records) + + +def _text_fidelity_from_metadata(item: dict[str, Any], *, group_page_index: int) -> TextFidelityRecord: + source_page_number = item.get("source_page_number") + return TextFidelityRecord( + page_index=group_page_index + _int_value(item.get("page_index")), + source_page_number=source_page_number if isinstance(source_page_number, int) else None, + pypdf_text_available=_bool_value(item.get("pypdf_text_available")), + markdown_text_available=_bool_value(item.get("markdown_text_available")), + pypdf_hangul_count=_int_value(item.get("pypdf_hangul_count")), + markdown_hangul_count=_int_value(item.get("markdown_hangul_count")), + hangul_count_delta=_int_value(item.get("hangul_count_delta")), + hangul_count_ratio=_optional_float_value(item.get("hangul_count_ratio")), + unexpected_cjk_count=_int_value(item.get("unexpected_cjk_count")), + pypdf_hangul_spacing_anomaly_ratio=_float_value(item.get("pypdf_hangul_spacing_anomaly_ratio")), + markdown_hangul_spacing_anomaly_ratio=_float_value(item.get("markdown_hangul_spacing_anomaly_ratio")), + text_similarity=_optional_float_value(item.get("text_similarity")), + replacement_candidate=_bool_value(item.get("replacement_candidate")), + comparison_status=str(item.get("comparison_status") or "unknown"), ) @@ -552,18 +1416,26 @@ def _convert_plan( keep_raw: bool, overwrite: bool, gpu: str | None, + mineru_profile: str, + gpu_inventory: tuple[GpuInfo, ...] | None, strict_local: bool, math_checker: MathChecker | None, result_source_pdf: Path | None = None, metadata_source_pdf: Path | None = None, metadata_source_sha256: str | None = None, engine_options_extra: dict[str, object] | None = None, + source_text_pages: tuple[str, ...] | None = None, ) -> ConversionResult: if overwrite: _clear_planned_outputs(plan) plan.markdown_path.parent.mkdir(parents=True, exist_ok=True) - options = MinerUOptions(strict_local=strict_local, gpu_device=gpu) + options = _mineru_options( + gpu=gpu, + mineru_profile=mineru_profile, + gpu_inventory=gpu_inventory, + strict_local=strict_local, + ) if keep_raw: if plan.raw_dir is None: @@ -581,6 +1453,7 @@ def _convert_plan( metadata_source_pdf=metadata_source_pdf, metadata_source_sha256=metadata_source_sha256, engine_options_extra=engine_options_extra, + source_text_pages=source_text_pages, ) with tempfile.TemporaryDirectory(prefix=f"{plan.source_pdf.stem}.", dir=plan.markdown_path.parent) as temporary_dir: @@ -596,6 +1469,7 @@ def _convert_plan( metadata_source_pdf=metadata_source_pdf, metadata_source_sha256=metadata_source_sha256, engine_options_extra=engine_options_extra, + source_text_pages=source_text_pages, ) @@ -611,23 +1485,37 @@ def _convert_in_work_dir( metadata_source_pdf: Path | None = None, metadata_source_sha256: str | None = None, engine_options_extra: dict[str, object] | None = None, + source_text_pages: tuple[str, ...] | None = None, ) -> ConversionResult: result_source = result_source_pdf or plan.source_pdf metadata_source = metadata_source_pdf or result_source try: adapter_result = adapter.convert(plan.source_pdf, work_dir, options) except StrictLocalViolationError as error: - return _failed_result(plan, warnings=(error.warning,), source_pdf=result_source) + return _failed_result( + plan, + warnings=(error.warning,), + source_pdf=result_source, + metadata_source_pdf=metadata_source, + metadata_source_sha256=metadata_source_sha256, + engine_options=options.to_engine_options(), + clock=clock, + ) engine = adapter_result.engine or ENGINE_NAME engine_version = adapter_result.engine_version or "unknown" + adapter_warnings = _merge_option_warnings(options.profile_warnings, adapter_result.warnings) if not adapter_result.succeeded: return _failed_result( plan, - warnings=adapter_result.warnings, + warnings=adapter_warnings, engine=engine, engine_version=engine_version, source_pdf=result_source, + metadata_source_pdf=metadata_source, + metadata_source_sha256=metadata_source_sha256, + engine_options=options.to_engine_options(), + clock=clock, ) if adapter_result.raw_markdown is None: @@ -638,12 +1526,17 @@ def _convert_in_work_dir( ) return _failed_result( plan, - warnings=adapter_result.warnings + (warning,), + warnings=adapter_warnings + (warning,), engine=engine, engine_version=engine_version, source_pdf=result_source, + metadata_source_pdf=metadata_source, + metadata_source_sha256=metadata_source_sha256, + engine_options=options.to_engine_options(), + clock=clock, ) + plan.assets_dir.mkdir(parents=True, exist_ok=True) assets = _materialize_assets(adapter_result.asset_paths, work_dir, plan.assets_dir) markdown_source = _rewrite_asset_links(adapter_result.raw_markdown, assets.link_map) normalized = normalize_markdown( @@ -659,17 +1552,25 @@ def _convert_in_work_dir( math_checker=math_checker, ) quality = prepared.quality - warnings = adapter_result.warnings + assets.warnings + normalized.warnings + quality.warnings + engine_options = dict(adapter_result.engine_options) + if engine_options_extra: + engine_options.update(engine_options_extra) + text_fidelity = _run_text_fidelity_checks( + metadata_source, + prepared.markdown, + page_count=_page_count(adapter_result.raw_structured), + engine_options=engine_options, + source_text_pages=source_text_pages, + ) + warnings = adapter_warnings + assets.warnings + normalized.warnings + quality.warnings + text_fidelity.warnings document = _build_document( source_pdf=metadata_source, markdown=prepared.markdown, assets=assets.records, warnings=warnings, raw_structured=adapter_result.raw_structured, + text_fidelity=text_fidelity.pages, ) - engine_options = dict(adapter_result.engine_options) - if engine_options_extra: - engine_options.update(engine_options_extra) metadata_data = build_metadata( document=document, source_sha256=metadata_source_sha256 or _sha256(metadata_source), @@ -709,6 +1610,8 @@ def _convert_in_work_dir( warning_count=len(warnings), warnings=warnings, pages_processed=int(metadata_data["summary"]["pages_processed"]), + _report_metadata=metadata_data, + _report_quality=report_quality, ) @@ -731,7 +1634,7 @@ def _materialize_assets(asset_paths: tuple[Path, ...], work_dir: Path, assets_di warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Adapter asset path is outside the work directory: {source_path}")) continue - destination_relative = _destination_asset_relative(source_relative) + destination_relative = _unique_asset_filename(_asset_filename(source_path, len(copied) + 1), copied) destination = assets_dir / destination_relative try: destination.resolve(strict=False).relative_to(assets_dir.resolve(strict=False)) @@ -739,14 +1642,8 @@ def _materialize_assets(asset_paths: tuple[Path, ...], work_dir: Path, assets_di warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Adapter asset destination is outside the assets directory: {source_path}")) continue - destination_key = destination_relative.as_posix() - if destination_key in copied: - warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Duplicate adapter asset destination was skipped: {destination_key}")) - continue - destination.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(source_path, destination) - copied.add(destination_key) final_link = PurePosixPath(assets_dir.name, destination_relative).as_posix() records.append(AssetRecord(final_link)) @@ -755,11 +1652,28 @@ def _materialize_assets(asset_paths: tuple[Path, ...], work_dir: Path, assets_di return _AssetMaterialization(records=tuple(records), warnings=tuple(warnings), link_map=link_map) -def _destination_asset_relative(source_relative: Path) -> PurePosixPath: - parts = PurePosixPath(source_relative.as_posix()).parts - if len(parts) > 1 and parts[0].casefold() in {"asset", "assets", "image", "images"}: - parts = parts[1:] - return PurePosixPath(*parts) +def _asset_filename(source_path: Path, index: int) -> str: + name = source_path.name.strip() + if name and name not in {".", ".."}: + return name + suffix = source_path.suffix if source_path.suffix else "" + return f"asset-{index:03d}{suffix}" + + +def _unique_asset_filename(filename: str, used_names: set[str]) -> PurePosixPath: + clean_name = PurePosixPath(filename.replace("\\", "/")).name + if not clean_name or clean_name in {".", ".."}: + clean_name = "asset" + path = PurePosixPath(clean_name) + stem = path.stem or "asset" + suffix = path.suffix + candidate = f"{stem}{suffix}" + index = 2 + while candidate.casefold() in used_names: + candidate = f"{stem}-{index:03d}{suffix}" + index += 1 + used_names.add(candidate.casefold()) + return PurePosixPath(candidate) def _add_asset_link_keys( @@ -814,6 +1728,7 @@ def _build_document( assets: tuple[AssetRecord, ...], warnings: tuple[WarningRecord, ...], raw_structured: object | None, + text_fidelity: tuple = (), ) -> DocumentRecord: page_count = _page_count(raw_structured) blocks = _formula_blocks(markdown) @@ -821,7 +1736,38 @@ def _build_document( PageRecord(page_index=page_index, blocks=blocks if page_index == 0 else ()) for page_index in range(page_count) ] - return DocumentRecord(source_pdf=source_pdf, pages=tuple(pages), assets=assets, warnings=warnings) + return DocumentRecord( + source_pdf=source_pdf, + pages=tuple(pages), + assets=assets, + warnings=warnings, + text_fidelity=text_fidelity, + ) + + +def _run_text_fidelity_checks( + source_pdf: Path, + markdown: str, + *, + page_count: int, + engine_options: dict[str, Any], + source_text_pages: tuple[str, ...] | None = None, +) -> TextFidelityResult: + return check_text_fidelity( + source_pdf, + markdown, + page_count=page_count, + engine_options=engine_options, + source_text_pages=source_text_pages, + ) + + +def _merge_option_warnings( + option_warnings: tuple[WarningRecord, ...], + adapter_warnings: tuple[WarningRecord, ...], +) -> tuple[WarningRecord, ...]: + extras = tuple(warning for warning in option_warnings if warning not in adapter_warnings) + return extras + adapter_warnings def _run_quality_checks( @@ -944,11 +1890,43 @@ def _failed_result( engine: str = ENGINE_NAME, engine_version: str = "unknown", source_pdf: Path | None = None, + metadata_source_pdf: Path | None = None, + metadata_source_sha256: str | None = None, + engine_options: dict[str, Any] | None = None, + clock: Clock | None = None, ) -> ConversionResult: + result_source = source_pdf or plan.source_pdf + metadata_source = metadata_source_pdf or result_source + metadata_data: dict[str, Any] | None = None + report_quality = QualityResult() + if clock is not None: + document = DocumentRecord( + source_pdf=metadata_source, + pages=(PageRecord(page_index=0),), + assets=(), + warnings=warnings, + ) + metadata_data = build_metadata( + document=document, + source_sha256=metadata_source_sha256 or _sha256(metadata_source), + created_at=_format_timestamp(clock()), + engine=engine, + engine_version=engine_version, + engine_options=engine_options or {}, + ) + report_text = render_report( + metadata_data, + quality=report_quality, + markdown_path=None, + metadata_path=None, + report_path=plan.report_path, + ) + _write_text(plan.report_path, report_text) + return ConversionResult( - source_pdf=source_pdf or plan.source_pdf, + source_pdf=result_source, markdown_path=plan.markdown_path, - metadata_path=plan.metadata_path, + metadata_path=None, report_path=plan.report_path, assets_dir=plan.assets_dir, raw_dir=plan.raw_dir, @@ -957,7 +1935,9 @@ def _failed_result( final_status="failed", warning_count=len(warnings), warnings=warnings, - pages_processed=0, + pages_processed=0 if metadata_data is None else int(metadata_data["summary"]["pages_processed"]), + _report_metadata=metadata_data, + _report_quality=report_quality, ) @@ -969,6 +1949,28 @@ def _clear_planned_outputs(plan: PlannedOutput) -> None: path.unlink() +def _clear_task_outputs(tasks: tuple[_ConversionTask, ...]) -> None: + for path in _unique_task_output_paths(tasks): + if path.is_dir(): + shutil.rmtree(path) + elif path.exists(): + path.unlink() + + +def _copy_group_raw_outputs(raw_dir: Path | None, artifacts: tuple[_PageConversionArtifact, ...]) -> None: + if raw_dir is None: + return + for artifact in artifacts: + source_raw_dir = artifact.result.raw_dir + if source_raw_dir is None or not source_raw_dir.exists(): + continue + destination = raw_dir / f"page-{artifact.source_page_number:03d}" + if destination.exists(): + shutil.rmtree(destination) + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copytree(source_raw_dir, destination) + + def _write_text(path: Path, text: str) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(text, encoding="utf-8") @@ -982,6 +1984,10 @@ def _sha256(path: Path) -> str: return digest.hexdigest() +def _path_key(path: Path) -> str: + return os.path.normcase(os.path.normpath(str(path.resolve(strict=False)))) + + def _format_timestamp(value: datetime) -> str: if value.tzinfo is None: value = value.replace(tzinfo=timezone.utc) diff --git a/src/pdf2md/doctor.py b/src/pdf2md/doctor.py index 5bf4002..b8d36a8 100644 --- a/src/pdf2md/doctor.py +++ b/src/pdf2md/doctor.py @@ -13,8 +13,10 @@ from dataclasses import dataclass from pathlib import Path from typing import Any, Literal, Protocol +from pdf2md.gpu import NVIDIA_SMI_QUERY, GpuInfo, parse_nvidia_smi_gpus, select_gpu from pdf2md.math_render import default_mathjax_helper_path from pdf2md.mineru_adapter import CommandResult, MinerUAdapter, MinerUVersionResult +from pdf2md.mineru_profile import resolve_mineru_profile DoctorStatus = Literal["pass", "warn", "fail"] @@ -178,13 +180,7 @@ def _check_gpu(which: Which, run_command: CommandRunner) -> DoctorCheck: if nvidia_smi_path is None: return DoctorCheck("gpu", "warn", "nvidia-smi was not found; NVIDIA GPU visibility could not be confirmed.") - result = run_command( - ( - "nvidia-smi", - "--query-gpu=name,memory.total,driver_version", - "--format=csv,noheader", - ) - ) + result = run_command(NVIDIA_SMI_QUERY) if result.exit_code != 0: return DoctorCheck( "gpu", @@ -193,20 +189,30 @@ def _check_gpu(which: Which, run_command: CommandRunner) -> DoctorCheck: (f"path: {nvidia_smi_path}", f"exit code: {result.exit_code}", _trim_detail(result.stderr)), ) - gpu_lines = tuple(line.strip() for line in result.stdout.splitlines() if line.strip()) - if not gpu_lines: + try: + gpus = parse_nvidia_smi_gpus(result.stdout) + except ValueError as error: + return DoctorCheck( + "gpu", + "warn", + "nvidia-smi output could not be parsed.", + (f"path: {nvidia_smi_path}", str(error), _trim_detail(result.stdout)), + ) + + if not gpus: return DoctorCheck("gpu", "warn", "nvidia-smi reported no visible NVIDIA GPU.", (f"path: {nvidia_smi_path}",)) - risky_names = tuple(line for line in gpu_lines if _is_pascal_or_pre_turing(line)) - if risky_names: + details = [f"path: {nvidia_smi_path}", *_gpu_detail_lines(gpus), *_gpu_recommendation_details(gpus)] + risky_gpus = tuple(gpu for gpu in gpus if gpu.pre_turing_risk) + if risky_gpus: return DoctorCheck( "gpu", "warn", "NVIDIA GPU is visible, but Pascal/pre-Turing compatibility risk was detected.", - (f"path: {nvidia_smi_path}", *risky_names), + tuple(details), ) - return DoctorCheck("gpu", "pass", "NVIDIA GPU is visible.", (f"path: {nvidia_smi_path}", *gpu_lines)) + return DoctorCheck("gpu", "pass", "NVIDIA GPU is visible.", tuple(details)) def _check_pytorch(import_module: ImportModule) -> DoctorCheck: @@ -269,6 +275,25 @@ def _check_pytorch(import_module: ImportModule) -> DoctorCheck: return DoctorCheck("pytorch", "pass", f"PyTorch {version} reports CUDA available.", tuple(details)) +def _gpu_detail_lines(gpus: tuple[GpuInfo, ...]) -> tuple[str, ...]: + return tuple( + f"gpu {gpu.index}: {gpu.name}, {gpu.memory_total_mib} MiB, driver {gpu.driver_version}" + for gpu in gpus + ) + + +def _gpu_recommendation_details(gpus: tuple[GpuInfo, ...]) -> tuple[str, ...]: + try: + selection = select_gpu(gpus, "auto") + except ValueError: + return () + profile = resolve_mineru_profile("auto", selected_gpu=selection.gpu, cuda_requested=True) + return ( + f"auto gpu: {selection.cuda_device} ({selection.gpu.name}, {selection.gpu.memory_total_mib} MiB)", + f"recommended MinerU profile: {profile.applied_profile}", + ) + + def _check_model_cache(env: Mapping[str, str], path_exists: PathExists, home: Path) -> DoctorCheck: configured_values: list[str] = [] existing_paths: list[str] = [] diff --git a/src/pdf2md/gpu.py b/src/pdf2md/gpu.py new file mode 100644 index 0000000..a1b2eab --- /dev/null +++ b/src/pdf2md/gpu.py @@ -0,0 +1,150 @@ +"""NVIDIA GPU inventory helpers for local runtime tuning.""" + +from __future__ import annotations + +import re +import shutil +import subprocess +from collections.abc import Callable +from dataclasses import dataclass + + +CommandRunner = Callable[[tuple[str, ...]], tuple[int, str, str]] + +NVIDIA_SMI_QUERY = ( + "nvidia-smi", + "--query-gpu=index,name,memory.total,driver_version", + "--format=csv,noheader,nounits", +) + + +@dataclass(frozen=True) +class GpuInfo: + index: int + name: str + memory_total_mib: int + driver_version: str + compute_capability: tuple[int, int] | None = None + + @property + def pre_turing_risk(self) -> bool: + if self.compute_capability is not None: + return self.compute_capability < (7, 5) + return _is_pascal_or_pre_turing(self.name) + + +@dataclass(frozen=True) +class GpuSelection: + gpu: GpuInfo + cuda_device: str + + +def parse_nvidia_smi_gpus(output: str) -> tuple[GpuInfo, ...]: + """Parse local ``nvidia-smi`` CSV output.""" + + gpus: list[GpuInfo] = [] + for line in output.splitlines(): + if not line.strip(): + continue + parts = [part.strip() for part in line.split(",")] + if len(parts) < 4: + raise ValueError(f"nvidia-smi GPU row must contain index, name, memory, and driver: {line}") + index = _parse_int(parts[0], "GPU index") + memory = _parse_memory_mib(parts[2]) + gpus.append(GpuInfo(index=index, name=parts[1], memory_total_mib=memory, driver_version=parts[3])) + return tuple(gpus) + + +def query_nvidia_gpus( + *, + which: Callable[[str], str | None] = shutil.which, + runner: CommandRunner | None = None, +) -> tuple[GpuInfo, ...]: + """Return visible NVIDIA GPUs, or an empty tuple when inventory is unavailable.""" + + if which("nvidia-smi") is None: + return () + run = runner or _run_command + exit_code, stdout, _stderr = run(NVIDIA_SMI_QUERY) + if exit_code != 0: + return () + try: + return parse_nvidia_smi_gpus(stdout) + except ValueError: + return () + + +def select_gpu(gpus: tuple[GpuInfo, ...], requested: str) -> GpuSelection: + request = requested.strip().casefold() + if request == "auto": + if not gpus: + raise ValueError("No visible NVIDIA GPU was found for --gpu auto.") + gpu = max(gpus, key=lambda item: (item.memory_total_mib, -item.index)) + return GpuSelection(gpu=gpu, cuda_device=f"cuda:{gpu.index}") + + index = _requested_index(request) + for gpu in gpus: + if gpu.index == index: + return GpuSelection(gpu=gpu, cuda_device=f"cuda:{gpu.index}") + raise ValueError(f"Requested CUDA GPU index {index} is not visible.") + + +def normalize_cuda_device(requested: str | None) -> str | None: + if requested is None: + return None + device = requested.strip() + if not device: + return None + if device.isdecimal(): + return f"cuda:{device}" + return device + + +def _run_command(command: tuple[str, ...]) -> tuple[int, str, str]: + try: + completed = subprocess.run(command, check=False, capture_output=True, text=True, timeout=20) + except (FileNotFoundError, subprocess.TimeoutExpired) as error: + return (127, "", str(error)) + return (completed.returncode, completed.stdout, completed.stderr) + + +def _requested_index(value: str) -> int: + if value.isdecimal(): + return int(value) + match = re.fullmatch(r"cuda:(\d+)", value) + if match: + return int(match.group(1)) + raise ValueError(f"Unsupported GPU request: {value}") + + +def _parse_int(value: str, field_name: str) -> int: + try: + parsed = int(value) + except ValueError as error: + raise ValueError(f"invalid {field_name}: {value}") from error + if parsed < 0: + raise ValueError(f"invalid {field_name}: {value}") + return parsed + + +def _parse_memory_mib(value: str) -> int: + match = re.search(r"\d+", value) + if match is None: + raise ValueError(f"invalid GPU memory value: {value}") + return _parse_int(match.group(0), "GPU memory") + + +def _is_pascal_or_pre_turing(value: str) -> bool: + normalized = value.casefold() + risky_tokens = ( + "gtx 10", + "gtx 9", + "gtx 8", + "gtx 7", + "gtx 6", + "gtx 5", + "tesla p", + "quadro p", + "pascal", + ) + return any(token in normalized for token in risky_tokens) diff --git a/src/pdf2md/ir.py b/src/pdf2md/ir.py index 59c1f02..2baaa74 100644 --- a/src/pdf2md/ir.py +++ b/src/pdf2md/ir.py @@ -40,6 +40,12 @@ class WarningCode(StrEnum): STRICT_LOCAL_VIOLATION = "STRICT_LOCAL_VIOLATION" MINERU_CLI_FAILED = "MINERU_CLI_FAILED" TABLE_FALLBACK = "TABLE_FALLBACK" + TEXT_LAYER_AVAILABLE = "TEXT_LAYER_AVAILABLE" + TEXT_FIDELITY_LOW = "TEXT_FIDELITY_LOW" + UNEXPECTED_CJK_IN_KOREAN_TEXT = "UNEXPECTED_CJK_IN_KOREAN_TEXT" + HANGUL_SPACING_SUSPECT = "HANGUL_SPACING_SUSPECT" + TEXT_PAGE_MAPPING_UNCERTAIN = "TEXT_PAGE_MAPPING_UNCERTAIN" + MINERU_PROFILE_ADJUSTED = "MINERU_PROFILE_ADJUSTED" class WarningSeverity(StrEnum): @@ -140,12 +146,75 @@ class WarningRecord: return data +@dataclass(frozen=True) +class TextFidelityRecord: + page_index: int + source_page_number: int | None + pypdf_text_available: bool + markdown_text_available: bool + pypdf_hangul_count: int + markdown_hangul_count: int + hangul_count_delta: int + hangul_count_ratio: float | None + unexpected_cjk_count: int + pypdf_hangul_spacing_anomaly_ratio: float + markdown_hangul_spacing_anomaly_ratio: float + text_similarity: float | None + replacement_candidate: bool + comparison_status: str + + def __post_init__(self) -> None: + _validate_page_index(self.page_index) + if self.source_page_number is not None and self.source_page_number < 1: + raise ValueError("source_page_number must be positive when provided") + for field_name in ( + "pypdf_hangul_count", + "markdown_hangul_count", + "unexpected_cjk_count", + ): + value = getattr(self, field_name) + if not isinstance(value, int) or value < 0: + raise ValueError(f"{field_name} must be a non-negative integer") + if self.hangul_count_ratio is not None and self.hangul_count_ratio < 0.0: + raise ValueError("hangul_count_ratio must be non-negative") + for field_name in ( + "pypdf_hangul_spacing_anomaly_ratio", + "markdown_hangul_spacing_anomaly_ratio", + "text_similarity", + ): + value = getattr(self, field_name) + if value is not None and not 0.0 <= value <= 1.0: + raise ValueError(f"{field_name} must be between 0.0 and 1.0") + if not self.comparison_status: + raise ValueError("comparison_status is required") + + def to_dict(self) -> dict[str, object]: + data: dict[str, object] = { + "page_index": self.page_index, + "pypdf_text_available": self.pypdf_text_available, + "markdown_text_available": self.markdown_text_available, + "pypdf_hangul_count": self.pypdf_hangul_count, + "markdown_hangul_count": self.markdown_hangul_count, + "hangul_count_delta": self.hangul_count_delta, + "unexpected_cjk_count": self.unexpected_cjk_count, + "pypdf_hangul_spacing_anomaly_ratio": self.pypdf_hangul_spacing_anomaly_ratio, + "markdown_hangul_spacing_anomaly_ratio": self.markdown_hangul_spacing_anomaly_ratio, + "replacement_candidate": self.replacement_candidate, + "comparison_status": self.comparison_status, + } + _add_optional(data, "source_page_number", self.source_page_number) + _add_optional(data, "hangul_count_ratio", self.hangul_count_ratio) + _add_optional(data, "text_similarity", self.text_similarity) + return data + + @dataclass(frozen=True) class DocumentRecord: source_pdf: PathLike pages: tuple[PageRecord, ...] assets: tuple[AssetRecord, ...] = field(default_factory=tuple) warnings: tuple[WarningRecord, ...] = field(default_factory=tuple) + text_fidelity: tuple[TextFidelityRecord, ...] = field(default_factory=tuple) def __post_init__(self) -> None: if not str(self.source_pdf): @@ -157,6 +226,7 @@ class DocumentRecord: object.__setattr__(self, "pages", pages) object.__setattr__(self, "assets", _tuple_of(AssetRecord, self.assets, "assets")) object.__setattr__(self, "warnings", _tuple_of(WarningRecord, self.warnings, "warnings")) + object.__setattr__(self, "text_fidelity", _tuple_of(TextFidelityRecord, self.text_fidelity, "text_fidelity")) def to_dict(self) -> dict[str, object]: return { @@ -164,6 +234,7 @@ class DocumentRecord: "pages": [page.to_dict() for page in self.pages], "assets": [asset.to_dict() for asset in self.assets], "warnings": [warning.to_dict() for warning in self.warnings], + "text_fidelity": [record.to_dict() for record in self.text_fidelity], } diff --git a/src/pdf2md/metadata.py b/src/pdf2md/metadata.py index 2dc242e..6fa1e92 100644 --- a/src/pdf2md/metadata.py +++ b/src/pdf2md/metadata.py @@ -46,6 +46,8 @@ def build_metadata( "warnings": [warning.to_dict() for warning in document.warnings], "summary": build_summary(document), } + if document.text_fidelity: + metadata["text_fidelity"] = [record.to_dict() for record in document.text_fidelity] _ensure_json_serializable(metadata) return metadata @@ -54,7 +56,7 @@ def build_summary(document: DocumentRecord) -> JsonObject: """Build required summary counts for metadata and later reports.""" blocks = tuple(iter_blocks(document.pages)) - return { + summary: JsonObject = { "pages_processed": len(document.pages), "warning_count": len(document.warnings), "asset_count": len(document.assets), @@ -62,6 +64,29 @@ def build_summary(document: DocumentRecord) -> JsonObject: "inline_formula_count": sum(block.block_type == BlockType.INLINE_FORMULA for block in blocks), "math_render_error_count": count_non_info_warnings(document.warnings, WarningCode.MATH_RENDER_FAILED), } + if document.text_fidelity: + summary.update( + { + "text_fidelity_checked_page_count": sum( + record.comparison_status == "checked" for record in document.text_fidelity + ), + "text_fidelity_low_page_count": count_non_info_warnings( + document.warnings, + WarningCode.TEXT_FIDELITY_LOW, + ) + or sum(_is_low_text_fidelity_record(record) for record in document.text_fidelity), + "text_fidelity_unexpected_cjk_count": sum( + record.unexpected_cjk_count for record in document.text_fidelity + ), + "text_fidelity_replacement_candidate_page_count": sum( + record.replacement_candidate for record in document.text_fidelity + ), + "text_fidelity_page_mapping_uncertain_count": sum( + record.comparison_status == "page_mapping_uncertain" for record in document.text_fidelity + ), + } + ) + return summary def count_warnings(warnings: tuple[WarningRecord, ...], code: WarningCode) -> int: @@ -72,6 +97,12 @@ def count_non_info_warnings(warnings: tuple[WarningRecord, ...], code: WarningCo return sum(warning.code == code and warning.severity != WarningSeverity.INFO for warning in warnings) +def _is_low_text_fidelity_record(record) -> bool: + low_ratio = record.hangul_count_ratio is not None and record.hangul_count_ratio < 0.95 + low_similarity = record.text_similarity is not None and record.text_similarity < 0.82 + return record.comparison_status == "checked" and (low_ratio or low_similarity or record.unexpected_cjk_count > 0) + + def _require_text(value: str | None, field_name: str) -> None: if not value: raise MetadataInputError(f"{field_name} is required") diff --git a/src/pdf2md/mineru_adapter.py b/src/pdf2md/mineru_adapter.py index 16a81ac..b76e7ec 100644 --- a/src/pdf2md/mineru_adapter.py +++ b/src/pdf2md/mineru_adapter.py @@ -49,6 +49,10 @@ class CommandResult: class MinerUOptions: strict_local: bool = True gpu_device: str | None = None + mineru_profile: str = "auto" + profile_environment: Mapping[str, str] = field(default_factory=dict) + profile_engine_options: Mapping[str, object] = field(default_factory=dict) + profile_warnings: tuple[WarningRecord, ...] = () engine_version: str | None = None extra_cli_args: tuple[str, ...] = () engine_options: Mapping[str, object] = field(default_factory=dict) @@ -57,6 +61,15 @@ class MinerUOptions: data: dict[str, object] = {"strict_local": self.strict_local} if self.gpu_device is not None: data["gpu_device"] = self.gpu_device + data["mineru_profile"] = ( + dict(self.profile_engine_options) + if self.profile_engine_options + else { + "requested": self.mineru_profile, + "applied": self.mineru_profile, + "environment": dict(self.profile_environment), + } + ) data.update(dict(self.engine_options)) return data @@ -249,7 +262,14 @@ def validate_strict_local_options(options: MinerUOptions) -> None: if not options.strict_local: raise StrictLocalViolationError("strict-local execution cannot be disabled in v1.") - values: list[object] = [options.gpu_device, *options.extra_cli_args, options.engine_options] + values: list[object] = [ + options.gpu_device, + options.mineru_profile, + *options.extra_cli_args, + options.engine_options, + options.profile_engine_options, + tuple(options.profile_environment.values()), + ] for value in values: _reject_prohibited_value(value) @@ -313,6 +333,8 @@ def _run_command(command: tuple[str, ...]) -> CommandResult: check=False, capture_output=True, text=True, + encoding="utf-8", + errors="replace", ) return CommandResult( command=command, @@ -343,25 +365,31 @@ def _run_with_environment( def _mineru_environment(options: MinerUOptions) -> dict[str, str]: - if options.gpu_device is None: - return {} - - device = options.gpu_device.strip() - if not device: - return {} - - if device.isdecimal(): - device = f"cuda:{device}" - - environment = {"MINERU_DEVICE_MODE": device} - if device.startswith("cuda:"): - index = device.split(":", 1)[1].strip() - if index: - environment["MINERU_DEVICE_MODE"] = "cuda" - environment["CUDA_VISIBLE_DEVICES"] = index + environment: dict[str, str] = {} + if options.gpu_device is not None: + device = options.gpu_device.strip() + if device: + if device.isdecimal(): + device = f"cuda:{device}" + environment["MINERU_DEVICE_MODE"] = device + if device.startswith("cuda:"): + index = device.split(":", 1)[1].strip() + if index: + environment["MINERU_DEVICE_MODE"] = "cuda" + environment["CUDA_VISIBLE_DEVICES"] = index + environment.update(_profile_environment(options.profile_environment)) return environment +def _profile_environment(environment: Mapping[str, str]) -> dict[str, str]: + allowed_names = { + "MINERU_PROCESSING_WINDOW_SIZE", + "MINERU_API_MAX_CONCURRENT_REQUESTS", + "MINERU_PDF_RENDER_THREADS", + } + return {name: str(value) for name, value in environment.items() if name in allowed_names} + + def _first_file(root: Path, pattern: str) -> Path | None: matches = sorted(root.rglob(pattern), key=lambda path: path.as_posix().casefold()) return matches[0] if matches else None @@ -394,6 +422,7 @@ def _result( stderr: str = "", warnings: tuple[WarningRecord, ...] = (), ) -> MinerUAdapterResult: + option_warnings = tuple(warning for warning in options.profile_warnings if warning not in warnings) return MinerUAdapterResult( succeeded=succeeded, command=command, @@ -402,7 +431,7 @@ def _result( raw_markdown=raw_markdown, raw_structured=raw_structured, asset_paths=asset_paths, - warnings=warnings, + warnings=option_warnings + warnings, engine=ENGINE_NAME, engine_version=options.engine_version, engine_options=options.to_engine_options(), diff --git a/src/pdf2md/mineru_profile.py b/src/pdf2md/mineru_profile.py new file mode 100644 index 0000000..45af928 --- /dev/null +++ b/src/pdf2md/mineru_profile.py @@ -0,0 +1,131 @@ +"""Strict-local MinerU runtime profile policy.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + +from pdf2md.gpu import GpuInfo +from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity + + +MinerUProfileName = Literal["auto", "safe", "performance"] + +SAFE_ENV = { + "MINERU_PROCESSING_WINDOW_SIZE": "1", + "MINERU_API_MAX_CONCURRENT_REQUESTS": "1", + "MINERU_PDF_RENDER_THREADS": "1", +} +AUTO_CONSERVATIVE_ENV = { + "MINERU_PROCESSING_WINDOW_SIZE": "4", + "MINERU_API_MAX_CONCURRENT_REQUESTS": "1", + "MINERU_PDF_RENDER_THREADS": "2", +} +AUTO_STRONG_ENV = { + "MINERU_PROCESSING_WINDOW_SIZE": "8", + "MINERU_API_MAX_CONCURRENT_REQUESTS": "1", + "MINERU_PDF_RENDER_THREADS": "4", +} +PERFORMANCE_ENV = { + "MINERU_PROCESSING_WINDOW_SIZE": "16", + "MINERU_API_MAX_CONCURRENT_REQUESTS": "1", + "MINERU_PDF_RENDER_THREADS": "4", +} + + +@dataclass(frozen=True) +class MinerUProfileResult: + requested_profile: str + applied_profile: str + environment: dict[str, str] + selected_gpu_index: int | None = None + selected_gpu_name: str | None = None + selected_gpu_vram_mib: int | None = None + selected_gpu_pre_turing_risk: bool | None = None + warnings: tuple[WarningRecord, ...] = () + + def to_engine_options(self) -> dict[str, object]: + data: dict[str, object] = { + "requested": self.requested_profile, + "applied": self.applied_profile, + "environment": dict(sorted(self.environment.items())), + } + if self.selected_gpu_index is not None: + data["selected_gpu"] = { + "index": self.selected_gpu_index, + "name": self.selected_gpu_name or "unknown", + "memory_total_mib": self.selected_gpu_vram_mib or 0, + "pre_turing_risk": bool(self.selected_gpu_pre_turing_risk), + } + return data + + +def resolve_mineru_profile( + requested_profile: str, + *, + selected_gpu: GpuInfo | None, + cuda_requested: bool, +) -> MinerUProfileResult: + profile = requested_profile.strip().casefold() + if profile not in {"auto", "safe", "performance"}: + raise ValueError("mineru_profile must be one of: auto, safe, performance") + + warnings: tuple[WarningRecord, ...] = () + if selected_gpu is None: + if cuda_requested: + warnings = ( + WarningRecord( + WarningCode.GPU_UNAVAILABLE, + WarningSeverity.WARNING, + "GPU inventory was unavailable; MinerU profile was resolved to safe settings.", + ), + ) + return _result(profile, "safe", SAFE_ENV, None, warnings) + + if profile == "safe": + return _result(profile, "safe", SAFE_ENV, selected_gpu, warnings) + + strong_gpu = selected_gpu.memory_total_mib >= 16 * 1024 and not selected_gpu.pre_turing_risk + mid_gpu = selected_gpu.memory_total_mib >= 12 * 1024 and not selected_gpu.pre_turing_risk + + if profile == "performance": + if strong_gpu: + return _result(profile, "performance", PERFORMANCE_ENV, selected_gpu, warnings) + return _result( + profile, + "safe", + SAFE_ENV, + selected_gpu, + ( + WarningRecord( + WarningCode.MINERU_PROFILE_ADJUSTED, + WarningSeverity.WARNING, + "Requested MinerU performance profile was downgraded to safe for the selected GPU.", + ), + ), + ) + + if strong_gpu: + return _result(profile, "auto", AUTO_STRONG_ENV, selected_gpu, warnings) + if mid_gpu: + return _result(profile, "auto-conservative", AUTO_CONSERVATIVE_ENV, selected_gpu, warnings) + return _result(profile, "safe", SAFE_ENV, selected_gpu, warnings) + + +def _result( + requested: str, + applied: str, + environment: dict[str, str], + gpu: GpuInfo | None, + warnings: tuple[WarningRecord, ...], +) -> MinerUProfileResult: + return MinerUProfileResult( + requested_profile=requested, + applied_profile=applied, + environment=dict(environment), + selected_gpu_index=gpu.index if gpu is not None else None, + selected_gpu_name=gpu.name if gpu is not None else None, + selected_gpu_vram_mib=gpu.memory_total_mib if gpu is not None else None, + selected_gpu_pre_turing_risk=gpu.pre_turing_risk if gpu is not None else None, + warnings=warnings, + ) diff --git a/src/pdf2md/paths.py b/src/pdf2md/paths.py index 28aa4eb..a618465 100644 --- a/src/pdf2md/paths.py +++ b/src/pdf2md/paths.py @@ -168,15 +168,15 @@ def _plan_one( keep_raw: bool, ) -> PlannedOutput: relative_parent = _safe_relative_parent(discovered_pdf.relative_parent) - parent = output_root / relative_parent stem = discovered_pdf.source_path.stem + parent = output_root / relative_parent / stem plan = PlannedOutput( source_pdf=discovered_pdf.source_path, - markdown_path=parent / f"{stem}.md", - assets_dir=parent / f"{stem}.assets", - metadata_path=parent / f"{stem}.metadata.json" if metadata else None, - report_path=parent / f"{stem}.report.md", - raw_dir=parent / f"{stem}.raw" if keep_raw else None, + markdown_path=parent / f"{stem}_001.md", + assets_dir=parent / "images", + metadata_path=None, + report_path=parent / f"{stem}_report.md", + raw_dir=parent / "raw" if keep_raw else None, ) _raise_if_plan_escapes_root(plan, output_root) return plan diff --git a/src/pdf2md/report.py b/src/pdf2md/report.py index 4219da7..75297e1 100644 --- a/src/pdf2md/report.py +++ b/src/pdf2md/report.py @@ -55,9 +55,14 @@ def render_report( "", f"- Source PDF: {_text(metadata.get('source_pdf'))}", ] + _append_output_folder(lines, metadata.get("engine_options", {})) + _append_markdown_parts(lines, metadata.get("engine_options", {})) chunk_line = _chunk_line(metadata.get("engine_options", {})) if chunk_line is not None: lines.append(chunk_line) + page_conversion_line = _page_conversion_line(metadata.get("engine_options", {})) + if page_conversion_line is not None: + lines.append(page_conversion_line) _append_optional_path(lines, "Output Markdown", paths.markdown_path) _append_optional_path(lines, "Metadata JSON", paths.metadata_path) _append_optional_path(lines, "Report Markdown", paths.report_path) @@ -81,11 +86,10 @@ def render_report( f"- Inline formula count: {_int(summary.get('inline_formula_count'))}", f"- Display formula count: {_int(summary.get('display_formula_count'))}", f"- Math render error count: {total_math_render_errors}", - "", - "## Pages With Warnings", - "", ] ) + _append_text_fidelity(lines, metadata) + lines.extend(["", "## Pages With Warnings", ""]) if pages_with_warning: lines.extend(f"- Page {page_index}" for page_index in pages_with_warning) else: @@ -144,6 +148,60 @@ def _summary(metadata: JsonObject) -> JsonObject: return summary if isinstance(summary, dict) else {} +def _append_text_fidelity(lines: list[str], metadata: JsonObject) -> None: + text_fidelity = metadata.get("text_fidelity") + if not isinstance(text_fidelity, list) or not text_fidelity: + return + + summary = _summary(metadata) + low_similarity_pages = _pages_matching( + text_fidelity, + lambda record: isinstance(record.get("text_similarity"), int | float) + and float(record["text_similarity"]) < 0.82, + ) + cjk_pages = _pages_matching( + text_fidelity, + lambda record: isinstance(record.get("unexpected_cjk_count"), int) and record["unexpected_cjk_count"] > 0, + ) + uncertain_pages = _pages_matching( + text_fidelity, + lambda record: record.get("comparison_status") == "page_mapping_uncertain", + ) + + lines.extend( + [ + "", + "## Text Fidelity", + "", + f"- Checked page count: {_int(summary.get('text_fidelity_checked_page_count'))}", + f"- Low-fidelity page count: {_int(summary.get('text_fidelity_low_page_count'))}", + f"- Unexpected CJK count: {_int(summary.get('text_fidelity_unexpected_cjk_count'))}", + ( + "- Replacement candidate page count: " + f"{_int(summary.get('text_fidelity_replacement_candidate_page_count'))}" + ), + f"- Low-similarity pages: {_format_page_list(low_similarity_pages)}", + f"- Unexpected-CJK pages: {_format_page_list(cjk_pages)}", + f"- Uncertain page-mapping pages: {_format_page_list(uncertain_pages)}", + ] + ) + + +def _pages_matching(records: list[object], predicate) -> tuple[int, ...]: + pages: list[int] = [] + for item in records: + if not isinstance(item, dict): + continue + page_index = item.get("page_index") + if isinstance(page_index, int) and predicate(item): + pages.append(page_index) + return tuple(pages) + + +def _format_page_list(pages: tuple[int, ...]) -> str: + return ", ".join(str(page) for page in pages) if pages else "None" + + def _chunk_line(engine_options: object) -> str | None: if not isinstance(engine_options, dict): return None @@ -160,6 +218,52 @@ def _chunk_line(engine_options: object) -> str | None: return f"- Chunk: {chunk_index}/{total_chunks}, source pages: {page_start}-{page_end}" +def _page_conversion_line(engine_options: object) -> str | None: + if not isinstance(engine_options, dict): + return None + page_conversion = engine_options.get("page_conversion") + if not isinstance(page_conversion, dict): + return None + mode = page_conversion.get("mode") + input_pages = page_conversion.get("mineru_input_page_count") + group_pages = page_conversion.get("output_group_page_count") + if mode != "single_page" or input_pages != 1 or not isinstance(group_pages, int): + return None + return f"- Page conversion mode: single-page MinerU inputs, grouped output size: {group_pages}" + + +def _append_output_folder(lines: list[str], engine_options: object) -> None: + if not isinstance(engine_options, dict): + return + output_folder = engine_options.get("output_folder") + if isinstance(output_folder, str) and output_folder: + lines.append(f"- Output folder: {output_folder}") + + +def _append_markdown_parts(lines: list[str], engine_options: object) -> None: + if not isinstance(engine_options, dict): + return + parts = engine_options.get("parts") + if not isinstance(parts, list) or not parts: + return + for item in parts: + if not isinstance(item, dict): + continue + index = _int(item.get("index")) + total = _int(item.get("total")) + page_start = _int(item.get("source_page_start")) + page_end = _int(item.get("source_page_end")) + status = _text(item.get("status")) + markdown_path = _text(item.get("markdown_path")) + lines.append( + f"- Markdown part {index}/{total}: {markdown_path} " + f"(source pages {page_start}-{page_end}, status {status})" + ) + failed_pages = item.get("failed_source_pages") + if isinstance(failed_pages, list) and failed_pages: + lines.append(f"- Failed source pages for part {index}: {_format_page_list(tuple(_int(page) for page in failed_pages))}") + + def _append_optional_path(lines: list[str], label: str, path: Path | None) -> None: if path is not None: lines.append(f"- {label}: {path}") diff --git a/src/pdf2md/text_fidelity.py b/src/pdf2md/text_fidelity.py new file mode 100644 index 0000000..7acfaf6 --- /dev/null +++ b/src/pdf2md/text_fidelity.py @@ -0,0 +1,340 @@ +"""Local text-layer fidelity diagnostics for generated Markdown.""" + +from __future__ import annotations + +import re +import unicodedata +from dataclasses import dataclass +from difflib import SequenceMatcher +from pathlib import Path +from typing import Any + +from pypdf import PdfReader + +from pdf2md.ir import PathLike, TextFidelityRecord, WarningCode, WarningRecord, WarningSeverity + + +LOW_HANGUL_RATIO_THRESHOLD = 0.95 +LOW_TEXT_SIMILARITY_THRESHOLD = 0.82 +HANGUL_SPACING_WARNING_THRESHOLD = 0.35 +SOURCE_SPACING_UNUSABLE_THRESHOLD = 0.75 + + +@dataclass(frozen=True) +class TextFidelityResult: + pages: tuple[TextFidelityRecord, ...] = () + warnings: tuple[WarningRecord, ...] = () + + +_DISPLAY_MATH_RE = re.compile(r"(?.*?)(?[^\n$]+?)(?\n]+>") +_MARKDOWN_MARKER_RE = re.compile(r"[*_~#>`\[\]()]") +_HANGUL_SPACING_RE = re.compile(r"(?<=[가-힣])\s+(?=[가-힣])") + + +def check_text_fidelity( + source_pdf: PathLike, + markdown: str, + *, + page_count: int, + engine_options: dict[str, Any] | None = None, + source_text_pages: tuple[str, ...] | None = None, +) -> TextFidelityResult: + """Compare local PDF text-layer extraction with Markdown where mapping is credible.""" + + if page_count < 1: + return TextFidelityResult() + + try: + all_source_pages = source_text_pages if source_text_pages is not None else extract_pdf_text_pages(source_pdf) + except Exception: + return TextFidelityResult() + if not all_source_pages: + return TextFidelityResult() + + source_page_start = _source_page_start(engine_options) + start_index = source_page_start - 1 + selected_source_pages = all_source_pages[start_index : start_index + page_count] + if not selected_source_pages: + return TextFidelityResult() + + if page_count == 1: + return compare_text_pages( + source_pages=(selected_source_pages[0],), + markdown_pages=(markdown,), + source_page_start=source_page_start, + ) + + return _uncertain_mapping_result(selected_source_pages, source_page_start) + + +def compare_text_pages( + *, + source_pages: tuple[str, ...], + markdown_pages: tuple[str, ...], + source_page_start: int = 1, +) -> TextFidelityResult: + """Compare aligned source and Markdown page text.""" + + pages: list[TextFidelityRecord] = [] + warnings: list[WarningRecord] = [] + if any(page.strip() for page in source_pages): + warnings.append( + WarningRecord( + WarningCode.TEXT_LAYER_AVAILABLE, + WarningSeverity.INFO, + "pypdf extracted source text for text fidelity diagnostics.", + ) + ) + + for page_index, source_text in enumerate(source_pages): + markdown_text = markdown_pages[page_index] if page_index < len(markdown_pages) else "" + record = _compare_one_page( + page_index=page_index, + source_page_number=source_page_start + page_index, + source_text=source_text, + markdown_text=markdown_text, + ) + pages.append(record) + warnings.extend(_warnings_for_record(record)) + + return TextFidelityResult(pages=tuple(pages), warnings=tuple(warnings)) + + +def extract_pdf_text_pages(source_pdf: PathLike) -> tuple[str, ...]: + """Extract local text-layer text from each PDF page with pypdf.""" + + reader = PdfReader(Path(source_pdf).expanduser()) + return tuple(page.extract_text() or "" for page in reader.pages) + + +def strip_markdown_for_text_fidelity(markdown: str) -> str: + """Remove Markdown constructs that are not body text for fidelity comparison.""" + + text = _strip_fenced_code(markdown) + text = _DISPLAY_MATH_RE.sub(" ", text) + text = _INLINE_MATH_RE.sub(" ", text) + text = _IMAGE_LINK_RE.sub(" ", text) + text = _LINK_RE.sub(r"\1", text) + text = _INLINE_CODE_RE.sub(" ", text) + text = _HTML_TAG_RE.sub(" ", text) + text = _MARKDOWN_MARKER_RE.sub(" ", text) + return re.sub(r"\s+", " ", text).strip() + + +def count_hangul_syllables(text: str) -> int: + return sum(0xAC00 <= ord(character) <= 0xD7A3 for character in text) + + +def count_unexpected_cjk(text: str) -> int: + return sum(_is_cjk_ideograph(character) for character in text) + + +def hangul_spacing_anomaly_ratio(text: str) -> float: + hangul_count = count_hangul_syllables(text) + if hangul_count < 2: + return 0.0 + return min(1.0, len(_HANGUL_SPACING_RE.findall(text)) / (hangul_count - 1)) + + +def text_similarity(source_text: str, markdown_text: str) -> float: + source = _normalize_for_similarity(source_text) + markdown = _normalize_for_similarity(markdown_text) + if not source and not markdown: + return 1.0 + if not source or not markdown: + return 0.0 + return SequenceMatcher(None, source, markdown, autojunk=False).ratio() + + +def _compare_one_page( + *, + page_index: int, + source_page_number: int, + source_text: str, + markdown_text: str, +) -> TextFidelityRecord: + stripped_markdown = strip_markdown_for_text_fidelity(markdown_text) + source_available = bool(source_text.strip()) + markdown_available = bool(stripped_markdown.strip()) + pypdf_hangul_count = count_hangul_syllables(source_text) + markdown_hangul_count = count_hangul_syllables(stripped_markdown) + hangul_ratio = None if pypdf_hangul_count == 0 else markdown_hangul_count / pypdf_hangul_count + cjk_count = count_unexpected_cjk(stripped_markdown) if pypdf_hangul_count else 0 + source_spacing = hangul_spacing_anomaly_ratio(source_text) + markdown_spacing = hangul_spacing_anomaly_ratio(stripped_markdown) + similarity = text_similarity(source_text, stripped_markdown) if source_available and markdown_available else None + status = _comparison_status(source_available, markdown_available) + replacement_candidate = _is_replacement_candidate( + status=status, + hangul_ratio=hangul_ratio, + unexpected_cjk_count=cjk_count, + source_spacing=source_spacing, + ) + return TextFidelityRecord( + page_index=page_index, + source_page_number=source_page_number, + pypdf_text_available=source_available, + markdown_text_available=markdown_available, + pypdf_hangul_count=pypdf_hangul_count, + markdown_hangul_count=markdown_hangul_count, + hangul_count_delta=markdown_hangul_count - pypdf_hangul_count, + hangul_count_ratio=hangul_ratio, + unexpected_cjk_count=cjk_count, + pypdf_hangul_spacing_anomaly_ratio=source_spacing, + markdown_hangul_spacing_anomaly_ratio=markdown_spacing, + text_similarity=similarity, + replacement_candidate=replacement_candidate, + comparison_status=status, + ) + + +def _comparison_status(source_available: bool, markdown_available: bool) -> str: + if not source_available: + return "source_text_missing" + if not markdown_available: + return "markdown_page_unavailable" + return "checked" + + +def _is_replacement_candidate( + *, + status: str, + hangul_ratio: float | None, + unexpected_cjk_count: int, + source_spacing: float, +) -> bool: + if status != "checked" or hangul_ratio is None: + return False + if source_spacing >= SOURCE_SPACING_UNUSABLE_THRESHOLD: + return False + return hangul_ratio < LOW_HANGUL_RATIO_THRESHOLD or unexpected_cjk_count > 0 + + +def _warnings_for_record(record: TextFidelityRecord) -> tuple[WarningRecord, ...]: + warnings: list[WarningRecord] = [] + low_similarity = record.text_similarity is not None and record.text_similarity < LOW_TEXT_SIMILARITY_THRESHOLD + low_hangul = record.hangul_count_ratio is not None and record.hangul_count_ratio < LOW_HANGUL_RATIO_THRESHOLD + if record.comparison_status == "checked" and (low_similarity or low_hangul or record.unexpected_cjk_count > 0): + warnings.append( + WarningRecord( + WarningCode.TEXT_FIDELITY_LOW, + WarningSeverity.WARNING, + "Markdown text differs materially from the local PDF text layer.", + page_index=record.page_index, + ) + ) + if record.unexpected_cjk_count > 0: + warnings.append( + WarningRecord( + WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT, + WarningSeverity.WARNING, + f"Markdown contains {record.unexpected_cjk_count} suspicious CJK ideograph(s) on a Korean text page.", + page_index=record.page_index, + ) + ) + if ( + record.pypdf_hangul_spacing_anomaly_ratio >= HANGUL_SPACING_WARNING_THRESHOLD + or record.markdown_hangul_spacing_anomaly_ratio >= HANGUL_SPACING_WARNING_THRESHOLD + ): + warnings.append( + WarningRecord( + WarningCode.HANGUL_SPACING_SUSPECT, + WarningSeverity.INFO, + "Hangul text contains unusually frequent whitespace between syllables.", + page_index=record.page_index, + ) + ) + return tuple(warnings) + + +def _uncertain_mapping_result(source_pages: tuple[str, ...], source_page_start: int) -> TextFidelityResult: + pages: list[TextFidelityRecord] = [] + warnings: list[WarningRecord] = [] + if any(page.strip() for page in source_pages): + warnings.append( + WarningRecord( + WarningCode.TEXT_LAYER_AVAILABLE, + WarningSeverity.INFO, + "pypdf extracted source text for text fidelity diagnostics.", + ) + ) + warnings.append( + WarningRecord( + WarningCode.TEXT_PAGE_MAPPING_UNCERTAIN, + WarningSeverity.INFO, + "Markdown page mapping is uncertain; per-page text fidelity was not scored.", + ) + ) + + for page_index, source_text in enumerate(source_pages): + source_available = bool(source_text.strip()) + status = "page_mapping_uncertain" if source_available else "source_text_missing" + pages.append( + TextFidelityRecord( + page_index=page_index, + source_page_number=source_page_start + page_index, + pypdf_text_available=source_available, + markdown_text_available=False, + pypdf_hangul_count=count_hangul_syllables(source_text), + markdown_hangul_count=0, + hangul_count_delta=-count_hangul_syllables(source_text), + hangul_count_ratio=0.0 if count_hangul_syllables(source_text) else None, + unexpected_cjk_count=0, + pypdf_hangul_spacing_anomaly_ratio=hangul_spacing_anomaly_ratio(source_text), + markdown_hangul_spacing_anomaly_ratio=0.0, + text_similarity=None, + replacement_candidate=False, + comparison_status=status, + ) + ) + return TextFidelityResult(pages=tuple(pages), warnings=tuple(warnings)) + + +def _source_page_start(engine_options: dict[str, Any] | None) -> int: + if not isinstance(engine_options, dict): + return 1 + chunk = engine_options.get("chunk") + if not isinstance(chunk, dict): + return 1 + value = chunk.get("source_page_start") + return value if isinstance(value, int) and value >= 1 else 1 + + +def _strip_fenced_code(text: str) -> str: + lines = text.splitlines() + output: list[str] = [] + fence: str | None = None + for line in lines: + stripped = line.lstrip(" ") + if fence is None and (stripped.startswith("```") or stripped.startswith("~~~")): + fence = stripped[:3] + continue + if fence is not None: + if stripped.startswith(fence): + fence = None + continue + output.append(line) + return "\n".join(output) + + +def _normalize_for_similarity(text: str) -> str: + normalized = unicodedata.normalize("NFKC", text) + normalized = re.sub(r"\s+", " ", normalized) + return normalized.strip() + + +def _is_cjk_ideograph(character: str) -> bool: + codepoint = ord(character) + return ( + 0x3400 <= codepoint <= 0x4DBF + or 0x4E00 <= codepoint <= 0x9FFF + or 0x20000 <= codepoint <= 0x2A6DF + or 0x2A700 <= codepoint <= 0x2B73F + or 0x2B740 <= codepoint <= 0x2B81F + or 0x2B820 <= codepoint <= 0x2CEAF + ) diff --git a/src/pdf2md_ui/__init__.py b/src/pdf2md_ui/__init__.py new file mode 100644 index 0000000..2147f99 --- /dev/null +++ b/src/pdf2md_ui/__init__.py @@ -0,0 +1,5 @@ +"""Minimal local desktop launcher for the pdf2md CLI.""" + +__all__ = ["__version__"] + +__version__ = "0.1.0" diff --git a/src/pdf2md_ui/app.py b/src/pdf2md_ui/app.py new file mode 100644 index 0000000..26cba11 --- /dev/null +++ b/src/pdf2md_ui/app.py @@ -0,0 +1,390 @@ +"""Minimal Tk desktop launcher for the pdf2md CLI.""" + +from __future__ import annotations + +import json +import os +import queue +import sys +import threading +from pathlib import Path +from tkinter import filedialog, messagebox, ttk +import tkinter as tk + + +if __package__ in {None, ""}: + sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from pdf2md_ui.runner import ( # noqa: E402 + DEFAULT_CHUNK_PAGES, + DEFAULT_GPU_DEVICE, + DEFAULT_MINERU_PROFILE, + MINERU_PROFILE_CHOICES, + CliResolutionError, + CommandSpec, + RunnerEvent, + RunningCommand, + build_convert_command, + build_doctor_command, + build_recheck_command, + default_output_dir, + find_project_root, + resolve_cli_command, +) + + +APP_NAME = "pdf2md-ui" + + +class Pdf2MdApp: + """Single-window local launcher.""" + + def __init__(self, root: tk.Tk) -> None: + self.root = root + self.root.title("pdf2md") + self.root.geometry("820x640") + self.root.minsize(680, 520) + self.root.protocol("WM_DELETE_WINDOW", self._close) + + self._events: queue.Queue[RunnerEvent] = queue.Queue() + self._running: RunningCommand | None = None + self._worker: threading.Thread | None = None + self._busy_widgets: list[ttk.Widget] = [] + + settings = _load_settings() + project_root = settings.get("project_root") or _default_project_root() + self.project_root_var = tk.StringVar(value=project_root) + self.input_pdf_var = tk.StringVar() + self.output_dir_var = tk.StringVar(value=settings.get("last_output_dir", "")) + self.markdown_var = tk.StringVar() + self.overwrite_var = tk.BooleanVar(value=False) + self.keep_raw_var = tk.BooleanVar(value=False) + self.chunk_enabled_var = tk.BooleanVar(value=False) + self.chunk_pages_var = tk.StringVar(value=str(DEFAULT_CHUNK_PAGES)) + self.gpu_var = tk.StringVar(value=DEFAULT_GPU_DEVICE) + self.mineru_profile_var = tk.StringVar(value=DEFAULT_MINERU_PROFILE) + self.status_var = tk.StringVar(value="Ready") + + self._build_widgets() + self._sync_chunk_state() + + def _build_widgets(self) -> None: + main = ttk.Frame(self.root, padding=12) + main.grid(row=0, column=0, sticky="nsew") + self.root.columnconfigure(0, weight=1) + self.root.rowconfigure(0, weight=1) + main.columnconfigure(1, weight=1) + main.rowconfigure(7, weight=1) + + self._add_path_row(main, 0, "Project root", self.project_root_var, self._choose_project_root) + self._add_path_row(main, 1, "Input PDF", self.input_pdf_var, self._choose_pdf) + self._add_path_row(main, 2, "Output dir", self.output_dir_var, self._choose_output_dir) + self._add_path_row(main, 3, "Markdown", self.markdown_var, self._choose_markdown) + + options = ttk.Frame(main) + options.grid(row=4, column=0, columnspan=3, sticky="ew", pady=(8, 4)) + for column in range(8): + options.columnconfigure(column, weight=0) + options.columnconfigure(7, weight=1) + + overwrite = ttk.Checkbutton(options, text="Overwrite", variable=self.overwrite_var) + overwrite.grid(row=0, column=0, sticky="w", padx=(0, 16)) + keep_raw = ttk.Checkbutton(options, text="Keep raw", variable=self.keep_raw_var) + keep_raw.grid(row=0, column=1, sticky="w", padx=(0, 16)) + chunk = ttk.Checkbutton( + options, + text="Group pages", + variable=self.chunk_enabled_var, + command=self._sync_chunk_state, + ) + chunk.grid(row=0, column=2, sticky="w") + self.chunk_spinbox = ttk.Spinbox(options, from_=1, to=999, width=6, textvariable=self.chunk_pages_var) + self.chunk_spinbox.grid(row=0, column=3, sticky="w", padx=(6, 16)) + ttk.Label(options, text="GPU").grid(row=0, column=4, sticky="w") + gpu = ttk.Entry(options, textvariable=self.gpu_var, width=14) + gpu.grid(row=0, column=5, sticky="w", padx=(6, 0)) + ttk.Label(options, text="Profile").grid(row=0, column=6, sticky="w", padx=(16, 0)) + profile = ttk.Combobox( + options, + textvariable=self.mineru_profile_var, + values=MINERU_PROFILE_CHOICES, + width=12, + state="readonly", + ) + profile.grid(row=0, column=7, sticky="w", padx=(6, 0)) + self._busy_widgets.extend([overwrite, keep_raw, chunk, self.chunk_spinbox, gpu, profile]) + + buttons = ttk.Frame(main) + buttons.grid(row=5, column=0, columnspan=3, sticky="ew", pady=(8, 4)) + self.doctor_button = ttk.Button(buttons, text="Doctor", command=self._run_doctor) + self.doctor_button.pack(side="left") + self.convert_button = ttk.Button(buttons, text="Convert", command=self._run_convert) + self.convert_button.pack(side="left", padx=(8, 0)) + self.recheck_button = ttk.Button(buttons, text="Recheck", command=self._run_recheck) + self.recheck_button.pack(side="left", padx=(8, 0)) + self.cancel_button = ttk.Button(buttons, text="Cancel", command=self._cancel, state="disabled") + self.cancel_button.pack(side="left", padx=(8, 0)) + self.open_output_button = ttk.Button(buttons, text="Open output", command=self._open_output) + self.open_output_button.pack(side="left", padx=(8, 0)) + self._busy_widgets.extend([self.doctor_button, self.convert_button, self.recheck_button, self.open_output_button]) + + status = ttk.Frame(main) + status.grid(row=6, column=0, columnspan=3, sticky="ew", pady=(4, 4)) + status.columnconfigure(0, weight=1) + ttk.Label(status, textvariable=self.status_var).grid(row=0, column=0, sticky="w") + self.progress = ttk.Progressbar(status, mode="indeterminate", length=180) + self.progress.grid(row=0, column=1, sticky="e") + + log_frame = ttk.Frame(main) + log_frame.grid(row=7, column=0, columnspan=3, sticky="nsew", pady=(8, 0)) + log_frame.columnconfigure(0, weight=1) + log_frame.rowconfigure(0, weight=1) + self.log = tk.Text(log_frame, height=14, wrap="word", state="disabled") + self.log.grid(row=0, column=0, sticky="nsew") + scroll = ttk.Scrollbar(log_frame, command=self.log.yview) + scroll.grid(row=0, column=1, sticky="ns") + self.log.configure(yscrollcommand=scroll.set) + + def _add_path_row( + self, + parent: ttk.Frame, + row: int, + label: str, + variable: tk.StringVar, + command, + ) -> None: + ttk.Label(parent, text=label).grid(row=row, column=0, sticky="w", pady=3) + entry = ttk.Entry(parent, textvariable=variable) + entry.grid(row=row, column=1, sticky="ew", padx=(8, 8), pady=3) + button = ttk.Button(parent, text="Browse", command=command) + button.grid(row=row, column=2, sticky="e", pady=3) + self._busy_widgets.extend([entry, button]) + + def _choose_project_root(self) -> None: + directory = filedialog.askdirectory(initialdir=self.project_root_var.get() or str(Path.cwd())) + if directory: + self.project_root_var.set(directory) + self._refresh_default_output() + self._save_settings() + + def _choose_pdf(self) -> None: + path = filedialog.askopenfilename( + title="Select PDF", + filetypes=(("PDF files", "*.pdf"), ("All files", "*.*")), + ) + if path: + self.input_pdf_var.set(path) + self._refresh_default_output() + + def _choose_output_dir(self) -> None: + directory = filedialog.askdirectory(initialdir=self.output_dir_var.get() or self.project_root_var.get()) + if directory: + self.output_dir_var.set(directory) + self._save_settings() + + def _choose_markdown(self) -> None: + path = filedialog.askopenfilename( + title="Select Markdown", + filetypes=(("Markdown files", "*.md"), ("All files", "*.*")), + ) + if path: + self.markdown_var.set(path) + + def _refresh_default_output(self) -> None: + input_pdf = self.input_pdf_var.get().strip() + if not input_pdf: + return + base = Path(self.project_root_var.get()).expanduser() if self.project_root_var.get().strip() else Path.cwd() + self.output_dir_var.set(str(default_output_dir(input_pdf, base_dir=base))) + + def _run_doctor(self) -> None: + self._start_resolved_command(lambda resolved: build_doctor_command(resolved)) + + def _run_convert(self) -> None: + input_pdf = self.input_pdf_var.get().strip() + if not input_pdf or not Path(input_pdf).is_file(): + self._show_error("Select an existing input PDF.") + return + + output_dir = self.output_dir_var.get().strip() + if not output_dir: + self._refresh_default_output() + output_dir = self.output_dir_var.get().strip() + if not output_dir: + self._show_error("Select an output directory.") + return + + try: + chunk_pages = int(self.chunk_pages_var.get()) if self.chunk_enabled_var.get() else None + except ValueError: + self._show_error("Group pages must be a positive integer.") + return + + def build(resolved): + return build_convert_command( + resolved, + input_pdf, + output_dir, + overwrite=self.overwrite_var.get(), + keep_raw=self.keep_raw_var.get(), + chunk_pages=chunk_pages, + gpu=self.gpu_var.get(), + mineru_profile=self.mineru_profile_var.get(), + ) + + self._start_resolved_command(build) + + def _run_recheck(self) -> None: + markdown = self.markdown_var.get().strip() + if not markdown or not Path(markdown).is_file(): + self._show_error("Select an existing Markdown file.") + return + self._start_resolved_command(lambda resolved: build_recheck_command(resolved, markdown)) + + def _start_resolved_command(self, build_command) -> None: + try: + resolved = resolve_cli_command(project_root=self.project_root_var.get().strip() or None) + command = build_command(resolved) + except (CliResolutionError, ValueError) as error: + self._show_error(str(error)) + return + self._start_command(command) + + def _start_command(self, command: CommandSpec) -> None: + runner = RunningCommand(command, self._events.put) + self._running = runner + self._set_busy(True) + self.status_var.set("Running") + self.progress.start(12) + self._append_log("") + self._worker = threading.Thread(target=runner.run, daemon=True) + self._worker.start() + self.root.after(100, self._poll_events) + + def _poll_events(self) -> None: + saw_exit = False + while True: + try: + event = self._events.get_nowait() + except queue.Empty: + break + if event.kind == "start": + self._append_log(f"> {event.message}") + elif event.kind == "output": + self._append_log(event.message) + elif event.kind == "error": + self._append_log(f"error: {event.message}") + elif event.kind == "exit": + saw_exit = True + self._append_log(event.message) + if event.exit_code == 0: + self.status_var.set("Completed") + self._save_settings() + else: + self.status_var.set(f"Failed ({event.exit_code})") + + if saw_exit: + self.progress.stop() + self._running = None + self._set_busy(False) + return + if self._running is not None: + self.root.after(100, self._poll_events) + + def _cancel(self) -> None: + if self._running is not None and self._running.cancel(): + self.status_var.set("Cancelling") + self._append_log("Cancellation requested.") + + def _open_output(self) -> None: + output_dir = self.output_dir_var.get().strip() + if not output_dir: + self._show_error("Select an output directory.") + return + path = Path(output_dir) + if not path.exists(): + self._show_error(f"Output directory does not exist: {path}") + return + if os.name != "nt": + self._show_error("Open output is only implemented for Windows in this launcher.") + return + os.startfile(path) # type: ignore[attr-defined] + + def _set_busy(self, busy: bool) -> None: + for widget in self._busy_widgets: + widget.state(["disabled"] if busy else ["!disabled"]) + self.cancel_button.state(["!disabled"] if busy else ["disabled"]) + self._sync_chunk_state() + + def _sync_chunk_state(self) -> None: + if self._running is not None or not self.chunk_enabled_var.get(): + self.chunk_spinbox.state(["disabled"]) + else: + self.chunk_spinbox.state(["!disabled"]) + + def _append_log(self, message: str) -> None: + self.log.configure(state="normal") + self.log.insert("end", f"{message}\n") + self.log.see("end") + self.log.configure(state="disabled") + + def _show_error(self, message: str) -> None: + self.status_var.set("Error") + self._append_log(f"error: {message}") + messagebox.showerror("pdf2md", message) + + def _save_settings(self) -> None: + data = { + "project_root": self.project_root_var.get().strip(), + "last_output_dir": self.output_dir_var.get().strip(), + } + _save_settings(data) + + def _close(self) -> None: + if self._running is not None: + close = messagebox.askyesno("pdf2md", "A command is running. Cancel and exit?") + if not close: + return + self._running.cancel() + self._save_settings() + self.root.destroy() + + +def _default_project_root() -> str: + root = find_project_root(Path.cwd()) or find_project_root(Path(__file__)) + return str(root) if root is not None else str(Path.cwd()) + + +def _settings_file() -> Path: + base = os.environ.get("APPDATA") + root = Path(base) if base else Path.home() + return root / APP_NAME / "settings.json" + + +def _load_settings() -> dict[str, str]: + path = _settings_file() + if not path.is_file(): + return {} + try: + data = json.loads(path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return {} + if not isinstance(data, dict): + return {} + return {str(key): str(value) for key, value in data.items() if isinstance(value, str)} + + +def _save_settings(data: dict[str, str]) -> None: + path = _settings_file() + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True) + "\n", encoding="utf-8") + + +def main() -> None: + root = tk.Tk() + Pdf2MdApp(root) + root.mainloop() + + +if __name__ == "__main__": + main() diff --git a/src/pdf2md_ui/runner.py b/src/pdf2md_ui/runner.py new file mode 100644 index 0000000..0920d35 --- /dev/null +++ b/src/pdf2md_ui/runner.py @@ -0,0 +1,306 @@ +"""Subprocess runner for the minimal pdf2md desktop launcher.""" + +from __future__ import annotations + +import os +import shutil +import subprocess +import threading +from collections.abc import Callable, Mapping +from dataclasses import dataclass +from pathlib import Path + + +DEFAULT_GPU_DEVICE = "cuda:0" +DEFAULT_MINERU_PROFILE = "auto" +DEFAULT_CHUNK_PAGES = 20 +MINERU_PROFILE_CHOICES = ("auto", "safe", "performance") + + +class CliResolutionError(RuntimeError): + """Raised when the UI cannot find a local pdf2md command.""" + + +@dataclass(frozen=True) +class ResolvedCommand: + """Resolved command prefix for invoking the project-owned pdf2md CLI.""" + + args_prefix: tuple[str, ...] + cwd: Path | None + source: str + + +@dataclass(frozen=True) +class CommandSpec: + """A fixed argument-list command for subprocess execution.""" + + args: tuple[str, ...] + cwd: Path | None = None + + def display(self) -> str: + return subprocess.list2cmdline(self.args) + + +@dataclass(frozen=True) +class RunnerEvent: + """Event emitted by a running CLI subprocess.""" + + kind: str + message: str = "" + exit_code: int | None = None + + +Which = Callable[[str], str | None] +EventCallback = Callable[[RunnerEvent], None] +PopenFactory = Callable[..., subprocess.Popen[str]] +TaskkillRunner = Callable[..., subprocess.CompletedProcess[str]] + + +def find_project_root(start: str | os.PathLike[str] | None = None) -> Path | None: + """Find the nearest project root containing ``pyproject.toml``.""" + + candidate = Path(start).expanduser() if start is not None else Path.cwd() + candidate = candidate.resolve(strict=False) + if candidate.is_file(): + candidate = candidate.parent + for path in (candidate, *candidate.parents): + if (path / "pyproject.toml").is_file(): + return path + return None + + +def resolve_cli_command( + *, + configured_command: str | os.PathLike[str] | None = None, + project_root: str | os.PathLike[str] | None = None, + which: Which = shutil.which, +) -> ResolvedCommand: + """Resolve a local command that invokes the project-owned ``pdf2md`` CLI.""" + + if configured_command is not None: + command = str(configured_command).strip() + if not command: + raise CliResolutionError("Configured pdf2md command is empty.") + _validate_configured_pdf2md_command(command) + return ResolvedCommand((command,), cwd=None, source="configured") + + pdf2md = which("pdf2md") + if pdf2md is not None: + return ResolvedCommand((pdf2md,), cwd=None, source="path") + + uv = which("uv") + if uv is not None and project_root is not None: + root = _validated_project_root(project_root) + return ResolvedCommand((uv, "run", "pdf2md"), cwd=root, source="uv") + + raise CliResolutionError( + "Could not find pdf2md on PATH. Set the project root and run Doctor, " + "or install the local runtime with uv first." + ) + + +def default_output_dir( + input_pdf: str | os.PathLike[str], + *, + base_dir: str | os.PathLike[str] | None = None, +) -> Path: + """Return the UI default output root for a selected PDF.""" + + root = Path(base_dir).expanduser() if base_dir is not None else Path.cwd() + return root / "outputs" + + +def build_doctor_command(resolved: ResolvedCommand) -> CommandSpec: + """Build ``pdf2md doctor``.""" + + return CommandSpec((*resolved.args_prefix, "doctor"), cwd=resolved.cwd) + + +def build_convert_command( + resolved: ResolvedCommand, + input_pdf: str | os.PathLike[str], + output_dir: str | os.PathLike[str], + *, + overwrite: bool = False, + keep_raw: bool = False, + chunk_pages: int | None = None, + gpu: str | None = DEFAULT_GPU_DEVICE, + mineru_profile: str | None = DEFAULT_MINERU_PROFILE, +) -> CommandSpec: + """Build a fixed argument-list ``pdf2md convert`` command.""" + + args = [*resolved.args_prefix, "convert", str(Path(input_pdf)), "--out", str(Path(output_dir))] + if overwrite: + args.append("--overwrite") + if keep_raw: + args.append("--keep-raw") + if chunk_pages is not None: + if chunk_pages < 1: + raise ValueError("chunk_pages must be a positive integer") + args.extend(("--chunk-pages", str(chunk_pages))) + if gpu is not None: + gpu_value = gpu.strip() + if not gpu_value: + raise ValueError("gpu device cannot be empty") + _reject_prohibited_value(gpu_value, field_name="gpu") + args.extend(("--gpu", gpu_value)) + if mineru_profile is not None: + profile_value = mineru_profile.strip() + if profile_value not in MINERU_PROFILE_CHOICES: + raise ValueError("mineru_profile must be one of: auto, safe, performance") + args.extend(("--mineru-profile", profile_value)) + return CommandSpec(tuple(args), cwd=resolved.cwd) + + +def build_recheck_command( + resolved: ResolvedCommand, + markdown_path: str | os.PathLike[str], +) -> CommandSpec: + """Build ``pdf2md recheck``.""" + + return CommandSpec((*resolved.args_prefix, "recheck", str(Path(markdown_path))), cwd=resolved.cwd) + + +def build_child_environment(base_env: Mapping[str, str] | None = None) -> dict[str, str]: + """Build the child environment while preserving explicit user settings.""" + + environment = dict(os.environ if base_env is None else base_env) + environment.setdefault("MINERU_MODEL_SOURCE", "local") + return environment + + +class RunningCommand: + """Run one CLI command and stream line events to a callback.""" + + def __init__( + self, + command: CommandSpec, + callback: EventCallback, + *, + popen_factory: PopenFactory = subprocess.Popen, + base_env: Mapping[str, str] | None = None, + ) -> None: + self.command = command + self._callback = callback + self._popen_factory = popen_factory + self._base_env = base_env + self._process: subprocess.Popen[str] | None = None + self._lock = threading.Lock() + + def run(self) -> int: + """Run the command synchronously on the caller's thread.""" + + self._emit(RunnerEvent("start", self.command.display())) + try: + process = self._popen_factory( + self.command.args, + cwd=str(self.command.cwd) if self.command.cwd is not None else None, + env=build_child_environment(self._base_env), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + errors="replace", + bufsize=1, + shell=False, + ) + except OSError as error: + self._emit(RunnerEvent("error", str(error), exit_code=127)) + self._emit(RunnerEvent("exit", "Command failed to start.", exit_code=127)) + return 127 + + with self._lock: + self._process = process + + if process.stdout is not None: + for line in process.stdout: + self._emit(RunnerEvent("output", line.rstrip("\r\n"))) + + exit_code = process.wait() + with self._lock: + self._process = None + self._emit(RunnerEvent("exit", f"Command exited with code {exit_code}.", exit_code=exit_code)) + return exit_code + + def cancel(self) -> bool: + """Request cancellation of the running subprocess tree.""" + + with self._lock: + process = self._process + if process is None: + return False + return terminate_process_tree(process) + + def _emit(self, event: RunnerEvent) -> None: + self._callback(event) + + +def terminate_process_tree( + process: subprocess.Popen[str], + *, + grace_seconds: float = 3.0, + taskkill_runner: TaskkillRunner = subprocess.run, + os_name: str = os.name, +) -> bool: + """Terminate a process, escalating to a Windows process-tree kill if needed.""" + + if process.poll() is not None: + return False + + try: + process.terminate() + except OSError: + return False + + try: + process.wait(timeout=grace_seconds) + return True + except subprocess.TimeoutExpired: + if os_name == "nt": + taskkill_runner( + ["taskkill", "/pid", str(process.pid), "/t", "/f"], + check=False, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + text=True, + ) + else: + process.kill() + try: + process.wait(timeout=grace_seconds) + except subprocess.TimeoutExpired: + return True + return True + + +def _validated_project_root(project_root: str | os.PathLike[str]) -> Path: + root = Path(project_root).expanduser().resolve(strict=False) + if not (root / "pyproject.toml").is_file(): + raise CliResolutionError(f"Project root does not contain pyproject.toml: {root}") + return root + + +def _validate_configured_pdf2md_command(command: str) -> None: + command_name = Path(command).name.casefold() + if command_name not in {"pdf2md", "pdf2md.exe"}: + raise CliResolutionError("Configured command must be the pdf2md executable.") + + +def _reject_prohibited_value(value: str, *, field_name: str) -> None: + normalized = value.casefold() + prohibited_tokens = ( + "--api-url", + "api_url", + "api-url", + "base_url", + "base-url", + "router", + "http_backend", + "http-backend", + "openai", + "openai-compatible", + "endpoint", + "backend", + "mineru-api", + ) + if "http://" in normalized or "https://" in normalized or any(token in normalized for token in prohibited_tokens): + raise ValueError(f"{field_name} contains a prohibited strict-local token") diff --git a/tests/integration/test_optional_mineru_fixtures.py b/tests/integration/test_optional_mineru_fixtures.py index 6707f51..5e6291a 100644 --- a/tests/integration/test_optional_mineru_fixtures.py +++ b/tests/integration/test_optional_mineru_fixtures.py @@ -39,7 +39,6 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) - output_root = tmp_path / "mineru-fixture-output" attempts: list[dict[str, object]] = [] for pdf in sample_pdfs: - sample_output = output_root / pdf.stem completed = subprocess.run( [ sys.executable, @@ -48,7 +47,7 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) - "convert", str(pdf), "--out", - str(sample_output), + str(output_root), ], cwd=REPO_ROOT, check=False, @@ -67,7 +66,7 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) - "convert", str(pdf), "--out", - str(sample_output), + str(output_root), ] ), "exit_code": completed.returncode, @@ -77,34 +76,27 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) - ) assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2) - markdown_path = sample_output / f"{pdf.stem}.md" - metadata_path = sample_output / f"{pdf.stem}.metadata.json" - report_path = sample_output / f"{pdf.stem}.report.md" + sample_output = output_root / pdf.stem + markdown_path = sample_output / f"{pdf.stem}_001.md" + report_path = sample_output / f"{pdf.stem}_report.md" assert markdown_path.exists() - assert metadata_path.exists() assert report_path.exists() + assert not list(sample_output.glob("*.metadata.json")) - metadata = json.loads(metadata_path.read_text(encoding="utf-8")) - summary = metadata["summary"] - assert metadata["engine"] == "MinerU" - assert summary["pages_processed"] >= 1 - assert "warning_count" in summary - assert "math_render_error_count" in summary - assert "asset_count" in summary report = report_path.read_text(encoding="utf-8") assert "Output Markdown:" in report - assert "Metadata JSON:" in report + assert "Metadata JSON:" not in report assert "Report Markdown:" in report + assert "- Engine: MinerU" in report + assert "- Pages processed:" in report + assert "- Warning count:" in report + assert "- Math render error count:" in report + assert "- Asset count:" in report attempts[-1].update( { "markdown_path": str(markdown_path), - "metadata_path": str(metadata_path), "report_path": str(report_path), - "warning_count": summary["warning_count"], "final_status": _report_final_status(report), - "math_render_error_count": summary["math_render_error_count"], - "asset_count": summary["asset_count"], - "pages_processed": summary["pages_processed"], } ) diff --git a/tests/integration/test_v1_fast_release_gate.py b/tests/integration/test_v1_fast_release_gate.py index 9cf9316..4b51db7 100644 --- a/tests/integration/test_v1_fast_release_gate.py +++ b/tests/integration/test_v1_fast_release_gate.py @@ -1,6 +1,5 @@ from __future__ import annotations -import json from datetime import datetime, timezone from pathlib import Path @@ -68,8 +67,13 @@ def make_pdf(directory: Path, name: str) -> Path: return path -def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_counts(tmp_path: Path) -> None: - pdf = make_pdf(tmp_path, "쉘구조_math.pdf") +def report_metadata(result) -> dict: + assert result._report_metadata is not None + return result._report_metadata + + +def test_v1_fast_conversion_writes_markdown_report_assets_and_quality_counts(tmp_path: Path) -> None: + pdf = make_pdf(tmp_path, "math.pdf") adapter = FixtureAdapter( raw_markdown=( "# Shell Element\n\n" @@ -85,17 +89,21 @@ def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_c result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock) assert result.final_status == "partial" + assert result.markdown_path == tmp_path / "out" / "math" / "math_001.md" assert result.markdown_path.exists() - assert result.metadata_path is not None and result.metadata_path.exists() + assert result.metadata_path is None + assert not list((tmp_path / "out").rglob("*.metadata.json")) + assert result.report_path == tmp_path / "out" / "math" / "math_report.md" assert result.report_path.exists() - assert (tmp_path / "out" / "쉘구조_math.assets" / "mesh.png").read_bytes() == b"fake image" + assert result.assets_dir == tmp_path / "out" / "math" / "images" + assert (result.assets_dir / "mesh.png").read_bytes() == b"fake image" markdown = result.markdown_path.read_text(encoding="utf-8") assert "$u_i$" in markdown assert "$$\nK u = f\n$$" in markdown - assert "![mesh](쉘구조_math.assets/mesh.png)" in markdown + assert "![mesh](images/mesh.png)" in markdown - metadata = json.loads(result.metadata_path.read_text(encoding="utf-8")) + metadata = report_metadata(result) assert metadata["engine"] == "MinerU" assert metadata["engine_version"] == "3.1.0" assert metadata["summary"]["pages_processed"] == 3 @@ -105,18 +113,18 @@ def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_c assert metadata["summary"]["math_render_error_count"] == 0 assert metadata["summary"]["warning_count"] == 1 assert metadata["warnings"][0]["code"] == "TABLE_FALLBACK" - assert metadata["assets"] == [{"relative_path": "쉘구조_math.assets/mesh.png"}] + assert metadata["assets"] == [{"relative_path": "images/mesh.png"}] report = result.report_path.read_text(encoding="utf-8") assert "- Final status: `partial`" in report assert "- Output Markdown:" in report - assert "- Metadata JSON:" in report + assert "- Metadata JSON:" not in report assert "- Report Markdown:" in report assert "- Math render error count: 0" in report assert "`TABLE_FALLBACK`" in report -def test_v1_fast_failure_records_no_fallback_and_writes_no_release_outputs(tmp_path: Path) -> None: +def test_v1_fast_failure_records_no_fallback_and_writes_report_only(tmp_path: Path) -> None: pdf = make_pdf(tmp_path, "failed.pdf") adapter = FixtureAdapter(raw_markdown="", succeeded=False) @@ -126,14 +134,15 @@ def test_v1_fast_failure_records_no_fallback_and_writes_no_release_outputs(tmp_p assert result.warning_count == 1 assert result.warnings[0].code == WarningCode.MINERU_CLI_FAILED assert not result.markdown_path.exists() - assert not result.report_path.exists() - assert result.metadata_path is not None and not result.metadata_path.exists() + assert result.report_path.exists() + assert result.metadata_path is None + assert "- Final status: `failed`" in result.report_path.read_text(encoding="utf-8") def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, capsys) -> None: source = tmp_path / "pdfs" first = make_pdf(source, "a.pdf") - second = make_pdf(source, "한글.pdf") + second = make_pdf(source, "korean.pdf") adapter = FixtureAdapter(raw_markdown="# Batch\n\nNo formulas.\n", raw_structured={"pages": 1}) exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock) @@ -144,9 +153,8 @@ def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, cap assert "converted: 2" in captured.out assert "failed: 0" in captured.out assert "warnings: 0" in captured.out - assert (tmp_path / "out" / "a.md").exists() - assert (tmp_path / "out" / "a.metadata.json").exists() - assert (tmp_path / "out" / "a.report.md").exists() - assert (tmp_path / "out" / "한글.md").exists() - assert (tmp_path / "out" / "한글.metadata.json").exists() - assert (tmp_path / "out" / "한글.report.md").exists() + assert (tmp_path / "out" / "a" / "a_001.md").exists() + assert (tmp_path / "out" / "a" / "a_report.md").exists() + assert (tmp_path / "out" / "korean" / "korean_001.md").exists() + assert (tmp_path / "out" / "korean" / "korean_report.md").exists() + assert not list((tmp_path / "out").rglob("*.metadata.json")) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0d988dd..e88ceac 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,6 +2,8 @@ from __future__ import annotations import subprocess import sys +import hashlib +import json from datetime import datetime, timezone from importlib.metadata import entry_points from pathlib import Path @@ -9,8 +11,10 @@ from pathlib import Path import pytest from pypdf import PdfWriter +import pdf2md.conversion as conversion_module from pdf2md.cli import main from pdf2md.doctor import DoctorCheck, DoctorReport +from pdf2md.gpu import GpuInfo from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity from pdf2md.mineru_adapter import MinerUAdapterResult @@ -69,6 +73,24 @@ def make_pdf_with_pages(directory: Path, name: str, page_count: int) -> Path: return path +def write_legacy_metadata(markdown_path: Path, source_pdf: Path) -> Path: + metadata_path = markdown_path.with_suffix(".metadata.json") + metadata = { + "source_pdf": str(source_pdf.resolve()), + "source_sha256": hashlib.sha256(source_pdf.read_bytes()).hexdigest(), + "created_at": "2026-05-08T00:00:00Z", + "engine": "MinerU", + "engine_version": "3.1.0", + "engine_options": {"strict_local": True}, + "pages": [{"page_index": 0, "blocks": []}], + "assets": [], + "warnings": [], + "summary": {"pages_processed": 1, "warning_count": 0}, + } + metadata_path.write_text(json.dumps(metadata, indent=2, sort_keys=True) + "\n", encoding="utf-8") + return metadata_path + + def test_console_script_entry_point_is_reserved() -> None: scripts = {entry_point.name: entry_point for entry_point in entry_points(group="console_scripts")} @@ -128,18 +150,91 @@ def test_cli_convert_single_pdf_writes_outputs_and_summary(tmp_path: Path, capsy out = tmp_path / "out" adapter = FakeAdapter() - exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock) + exit_code = main(["convert", str(pdf), "--out", str(out), "--metadata"], adapter=adapter, clock=fixed_clock) captured = capsys.readouterr() assert exit_code == 0 assert "converted: 1" in captured.out assert "failed: 0" in captured.out assert "warnings: 0" in captured.out - assert (out / "paper.md").exists() - assert (out / "paper.metadata.json").exists() - assert (out / "paper.report.md").exists() + assert (out / "paper" / "paper_001.md").exists() + assert not list(out.rglob("*.metadata.json")) + assert (out / "paper" / "paper_report.md").exists() assert adapter.calls == [pdf.resolve()] - assert adapter.options[0].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"} + assert adapter.options[0].to_engine_options() == { + "strict_local": True, + "gpu_device": "cuda:0", + "mineru_profile": { + "requested": "auto", + "applied": "safe", + "environment": { + "MINERU_API_MAX_CONCURRENT_REQUESTS": "1", + "MINERU_PDF_RENDER_THREADS": "1", + "MINERU_PROCESSING_WINDOW_SIZE": "1", + }, + }, + } + + +def test_cli_convert_accepts_safe_mineru_profile(tmp_path: Path, capsys) -> None: + pdf = make_pdf(tmp_path, "paper.pdf") + adapter = FakeAdapter() + + exit_code = main( + ["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "safe"], + adapter=adapter, + clock=fixed_clock, + ) + + capsys.readouterr() + assert exit_code == 0 + assert adapter.options[0].to_engine_options()["mineru_profile"]["requested"] == "safe" + + +def test_cli_convert_accepts_performance_mineru_profile(tmp_path: Path, capsys) -> None: + pdf = make_pdf(tmp_path, "paper.pdf") + adapter = FakeAdapter() + + exit_code = main( + ["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "performance"], + adapter=adapter, + clock=fixed_clock, + ) + + capsys.readouterr() + assert exit_code == 0 + profile = adapter.options[0].to_engine_options()["mineru_profile"] + assert profile["requested"] == "performance" + assert profile["applied"] == "safe" + + +def test_cli_convert_rejects_invalid_mineru_profile(tmp_path: Path, capsys) -> None: + pdf = make_pdf(tmp_path, "paper.pdf") + + with pytest.raises(SystemExit) as error: + main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "fast"]) + + captured = capsys.readouterr() + assert error.value.code == 2 + assert "invalid choice" in captured.err + + +def test_cli_convert_gpu_auto_selects_largest_visible_gpu(tmp_path: Path, capsys, monkeypatch) -> None: + pdf = make_pdf(tmp_path, "paper.pdf") + adapter = FakeAdapter() + inventory = ( + GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"), + GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00"), + ) + monkeypatch.setattr(conversion_module, "query_nvidia_gpus", lambda: inventory) + + exit_code = main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--gpu", "auto"], adapter=adapter, clock=fixed_clock) + + capsys.readouterr() + options = adapter.options[0].to_engine_options() + assert exit_code == 0 + assert options["gpu_device"] == "cuda:1" + assert options["mineru_profile"]["selected_gpu"]["name"] == "NVIDIA RTX 4090" def test_cli_convert_directory_is_deterministic(tmp_path: Path, capsys) -> None: @@ -173,7 +268,7 @@ def test_cli_convert_recursive_only_when_requested(tmp_path: Path, capsys) -> No assert exit_code == 0 assert [path.name for path in adapter.calls] == ["child.pdf", "top.pdf"] assert "converted: 2" in captured.out - assert (tmp_path / "out" / "nested" / "child.md").exists() + assert (tmp_path / "out" / "nested" / "child" / "child_001.md").exists() def test_cli_failure_summary_returns_nonzero(tmp_path: Path, capsys) -> None: @@ -204,22 +299,23 @@ def test_cli_recheck_markdown_regenerates_adjacent_metadata_and_report(tmp_path: ) capsys.readouterr() - markdown_path = out / "paper.md" + markdown_path = out / "paper" / "paper_001.md" markdown_path.write_text("Inline $x_i$\n", encoding="utf-8") + write_legacy_metadata(markdown_path, pdf) exit_code = main(["recheck", str(markdown_path)], clock=fixed_clock, math_checker=lambda _: True) captured = capsys.readouterr() assert exit_code == 0 assert "rechecked:" in captured.out assert "warnings: 0" in captured.out - assert "- Final status: `success`" in (out / "paper.report.md").read_text(encoding="utf-8") + assert "- Final status: `success`" in markdown_path.with_suffix(".report.md").read_text(encoding="utf-8") def test_cli_preflight_conflict_fails_before_conversion(tmp_path: Path, capsys) -> None: pdf = make_pdf(tmp_path, "paper.pdf") out = tmp_path / "out" - out.mkdir() - (out / "paper.md").write_text("old", encoding="utf-8") + (out / "paper").mkdir(parents=True) + (out / "paper" / "paper_001.md").write_text("old", encoding="utf-8") adapter = FakeAdapter() exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock) @@ -240,12 +336,12 @@ def test_cli_convert_chunk_pages_flag_uses_default_twenty_pages(tmp_path: Path, captured = capsys.readouterr() assert exit_code == 0 assert "converted: 2" in captured.out - assert [path.name for path in adapter.calls] == [ - "long.part-001.pages-001-020.pdf", - "long.part-002.pages-021-021.pdf", - ] - assert (out / "long.part-001.pages-001-020.md").exists() - assert (out / "long.part-002.pages-021-021.md").exists() + assert len(adapter.calls) == 21 + assert [path.name for path in adapter.calls[:3]] == ["long.page-001.pdf", "long.page-002.pdf", "long.page-003.pdf"] + assert (out / "long" / "long_001.md").exists() + assert (out / "long" / "long_002.md").exists() + assert (out / "long" / "long_report.md").exists() + assert not list(out.rglob("*.metadata.json")) def test_cli_convert_rejects_non_positive_chunk_pages(tmp_path: Path, capsys) -> None: diff --git a/tests/test_conversion.py b/tests/test_conversion.py index b8a3a08..fbff6c7 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -6,10 +6,11 @@ from datetime import datetime, timezone from pathlib import Path import pytest -from pypdf import PdfWriter +from pypdf import PdfReader, PdfWriter import pdf2md.conversion as conversion_module from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf, recheck_markdown +from pdf2md.gpu import GpuInfo from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity from pdf2md.mineru_adapter import MinerUAdapterResult, StrictLocalViolationError from pdf2md.paths import OutputConflictError @@ -32,6 +33,7 @@ class FakeAdapter: self.warnings = warnings self.asset_name = asset_name self.calls: list[tuple[Path, Path, object]] = [] + self.input_page_counts: list[int] = [] def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult: input_path = Path(input_pdf) @@ -39,6 +41,10 @@ class FakeAdapter: output_dir.mkdir(parents=True, exist_ok=True) (output_dir / "raw.log").write_text("raw output", encoding="utf-8") self.calls.append((input_path, output_dir, options)) + try: + self.input_page_counts.append(len(PdfReader(input_path).pages)) + except Exception: + self.input_page_counts.append(0) asset_paths: tuple[Path, ...] = () if self.asset_name is not None: asset_path = output_dir / "assets" / self.asset_name @@ -67,12 +73,17 @@ class SequencedAdapter: def __init__(self, outcomes: tuple[bool, ...]) -> None: self.outcomes = list(outcomes) self.calls: list[Path] = [] + self.input_page_counts: list[int] = [] def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult: input_path = Path(input_pdf) output_dir = Path(work_dir) output_dir.mkdir(parents=True, exist_ok=True) self.calls.append(input_path) + try: + self.input_page_counts.append(len(PdfReader(input_path).pages)) + except Exception: + self.input_page_counts.append(0) succeeded = self.outcomes.pop(0) warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.") return MinerUAdapterResult( @@ -93,6 +104,66 @@ class SequencedAdapter: ) +class PageMarkdownAdapter: + def __init__(self, markdown_pages: tuple[str, ...]) -> None: + self.markdown_pages = list(markdown_pages) + self.calls: list[Path] = [] + + def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult: + input_path = Path(input_pdf) + output_dir = Path(work_dir) + output_dir.mkdir(parents=True, exist_ok=True) + self.calls.append(input_path) + markdown = self.markdown_pages.pop(0) + return MinerUAdapterResult( + succeeded=True, + command=("mineru", "-p", str(input_path), "-o", str(output_dir)), + input_pdf=input_path, + work_dir=output_dir, + raw_markdown=markdown, + raw_structured={"pages": 1}, + asset_paths=(), + warnings=(), + engine="MinerU", + engine_version="3.1.0", + engine_options=options.to_engine_options() if options is not None else {"strict_local": True}, + exit_code=0, + stdout="", + stderr="", + ) + + +class CollidingPageAssetAdapter: + def __init__(self) -> None: + self.calls: list[Path] = [] + + def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult: + input_path = Path(input_pdf) + output_dir = Path(work_dir) + output_dir.mkdir(parents=True, exist_ok=True) + self.calls.append(input_path) + page_number = len(self.calls) + asset_path = output_dir / "assets" / "fig.png" + asset_path.parent.mkdir(parents=True, exist_ok=True) + asset_path.write_bytes(f"asset {page_number}".encode("utf-8")) + return MinerUAdapterResult( + succeeded=True, + command=("mineru", "-p", str(input_path), "-o", str(output_dir)), + input_pdf=input_path, + work_dir=output_dir, + raw_markdown=f"Page {page_number}\n\n![fig](assets/fig.png)\n", + raw_structured={"pages": 1}, + asset_paths=(asset_path,), + warnings=(), + engine="MinerU", + engine_version="3.1.0", + engine_options=options.to_engine_options() if options is not None else {"strict_local": True}, + exit_code=0, + stdout="", + stderr="", + ) + + class NestedMinerUAssetAdapter: def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult: input_path = Path(input_pdf) @@ -140,6 +211,20 @@ def make_pdf_with_pages(tmp_path: Path, page_count: int, name: str = "paper.pdf" return path +def report_metadata(result) -> dict: + assert result._report_metadata is not None + return result._report_metadata + + +def write_legacy_metadata(result) -> Path: + metadata_path = result.markdown_path.with_suffix(".metadata.json") + metadata_path.write_text( + json.dumps(report_metadata(result), indent=2, ensure_ascii=False, sort_keys=True) + "\n", + encoding="utf-8", + ) + return metadata_path + + def test_convert_pdf_writes_markdown_metadata_report_and_assets(tmp_path: Path) -> None: pdf = make_pdf(tmp_path) adapter = FakeAdapter( @@ -156,18 +241,23 @@ def test_convert_pdf_writes_markdown_metadata_report_and_assets(tmp_path: Path) assert result.warning_count == 0 assert result.engine == "MinerU" assert result.engine_version == "3.1.0" - assert result.markdown_path.read_text(encoding="utf-8") == "# Title\n\nInline $x_i$\n\n![fig](paper.assets/fig.png)\n" - assert (tmp_path / "out" / "paper.assets" / "fig.png").read_bytes() == b"asset" + assert result.markdown_path == tmp_path / "out" / "paper" / "paper_001.md" + assert result.markdown_path.read_text(encoding="utf-8") == "# Title\n\nInline $x_i$\n\n![fig](images/fig.png)\n" + assert (tmp_path / "out" / "paper" / "images" / "fig.png").read_bytes() == b"asset" + assert result.metadata_path is None + assert not list((tmp_path / "out").rglob("*.metadata.json")) assert result.report_path.exists() - metadata = json.loads(result.metadata_path.read_text(encoding="utf-8")) + metadata = report_metadata(result) assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest() assert metadata["created_at"] == "2026-05-08T00:00:00Z" assert metadata["summary"]["pages_processed"] == 2 assert metadata["summary"]["inline_formula_count"] == 1 assert metadata["summary"]["asset_count"] == 1 - assert metadata["assets"] == [{"relative_path": "paper.assets/fig.png"}] - assert "- Final status: `success`" in result.report_path.read_text(encoding="utf-8") + assert metadata["assets"] == [{"relative_path": "images/fig.png"}] + report = result.report_path.read_text(encoding="utf-8") + assert "- Final status: `success`" in report + assert "Metadata JSON:" not in report assert not adapter.calls[0][1].exists() @@ -183,14 +273,16 @@ def test_convert_pdf_adapter_failure_returns_failed_result_without_fallback_or_o assert result.warnings == (warning,) assert len(adapter.calls) == 1 assert not result.markdown_path.exists() - assert not result.report_path.exists() + assert result.metadata_path is None + assert result.report_path.exists() + assert "- Final status: `failed`" in result.report_path.read_text(encoding="utf-8") def test_convert_pdf_respects_output_conflicts_and_overwrite(tmp_path: Path) -> None: pdf = make_pdf(tmp_path) out = tmp_path / "out" - out.mkdir() - (out / "paper.md").write_text("old", encoding="utf-8") + (out / "paper").mkdir(parents=True) + (out / "paper" / "paper_001.md").write_text("old", encoding="utf-8") with pytest.raises(OutputConflictError): convert_pdf(pdf, out, adapter=FakeAdapter(), clock=fixed_clock) @@ -209,7 +301,7 @@ def test_convert_pdf_can_skip_metadata_json_but_still_writes_report(tmp_path: Pa assert result.metadata_path is None assert result.markdown_path.exists() assert result.report_path.exists() - assert not (tmp_path / "out" / "paper.metadata.json").exists() + assert not list((tmp_path / "out").rglob("*.metadata.json")) report = result.report_path.read_text(encoding="utf-8") assert "Metadata JSON:" not in report assert "Report Markdown:" in report @@ -223,7 +315,7 @@ def test_convert_pdf_records_math_checker_failures_in_metadata_and_report(tmp_pa assert result.final_status == "partial" assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_FAILED] - metadata = json.loads(result.metadata_path.read_text(encoding="utf-8")) + metadata = report_metadata(result) assert metadata["summary"]["math_render_error_count"] == 1 assert metadata["warnings"][0]["code"] == "MATH_RENDER_FAILED" report = result.report_path.read_text(encoding="utf-8") @@ -244,7 +336,7 @@ def test_convert_pdf_repairs_math_render_failure_before_writing_outputs(tmp_path assert result.final_status == "partial" assert result.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$" assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_REPAIRED] - metadata = json.loads(result.metadata_path.read_text(encoding="utf-8")) + metadata = report_metadata(result) assert metadata["summary"]["math_render_error_count"] == 0 assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED" report = result.report_path.read_text(encoding="utf-8") @@ -256,6 +348,7 @@ def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown( pdf = make_pdf(tmp_path) adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n") result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: False, clock=fixed_clock) + legacy_metadata_path = write_legacy_metadata(result) result.markdown_path.write_text("Inline $x_i$\n", encoding="utf-8") rechecked = recheck_markdown(result.markdown_path, math_checker=lambda _: True, clock=fixed_clock) @@ -263,9 +356,9 @@ def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown( assert rechecked.final_status == "success" assert rechecked.warning_count == 0 assert rechecked.markdown_path == result.markdown_path - assert rechecked.metadata_path == result.metadata_path - assert rechecked.report_path == result.report_path - metadata = json.loads(result.metadata_path.read_text(encoding="utf-8")) + assert rechecked.metadata_path == legacy_metadata_path + assert rechecked.report_path == result.markdown_path.with_suffix(".report.md") + metadata = json.loads(legacy_metadata_path.read_text(encoding="utf-8")) assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest() assert metadata["created_at"] == "2026-05-08T00:00:00Z" assert metadata["summary"]["pages_processed"] == 1 @@ -273,7 +366,7 @@ def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown( assert metadata["summary"]["math_render_error_count"] == 0 assert metadata["summary"]["warning_count"] == 0 assert metadata["warnings"] == [] - report = result.report_path.read_text(encoding="utf-8") + report = rechecked.report_path.read_text(encoding="utf-8") assert "- Final status: `success`" in report assert "- Math render error count: 0" in report assert "- None" in report @@ -287,17 +380,26 @@ def test_recheck_markdown_repairs_math_render_failure(tmp_path: Path) -> None: pdf = make_pdf(tmp_path) adapter = FakeAdapter(raw_markdown="No formulas.\n") result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock) + legacy_metadata_path = write_legacy_metadata(result) result.markdown_path.write_text("$$\nx ^ {i} ^ {t}\n$$\n", encoding="utf-8") rechecked = recheck_markdown(result.markdown_path, math_checker=RepairAwareChecker(), clock=fixed_clock) assert rechecked.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$\n" assert [warning.code for warning in rechecked.warnings] == [WarningCode.MATH_RENDER_REPAIRED] - metadata = json.loads(result.metadata_path.read_text(encoding="utf-8")) + metadata = json.loads(legacy_metadata_path.read_text(encoding="utf-8")) assert metadata["summary"]["math_render_error_count"] == 0 assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED" +def test_recheck_markdown_requires_legacy_metadata_for_simplified_outputs(tmp_path: Path) -> None: + pdf = make_pdf(tmp_path) + result = convert_pdf(pdf, tmp_path / "out", adapter=FakeAdapter(), math_checker=lambda _: True, clock=fixed_clock) + + with pytest.raises(ValueError, match="Legacy adjacent metadata JSON"): + recheck_markdown(result.markdown_path, math_checker=lambda _: True, clock=fixed_clock) + + def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: Path, monkeypatch) -> None: pdf = make_pdf(tmp_path) adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n") @@ -308,7 +410,7 @@ def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: assert result.final_status == "partial" assert result.warnings[0].code == WarningCode.MATH_RENDER_FAILED assert result.warnings[0].severity == WarningSeverity.INFO - metadata = json.loads(result.metadata_path.read_text(encoding="utf-8")) + metadata = report_metadata(result) assert metadata["summary"]["warning_count"] == 1 assert metadata["summary"]["math_render_error_count"] == 0 report = result.report_path.read_text(encoding="utf-8") @@ -316,6 +418,55 @@ def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: assert "- Math render error count: 0" in report +def test_convert_pdf_records_text_fidelity_without_replacing_markdown(tmp_path: Path, monkeypatch) -> None: + pdf = make_pdf(tmp_path) + adapter = FakeAdapter(raw_markdown="쉘의 력과 曲률\n", raw_structured={"pages": 1}) + monkeypatch.setattr( + "pdf2md.text_fidelity.extract_pdf_text_pages", + lambda _: ("쉘의 응력과 곡률\n",), + ) + + result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock) + + assert result.markdown_path.read_text(encoding="utf-8") == "쉘의 력과 曲률\n" + assert [warning.code for warning in result.warnings] == [ + WarningCode.TEXT_LAYER_AVAILABLE, + WarningCode.TEXT_FIDELITY_LOW, + WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT, + ] + metadata = report_metadata(result) + assert metadata["text_fidelity"][0]["replacement_candidate"] is True + assert metadata["summary"]["text_fidelity_low_page_count"] == 1 + assert metadata["summary"]["text_fidelity_unexpected_cjk_count"] == 1 + report = result.report_path.read_text(encoding="utf-8") + assert "## Text Fidelity" in report + assert "`TEXT_FIDELITY_LOW` page 0" in report + + +def test_recheck_markdown_reruns_text_fidelity_without_duplicate_old_warnings(tmp_path: Path, monkeypatch) -> None: + pdf = make_pdf(tmp_path) + monkeypatch.setattr( + "pdf2md.text_fidelity.extract_pdf_text_pages", + lambda _: ("쉘의 응력과 곡률\n",), + ) + result = convert_pdf( + pdf, + tmp_path / "out", + adapter=FakeAdapter(raw_markdown="쉘의 력과 曲률\n", raw_structured={"pages": 1}), + math_checker=lambda _: True, + clock=fixed_clock, + ) + + result.markdown_path.write_text("쉘의 응력과 곡률\n", encoding="utf-8") + legacy_metadata_path = write_legacy_metadata(result) + rechecked = recheck_markdown(result.markdown_path, math_checker=lambda _: True, clock=fixed_clock) + + assert [warning.code for warning in rechecked.warnings] == [WarningCode.TEXT_LAYER_AVAILABLE] + metadata = json.loads(legacy_metadata_path.read_text(encoding="utf-8")) + assert [warning["code"] for warning in metadata["warnings"]] == ["TEXT_LAYER_AVAILABLE"] + assert metadata["summary"]["text_fidelity_low_page_count"] == 0 + + def test_convert_pdf_uses_default_math_checker_when_available(tmp_path: Path, monkeypatch) -> None: class DefaultChecker: def __init__(self) -> None: @@ -342,7 +493,7 @@ def test_convert_pdf_keep_raw_preserves_adapter_work_directory(tmp_path: Path) - result = convert_pdf(pdf, tmp_path / "out", keep_raw=True, adapter=FakeAdapter(), clock=fixed_clock) - assert result.raw_dir == tmp_path / "out" / "paper.raw" + assert result.raw_dir == tmp_path / "out" / "paper" / "raw" assert (result.raw_dir / "raw.log").read_text(encoding="utf-8") == "raw output" @@ -359,7 +510,10 @@ def test_convert_pdf_passes_gpu_device_to_strict_local_options(tmp_path: Path) - convert_pdf(pdf, tmp_path / "out", gpu="cuda:0", adapter=adapter, clock=fixed_clock) - assert adapter.calls[0][2].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"} + engine_options = adapter.calls[0][2].to_engine_options() + assert engine_options["strict_local"] is True + assert engine_options["gpu_device"] == "cuda:0" + assert engine_options["mineru_profile"]["requested"] == "auto" def test_convert_pdf_defaults_to_cuda_zero(tmp_path: Path) -> None: @@ -368,7 +522,58 @@ def test_convert_pdf_defaults_to_cuda_zero(tmp_path: Path) -> None: convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock) - assert adapter.calls[0][2].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"} + engine_options = adapter.calls[0][2].to_engine_options() + assert engine_options["strict_local"] is True + assert engine_options["gpu_device"] == "cuda:0" + assert engine_options["mineru_profile"]["requested"] == "auto" + + +def test_convert_pdf_gpu_auto_selects_largest_gpu_and_records_profile(tmp_path: Path) -> None: + pdf = make_pdf(tmp_path) + adapter = FakeAdapter() + inventory = ( + GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"), + GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00"), + ) + + result = convert_pdf( + pdf, + tmp_path / "out", + gpu="auto", + mineru_profile="auto", + gpu_inventory=inventory, + adapter=adapter, + clock=fixed_clock, + ) + + engine_options = adapter.calls[0][2].to_engine_options() + assert engine_options["gpu_device"] == "cuda:1" + assert engine_options["mineru_profile"]["applied"] == "auto" + assert engine_options["mineru_profile"]["selected_gpu"]["index"] == 1 + metadata = report_metadata(result) + assert metadata["engine_options"]["gpu_device"] == "cuda:1" + assert metadata["engine_options"]["mineru_profile"]["selected_gpu"]["name"] == "NVIDIA RTX 4090" + + +def test_convert_pdf_performance_profile_warning_is_recorded(tmp_path: Path) -> None: + pdf = make_pdf(tmp_path) + adapter = FakeAdapter() + inventory = (GpuInfo(index=0, name="NVIDIA GeForce GTX 1070 Ti", memory_total_mib=8192, driver_version="577.00"),) + + result = convert_pdf( + pdf, + tmp_path / "out", + gpu="cuda:0", + mineru_profile="performance", + gpu_inventory=inventory, + adapter=adapter, + clock=fixed_clock, + ) + + assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_PROFILE_ADJUSTED] + metadata = report_metadata(result) + assert metadata["warnings"][0]["code"] == "MINERU_PROFILE_ADJUSTED" + assert metadata["engine_options"]["mineru_profile"]["applied"] == "safe" def test_convert_pdf_rewrites_nested_mineru_image_links_and_page_indexes(tmp_path: Path) -> None: @@ -385,11 +590,10 @@ def test_convert_pdf_rewrites_nested_mineru_image_links_and_page_indexes(tmp_pat assert result.final_status == "success" assert result.pages_processed == 13 markdown = result.markdown_path.read_text(encoding="utf-8") - assert "![fig](paper.assets/paper/hybrid_auto/images/fig.png)" in markdown - assert "](images/fig.png)" not in markdown - copied_asset = tmp_path / "out" / "paper.assets" / "paper" / "hybrid_auto" / "images" / "fig.png" + assert "![fig](images/fig.png)" in markdown + copied_asset = tmp_path / "out" / "paper" / "images" / "fig.png" assert copied_asset.read_bytes() == b"nested asset" - metadata = json.loads(result.metadata_path.read_text(encoding="utf-8")) + metadata = report_metadata(result) assert metadata["summary"]["pages_processed"] == 13 assert metadata["summary"]["warning_count"] == 0 @@ -406,12 +610,13 @@ def test_convert_input_batch_continues_after_per_file_failure(tmp_path: Path) -> assert [path.name for path in adapter.calls] == ["a.pdf", "b.pdf", "c.pdf"] assert batch.converted_count == 2 assert batch.failed_count == 1 - assert (tmp_path / "out" / "a.md").exists() - assert not (tmp_path / "out" / "b.md").exists() - assert (tmp_path / "out" / "c.md").exists() + assert (tmp_path / "out" / "a" / "a_001.md").exists() + assert not (tmp_path / "out" / "b" / "b_001.md").exists() + assert (tmp_path / "out" / "b" / "b_report.md").exists() + assert (tmp_path / "out" / "c" / "c_001.md").exists() -def test_convert_pdf_chunk_mode_returns_batch_and_deletes_temporary_chunk_pdfs(tmp_path: Path) -> None: +def test_convert_pdf_chunk_mode_converts_single_pages_and_returns_grouped_outputs(tmp_path: Path) -> None: pdf = make_pdf_with_pages(tmp_path, 41, "thesis.pdf") adapter = FakeAdapter(raw_structured={"pages": 1}) @@ -427,60 +632,157 @@ def test_convert_pdf_chunk_mode_returns_batch_and_deletes_temporary_chunk_pdfs(t assert isinstance(batch, BatchConversionResult) assert batch.converted_count == 3 assert [result.markdown_path.name for result in batch.results] == [ - "thesis.part-001.pages-001-020.md", - "thesis.part-002.pages-021-040.md", - "thesis.part-003.pages-041-041.md", + "thesis_001.md", + "thesis_002.md", + "thesis_003.md", ] - assert [path.name for path, _, _ in adapter.calls] == [ - "thesis.part-001.pages-001-020.pdf", - "thesis.part-002.pages-021-040.pdf", - "thesis.part-003.pages-041-041.pdf", + assert len(adapter.calls) == 41 + assert adapter.input_page_counts == [1] * 41 + assert [path.name for path, _, _ in adapter.calls[:3]] == [ + "thesis.page-001.pdf", + "thesis.page-002.pdf", + "thesis.page-003.pdf", ] assert all(result.source_pdf == pdf.resolve() for result in batch.results) assert all(not path.exists() for path, _, _ in adapter.calls) - metadata = json.loads((tmp_path / "out" / "thesis.part-002.pages-021-040.metadata.json").read_text(encoding="utf-8")) + assert all(result.metadata_path is None for result in batch.results) + assert not list((tmp_path / "out").rglob("*.metadata.json")) + assert {result.report_path for result in batch.results} == {tmp_path / "out" / "thesis" / "thesis_report.md"} + + metadata = report_metadata(batch.results[1]) assert metadata["source_pdf"] == str(pdf.resolve()) assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest() assert metadata["engine_options"]["chunk"] == { "chunk_index": 2, "chunk_page_count": 20, - "chunk_pdf_name": "thesis.part-002.pages-021-040.pdf", "original_source_pdf": str(pdf.resolve()), "source_page_end": 40, "source_page_start": 21, "total_chunks": 3, } - report = (tmp_path / "out" / "thesis.part-002.pages-021-040.report.md").read_text(encoding="utf-8") - assert "- Chunk: 2/3, source pages: 21-40" in report + assert metadata["engine_options"]["page_conversion"] == { + "failed_source_pages": [], + "mineru_input_page_count": 1, + "mode": "single_page", + "output_group_page_count": 20, + } + report = (tmp_path / "out" / "thesis" / "thesis_report.md").read_text(encoding="utf-8") + assert "- Markdown part 2/3:" in report + assert "source pages 21-40" in report + assert "thesis_002.md" in report -def test_convert_pdf_chunk_mode_keeps_short_pdf_as_single_batch_result(tmp_path: Path) -> None: +def test_convert_pdf_chunk_mode_converts_short_pdf_as_single_page_inputs(tmp_path: Path) -> None: pdf = make_pdf_with_pages(tmp_path, 3, "short.pdf") - adapter = FakeAdapter(raw_structured={"pages": 3}) + adapter = FakeAdapter(raw_structured={"pages": 1}) batch = convert_pdf(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock) assert isinstance(batch, BatchConversionResult) assert batch.converted_count == 1 - assert batch.results[0].markdown_path.name == "short.md" - assert adapter.calls[0][0] == pdf.resolve() - assert adapter.calls[0][0].exists() + assert batch.results[0].markdown_path.name == "short_001.md" + assert [path.name for path, _, _ in adapter.calls] == [ + "short.page-001.pdf", + "short.page-002.pdf", + "short.page-003.pdf", + ] + assert adapter.input_page_counts == [1, 1, 1] + assert all(not path.exists() for path, _, _ in adapter.calls) + metadata = report_metadata(batch.results[0]) + assert metadata["engine_options"]["chunk"]["chunk_page_count"] == 3 + assert metadata["engine_options"]["page_conversion"]["output_group_page_count"] == 20 -def test_convert_input_chunk_mode_continues_after_failed_chunk(tmp_path: Path) -> None: - pdf = make_pdf_with_pages(tmp_path, 41, "paper.pdf") +def test_convert_input_chunk_mode_continues_after_failed_page_inside_group(tmp_path: Path) -> None: + pdf = make_pdf_with_pages(tmp_path, 3, "paper.pdf") adapter = SequencedAdapter((True, False, True)) batch = convert_input(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock) - assert batch.converted_count == 2 - assert batch.failed_count == 1 + assert batch.converted_count == 1 + assert batch.failed_count == 0 assert [path.name for path in adapter.calls] == [ - "paper.part-001.pages-001-020.pdf", - "paper.part-002.pages-021-040.pdf", - "paper.part-003.pages-041-041.pdf", + "paper.page-001.pdf", + "paper.page-002.pdf", + "paper.page-003.pdf", ] - assert (tmp_path / "out" / "paper.part-001.pages-001-020.md").exists() - assert not (tmp_path / "out" / "paper.part-002.pages-021-040.md").exists() - assert (tmp_path / "out" / "paper.part-003.pages-041-041.md").exists() + assert adapter.input_page_counts == [1, 1, 1] + assert (tmp_path / "out" / "paper" / "paper_001.md").exists() + markdown = (tmp_path / "out" / "paper" / "paper_001.md").read_text(encoding="utf-8") + assert "" in markdown + metadata = report_metadata(batch.results[0]) + assert metadata["summary"]["pages_processed"] == 3 + assert metadata["warnings"][0]["code"] == "MINERU_CLI_FAILED" + assert metadata["warnings"][0]["severity"] == "warning" + assert metadata["warnings"][0]["page_index"] == 1 + assert metadata["engine_options"]["page_conversion"]["failed_source_pages"] == [2] + assert "- Final status: `partial`" in (tmp_path / "out" / "paper" / "paper_report.md").read_text( + encoding="utf-8" + ) + + +def test_convert_pdf_chunk_mode_failed_group_writes_report_but_no_markdown(tmp_path: Path) -> None: + pdf = make_pdf_with_pages(tmp_path, 2, "paper.pdf") + adapter = SequencedAdapter((False, False)) + + batch = convert_pdf(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock) + + assert batch.converted_count == 0 + assert batch.failed_count == 1 + [result] = batch.results + assert result.final_status == "failed" + assert not result.markdown_path.exists() + assert result.metadata_path is None + assert not list((tmp_path / "out").rglob("*.metadata.json")) + assert result.report_path.exists() + metadata = report_metadata(result) + assert [warning["page_index"] for warning in metadata["warnings"]] == [0, 1] + assert {warning["severity"] for warning in metadata["warnings"]} == {"error"} + + +def test_convert_pdf_chunk_mode_copies_page_assets_without_collisions(tmp_path: Path) -> None: + pdf = make_pdf_with_pages(tmp_path, 2, "paper.pdf") + adapter = CollidingPageAssetAdapter() + + batch = convert_pdf(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, math_checker=lambda _: True, clock=fixed_clock) + + [result] = batch.results + markdown = result.markdown_path.read_text(encoding="utf-8") + assert "![fig](images/page-001_fig.png)" in markdown + assert "![fig](images/page-002_fig.png)" in markdown + assert (result.assets_dir / "page-001_fig.png").read_bytes() == b"asset 1" + assert (result.assets_dir / "page-002_fig.png").read_bytes() == b"asset 2" + metadata = report_metadata(result) + assert [asset["relative_path"] for asset in metadata["assets"]] == [ + "images/page-001_fig.png", + "images/page-002_fig.png", + ] + + +def test_convert_pdf_chunk_mode_preserves_page_text_fidelity_numbers(tmp_path: Path, monkeypatch) -> None: + pdf = make_pdf_with_pages(tmp_path, 3, "korean.pdf") + extraction_calls: list[Path] = [] + + def fake_extract(source_pdf: Path) -> tuple[str, ...]: + extraction_calls.append(source_pdf) + return ("가나다", "라마바", "사아자") + + monkeypatch.setattr(conversion_module, "extract_pdf_text_pages", fake_extract) + adapter = PageMarkdownAdapter(("가나다\n", "라마\n", "사아자\n")) + + batch = convert_pdf( + pdf, + tmp_path / "out", + adapter=adapter, + chunk_pages=20, + math_checker=lambda _: True, + clock=fixed_clock, + ) + + [result] = batch.results + metadata = report_metadata(result) + assert [record["page_index"] for record in metadata["text_fidelity"]] == [0, 1, 2] + assert [record["source_page_number"] for record in metadata["text_fidelity"]] == [1, 2, 3] + assert metadata["summary"]["text_fidelity_checked_page_count"] == 3 + assert extraction_calls == [pdf.resolve()] diff --git a/tests/test_doctor.py b/tests/test_doctor.py index 2049551..54bb5cc 100644 --- a/tests/test_doctor.py +++ b/tests/test_doctor.py @@ -165,7 +165,7 @@ def test_doctor_warns_when_gpu_and_pytorch_are_missing(tmp_path: Path) -> None: def test_doctor_warns_for_gtx_1070_ti_pascal_risk(tmp_path: Path) -> None: - report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n") + report = make_report(tmp_path, gpu_stdout="0, NVIDIA GeForce GTX 1070 Ti, 8192, 551.86\n") gpu_check = find_check(report, "gpu") assert report.status == "warn" @@ -181,7 +181,7 @@ def test_doctor_warns_for_pytorch_pre_turing_capability(tmp_path: Path) -> None: report = make_report( tmp_path, - gpu_stdout="NVIDIA RTX 4060, 8192 MiB, 551.86\n", + gpu_stdout="0, NVIDIA RTX 4060, 8192, 551.86\n", import_module=fake_pascal_torch, ) @@ -220,7 +220,7 @@ def test_doctor_warns_when_mathjax_health_fails(tmp_path: Path) -> None: def failing_runner(command: tuple[str, ...]) -> DoctorCommandResult: if command[-1] == "--health": return DoctorCommandResult(command, 1, stderr="Cannot find package 'mathjax'") - return command_runner("NVIDIA RTX 4060, 8192 MiB, 551.86\n")(command) + return command_runner("0, NVIDIA RTX 4060, 8192, 551.86\n")(command) report = make_report(tmp_path, run_command=failing_runner) @@ -232,7 +232,7 @@ def test_doctor_warns_when_mathjax_health_fails(tmp_path: Path) -> None: def test_format_doctor_report_is_stable(tmp_path: Path) -> None: - report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n") + report = make_report(tmp_path, gpu_stdout="0, NVIDIA GeForce GTX 1070 Ti, 8192, 551.86\n") formatted = format_doctor_report(report) @@ -241,13 +241,29 @@ def test_format_doctor_report_is_stable(tmp_path: Path) -> None: assert "[PASS] local-only:" in formatted +def test_doctor_reports_auto_gpu_and_recommended_profile(tmp_path: Path) -> None: + report = make_report( + tmp_path, + gpu_stdout=( + "0, NVIDIA RTX 4060, 8192, 577.00\n" + "1, NVIDIA RTX 4090, 24564, 577.00\n" + ), + ) + + gpu_check = find_check(report, "gpu") + assert gpu_check.status == "pass" + assert any("gpu 1: NVIDIA RTX 4090, 24564 MiB, driver 577.00" in detail for detail in gpu_check.details) + assert any("auto gpu: cuda:1" in detail for detail in gpu_check.details) + assert any("recommended MinerU profile: auto" in detail for detail in gpu_check.details) + + def make_report( tmp_path: Path, *, python_version: tuple[int, int, int] = (3, 12, 7), available_tools: dict[str, str] | None = None, mineru_result: MinerUVersionResult | None = None, - gpu_stdout: str = "NVIDIA RTX 4060, 8192 MiB, 551.86\n", + gpu_stdout: str = "0, NVIDIA RTX 4060, 8192, 551.86\n", env: dict[str, str] | None = None, existing_paths: set[Path] | None = None, import_module=None, diff --git a/tests/test_gpu.py b/tests/test_gpu.py new file mode 100644 index 0000000..2f7670d --- /dev/null +++ b/tests/test_gpu.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +import pytest + +from pdf2md.gpu import GpuInfo, parse_nvidia_smi_gpus, select_gpu + + +def test_parse_nvidia_smi_output_with_one_rtx_gpu() -> None: + gpus = parse_nvidia_smi_gpus("0, NVIDIA GeForce RTX 4090, 24564, 577.00\n") + + assert gpus == ( + GpuInfo(index=0, name="NVIDIA GeForce RTX 4090", memory_total_mib=24564, driver_version="577.00"), + ) + assert gpus[0].pre_turing_risk is False + + +def test_parse_nvidia_smi_output_with_multiple_gpus_and_mib_suffix() -> None: + gpus = parse_nvidia_smi_gpus( + "0, NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 577.00\n" + "1, NVIDIA RTX A5000, 24564 MiB, 577.00\n" + ) + + assert [gpu.index for gpu in gpus] == [0, 1] + assert [gpu.memory_total_mib for gpu in gpus] == [8192, 24564] + assert gpus[0].pre_turing_risk is True + assert gpus[1].pre_turing_risk is False + + +def test_parse_nvidia_smi_output_ignores_blank_lines_and_rejects_malformed_memory() -> None: + with pytest.raises(ValueError, match="memory"): + parse_nvidia_smi_gpus("\n0, NVIDIA RTX 4090, not-memory, 577.00\n") + + +def test_select_gpu_auto_chooses_largest_vram_gpu() -> None: + gpus = ( + GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"), + GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00"), + ) + + selected = select_gpu(gpus, "auto") + + assert selected.gpu == gpus[1] + assert selected.cuda_device == "cuda:1" + + +def test_select_gpu_accepts_cuda_and_numeric_requests() -> None: + gpus = ( + GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"), + GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00"), + ) + + assert select_gpu(gpus, "cuda:1").gpu == gpus[1] + assert select_gpu(gpus, "1").cuda_device == "cuda:1" + + +def test_select_gpu_errors_when_requested_gpu_is_absent() -> None: + gpus = (GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"),) + + with pytest.raises(ValueError, match="not visible"): + select_gpu(gpus, "cuda:1") + + +def test_select_gpu_auto_errors_without_visible_gpus() -> None: + with pytest.raises(ValueError, match="No visible NVIDIA GPU"): + select_gpu((), "auto") diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 2d3ecbc..a4d67e6 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -11,6 +11,7 @@ from pdf2md.ir import ( BlockType, DocumentRecord, PageRecord, + TextFidelityRecord, WarningCode, WarningRecord, WarningSeverity, @@ -171,3 +172,53 @@ def test_info_math_render_warning_is_not_counted_as_render_error(tmp_path: Path) assert summary["warning_count"] == 1 assert summary["math_render_error_count"] == 0 + + +def test_metadata_includes_text_fidelity_when_records_exist(tmp_path: Path) -> None: + document = DocumentRecord( + source_pdf=tmp_path / "paper.pdf", + pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),), + text_fidelity=( + TextFidelityRecord( + page_index=0, + source_page_number=1, + pypdf_text_available=True, + markdown_text_available=True, + pypdf_hangul_count=10, + markdown_hangul_count=8, + hangul_count_delta=-2, + hangul_count_ratio=0.8, + unexpected_cjk_count=1, + pypdf_hangul_spacing_anomaly_ratio=0.0, + markdown_hangul_spacing_anomaly_ratio=0.0, + text_similarity=0.72, + replacement_candidate=True, + comparison_status="checked", + ), + ), + warnings=( + WarningRecord(WarningCode.TEXT_FIDELITY_LOW, WarningSeverity.WARNING, "Low text fidelity.", page_index=0), + WarningRecord( + WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT, + WarningSeverity.WARNING, + "Unexpected CJK.", + page_index=0, + ), + ), + ) + + metadata = build_metadata( + document=document, + source_sha256="0" * 64, + created_at="2026-05-11T00:00:00Z", + engine="MinerU", + engine_version="3.1.0", + ) + + assert metadata["text_fidelity"][0]["page_index"] == 0 + assert metadata["text_fidelity"][0]["replacement_candidate"] is True + assert metadata["summary"]["text_fidelity_checked_page_count"] == 1 + assert metadata["summary"]["text_fidelity_low_page_count"] == 1 + assert metadata["summary"]["text_fidelity_unexpected_cjk_count"] == 1 + assert metadata["summary"]["text_fidelity_replacement_candidate_page_count"] == 1 + assert metadata["summary"]["text_fidelity_page_mapping_uncertain_count"] == 0 diff --git a/tests/test_mineru_adapter.py b/tests/test_mineru_adapter.py index f3ea726..4d24b73 100644 --- a/tests/test_mineru_adapter.py +++ b/tests/test_mineru_adapter.py @@ -1,17 +1,21 @@ from __future__ import annotations import os +import sys from pathlib import Path import pytest from pdf2md.ir import WarningCode +from pdf2md.gpu import GpuInfo from pdf2md.mineru_adapter import ( CommandResult, MinerUAdapter, MinerUOptions, StrictLocalViolationError, + _run_command, ) +from pdf2md.mineru_profile import resolve_mineru_profile class FakeRunner: @@ -36,10 +40,16 @@ class EnvironmentRunner: def __init__(self) -> None: self.mineru_device_mode: str | None = None self.cuda_visible_devices: str | None = None + self.processing_window_size: str | None = None + self.max_concurrent_requests: str | None = None + self.pdf_render_threads: str | None = None def __call__(self, command: tuple[str, ...]) -> CommandResult: self.mineru_device_mode = os.environ.get("MINERU_DEVICE_MODE") self.cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES") + self.processing_window_size = os.environ.get("MINERU_PROCESSING_WINDOW_SIZE") + self.max_concurrent_requests = os.environ.get("MINERU_API_MAX_CONCURRENT_REQUESTS") + self.pdf_render_threads = os.environ.get("MINERU_PDF_RENDER_THREADS") work_dir = Path(command[command.index("-o") + 1]) work_dir.mkdir(parents=True, exist_ok=True) (work_dir / "paper.md").write_text("# Title\n", encoding="utf-8") @@ -133,6 +143,20 @@ def test_version_empty_output_is_explicit() -> None: assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED] +def test_default_runner_decodes_utf8_process_output() -> None: + code = ( + "import sys; " + "sys.stdout.buffer.write('stdout ∙\\n'.encode('utf-8')); " + "sys.stderr.buffer.write('stderr ∙\\n'.encode('utf-8'))" + ) + + result = _run_command((sys.executable, "-c", code)) + + assert result.exit_code == 0 + assert result.stdout == "stdout ∙\n" + assert result.stderr == "stderr ∙\n" + + def test_build_command_is_list_based_and_deterministic(tmp_path: Path) -> None: adapter = MinerUAdapter(which=available, runner=FakeRunner()) input_pdf = tmp_path / "논문 with spaces.pdf" @@ -200,7 +224,15 @@ def test_successful_mocked_output_parses_markdown_json_and_assets(tmp_path: Path ] assert result.engine == "MinerU" assert result.engine_version == "3.1.0" - assert result.engine_options == {"strict_local": True, "gpu_device": "cuda:0"} + assert result.engine_options == { + "strict_local": True, + "gpu_device": "cuda:0", + "mineru_profile": { + "requested": "auto", + "applied": "auto", + "environment": {}, + }, + } assert result.exit_code == 0 assert result.stdout == "ok" assert result.stderr == "warn" @@ -209,6 +241,7 @@ def test_successful_mocked_output_parses_markdown_json_and_assets(tmp_path: Path def test_gpu_option_sets_mineru_environment_and_restores_previous_values(tmp_path: Path, monkeypatch) -> None: monkeypatch.setenv("MINERU_DEVICE_MODE", "cpu") monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "7") + monkeypatch.setenv("MINERU_PROCESSING_WINDOW_SIZE", "99") runner = EnvironmentRunner() adapter = MinerUAdapter(which=available, runner=runner) @@ -219,6 +252,53 @@ def test_gpu_option_sets_mineru_environment_and_restores_previous_values(tmp_pat assert runner.cuda_visible_devices == "0" assert os.environ["MINERU_DEVICE_MODE"] == "cpu" assert os.environ["CUDA_VISIBLE_DEVICES"] == "7" + assert os.environ["MINERU_PROCESSING_WINDOW_SIZE"] == "99" + + +def test_profile_option_sets_allowlisted_mineru_environment_and_engine_options(tmp_path: Path) -> None: + gpu = GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00") + profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True) + runner = EnvironmentRunner() + adapter = MinerUAdapter(which=available, runner=runner) + + result = adapter.convert( + tmp_path / "paper.pdf", + tmp_path / "work", + MinerUOptions( + gpu_device="cuda:1", + mineru_profile="performance", + profile_environment=profile.environment, + profile_engine_options=profile.to_engine_options(), + ), + ) + + assert result.succeeded is True + assert runner.mineru_device_mode == "cuda" + assert runner.cuda_visible_devices == "1" + assert runner.processing_window_size == "16" + assert runner.max_concurrent_requests == "1" + assert runner.pdf_render_threads == "4" + assert result.engine_options["mineru_profile"]["applied"] == "performance" + + +def test_profile_warnings_are_preserved_in_adapter_result(tmp_path: Path) -> None: + gpu = GpuInfo(index=0, name="NVIDIA GeForce GTX 1070 Ti", memory_total_mib=8192, driver_version="577.00") + profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True) + adapter = MinerUAdapter(which=available, runner=EnvironmentRunner()) + + result = adapter.convert( + tmp_path / "paper.pdf", + tmp_path / "work", + MinerUOptions( + gpu_device="cuda:0", + mineru_profile="performance", + profile_environment=profile.environment, + profile_engine_options=profile.to_engine_options(), + profile_warnings=profile.warnings, + ), + ) + + assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_PROFILE_ADJUSTED] def test_nonzero_exit_does_not_parse_existing_outputs_or_fallback(tmp_path: Path) -> None: diff --git a/tests/test_mineru_profile.py b/tests/test_mineru_profile.py new file mode 100644 index 0000000..8493498 --- /dev/null +++ b/tests/test_mineru_profile.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +from pdf2md.gpu import GpuInfo +from pdf2md.ir import WarningCode, WarningSeverity +from pdf2md.mineru_profile import resolve_mineru_profile + + +SAFE_ENV = { + "MINERU_PROCESSING_WINDOW_SIZE": "1", + "MINERU_API_MAX_CONCURRENT_REQUESTS": "1", + "MINERU_PDF_RENDER_THREADS": "1", +} + + +def test_auto_profile_uses_safe_values_without_gpu_inventory() -> None: + profile = resolve_mineru_profile("auto", selected_gpu=None, cuda_requested=True) + + assert profile.applied_profile == "safe" + assert profile.environment == SAFE_ENV + assert [warning.code for warning in profile.warnings] == [WarningCode.GPU_UNAVAILABLE] + + +def test_auto_profile_uses_safe_values_for_gtx_1070_ti() -> None: + gpu = GpuInfo(index=0, name="NVIDIA GeForce GTX 1070 Ti", memory_total_mib=8192, driver_version="577.00") + + profile = resolve_mineru_profile("auto", selected_gpu=gpu, cuda_requested=True) + + assert profile.requested_profile == "auto" + assert profile.applied_profile == "safe" + assert profile.environment == SAFE_ENV + assert profile.selected_gpu_name == "NVIDIA GeForce GTX 1070 Ti" + + +def test_auto_profile_uses_moderate_values_for_16gb_turing_or_newer_gpu() -> None: + gpu = GpuInfo(index=0, name="NVIDIA RTX A4000", memory_total_mib=16384, driver_version="577.00") + + profile = resolve_mineru_profile("auto", selected_gpu=gpu, cuda_requested=True) + + assert profile.applied_profile == "auto" + assert profile.environment == { + "MINERU_PROCESSING_WINDOW_SIZE": "8", + "MINERU_API_MAX_CONCURRENT_REQUESTS": "1", + "MINERU_PDF_RENDER_THREADS": "4", + } + assert profile.warnings == () + + +def test_auto_profile_uses_conservative_values_for_12gb_to_16gb_gpu() -> None: + gpu = GpuInfo(index=0, name="NVIDIA RTX 4070", memory_total_mib=12288, driver_version="577.00") + + profile = resolve_mineru_profile("auto", selected_gpu=gpu, cuda_requested=True) + + assert profile.applied_profile == "auto-conservative" + assert profile.environment == { + "MINERU_PROCESSING_WINDOW_SIZE": "4", + "MINERU_API_MAX_CONCURRENT_REQUESTS": "1", + "MINERU_PDF_RENDER_THREADS": "2", + } + + +def test_performance_profile_uses_performance_values_only_on_strong_gpu() -> None: + gpu = GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00") + + profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True) + + assert profile.applied_profile == "performance" + assert profile.environment == { + "MINERU_PROCESSING_WINDOW_SIZE": "16", + "MINERU_API_MAX_CONCURRENT_REQUESTS": "1", + "MINERU_PDF_RENDER_THREADS": "4", + } + assert profile.selected_gpu_index == 1 + assert profile.selected_gpu_vram_mib == 24564 + + +def test_performance_profile_downgrades_to_safe_on_weak_gpu() -> None: + gpu = GpuInfo(index=0, name="NVIDIA GeForce GTX 1070 Ti", memory_total_mib=8192, driver_version="577.00") + + profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True) + + assert profile.applied_profile == "safe" + assert profile.environment == SAFE_ENV + assert [warning.code for warning in profile.warnings] == [WarningCode.MINERU_PROFILE_ADJUSTED] + assert profile.warnings[0].severity == WarningSeverity.WARNING + + +def test_profile_details_are_json_ready() -> None: + gpu = GpuInfo(index=0, name="NVIDIA RTX A5000", memory_total_mib=24564, driver_version="577.00") + + profile = resolve_mineru_profile("auto", selected_gpu=gpu, cuda_requested=True) + + assert profile.to_engine_options() == { + "requested": "auto", + "applied": "auto", + "environment": { + "MINERU_API_MAX_CONCURRENT_REQUESTS": "1", + "MINERU_PDF_RENDER_THREADS": "4", + "MINERU_PROCESSING_WINDOW_SIZE": "8", + }, + "selected_gpu": { + "index": 0, + "name": "NVIDIA RTX A5000", + "memory_total_mib": 24564, + "pre_turing_risk": False, + }, + } diff --git a/tests/test_paths.py b/tests/test_paths.py index 7ec6c71..7429dd5 100644 --- a/tests/test_paths.py +++ b/tests/test_paths.py @@ -73,14 +73,14 @@ def test_discovers_directory_recursive_with_relative_parents(tmp_path: Path) -> def test_discovery_order_is_deterministic_for_non_ascii_names(tmp_path: Path) -> None: - touch(tmp_path / "한글.pdf") + korean_pdf = touch(tmp_path / "논문.pdf") touch(tmp_path / "Alpha.pdf") touch(tmp_path / "beta.PDF") first = discover_pdfs(tmp_path) second = discover_pdfs(tmp_path) - assert [item.source_path.name for item in first] == ["Alpha.pdf", "beta.PDF", "한글.pdf"] + assert [item.source_path.name for item in first] == ["Alpha.pdf", "beta.PDF", korean_pdf.name] assert first == second @@ -91,22 +91,24 @@ def test_plans_all_default_output_paths_for_single_pdf(tmp_path: Path) -> None: [plan] = plan_pdf_outputs(pdf, output_root) assert plan.source_pdf == pdf.resolve() - assert plan.markdown_path == output_root.resolve() / "입력.md" - assert plan.assets_dir == output_root.resolve() / "입력.assets" - assert plan.metadata_path == output_root.resolve() / "입력.metadata.json" - assert plan.report_path == output_root.resolve() / "입력.report.md" + assert plan.markdown_path == output_root.resolve() / "입력" / "입력_001.md" + assert plan.assets_dir == output_root.resolve() / "입력" / "images" + assert plan.metadata_path is None + assert plan.report_path == output_root.resolve() / "입력" / "입력_report.md" assert plan.raw_dir is None -def test_plans_optional_metadata_and_raw_outputs(tmp_path: Path) -> None: +def test_plans_metadata_flag_as_noop_and_raw_outputs(tmp_path: Path) -> None: pdf = touch(tmp_path / "paper.pdf") + [with_metadata_flag] = plan_pdf_outputs(pdf, tmp_path / "out", metadata=True) [without_metadata] = plan_pdf_outputs(pdf, tmp_path / "out", metadata=False) [with_raw] = plan_pdf_outputs(pdf, tmp_path / "out", keep_raw=True) + assert with_metadata_flag.metadata_path is None assert without_metadata.metadata_path is None - assert without_metadata.report_path == (tmp_path / "out").resolve() / "paper.report.md" - assert with_raw.raw_dir == (tmp_path / "out").resolve() / "paper.raw" + assert without_metadata.report_path == (tmp_path / "out").resolve() / "paper" / "paper_report.md" + assert with_raw.raw_dir == (tmp_path / "out").resolve() / "paper" / "raw" def test_recursive_planning_preserves_relative_subdirectories(tmp_path: Path) -> None: @@ -117,8 +119,8 @@ def test_recursive_planning_preserves_relative_subdirectories(tmp_path: Path) -> plans = plan_pdf_outputs(root, tmp_path / "out", recursive=True) assert [plan.markdown_path.relative_to((tmp_path / "out").resolve()) for plan in plans] == [ - Path("nested") / "same.md", - Path("same.md"), + Path("nested") / "same" / "same_001.md", + Path("same") / "same_001.md", ] @@ -137,21 +139,21 @@ def test_non_recursive_duplicate_output_paths_fail(tmp_path: Path) -> None: def test_output_conflicts_report_all_existing_paths(tmp_path: Path) -> None: pdf = touch(tmp_path / "paper.pdf") output_root = tmp_path / "out" - (output_root / "paper.assets").mkdir(parents=True) - (output_root / "paper.md").mkdir() - touch(output_root / "paper.metadata.json") + (output_root / "paper" / "images").mkdir(parents=True) + (output_root / "paper" / "paper_001.md").mkdir() + touch(output_root / "paper" / "paper_report.md") with pytest.raises(OutputConflictError) as error: plan_pdf_outputs(pdf, output_root) conflict_names = {path.name for path in error.value.conflicts} - assert conflict_names == {"paper.assets", "paper.md", "paper.metadata.json"} + assert conflict_names == {"images", "paper_001.md", "paper_report.md"} def test_overwrite_allows_existing_paths_without_deleting(tmp_path: Path) -> None: pdf = touch(tmp_path / "paper.pdf") output_root = tmp_path / "out" - existing = touch(output_root / "paper.md") + existing = touch(output_root / "paper" / "paper_001.md") [plan] = plan_pdf_outputs(pdf, output_root, overwrite=True) diff --git a/tests/test_report.py b/tests/test_report.py index 7f9491f..2e026ee 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -8,6 +8,7 @@ from pdf2md.ir import ( BlockType, DocumentRecord, PageRecord, + TextFidelityRecord, WarningCode, WarningRecord, WarningSeverity, @@ -161,3 +162,115 @@ def test_report_includes_chunk_context_when_metadata_has_chunk_options(tmp_path: report = render_report(metadata) assert "- Chunk: 2/3, source pages: 21-40" in report + + +def test_report_includes_single_page_conversion_context(tmp_path: Path) -> None: + metadata = make_metadata(tmp_path) + metadata["engine_options"] = { + "strict_local": True, + "page_conversion": { + "mode": "single_page", + "mineru_input_page_count": 1, + "output_group_page_count": 20, + "failed_source_pages": [], + }, + } + + report = render_report(metadata) + + assert "- Page conversion mode: single-page MinerU inputs, grouped output size: 20" in report + + +def test_report_includes_aggregate_output_folder_and_markdown_parts(tmp_path: Path) -> None: + metadata = make_metadata(tmp_path) + metadata["engine_options"] = { + "strict_local": True, + "output_folder": str(tmp_path / "out" / "paper"), + "parts": [ + { + "index": 1, + "total": 2, + "source_page_start": 1, + "source_page_end": 20, + "markdown_path": str(tmp_path / "out" / "paper" / "paper_001.md"), + "status": "success", + "warning_count": 0, + }, + { + "index": 2, + "total": 2, + "source_page_start": 21, + "source_page_end": 23, + "markdown_path": None, + "status": "failed", + "warning_count": 2, + "failed_source_pages": [22, 23], + }, + ], + } + + report = render_report(metadata) + + assert f"- Output folder: {tmp_path / 'out' / 'paper'}" in report + assert "paper_001.md (source pages 1-20, status success)" in report + assert "- Markdown part 2/2: unavailable (source pages 21-23, status failed)" in report + assert "- Failed source pages for part 2: 22, 23" in report + + +def test_report_includes_text_fidelity_section_when_metadata_has_diagnostics(tmp_path: Path) -> None: + document = DocumentRecord( + source_pdf=tmp_path / "paper.pdf", + pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),), + text_fidelity=( + TextFidelityRecord( + page_index=0, + source_page_number=3, + pypdf_text_available=True, + markdown_text_available=True, + pypdf_hangul_count=10, + markdown_hangul_count=7, + hangul_count_delta=-3, + hangul_count_ratio=0.7, + unexpected_cjk_count=2, + pypdf_hangul_spacing_anomaly_ratio=0.0, + markdown_hangul_spacing_anomaly_ratio=0.0, + text_similarity=0.61, + replacement_candidate=True, + comparison_status="checked", + ), + TextFidelityRecord( + page_index=1, + source_page_number=4, + pypdf_text_available=True, + markdown_text_available=False, + pypdf_hangul_count=5, + markdown_hangul_count=0, + hangul_count_delta=-5, + hangul_count_ratio=0.0, + unexpected_cjk_count=0, + pypdf_hangul_spacing_anomaly_ratio=0.0, + markdown_hangul_spacing_anomaly_ratio=0.0, + text_similarity=0.0, + replacement_candidate=False, + comparison_status="page_mapping_uncertain", + ), + ), + ) + metadata = build_metadata( + document=document, + source_sha256="0" * 64, + created_at="2026-05-11T00:00:00Z", + engine="MinerU", + engine_version="3.1.0", + ) + + report = render_report(metadata) + + assert "## Text Fidelity" in report + assert "- Checked page count: 1" in report + assert "- Low-fidelity page count: 1" in report + assert "- Unexpected CJK count: 2" in report + assert "- Replacement candidate page count: 1" in report + assert "- Low-similarity pages: 0" in report + assert "- Unexpected-CJK pages: 0" in report + assert "- Uncertain page-mapping pages: 1" in report diff --git a/tests/test_text_fidelity.py b/tests/test_text_fidelity.py new file mode 100644 index 0000000..0e2e4ad --- /dev/null +++ b/tests/test_text_fidelity.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from pathlib import Path + +from pypdf import PdfWriter + +from pdf2md.ir import WarningCode +from pdf2md.text_fidelity import ( + check_text_fidelity, + compare_text_pages, + count_hangul_syllables, + count_unexpected_cjk, + hangul_spacing_anomaly_ratio, + strip_markdown_for_text_fidelity, +) + + +def test_text_metric_helpers_count_hangul_cjk_and_spacing() -> None: + assert count_hangul_syllables("응 력 A 曲") == 2 + assert count_unexpected_cjk("응 력 A 曲") == 1 + assert hangul_spacing_anomaly_ratio("응 력") == 1.0 + assert hangul_spacing_anomaly_ratio("응력") == 0.0 + + +def test_markdown_stripping_ignores_math_assets_and_code() -> None: + markdown = "\n".join( + [ + "# 제목", + "![figure](paper.assets/fig.png)", + "본문 $x_i$ 유지", + "```", + "코드 한글", + "```", + "`인라인 코드` 마지막", + ] + ) + + stripped = strip_markdown_for_text_fidelity(markdown) + + assert "제목" in stripped + assert "본문" in stripped + assert "마지막" in stripped + assert "figure" not in stripped + assert "x_i" not in stripped + assert "코드 한글" not in stripped + assert "인라인 코드" not in stripped + + +def test_compare_text_pages_flags_low_hangul_fidelity_and_replacement_candidate() -> None: + result = compare_text_pages( + source_pages=("쉘의 응력과 곡률을 계산한다",), + markdown_pages=("쉘의 력과 曲률을 계산한다",), + source_page_start=6, + ) + + page = result.pages[0] + assert page.comparison_status == "checked" + assert page.source_page_number == 6 + assert page.pypdf_hangul_count > page.markdown_hangul_count + assert page.unexpected_cjk_count == 1 + assert page.replacement_candidate is True + assert [warning.code for warning in result.warnings] == [ + WarningCode.TEXT_LAYER_AVAILABLE, + WarningCode.TEXT_FIDELITY_LOW, + WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT, + ] + + +def test_compare_text_pages_allows_markdown_hangul_count_above_source() -> None: + result = compare_text_pages( + source_pages=("응력",), + markdown_pages=("응력 변형률",), + ) + + assert result.pages[0].hangul_count_ratio == 2.5 + assert result.pages[0].replacement_candidate is False + + +def test_check_text_fidelity_marks_uncertain_page_mapping_for_multi_page_markdown() -> None: + result = check_text_fidelity( + Path("paper.pdf"), + "첫 페이지와 둘째 페이지를 합친 Markdown", + page_count=2, + source_text_pages=("첫 페이지", "둘째 페이지"), + ) + + assert [page.comparison_status for page in result.pages] == [ + "page_mapping_uncertain", + "page_mapping_uncertain", + ] + assert [warning.code for warning in result.warnings] == [ + WarningCode.TEXT_LAYER_AVAILABLE, + WarningCode.TEXT_PAGE_MAPPING_UNCERTAIN, + ] + + +def test_blank_generated_pdf_extraction_is_nonfatal(tmp_path: Path) -> None: + pdf = tmp_path / "blank.pdf" + writer = PdfWriter() + writer.add_blank_page(width=72, height=72) + with pdf.open("wb") as file: + writer.write(file) + + result = check_text_fidelity(pdf, "Markdown text", page_count=1) + + assert result.pages[0].comparison_status == "source_text_missing" + assert result.warnings == () diff --git a/tests/test_ui_runner.py b/tests/test_ui_runner.py new file mode 100644 index 0000000..02fd592 --- /dev/null +++ b/tests/test_ui_runner.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path + +import pytest + +from pdf2md_ui.runner import ( + CommandSpec, + ResolvedCommand, + RunningCommand, + build_child_environment, + build_convert_command, + build_doctor_command, + build_recheck_command, + default_output_dir, + resolve_cli_command, + terminate_process_tree, +) +from pdf2md_ui.runner import CliResolutionError + + +def test_resolves_pdf2md_from_path_before_uv(tmp_path: Path) -> None: + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n", encoding="utf-8") + + resolved = resolve_cli_command( + project_root=tmp_path, + which=lambda name: {"pdf2md": "pdf2md.exe", "uv": "uv.exe"}.get(name), + ) + + assert resolved == ResolvedCommand(("pdf2md.exe",), cwd=None, source="path") + + +def test_resolves_uv_run_with_project_root_when_pdf2md_missing(tmp_path: Path) -> None: + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n", encoding="utf-8") + + resolved = resolve_cli_command( + project_root=tmp_path, + which=lambda name: {"uv": "uv.exe"}.get(name), + ) + + assert resolved == ResolvedCommand(("uv.exe", "run", "pdf2md"), cwd=tmp_path.resolve(), source="uv") + + +def test_resolution_requires_project_root_for_uv() -> None: + with pytest.raises(CliResolutionError): + resolve_cli_command(which=lambda name: "uv.exe" if name == "uv" else None) + + +def test_configured_command_must_be_pdf2md() -> None: + with pytest.raises(CliResolutionError, match="pdf2md"): + resolve_cli_command(configured_command="mineru.exe") + + +def test_builds_doctor_command() -> None: + resolved = ResolvedCommand(("uv", "run", "pdf2md"), cwd=Path("repo"), source="uv") + + command = build_doctor_command(resolved) + + assert command == CommandSpec(("uv", "run", "pdf2md", "doctor"), cwd=Path("repo")) + + +def test_builds_convert_command_with_fixed_argument_list(tmp_path: Path) -> None: + resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path") + input_pdf = tmp_path / "?쇰Ц.pdf" + output_dir = tmp_path / "outputs" / "?쇰Ц" + + command = build_convert_command( + resolved, + input_pdf, + output_dir, + overwrite=True, + keep_raw=True, + chunk_pages=20, + gpu="cuda:0", + ) + + assert command.args == ( + "pdf2md", + "convert", + str(input_pdf), + "--out", + str(output_dir), + "--overwrite", + "--keep-raw", + "--chunk-pages", + "20", + "--gpu", + "cuda:0", + "--mineru-profile", + "auto", + ) + + +def test_builds_recheck_command(tmp_path: Path) -> None: + resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path") + markdown = tmp_path / "paper.md" + + command = build_recheck_command(resolved, markdown) + + assert command.args == ("pdf2md", "recheck", str(markdown)) + + +def test_generated_commands_do_not_include_remote_or_api_options(tmp_path: Path) -> None: + resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path") + command = build_convert_command(resolved, tmp_path / "paper.pdf", tmp_path / "out") + joined = " ".join(command.args).casefold() + + for token in ("--api-url", "http://", "https://", "router", "openai", "mineru-api"): + assert token not in joined + + +def test_default_output_dir_uses_shared_output_root(tmp_path: Path) -> None: + pdf = tmp_path / "?섍뎄議곕Ъ.pdf" + + assert default_output_dir(pdf, base_dir=tmp_path) == tmp_path / "outputs" + + +def test_convert_rejects_non_positive_chunk_pages(tmp_path: Path) -> None: + resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path") + + with pytest.raises(ValueError, match="positive"): + build_convert_command(resolved, tmp_path / "paper.pdf", tmp_path / "out", chunk_pages=0) + + +def test_convert_rejects_prohibited_gpu_tokens(tmp_path: Path) -> None: + resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path") + + with pytest.raises(ValueError, match="strict-local"): + build_convert_command(resolved, tmp_path / "paper.pdf", tmp_path / "out", gpu="https://example.test") + + +def test_convert_rejects_unknown_mineru_profile(tmp_path: Path) -> None: + resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path") + + with pytest.raises(ValueError, match="mineru_profile"): + build_convert_command(resolved, tmp_path / "paper.pdf", tmp_path / "out", mineru_profile="fast") + + +def test_child_environment_defaults_mineru_model_source() -> None: + environment = build_child_environment({"PATH": "x"}) + + assert environment["MINERU_MODEL_SOURCE"] == "local" + + +def test_child_environment_preserves_existing_mineru_model_source() -> None: + environment = build_child_environment({"MINERU_MODEL_SOURCE": "custom"}) + + assert environment["MINERU_MODEL_SOURCE"] == "custom" + + +def test_running_command_uses_shell_false_and_streams_output() -> None: + captured: dict[str, object] = {} + events = [] + + class FakeProcess: + pid = 123 + stdout = iter(["hello\n", "done\n"]) + + def wait(self, timeout=None): + return 0 + + def poll(self): + return 0 + + def fake_popen(*args, **kwargs): + captured["args"] = args + captured["kwargs"] = kwargs + return FakeProcess() + + runner = RunningCommand(CommandSpec(("pdf2md", "doctor")), events.append, popen_factory=fake_popen, base_env={}) + + assert runner.run() == 0 + assert captured["args"] == (("pdf2md", "doctor"),) + assert captured["kwargs"]["shell"] is False + assert captured["kwargs"]["stderr"] is subprocess.STDOUT + assert captured["kwargs"]["env"]["MINERU_MODEL_SOURCE"] == "local" + assert [(event.kind, event.message, event.exit_code) for event in events] == [ + ("start", "pdf2md doctor", None), + ("output", "hello", None), + ("output", "done", None), + ("exit", "Command exited with code 0.", 0), + ] + + +def test_cancel_uses_taskkill_after_windows_grace_timeout() -> None: + taskkill_calls = [] + + class SlowProcess: + pid = 456 + + def __init__(self) -> None: + self.wait_count = 0 + self.terminated = False + + def poll(self): + return None + + def terminate(self) -> None: + self.terminated = True + + def wait(self, timeout=None): + self.wait_count += 1 + if self.wait_count == 1: + raise subprocess.TimeoutExpired("pdf2md", timeout) + return 1 + + def fake_taskkill(*args, **kwargs): + taskkill_calls.append((args, kwargs)) + return subprocess.CompletedProcess(args[0], 0) + + process = SlowProcess() + + assert terminate_process_tree(process, grace_seconds=0, taskkill_runner=fake_taskkill, os_name="nt") + assert process.terminated + assert taskkill_calls[0][0][0] == ["taskkill", "/pid", "456", "/t", "/f"] + + +def test_cancel_does_not_taskkill_when_process_exits_promptly() -> None: + taskkill_calls = [] + + class FastProcess: + pid = 789 + + def poll(self): + return None + + def terminate(self) -> None: + pass + + def wait(self, timeout=None): + return 0 + + assert terminate_process_tree(FastProcess(), taskkill_runner=lambda *args, **kwargs: taskkill_calls.append(args)) + assert taskkill_calls == [] diff --git a/uv.lock b/uv.lock index 68ad474..fc8e7e6 100644 --- a/uv.lock +++ b/uv.lock @@ -2,6 +2,15 @@ version = 1 revision = 3 requires-python = "==3.12.*" +[[package]] +name = "altgraph" +version = "0.17.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/f8/97fdf103f38fed6792a1601dbc16cc8aac56e7459a9fff08c812d8ae177a/altgraph-0.17.5.tar.gz", hash = "sha256:c87b395dd12fabde9c99573a9749d67da8d29ef9de0125c7f536699b4a9bc9e7", size = 48428, upload-time = "2025-11-21T20:35:50.583Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/ba/000a1996d4308bc65120167c21241a3b205464a2e0b58deda26ae8ac21d1/altgraph-0.17.5-py2.py3-none-any.whl", hash = "sha256:f3a22400bce1b0c701683820ac4f3b159cd301acab067c51c653e06961600597", size = 21228, upload-time = "2025-11-21T20:35:49.444Z" }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -23,12 +32,16 @@ dependencies = [ dev = [ { name = "pytest" }, ] +ui-build = [ + { name = "pyinstaller" }, +] [package.metadata] requires-dist = [{ name = "pypdf", specifier = ">=6.10.2,<7" }] [package.metadata.requires-dev] dev = [{ name = "pytest", specifier = ">=8.3" }] +ui-build = [{ name = "pyinstaller", specifier = ">=6.20,<7" }] [[package]] name = "iniconfig" @@ -39,6 +52,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, ] +[[package]] +name = "macholib" +version = "1.16.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "altgraph" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/10/2f/97589876ea967487978071c9042518d28b958d87b17dceb7cdc1d881f963/macholib-1.16.4.tar.gz", hash = "sha256:f408c93ab2e995cd2c46e34fe328b130404be143469e41bc366c807448979362", size = 59427, upload-time = "2025-11-22T08:28:38.373Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/d1/a9f36f8ecdf0fb7c9b1e78c8d7af12b8c8754e74851ac7b94a8305540fc7/macholib-1.16.4-py2.py3-none-any.whl", hash = "sha256:da1a3fa8266e30f0ce7e97c6a54eefaae8edd1e5f86f3eb8b95457cae90265ea", size = 38117, upload-time = "2025-11-22T08:28:36.939Z" }, +] + [[package]] name = "packaging" version = "26.2" @@ -48,6 +73,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" }, ] +[[package]] +name = "pefile" +version = "2024.8.26" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/03/4f/2750f7f6f025a1507cd3b7218691671eecfd0bbebebe8b39aa0fe1d360b8/pefile-2024.8.26.tar.gz", hash = "sha256:3ff6c5d8b43e8c37bb6e6dd5085658d658a7a0bdcd20b6a07b1fcfc1c4e9d632", size = 76008, upload-time = "2024-08-26T20:58:38.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/16/12b82f791c7f50ddec566873d5bdd245baa1491bac11d15ffb98aecc8f8b/pefile-2024.8.26-py3-none-any.whl", hash = "sha256:76f8b485dcd3b1bb8166f1128d395fa3d87af26360c2358fb75b80019b957c6f", size = 74766, upload-time = "2024-08-26T21:01:02.632Z" }, +] + [[package]] name = "pluggy" version = "1.6.0" @@ -66,6 +100,47 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, ] +[[package]] +name = "pyinstaller" +version = "6.20.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "altgraph" }, + { name = "macholib", marker = "sys_platform == 'darwin'" }, + { name = "packaging" }, + { name = "pefile", marker = "sys_platform == 'win32'" }, + { name = "pyinstaller-hooks-contrib" }, + { name = "pywin32-ctypes", marker = "sys_platform == 'win32'" }, + { name = "setuptools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/46/60/d03d52e6690d4e9caf333dcd14550cde634ce6c118b3bc8fa3112c3186fd/pyinstaller-6.20.0.tar.gz", hash = "sha256:95c5c7e03d5d61e9dfb8ef259c699cf492bb1041beb6dbe83696608cec07347a", size = 4048728, upload-time = "2026-04-22T20:59:36.96Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/e4/e228d6d1bbb7fd62dc660a8fb202a583b023d3a3624ca95d1a9290ee4d6a/pyinstaller-6.20.0-py3-none-macosx_10_13_universal2.whl", hash = "sha256:bf3be4e1284ee78ddccba5e29f99443a12a7b4673168288ffc4c9d38c6f7b90e", size = 1047642, upload-time = "2026-04-22T20:58:32.006Z" }, + { url = "https://files.pythonhosted.org/packages/ce/bd/afb631bcb3f9040efebd4f6d067f0828b51710818f69fb41a2d4b7787f52/pyinstaller-6.20.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:72ae9c1fdea134afa791f58bdc9a1934d5c7609753c111e0026bfc272b32b712", size = 742494, upload-time = "2026-04-22T20:58:36.285Z" }, + { url = "https://files.pythonhosted.org/packages/76/08/0729a5bac14754150e5d83b39d87d842eb42b0bffcaa03dbad6252e23a39/pyinstaller-6.20.0-py3-none-manylinux2014_i686.whl", hash = "sha256:1031bcc307f3fbeffd4e162723e64d46dbf591c82dd0997413afb2a07328b941", size = 754191, upload-time = "2026-04-22T20:58:40.603Z" }, + { url = "https://files.pythonhosted.org/packages/e6/82/bc0ee4c7b97db1958eb651e0da9fb1e672e5ae53ca8867fd97701de52906/pyinstaller-6.20.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:8df3b3f347659fa2562d8d193a98ad4600133b8b8d07c268df89e4154376750e", size = 751902, upload-time = "2026-04-22T20:58:44.7Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e7/770002d6aaa54173881cb2c49bb195ba67b97bf39bac1cdf320f28401629/pyinstaller-6.20.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:b0d3cc9dd8120d448459bd3880a12e2f9774c51443af49047801446377999a59", size = 748634, upload-time = "2026-04-22T20:58:48.579Z" }, + { url = "https://files.pythonhosted.org/packages/fe/db/68ba1fccb71278b2124fb90b37b7c8c0bc4c1173fba45b94466df3d9cb7f/pyinstaller-6.20.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:03696bb6350177c6bc23bcaf78e71a33c4a89b6754dd90d1be2f318e978c918b", size = 748490, upload-time = "2026-04-22T20:58:52.749Z" }, + { url = "https://files.pythonhosted.org/packages/03/0f/ac77ffa996a56be3d5c8f85734a007f8347240691657f9704e7de2527fa3/pyinstaller-6.20.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:6357f1699f6af84f37e7367f031d4f68abdba65543b83990c9e8f5a4cebed0b7", size = 747650, upload-time = "2026-04-22T20:58:57.093Z" }, + { url = "https://files.pythonhosted.org/packages/e0/56/1ee91c3a2bc10ca1f36da10a6fd55ff7efc4dec367171eb25992a827874f/pyinstaller-6.20.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:0ab39c690abad26ba148e8f664f0478acc82a733997f4f22e757774832802da9", size = 747413, upload-time = "2026-04-22T20:59:01.174Z" }, + { url = "https://files.pythonhosted.org/packages/d7/55/ae264339996953c4cdf9d89d916a0a8fa26a83cf917a742fff8b9d5f3fe8/pyinstaller-6.20.0-py3-none-win32.whl", hash = "sha256:9a7637e8e44b4387b13667fdcaac86ab6b29c446c16d34d8401539b81838759c", size = 1331584, upload-time = "2026-04-22T20:59:07.201Z" }, + { url = "https://files.pythonhosted.org/packages/76/8c/300f57578882cce259bfb5ae56fda3b69caa3fe9df40a176c719920ea6e2/pyinstaller-6.20.0-py3-none-win_amd64.whl", hash = "sha256:d588844e890ee80c4365867f98146636e1849bbca8e4284bbf0c809aff0f161a", size = 1391851, upload-time = "2026-04-22T20:59:14.024Z" }, + { url = "https://files.pythonhosted.org/packages/8a/ea/b2f8e1642aecda78c0b75c7321f708e49e10bb3c00dd4f148c40761a1527/pyinstaller-6.20.0-py3-none-win_arm64.whl", hash = "sha256:bd53282c0a73e5c95573e1ddc8e5d564d4932bec91efbaed4dc5fdff9c2ae7f2", size = 1332259, upload-time = "2026-04-22T20:59:20.509Z" }, +] + +[[package]] +name = "pyinstaller-hooks-contrib" +version = "2026.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, + { name = "setuptools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1a/67/f4452d68793fb15beba4f19ef39a38a8822f0da7452b503c400d5a21f5c1/pyinstaller_hooks_contrib-2026.5.tar.gz", hash = "sha256:f066dfca8f7c45ff6336c9cf9fe25b4e48bfeb322a1aa24faaedfb8a8d1b0b08", size = 173689, upload-time = "2026-05-04T22:36:55.124Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/5c/fd465d11da4d12b50d7eb5d2ee2ceb780d8d049dbb489f3828d131e387af/pyinstaller_hooks_contrib-2026.5-py3-none-any.whl", hash = "sha256:ea1535783fbdac4626351709e83f3ea80b681d3a4745763ebb407b5e27342eb9", size = 457314, upload-time = "2026-05-04T22:36:53.598Z" }, +] + [[package]] name = "pypdf" version = "6.10.2" @@ -90,3 +165,21 @@ sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc wheels = [ { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" }, ] + +[[package]] +name = "pywin32-ctypes" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/9f/01a1a99704853cb63f253eea009390c88e7131c67e66a0a02099a8c917cb/pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755", size = 29471, upload-time = "2024-08-14T10:15:34.626Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/3d/8161f7711c017e01ac9f008dfddd9410dff3674334c233bde66e7ba65bbf/pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8", size = 30756, upload-time = "2024-08-14T10:15:33.187Z" }, +] + +[[package]] +name = "setuptools" +version = "82.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4f/db/cfac1baf10650ab4d1c111714410d2fbb77ac5a616db26775db562c8fab2/setuptools-82.0.1.tar.gz", hash = "sha256:7d872682c5d01cfde07da7bccc7b65469d3dca203318515ada1de5eda35efbf9", size = 1152316, upload-time = "2026-03-09T12:47:17.221Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/76/f789f7a86709c6b087c5a2f52f911838cad707cc613162401badc665acfe/setuptools-82.0.1-py3-none-any.whl", hash = "sha256:a59e362652f08dcd477c78bb6e7bd9d80a7995bc73ce773050228a348ce2e5bb", size = 1006223, upload-time = "2026-03-09T12:47:15.026Z" }, +]