From 03927a26a1148a89d31e4a5d0ae30d684fda466b Mon Sep 17 00:00:00 2001 From: NINI Date: Mon, 11 May 2026 01:00:26 +0900 Subject: [PATCH] Add Markdown recheck command --- PLAN.md | 2 +- PROGRESS.md | 11 ++- README.md | 12 ++- src/pdf2md/__init__.py | 2 + src/pdf2md/cli.py | 21 ++++- src/pdf2md/conversion.py | 178 ++++++++++++++++++++++++++++++++++++++- tests/test_cli.py | 31 ++++++- tests/test_conversion.py | 29 ++++++- tests/test_package.py | 1 + 9 files changed, 276 insertions(+), 11 deletions(-) diff --git a/PLAN.md b/PLAN.md index 1e83062..5e9a64e 100644 --- a/PLAN.md +++ b/PLAN.md @@ -4,7 +4,7 @@ This file is the shared work plan for agents. Read it before starting work, then ## Current Goal -Completed work history is archived in `docs/WORKARCHIVE.md`. Sprint 10 pre-conversion PDF chunking is implemented. On this PC, full local runtime setup is complete in `.venv`; next work is optional manual Obsidian quality review or additional sample validation if requested. +Completed work history is archived in `docs/WORKARCHIVE.md`. Sprint 10 pre-conversion PDF chunking is implemented. On this PC, full local runtime setup is complete in `.venv`; Markdown quality recheck for existing outputs is implemented. Next work is optional manual Obsidian quality review, Markdown cleanup for sample warnings, or additional sample validation if requested. ## Active Constraints diff --git a/PROGRESS.md b/PROGRESS.md index f181860..03cb660 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -6,7 +6,7 @@ This file records current progress for agents. Read it before starting work, the - Project direction is documented in `PRD.md`, `ARCHITECTURE.md`, `AGENTS.md`, and `docs/KNOWLEDGEBASE.md`. - MinerU 3.1.0 is fixed as the only conversion engine. -- The converter currently includes path planning, project-owned records, metadata, direct local MinerU adapter boundary, Obsidian Markdown normalization, local quality checks, report rendering, conversion orchestration, `pdf2md convert`, `pdf2md doctor`, local MathJax render checking, release-gate tests, and opt-in pre-conversion PDF chunking. +- The converter currently includes path planning, project-owned records, metadata, direct local MinerU adapter boundary, Obsidian Markdown normalization, local quality checks, report rendering, conversion orchestration, `pdf2md convert`, `pdf2md recheck`, `pdf2md doctor`, local MathJax render checking, release-gate tests, and opt-in pre-conversion PDF chunking. - `docs/V1IMPLEMENTATIONPLAN.md` defines the v1 implementation sequence. - `docs/Sprints/` contains completed sprint contracts through Sprint 10. - `docs/WORKARCHIVE.md` contains completed sprint history, historical verification results, runtime setup notes, and sample conversion evidence. @@ -45,6 +45,8 @@ This file records current progress for agents. Read it before starting work, the - Verified full local runtime with `uv run pdf2md doctor`: PASS. - Verified real local sample conversion: `samples/FourNodeQuadrilateralShellElementMITC4.pdf` to ignored `outputs/runtime-smoke/`, status `success`, 7 pages, 22 assets, 38 inline formulas, 16 display formulas, 0 math render errors, and 0 warnings. - Converted `samples/MITC공부.pdf` to ignored `outputs/MITC공부/`; report status was `partial`: 13 pages, 107 assets, 23 inline formulas, 103 display formulas, 2 MathJax render warnings, and 0 missing or invalid asset links. +- Added `recheck_markdown()` and `pdf2md recheck ` to rerun local quality checks for an existing generated Markdown file and rewrite the adjacent metadata JSON and `.report.md` without rerunning MinerU. +- Verified `uv run pdf2md recheck outputs\MITC공부\MITC공부.md`; the command regenerated metadata/report and still reported 2 warnings because the current Markdown still contains the two MathJax-invalid expressions. ## In Progress @@ -56,6 +58,7 @@ This file records current progress for agents. Read it before starting work, the ## Next Actions -1. Review generated sample Markdown outputs in Obsidian if visual quality needs manual assessment. -2. Run optional real local chunked conversion on a long sample only if requested. -3. Preserve strict-local runtime behavior: use local model paths, direct CLI execution, and no user-specified API or remote backend. +1. Manually fix the two MathJax-invalid expressions in `outputs/MITC공부/MITC공부.md` if a warning-free local report is desired, then run `uv run pdf2md recheck outputs\MITC공부\MITC공부.md`. +2. Review generated sample Markdown outputs in Obsidian if visual quality needs manual assessment. +3. Run optional real local chunked conversion on a long sample only if requested. +4. Preserve strict-local runtime behavior: use local model paths, direct CLI execution, and no user-specified API or remote backend. diff --git a/README.md b/README.md index 0a4de77..e93d92d 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Local-only PDF-to-Markdown converter for math-heavy digital documents. ## Status -The project currently provides a Python package, `pdf2md convert`, metadata/report output, mocked MinerU adapter tests, `pdf2md doctor` setup diagnostics, and Sprint 9 release-gate documentation. Real local MinerU sample validation remains optional and may be blocked until MinerU 3.1.0 and local model/cache setup are available. +The project currently provides a Python package, `pdf2md convert`, Markdown recheck via `pdf2md recheck`, metadata/report output, mocked MinerU adapter tests, `pdf2md doctor` setup diagnostics, and Sprint 9 release-gate documentation. Real local MinerU sample validation remains optional and may be blocked until MinerU 3.1.0 and local model/cache setup are available. ## Setup @@ -76,6 +76,16 @@ The model/cache check looks for these environment variables when present: It also checks for `%USERPROFILE%\mineru.json`, which MinerU documents as its default user config location. Missing model/cache paths are warnings because model download and cache population must be explicit setup actions. +## Rechecking Markdown + +After editing a generated Markdown file, rerun local quality checks and regenerate the adjacent metadata/report files: + +```powershell +uv run pdf2md recheck outputs/MITC공부/MITC공부.md +``` + +`recheck` reads the existing `.metadata.json` for source PDF, engine, page, and asset provenance. It replaces quality warnings that can be recalculated from the current Markdown, including MathJax render failures and local asset-link warnings, then rewrites `.metadata.json` and `.report.md`. + ## Runtime Policy Runtime conversion is strict-local. Allowed: direct `mineru` CLI execution and the CLI-internal temporary local `mineru-api` that MinerU starts when `--api-url` is omitted. Prohibited: `--api-url`, remote APIs, router mode, HTTP client backends, remote OpenAI-compatible backends, hosted renderers, and cloud fallbacks. diff --git a/src/pdf2md/__init__.py b/src/pdf2md/__init__.py index 8f0b895..28ba620 100644 --- a/src/pdf2md/__init__.py +++ b/src/pdf2md/__init__.py @@ -7,6 +7,7 @@ from pdf2md.conversion import ( ConversionResult, convert_input, convert_pdf, + recheck_markdown, ) __version__ = "0.1.0" @@ -19,4 +20,5 @@ __all__ = [ "__version__", "convert_input", "convert_pdf", + "recheck_markdown", ] diff --git a/src/pdf2md/cli.py b/src/pdf2md/cli.py index dc25f82..db48302 100644 --- a/src/pdf2md/cli.py +++ b/src/pdf2md/cli.py @@ -7,7 +7,7 @@ import sys from collections.abc import Sequence from pdf2md import __version__ -from pdf2md.conversion import DEFAULT_CHUNK_PAGES, DEFAULT_GPU_DEVICE, ConversionAdapter, convert_input +from pdf2md.conversion import DEFAULT_CHUNK_PAGES, DEFAULT_GPU_DEVICE, ConversionAdapter, convert_input, recheck_markdown from pdf2md.doctor import DoctorReport, format_doctor_report, run_doctor from pdf2md.mineru_adapter import StrictLocalViolationError from pdf2md.paths import PathPlanningError @@ -17,6 +17,7 @@ def main( argv: Sequence[str] | None = None, *, adapter: ConversionAdapter | None = None, + math_checker=None, clock=None, doctor_runner=None, ) -> int: @@ -61,6 +62,8 @@ def main( default=True, help="Keep strict-local conversion policy enabled. Enabled by default.", ) + recheck_parser = subparsers.add_parser("recheck", help="Re-run quality checks for an existing Markdown output.") + recheck_parser.add_argument("markdown", help="Existing Markdown output from pdf2md convert.") args = parser.parse_args(argv) if args.version: @@ -72,6 +75,21 @@ def main( print(format_doctor_report(report)) return report.exit_code + if args.command == "recheck": + try: + result = recheck_markdown(args.markdown, math_checker=math_checker, clock=clock) + except ValueError as error: + print(f"error: {error}", file=sys.stderr) + return 2 + print( + "rechecked: " + f"{result.markdown_path} -> {result.metadata_path}, {result.report_path} " + f"({result.warning_count} warnings)" + ) + print(f"status: {result.final_status}") + print(f"warnings: {result.warning_count}") + return 1 if not result.succeeded else 0 + if args.command != "convert": parser.print_help() return 0 @@ -88,6 +106,7 @@ def main( gpu=args.gpu, strict_local=args.strict_local, adapter=adapter, + math_checker=math_checker, clock=clock, ) except (PathPlanningError, StrictLocalViolationError, ValueError) as error: diff --git a/src/pdf2md/conversion.py b/src/pdf2md/conversion.py index ef706cd..70954b6 100644 --- a/src/pdf2md/conversion.py +++ b/src/pdf2md/conversion.py @@ -11,7 +11,7 @@ from collections.abc import Callable from dataclasses import dataclass, replace from datetime import datetime, timezone from pathlib import Path, PurePosixPath -from typing import Protocol +from typing import Any, Protocol from pdf2md.ir import ( AssetRecord, @@ -104,6 +104,13 @@ class _ConversionTask: _IMAGE_LINK_RE = re.compile(r"!\[(?P[^\]\n]*)\]\((?P[^)\n]+)\)") _DISPLAY_MATH_RE = re.compile(r"(?.*?)(?[^\n$]+?)(? ConversionResult: + """Re-run local quality checks for an existing Markdown output and rewrite metadata/report.""" + + markdown_file = Path(markdown_path).expanduser().resolve() + if not markdown_file.is_file(): + raise ValueError(f"Markdown output does not exist: {markdown_file}") + + metadata_path = markdown_file.with_suffix(".metadata.json") + report_path = markdown_file.with_suffix(".report.md") + if not metadata_path.is_file(): + raise ValueError(f"Existing metadata JSON is required for recheck: {metadata_path}") + + existing_metadata = _read_metadata_json(metadata_path) + markdown = markdown_file.read_text(encoding="utf-8") + assets_dir = markdown_file.with_suffix(".assets") + assets = _assets_from_metadata(existing_metadata) + quality = _run_quality_checks( + markdown, + markdown_dir=markdown_file.parent, + asset_root=assets_dir, + math_checker=math_checker, + ) + warnings = _preserved_metadata_warnings(existing_metadata) + quality.warnings + document = _build_document( + source_pdf=Path(_metadata_text(existing_metadata, "source_pdf")), + markdown=markdown, + assets=assets, + warnings=warnings, + raw_structured={"pages": [None] * _metadata_page_count(existing_metadata)}, + ) + now = clock or _utc_now + metadata_data = build_metadata( + document=document, + source_sha256=_metadata_text(existing_metadata, "source_sha256"), + created_at=_format_timestamp(now()), + engine=_metadata_text(existing_metadata, "engine"), + engine_version=_metadata_text(existing_metadata, "engine_version"), + engine_options=_metadata_engine_options(existing_metadata), + ) + report_quality = QualityResult( + missing_asset_link_count=quality.missing_asset_link_count, + invalid_asset_link_count=quality.invalid_asset_link_count, + ) + report_text = render_report( + metadata_data, + quality=report_quality, + markdown_path=markdown_file, + metadata_path=metadata_path, + report_path=report_path, + ) + final_status = determine_final_status(metadata_data, report_quality) + + _write_text(metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n") + _write_text(report_path, report_text) + + return ConversionResult( + source_pdf=Path(_metadata_text(metadata_data, "source_pdf")), + markdown_path=markdown_file, + metadata_path=metadata_path, + report_path=report_path, + assets_dir=assets_dir, + raw_dir=None, + engine=_metadata_text(metadata_data, "engine"), + engine_version=_metadata_text(metadata_data, "engine_version"), + final_status=final_status, + warning_count=len(warnings), + warnings=warnings, + pages_processed=int(metadata_data["summary"]["pages_processed"]), + ) + + +def _read_metadata_json(path: Path) -> dict[str, Any]: + data = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise ValueError(f"metadata JSON must contain an object: {path}") + return data + + +def _assets_from_metadata(metadata: dict[str, Any]) -> tuple[AssetRecord, ...]: + raw_assets = metadata.get("assets", ()) + if not isinstance(raw_assets, list): + return () + assets: list[AssetRecord] = [] + for item in raw_assets: + if not isinstance(item, dict): + continue + relative_path = item.get("relative_path") + if not isinstance(relative_path, str) or not relative_path: + continue + assets.append( + AssetRecord( + relative_path, + page_index=_optional_page_index(item.get("page_index")), + bbox=_optional_bbox(item.get("bbox")), + ) + ) + return tuple(assets) + + +def _preserved_metadata_warnings(metadata: dict[str, Any]) -> tuple[WarningRecord, ...]: + raw_warnings = metadata.get("warnings", ()) + if not isinstance(raw_warnings, list): + return () + warnings: list[WarningRecord] = [] + for item in raw_warnings: + if not isinstance(item, dict): + continue + warning = _warning_from_metadata(item) + if warning is not None and warning.code not in _RECHECKED_WARNING_CODES: + warnings.append(warning) + return tuple(warnings) + + +def _warning_from_metadata(item: dict[str, Any]) -> WarningRecord | None: + code = item.get("code") + severity = item.get("severity") + message = item.get("message") + if not isinstance(code, str) or not isinstance(severity, str) or not isinstance(message, str) or not message: + return None + return WarningRecord( + WarningCode(code), + WarningSeverity(severity), + message, + page_index=_optional_page_index(item.get("page_index")), + bbox=_optional_bbox(item.get("bbox")), + ) + + +def _metadata_text(metadata: dict[str, Any], field_name: str) -> str: + value = metadata.get(field_name) + if not isinstance(value, str) or not value: + raise ValueError(f"metadata field is required: {field_name}") + return value + + +def _metadata_engine_options(metadata: dict[str, Any]) -> dict[str, Any]: + value = metadata.get("engine_options", {}) + return dict(value) if isinstance(value, dict) else {} + + +def _metadata_page_count(metadata: dict[str, Any]) -> int: + pages = metadata.get("pages") + if isinstance(pages, list) and pages: + return len(pages) + summary = metadata.get("summary") + if isinstance(summary, dict): + pages_processed = summary.get("pages_processed") + if isinstance(pages_processed, int) and pages_processed > 0: + return pages_processed + return 1 + + +def _optional_page_index(value: object) -> int | None: + return value if isinstance(value, int) and value >= 0 else None + + +def _optional_bbox(value: object) -> tuple[float, float, float, float] | None: + if not isinstance(value, list | tuple) or len(value) != 4: + return None + if not all(isinstance(part, int | float) for part in value): + return None + return tuple(float(part) for part in value) + + def _plan_conversion_tasks( discovered: tuple[DiscoveredPdf, ...], output_dir: PathLike, diff --git a/tests/test_cli.py b/tests/test_cli.py index 9d6403c..0d988dd 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -16,8 +16,9 @@ from pdf2md.mineru_adapter import MinerUAdapterResult class FakeAdapter: - def __init__(self, *, succeeded: bool = True) -> None: + def __init__(self, *, succeeded: bool = True, raw_markdown: str | None = None) -> None: self.succeeded = succeeded + self.raw_markdown = raw_markdown self.calls: list[Path] = [] self.options: list[object] = [] @@ -33,7 +34,7 @@ class FakeAdapter: command=("mineru", "-p", str(input_path), "-o", str(output_dir)), input_pdf=input_path, work_dir=output_dir, - raw_markdown=f"# {input_path.stem}\n" if self.succeeded else None, + raw_markdown=(self.raw_markdown or f"# {input_path.stem}\n") if self.succeeded else None, raw_structured={"pages": 1}, asset_paths=(), warnings=() if self.succeeded else (warning,), @@ -188,6 +189,32 @@ def test_cli_failure_summary_returns_nonzero(tmp_path: Path, capsys) -> None: assert not (tmp_path / "out" / "paper.md").exists() +def test_cli_recheck_markdown_regenerates_adjacent_metadata_and_report(tmp_path: Path, capsys) -> None: + pdf = make_pdf(tmp_path, "paper.pdf") + out = tmp_path / "out" + adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n") + assert ( + main( + ["convert", str(pdf), "--out", str(out)], + adapter=adapter, + clock=fixed_clock, + math_checker=lambda _: False, + ) + == 0 + ) + capsys.readouterr() + + markdown_path = out / "paper.md" + markdown_path.write_text("Inline $x_i$\n", encoding="utf-8") + exit_code = main(["recheck", str(markdown_path)], clock=fixed_clock, math_checker=lambda _: True) + + captured = capsys.readouterr() + assert exit_code == 0 + assert "rechecked:" in captured.out + assert "warnings: 0" in captured.out + assert "- Final status: `success`" in (out / "paper.report.md").read_text(encoding="utf-8") + + def test_cli_preflight_conflict_fails_before_conversion(tmp_path: Path, capsys) -> None: pdf = make_pdf(tmp_path, "paper.pdf") out = tmp_path / "out" diff --git a/tests/test_conversion.py b/tests/test_conversion.py index 16544a3..5a3b026 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -9,7 +9,7 @@ import pytest from pypdf import PdfWriter import pdf2md.conversion as conversion_module -from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf +from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf, recheck_markdown from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity from pdf2md.mineru_adapter import MinerUAdapterResult, StrictLocalViolationError from pdf2md.paths import OutputConflictError @@ -230,6 +230,33 @@ def test_convert_pdf_records_math_checker_failures_in_metadata_and_report(tmp_pa assert "`MATH_RENDER_FAILED`" in report +def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(tmp_path: Path) -> None: + pdf = make_pdf(tmp_path) + adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n") + result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: False, clock=fixed_clock) + + result.markdown_path.write_text("Inline $x_i$\n", encoding="utf-8") + rechecked = recheck_markdown(result.markdown_path, math_checker=lambda _: True, clock=fixed_clock) + + assert rechecked.final_status == "success" + assert rechecked.warning_count == 0 + assert rechecked.markdown_path == result.markdown_path + assert rechecked.metadata_path == result.metadata_path + assert rechecked.report_path == result.report_path + metadata = json.loads(result.metadata_path.read_text(encoding="utf-8")) + assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest() + assert metadata["created_at"] == "2026-05-08T00:00:00Z" + assert metadata["summary"]["pages_processed"] == 1 + assert metadata["summary"]["inline_formula_count"] == 1 + assert metadata["summary"]["math_render_error_count"] == 0 + assert metadata["summary"]["warning_count"] == 0 + assert metadata["warnings"] == [] + report = result.report_path.read_text(encoding="utf-8") + assert "- Final status: `success`" in report + assert "- Math render error count: 0" in report + assert "- None" in report + + def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: Path, monkeypatch) -> None: pdf = make_pdf(tmp_path) adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n") diff --git a/tests/test_package.py b/tests/test_package.py index 3726f01..9ad0af3 100644 --- a/tests/test_package.py +++ b/tests/test_package.py @@ -6,3 +6,4 @@ import pdf2md def test_package_imports() -> None: assert pdf2md.__version__ == "0.1.0" assert callable(pdf2md.convert_pdf) + assert callable(pdf2md.recheck_markdown)