from __future__ import annotations import json import os import re import subprocess import sys from pathlib import Path import pytest if os.environ.get("PDF2MD_RUN_MINERU_FIXTURES") != "1": pytest.skip( "optional local MinerU fixture evaluation is disabled; set PDF2MD_RUN_MINERU_FIXTURES=1 to run", allow_module_level=True, ) REPO_ROOT = Path(__file__).resolve().parents[2] SAMPLES_DIR = REPO_ROOT / "samples" def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -> None: doctor = subprocess.run( [sys.executable, "-m", "pdf2md.cli", "doctor"], cwd=REPO_ROOT, check=False, capture_output=True, text=True, ) if doctor.returncode != 0: pytest.skip(f"local MinerU fixture evaluation blocked by doctor:\n{doctor.stdout}\n{doctor.stderr}") sample_pdfs = tuple(sorted(SAMPLES_DIR.glob("*.pdf"), key=lambda path: path.name.casefold())) if not sample_pdfs: pytest.skip(f"no local sample PDFs found under {SAMPLES_DIR}") output_root = tmp_path / "mineru-fixture-output" attempts: list[dict[str, object]] = [] for pdf in sample_pdfs: completed = subprocess.run( [ sys.executable, "-m", "pdf2md.cli", "convert", str(pdf), "--out", str(output_root), ], cwd=REPO_ROOT, check=False, capture_output=True, text=True, timeout=1800, ) attempts.append( { "source": str(pdf.relative_to(REPO_ROOT)), "command": " ".join( [ sys.executable, "-m", "pdf2md.cli", "convert", str(pdf), "--out", str(output_root), ] ), "exit_code": completed.returncode, "stdout": completed.stdout, "stderr": completed.stderr, } ) assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2) sample_output = output_root / pdf.stem markdown_path = sample_output / f"{pdf.stem}_001.md" report_path = sample_output / f"{pdf.stem}_report.md" assert markdown_path.exists() assert report_path.exists() assert not list(sample_output.glob("*.metadata.json")) report = report_path.read_text(encoding="utf-8") assert "Output Markdown:" in report assert "Metadata JSON:" not in report assert "Report Markdown:" in report assert "- Engine: MinerU" in report assert "- Pages processed:" in report assert "- Warning count:" in report assert "- Math render error count:" in report assert "- Asset count:" in report attempts[-1].update( { "markdown_path": str(markdown_path), "report_path": str(report_path), "final_status": _report_final_status(report), } ) record_path = output_root / "fixture-evaluation.json" record_path.write_text(json.dumps({"attempts": attempts}, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") assert record_path.exists() def _report_final_status(report: str) -> str: match = re.search(r"^- Final status: `(?P[^`]+)`$", report, re.MULTILINE) return match.group("status") if match else "unavailable"