from __future__ import annotations import json import os import re import subprocess import sys from pathlib import Path import pytest if os.environ.get("PDF2MD_RUN_MINERU_FIXTURES") != "1": pytest.skip( "optional local MinerU fixture evaluation is disabled; set PDF2MD_RUN_MINERU_FIXTURES=1 to run", allow_module_level=True, ) REPO_ROOT = Path(__file__).resolve().parents[2] SAMPLES_DIR = REPO_ROOT / "samples" def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -> None: doctor = subprocess.run( [sys.executable, "-m", "pdf2md.cli", "doctor"], cwd=REPO_ROOT, check=False, capture_output=True, text=True, ) if doctor.returncode != 0: pytest.skip(f"local MinerU fixture evaluation blocked by doctor:\n{doctor.stdout}\n{doctor.stderr}") sample_pdfs = tuple(sorted(SAMPLES_DIR.glob("*.pdf"), key=lambda path: path.name.casefold())) if not sample_pdfs: pytest.skip(f"no local sample PDFs found under {SAMPLES_DIR}") output_root = tmp_path / "mineru-fixture-output" attempts: list[dict[str, object]] = [] for pdf in sample_pdfs: sample_output = output_root / pdf.stem completed = subprocess.run( [ sys.executable, "-m", "pdf2md.cli", "convert", str(pdf), "--out", str(sample_output), ], cwd=REPO_ROOT, check=False, capture_output=True, text=True, timeout=1800, ) attempts.append( { "source": str(pdf.relative_to(REPO_ROOT)), "command": " ".join( [ sys.executable, "-m", "pdf2md.cli", "convert", str(pdf), "--out", str(sample_output), ] ), "exit_code": completed.returncode, "stdout": completed.stdout, "stderr": completed.stderr, } ) assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2) markdown_path = sample_output / f"{pdf.stem}.md" metadata_path = sample_output / f"{pdf.stem}.metadata.json" report_path = sample_output / f"{pdf.stem}.report.md" assert markdown_path.exists() assert metadata_path.exists() assert report_path.exists() metadata = json.loads(metadata_path.read_text(encoding="utf-8")) summary = metadata["summary"] assert metadata["engine"] == "MinerU" assert summary["pages_processed"] >= 1 assert "warning_count" in summary assert "math_render_error_count" in summary assert "asset_count" in summary report = report_path.read_text(encoding="utf-8") assert "Output Markdown:" in report assert "Metadata JSON:" in report assert "Report Markdown:" in report attempts[-1].update( { "markdown_path": str(markdown_path), "metadata_path": str(metadata_path), "report_path": str(report_path), "warning_count": summary["warning_count"], "final_status": _report_final_status(report), "math_render_error_count": summary["math_render_error_count"], "asset_count": summary["asset_count"], "pages_processed": summary["pages_processed"], } ) record_path = output_root / "fixture-evaluation.json" record_path.write_text(json.dumps({"attempts": attempts}, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") assert record_path.exists() def _report_final_status(report: str) -> str: match = re.search(r"^- Final status: `(?P[^`]+)`$", report, re.MULTILINE) return match.group("status") if match else "unavailable"