add pdftomd

This commit is contained in:
김경종
2026-05-08 16:42:19 +09:00
parent 551ab50735
commit 88d6b92283
99 changed files with 47332 additions and 0 deletions
@@ -0,0 +1,118 @@
from __future__ import annotations
import json
import os
import re
import subprocess
import sys
from pathlib import Path
import pytest
if os.environ.get("PDF2MD_RUN_MINERU_FIXTURES") != "1":
pytest.skip(
"optional local MinerU fixture evaluation is disabled; set PDF2MD_RUN_MINERU_FIXTURES=1 to run",
allow_module_level=True,
)
REPO_ROOT = Path(__file__).resolve().parents[2]
SAMPLES_DIR = REPO_ROOT / "samples"
def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -> None:
doctor = subprocess.run(
[sys.executable, "-m", "pdf2md.cli", "doctor"],
cwd=REPO_ROOT,
check=False,
capture_output=True,
text=True,
)
if doctor.returncode != 0:
pytest.skip(f"local MinerU fixture evaluation blocked by doctor:\n{doctor.stdout}\n{doctor.stderr}")
sample_pdfs = tuple(sorted(SAMPLES_DIR.glob("*.pdf"), key=lambda path: path.name.casefold()))
if not sample_pdfs:
pytest.skip(f"no local sample PDFs found under {SAMPLES_DIR}")
output_root = tmp_path / "mineru-fixture-output"
attempts: list[dict[str, object]] = []
for pdf in sample_pdfs:
sample_output = output_root / pdf.stem
completed = subprocess.run(
[
sys.executable,
"-m",
"pdf2md.cli",
"convert",
str(pdf),
"--out",
str(sample_output),
],
cwd=REPO_ROOT,
check=False,
capture_output=True,
text=True,
timeout=1800,
)
attempts.append(
{
"source": str(pdf.relative_to(REPO_ROOT)),
"command": " ".join(
[
sys.executable,
"-m",
"pdf2md.cli",
"convert",
str(pdf),
"--out",
str(sample_output),
]
),
"exit_code": completed.returncode,
"stdout": completed.stdout,
"stderr": completed.stderr,
}
)
assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2)
markdown_path = sample_output / f"{pdf.stem}.md"
metadata_path = sample_output / f"{pdf.stem}.metadata.json"
report_path = sample_output / f"{pdf.stem}.report.md"
assert markdown_path.exists()
assert metadata_path.exists()
assert report_path.exists()
metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
summary = metadata["summary"]
assert metadata["engine"] == "MinerU"
assert summary["pages_processed"] >= 1
assert "warning_count" in summary
assert "math_render_error_count" in summary
assert "asset_count" in summary
report = report_path.read_text(encoding="utf-8")
assert "Output Markdown:" in report
assert "Metadata JSON:" in report
assert "Report Markdown:" in report
attempts[-1].update(
{
"markdown_path": str(markdown_path),
"metadata_path": str(metadata_path),
"report_path": str(report_path),
"warning_count": summary["warning_count"],
"final_status": _report_final_status(report),
"math_render_error_count": summary["math_render_error_count"],
"asset_count": summary["asset_count"],
"pages_processed": summary["pages_processed"],
}
)
record_path = output_root / "fixture-evaluation.json"
record_path.write_text(json.dumps({"attempts": attempts}, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
assert record_path.exists()
def _report_final_status(report: str) -> str:
match = re.search(r"^- Final status: `(?P<status>[^`]+)`$", report, re.MULTILINE)
return match.group("status") if match else "unavailable"