add pdftomd
This commit is contained in:
@@ -0,0 +1,118 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
if os.environ.get("PDF2MD_RUN_MINERU_FIXTURES") != "1":
|
||||
pytest.skip(
|
||||
"optional local MinerU fixture evaluation is disabled; set PDF2MD_RUN_MINERU_FIXTURES=1 to run",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
SAMPLES_DIR = REPO_ROOT / "samples"
|
||||
|
||||
|
||||
def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -> None:
|
||||
doctor = subprocess.run(
|
||||
[sys.executable, "-m", "pdf2md.cli", "doctor"],
|
||||
cwd=REPO_ROOT,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if doctor.returncode != 0:
|
||||
pytest.skip(f"local MinerU fixture evaluation blocked by doctor:\n{doctor.stdout}\n{doctor.stderr}")
|
||||
|
||||
sample_pdfs = tuple(sorted(SAMPLES_DIR.glob("*.pdf"), key=lambda path: path.name.casefold()))
|
||||
if not sample_pdfs:
|
||||
pytest.skip(f"no local sample PDFs found under {SAMPLES_DIR}")
|
||||
|
||||
output_root = tmp_path / "mineru-fixture-output"
|
||||
attempts: list[dict[str, object]] = []
|
||||
for pdf in sample_pdfs:
|
||||
sample_output = output_root / pdf.stem
|
||||
completed = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"pdf2md.cli",
|
||||
"convert",
|
||||
str(pdf),
|
||||
"--out",
|
||||
str(sample_output),
|
||||
],
|
||||
cwd=REPO_ROOT,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=1800,
|
||||
)
|
||||
attempts.append(
|
||||
{
|
||||
"source": str(pdf.relative_to(REPO_ROOT)),
|
||||
"command": " ".join(
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"pdf2md.cli",
|
||||
"convert",
|
||||
str(pdf),
|
||||
"--out",
|
||||
str(sample_output),
|
||||
]
|
||||
),
|
||||
"exit_code": completed.returncode,
|
||||
"stdout": completed.stdout,
|
||||
"stderr": completed.stderr,
|
||||
}
|
||||
)
|
||||
assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2)
|
||||
|
||||
markdown_path = sample_output / f"{pdf.stem}.md"
|
||||
metadata_path = sample_output / f"{pdf.stem}.metadata.json"
|
||||
report_path = sample_output / f"{pdf.stem}.report.md"
|
||||
assert markdown_path.exists()
|
||||
assert metadata_path.exists()
|
||||
assert report_path.exists()
|
||||
|
||||
metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
summary = metadata["summary"]
|
||||
assert metadata["engine"] == "MinerU"
|
||||
assert summary["pages_processed"] >= 1
|
||||
assert "warning_count" in summary
|
||||
assert "math_render_error_count" in summary
|
||||
assert "asset_count" in summary
|
||||
report = report_path.read_text(encoding="utf-8")
|
||||
assert "Output Markdown:" in report
|
||||
assert "Metadata JSON:" in report
|
||||
assert "Report Markdown:" in report
|
||||
attempts[-1].update(
|
||||
{
|
||||
"markdown_path": str(markdown_path),
|
||||
"metadata_path": str(metadata_path),
|
||||
"report_path": str(report_path),
|
||||
"warning_count": summary["warning_count"],
|
||||
"final_status": _report_final_status(report),
|
||||
"math_render_error_count": summary["math_render_error_count"],
|
||||
"asset_count": summary["asset_count"],
|
||||
"pages_processed": summary["pages_processed"],
|
||||
}
|
||||
)
|
||||
|
||||
record_path = output_root / "fixture-evaluation.json"
|
||||
record_path.write_text(json.dumps({"attempts": attempts}, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
assert record_path.exists()
|
||||
|
||||
|
||||
def _report_final_status(report: str) -> str:
|
||||
match = re.search(r"^- Final status: `(?P<status>[^`]+)`$", report, re.MULTILINE)
|
||||
return match.group("status") if match else "unavailable"
|
||||
@@ -0,0 +1,152 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.cli import main
|
||||
from pdf2md.conversion import convert_pdf
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult
|
||||
|
||||
|
||||
class FixtureAdapter:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
raw_markdown: str,
|
||||
raw_structured: object | None = None,
|
||||
succeeded: bool = True,
|
||||
asset_name: str | None = None,
|
||||
warnings: tuple[WarningRecord, ...] = (),
|
||||
) -> None:
|
||||
self.raw_markdown = raw_markdown
|
||||
self.raw_structured = raw_structured
|
||||
self.succeeded = succeeded
|
||||
self.asset_name = asset_name
|
||||
self.warnings = warnings
|
||||
self.calls: list[tuple[Path, Path]] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.calls.append((input_path, output_dir))
|
||||
asset_paths: tuple[Path, ...] = ()
|
||||
if self.asset_name is not None:
|
||||
asset_path = output_dir / "assets" / self.asset_name
|
||||
asset_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
asset_path.write_bytes(b"fake image")
|
||||
asset_paths = (asset_path,)
|
||||
failure = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
|
||||
return MinerUAdapterResult(
|
||||
succeeded=self.succeeded,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=self.raw_markdown if self.succeeded else None,
|
||||
raw_structured=self.raw_structured,
|
||||
asset_paths=asset_paths,
|
||||
warnings=self.warnings if self.succeeded else (failure,),
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0 if self.succeeded else 2,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
def fixed_clock() -> datetime:
|
||||
return datetime(2026, 5, 8, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def make_pdf(directory: Path, name: str) -> Path:
|
||||
path = directory / name
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(b"%PDF-1.7\nfast integration fixture\n")
|
||||
return path
|
||||
|
||||
|
||||
def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_counts(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "쉘구조_math.pdf")
|
||||
adapter = FixtureAdapter(
|
||||
raw_markdown=(
|
||||
"# Shell Element\n\n"
|
||||
"Inline \\(u_i\\) and display:\n\n"
|
||||
"\\[\nK u = f\n\\]\n\n"
|
||||
'<table><tr><td rowspan="2">\\(N_i\\)</td><td>stress</td></tr></table>\n\n'
|
||||
"\n"
|
||||
),
|
||||
raw_structured={"pages": [{}, {}, {}]},
|
||||
asset_name="mesh.png",
|
||||
)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert result.markdown_path.exists()
|
||||
assert result.metadata_path is not None and result.metadata_path.exists()
|
||||
assert result.report_path.exists()
|
||||
assert (tmp_path / "out" / "쉘구조_math.assets" / "mesh.png").read_bytes() == b"fake image"
|
||||
|
||||
markdown = result.markdown_path.read_text(encoding="utf-8")
|
||||
assert "$u_i$" in markdown
|
||||
assert "$$\nK u = f\n$$" in markdown
|
||||
assert "" in markdown
|
||||
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["engine"] == "MinerU"
|
||||
assert metadata["engine_version"] == "3.1.0"
|
||||
assert metadata["summary"]["pages_processed"] == 3
|
||||
assert metadata["summary"]["asset_count"] == 1
|
||||
assert metadata["summary"]["inline_formula_count"] == 1
|
||||
assert metadata["summary"]["display_formula_count"] == 1
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["summary"]["warning_count"] == 1
|
||||
assert metadata["warnings"][0]["code"] == "TABLE_FALLBACK"
|
||||
assert metadata["assets"] == [{"relative_path": "쉘구조_math.assets/mesh.png"}]
|
||||
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "- Final status: `partial`" in report
|
||||
assert "- Output Markdown:" in report
|
||||
assert "- Metadata JSON:" in report
|
||||
assert "- Report Markdown:" in report
|
||||
assert "- Math render error count: 0" in report
|
||||
assert "`TABLE_FALLBACK`" in report
|
||||
|
||||
|
||||
def test_v1_fast_failure_records_no_fallback_and_writes_no_release_outputs(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "failed.pdf")
|
||||
adapter = FixtureAdapter(raw_markdown="", succeeded=False)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "failed"
|
||||
assert result.warning_count == 1
|
||||
assert result.warnings[0].code == WarningCode.MINERU_CLI_FAILED
|
||||
assert not result.markdown_path.exists()
|
||||
assert not result.report_path.exists()
|
||||
assert result.metadata_path is not None and not result.metadata_path.exists()
|
||||
|
||||
|
||||
def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, capsys) -> None:
|
||||
source = tmp_path / "pdfs"
|
||||
first = make_pdf(source, "a.pdf")
|
||||
second = make_pdf(source, "한글.pdf")
|
||||
adapter = FixtureAdapter(raw_markdown="# Batch\n\nNo formulas.\n", raw_structured={"pages": 1})
|
||||
|
||||
exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert [call[0] for call in adapter.calls] == [first.resolve(), second.resolve()]
|
||||
assert "converted: 2" in captured.out
|
||||
assert "failed: 0" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert (tmp_path / "out" / "a.md").exists()
|
||||
assert (tmp_path / "out" / "a.metadata.json").exists()
|
||||
assert (tmp_path / "out" / "a.report.md").exists()
|
||||
assert (tmp_path / "out" / "한글.md").exists()
|
||||
assert (tmp_path / "out" / "한글.metadata.json").exists()
|
||||
assert (tmp_path / "out" / "한글.report.md").exists()
|
||||
Reference in New Issue
Block a user