modify pdftomd

This commit is contained in:
김경종
2026-05-14 10:16:59 +09:00
parent 2232b51fc9
commit dc11880140
69 changed files with 7784 additions and 1150 deletions
@@ -39,7 +39,6 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
output_root = tmp_path / "mineru-fixture-output"
attempts: list[dict[str, object]] = []
for pdf in sample_pdfs:
sample_output = output_root / pdf.stem
completed = subprocess.run(
[
sys.executable,
@@ -48,7 +47,7 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
"convert",
str(pdf),
"--out",
str(sample_output),
str(output_root),
],
cwd=REPO_ROOT,
check=False,
@@ -67,7 +66,7 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
"convert",
str(pdf),
"--out",
str(sample_output),
str(output_root),
]
),
"exit_code": completed.returncode,
@@ -77,34 +76,27 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
)
assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2)
markdown_path = sample_output / f"{pdf.stem}.md"
metadata_path = sample_output / f"{pdf.stem}.metadata.json"
report_path = sample_output / f"{pdf.stem}.report.md"
sample_output = output_root / pdf.stem
markdown_path = sample_output / f"{pdf.stem}_001.md"
report_path = sample_output / f"{pdf.stem}_report.md"
assert markdown_path.exists()
assert metadata_path.exists()
assert report_path.exists()
assert not list(sample_output.glob("*.metadata.json"))
metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
summary = metadata["summary"]
assert metadata["engine"] == "MinerU"
assert summary["pages_processed"] >= 1
assert "warning_count" in summary
assert "math_render_error_count" in summary
assert "asset_count" in summary
report = report_path.read_text(encoding="utf-8")
assert "Output Markdown:" in report
assert "Metadata JSON:" in report
assert "Metadata JSON:" not in report
assert "Report Markdown:" in report
assert "- Engine: MinerU" in report
assert "- Pages processed:" in report
assert "- Warning count:" in report
assert "- Math render error count:" in report
assert "- Asset count:" in report
attempts[-1].update(
{
"markdown_path": str(markdown_path),
"metadata_path": str(metadata_path),
"report_path": str(report_path),
"warning_count": summary["warning_count"],
"final_status": _report_final_status(report),
"math_render_error_count": summary["math_render_error_count"],
"asset_count": summary["asset_count"],
"pages_processed": summary["pages_processed"],
}
)