modify pdftomd
This commit is contained in:
@@ -39,7 +39,6 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
|
||||
output_root = tmp_path / "mineru-fixture-output"
|
||||
attempts: list[dict[str, object]] = []
|
||||
for pdf in sample_pdfs:
|
||||
sample_output = output_root / pdf.stem
|
||||
completed = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
@@ -48,7 +47,7 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
|
||||
"convert",
|
||||
str(pdf),
|
||||
"--out",
|
||||
str(sample_output),
|
||||
str(output_root),
|
||||
],
|
||||
cwd=REPO_ROOT,
|
||||
check=False,
|
||||
@@ -67,7 +66,7 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
|
||||
"convert",
|
||||
str(pdf),
|
||||
"--out",
|
||||
str(sample_output),
|
||||
str(output_root),
|
||||
]
|
||||
),
|
||||
"exit_code": completed.returncode,
|
||||
@@ -77,34 +76,27 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
|
||||
)
|
||||
assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2)
|
||||
|
||||
markdown_path = sample_output / f"{pdf.stem}.md"
|
||||
metadata_path = sample_output / f"{pdf.stem}.metadata.json"
|
||||
report_path = sample_output / f"{pdf.stem}.report.md"
|
||||
sample_output = output_root / pdf.stem
|
||||
markdown_path = sample_output / f"{pdf.stem}_001.md"
|
||||
report_path = sample_output / f"{pdf.stem}_report.md"
|
||||
assert markdown_path.exists()
|
||||
assert metadata_path.exists()
|
||||
assert report_path.exists()
|
||||
assert not list(sample_output.glob("*.metadata.json"))
|
||||
|
||||
metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
summary = metadata["summary"]
|
||||
assert metadata["engine"] == "MinerU"
|
||||
assert summary["pages_processed"] >= 1
|
||||
assert "warning_count" in summary
|
||||
assert "math_render_error_count" in summary
|
||||
assert "asset_count" in summary
|
||||
report = report_path.read_text(encoding="utf-8")
|
||||
assert "Output Markdown:" in report
|
||||
assert "Metadata JSON:" in report
|
||||
assert "Metadata JSON:" not in report
|
||||
assert "Report Markdown:" in report
|
||||
assert "- Engine: MinerU" in report
|
||||
assert "- Pages processed:" in report
|
||||
assert "- Warning count:" in report
|
||||
assert "- Math render error count:" in report
|
||||
assert "- Asset count:" in report
|
||||
attempts[-1].update(
|
||||
{
|
||||
"markdown_path": str(markdown_path),
|
||||
"metadata_path": str(metadata_path),
|
||||
"report_path": str(report_path),
|
||||
"warning_count": summary["warning_count"],
|
||||
"final_status": _report_final_status(report),
|
||||
"math_render_error_count": summary["math_render_error_count"],
|
||||
"asset_count": summary["asset_count"],
|
||||
"pages_processed": summary["pages_processed"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
@@ -68,8 +67,13 @@ def make_pdf(directory: Path, name: str) -> Path:
|
||||
return path
|
||||
|
||||
|
||||
def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_counts(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "쉘구조_math.pdf")
|
||||
def report_metadata(result) -> dict:
|
||||
assert result._report_metadata is not None
|
||||
return result._report_metadata
|
||||
|
||||
|
||||
def test_v1_fast_conversion_writes_markdown_report_assets_and_quality_counts(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "math.pdf")
|
||||
adapter = FixtureAdapter(
|
||||
raw_markdown=(
|
||||
"# Shell Element\n\n"
|
||||
@@ -85,17 +89,21 @@ def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_c
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert result.markdown_path == tmp_path / "out" / "math" / "math_001.md"
|
||||
assert result.markdown_path.exists()
|
||||
assert result.metadata_path is not None and result.metadata_path.exists()
|
||||
assert result.metadata_path is None
|
||||
assert not list((tmp_path / "out").rglob("*.metadata.json"))
|
||||
assert result.report_path == tmp_path / "out" / "math" / "math_report.md"
|
||||
assert result.report_path.exists()
|
||||
assert (tmp_path / "out" / "쉘구조_math.assets" / "mesh.png").read_bytes() == b"fake image"
|
||||
assert result.assets_dir == tmp_path / "out" / "math" / "images"
|
||||
assert (result.assets_dir / "mesh.png").read_bytes() == b"fake image"
|
||||
|
||||
markdown = result.markdown_path.read_text(encoding="utf-8")
|
||||
assert "$u_i$" in markdown
|
||||
assert "$$\nK u = f\n$$" in markdown
|
||||
assert "" in markdown
|
||||
assert "" in markdown
|
||||
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
metadata = report_metadata(result)
|
||||
assert metadata["engine"] == "MinerU"
|
||||
assert metadata["engine_version"] == "3.1.0"
|
||||
assert metadata["summary"]["pages_processed"] == 3
|
||||
@@ -105,18 +113,18 @@ def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_c
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["summary"]["warning_count"] == 1
|
||||
assert metadata["warnings"][0]["code"] == "TABLE_FALLBACK"
|
||||
assert metadata["assets"] == [{"relative_path": "쉘구조_math.assets/mesh.png"}]
|
||||
assert metadata["assets"] == [{"relative_path": "images/mesh.png"}]
|
||||
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "- Final status: `partial`" in report
|
||||
assert "- Output Markdown:" in report
|
||||
assert "- Metadata JSON:" in report
|
||||
assert "- Metadata JSON:" not in report
|
||||
assert "- Report Markdown:" in report
|
||||
assert "- Math render error count: 0" in report
|
||||
assert "`TABLE_FALLBACK`" in report
|
||||
|
||||
|
||||
def test_v1_fast_failure_records_no_fallback_and_writes_no_release_outputs(tmp_path: Path) -> None:
|
||||
def test_v1_fast_failure_records_no_fallback_and_writes_report_only(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "failed.pdf")
|
||||
adapter = FixtureAdapter(raw_markdown="", succeeded=False)
|
||||
|
||||
@@ -126,14 +134,15 @@ def test_v1_fast_failure_records_no_fallback_and_writes_no_release_outputs(tmp_p
|
||||
assert result.warning_count == 1
|
||||
assert result.warnings[0].code == WarningCode.MINERU_CLI_FAILED
|
||||
assert not result.markdown_path.exists()
|
||||
assert not result.report_path.exists()
|
||||
assert result.metadata_path is not None and not result.metadata_path.exists()
|
||||
assert result.report_path.exists()
|
||||
assert result.metadata_path is None
|
||||
assert "- Final status: `failed`" in result.report_path.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, capsys) -> None:
|
||||
source = tmp_path / "pdfs"
|
||||
first = make_pdf(source, "a.pdf")
|
||||
second = make_pdf(source, "한글.pdf")
|
||||
second = make_pdf(source, "korean.pdf")
|
||||
adapter = FixtureAdapter(raw_markdown="# Batch\n\nNo formulas.\n", raw_structured={"pages": 1})
|
||||
|
||||
exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
|
||||
@@ -144,9 +153,8 @@ def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, cap
|
||||
assert "converted: 2" in captured.out
|
||||
assert "failed: 0" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert (tmp_path / "out" / "a.md").exists()
|
||||
assert (tmp_path / "out" / "a.metadata.json").exists()
|
||||
assert (tmp_path / "out" / "a.report.md").exists()
|
||||
assert (tmp_path / "out" / "한글.md").exists()
|
||||
assert (tmp_path / "out" / "한글.metadata.json").exists()
|
||||
assert (tmp_path / "out" / "한글.report.md").exists()
|
||||
assert (tmp_path / "out" / "a" / "a_001.md").exists()
|
||||
assert (tmp_path / "out" / "a" / "a_report.md").exists()
|
||||
assert (tmp_path / "out" / "korean" / "korean_001.md").exists()
|
||||
assert (tmp_path / "out" / "korean" / "korean_report.md").exists()
|
||||
assert not list((tmp_path / "out").rglob("*.metadata.json"))
|
||||
|
||||
Reference in New Issue
Block a user