Files
PDFToMD/tests/test_pdf_splitter.py
2026-05-08 16:42:19 +09:00

63 lines
1.9 KiB
Python

from __future__ import annotations
from pathlib import Path
import pytest
from pypdf import PdfReader, PdfWriter
from pdf2md.pdf_splitter import PdfChunkError, count_pdf_pages, plan_pdf_chunks, write_pdf_chunk
def make_blank_pdf(path: Path, page_count: int) -> Path:
path.parent.mkdir(parents=True, exist_ok=True)
writer = PdfWriter()
for _ in range(page_count):
writer.add_blank_page(width=72, height=72)
with path.open("wb") as file:
writer.write(file)
return path
@pytest.mark.parametrize(
("page_count", "expected_ranges"),
[
(1, [(1, 1)]),
(20, [(1, 20)]),
(21, [(1, 20), (21, 21)]),
(40, [(1, 20), (21, 40)]),
(41, [(1, 20), (21, 40), (41, 41)]),
],
)
def test_plan_pdf_chunks_uses_one_based_ranges_and_names(
tmp_path: Path,
page_count: int,
expected_ranges: list[tuple[int, int]],
) -> None:
pdf = make_blank_pdf(tmp_path / "paper.pdf", page_count)
chunks = plan_pdf_chunks(pdf, chunk_pages=20)
assert count_pdf_pages(pdf) == page_count
assert [(chunk.source_page_start, chunk.source_page_end) for chunk in chunks] == expected_ranges
assert [chunk.output_filename for chunk in chunks] == [
f"paper.part-{index:03d}.pages-{start:03d}-{end:03d}.pdf"
for index, (start, end) in enumerate(expected_ranges, start=1)
]
def test_write_pdf_chunk_writes_expected_page_count(tmp_path: Path) -> None:
pdf = make_blank_pdf(tmp_path / "paper.pdf", 41)
chunk = plan_pdf_chunks(pdf, chunk_pages=20)[1]
output = write_pdf_chunk(chunk, tmp_path / "chunks" / chunk.output_filename)
assert output.exists()
assert len(PdfReader(output).pages) == 20
def test_plan_pdf_chunks_rejects_non_positive_chunk_size(tmp_path: Path) -> None:
pdf = make_blank_pdf(tmp_path / "paper.pdf", 1)
with pytest.raises(PdfChunkError, match="positive integer"):
plan_pdf_chunks(pdf, chunk_pages=0)