from __future__ import annotations from pathlib import Path import pytest from pypdf import PdfReader, PdfWriter from pdf2md.pdf_splitter import PdfChunkError, count_pdf_pages, plan_pdf_chunks, write_pdf_chunk def make_blank_pdf(path: Path, page_count: int) -> Path: path.parent.mkdir(parents=True, exist_ok=True) writer = PdfWriter() for _ in range(page_count): writer.add_blank_page(width=72, height=72) with path.open("wb") as file: writer.write(file) return path @pytest.mark.parametrize( ("page_count", "expected_ranges"), [ (1, [(1, 1)]), (20, [(1, 20)]), (21, [(1, 20), (21, 21)]), (40, [(1, 20), (21, 40)]), (41, [(1, 20), (21, 40), (41, 41)]), ], ) def test_plan_pdf_chunks_uses_one_based_ranges_and_names( tmp_path: Path, page_count: int, expected_ranges: list[tuple[int, int]], ) -> None: pdf = make_blank_pdf(tmp_path / "paper.pdf", page_count) chunks = plan_pdf_chunks(pdf, chunk_pages=20) assert count_pdf_pages(pdf) == page_count assert [(chunk.source_page_start, chunk.source_page_end) for chunk in chunks] == expected_ranges assert [chunk.output_filename for chunk in chunks] == [ f"paper.part-{index:03d}.pages-{start:03d}-{end:03d}.pdf" for index, (start, end) in enumerate(expected_ranges, start=1) ] def test_write_pdf_chunk_writes_expected_page_count(tmp_path: Path) -> None: pdf = make_blank_pdf(tmp_path / "paper.pdf", 41) chunk = plan_pdf_chunks(pdf, chunk_pages=20)[1] output = write_pdf_chunk(chunk, tmp_path / "chunks" / chunk.output_filename) assert output.exists() assert len(PdfReader(output).pages) == 20 def test_plan_pdf_chunks_rejects_non_positive_chunk_size(tmp_path: Path) -> None: pdf = make_blank_pdf(tmp_path / "paper.pdf", 1) with pytest.raises(PdfChunkError, match="positive integer"): plan_pdf_chunks(pdf, chunk_pages=0)