add pdftomd
This commit is contained in:
@@ -0,0 +1,62 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
from pdf2md.pdf_splitter import PdfChunkError, count_pdf_pages, plan_pdf_chunks, write_pdf_chunk
|
||||
|
||||
|
||||
def make_blank_pdf(path: Path, page_count: int) -> Path:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
writer = PdfWriter()
|
||||
for _ in range(page_count):
|
||||
writer.add_blank_page(width=72, height=72)
|
||||
with path.open("wb") as file:
|
||||
writer.write(file)
|
||||
return path
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("page_count", "expected_ranges"),
|
||||
[
|
||||
(1, [(1, 1)]),
|
||||
(20, [(1, 20)]),
|
||||
(21, [(1, 20), (21, 21)]),
|
||||
(40, [(1, 20), (21, 40)]),
|
||||
(41, [(1, 20), (21, 40), (41, 41)]),
|
||||
],
|
||||
)
|
||||
def test_plan_pdf_chunks_uses_one_based_ranges_and_names(
|
||||
tmp_path: Path,
|
||||
page_count: int,
|
||||
expected_ranges: list[tuple[int, int]],
|
||||
) -> None:
|
||||
pdf = make_blank_pdf(tmp_path / "paper.pdf", page_count)
|
||||
|
||||
chunks = plan_pdf_chunks(pdf, chunk_pages=20)
|
||||
|
||||
assert count_pdf_pages(pdf) == page_count
|
||||
assert [(chunk.source_page_start, chunk.source_page_end) for chunk in chunks] == expected_ranges
|
||||
assert [chunk.output_filename for chunk in chunks] == [
|
||||
f"paper.part-{index:03d}.pages-{start:03d}-{end:03d}.pdf"
|
||||
for index, (start, end) in enumerate(expected_ranges, start=1)
|
||||
]
|
||||
|
||||
|
||||
def test_write_pdf_chunk_writes_expected_page_count(tmp_path: Path) -> None:
|
||||
pdf = make_blank_pdf(tmp_path / "paper.pdf", 41)
|
||||
chunk = plan_pdf_chunks(pdf, chunk_pages=20)[1]
|
||||
|
||||
output = write_pdf_chunk(chunk, tmp_path / "chunks" / chunk.output_filename)
|
||||
|
||||
assert output.exists()
|
||||
assert len(PdfReader(output).pages) == 20
|
||||
|
||||
|
||||
def test_plan_pdf_chunks_rejects_non_positive_chunk_size(tmp_path: Path) -> None:
|
||||
pdf = make_blank_pdf(tmp_path / "paper.pdf", 1)
|
||||
|
||||
with pytest.raises(PdfChunkError, match="positive integer"):
|
||||
plan_pdf_chunks(pdf, chunk_pages=0)
|
||||
Reference in New Issue
Block a user