Files
PDFToMD/tests/test_models.py
T
김경종 7e985ae94a add files
2026-04-30 17:05:19 +09:00

141 lines
3.8 KiB
Python

from __future__ import annotations
import pytest
from pdftomd import (
Asset,
AssetKind,
BlockRole,
BoundingBox,
ChunkMetadata,
DocumentBlock,
DocumentIdentity,
Figure,
Formula,
PageAnalysis,
PageRange,
Table,
)
def test_document_identity_slug_and_output_fields_are_deterministic() -> None:
document = DocumentIdentity.from_path(r"C:\papers\MITC Korean Report 2026.pdf")
assert document.source_path == r"C:\papers\MITC Korean Report 2026.pdf"
assert document.stem == "MITC Korean Report 2026"
assert document.slug == "mitc-korean-report-2026"
assert document.chunk_filename(1) == "mitc-korean-report-2026_001.md"
assert document.asset_dir == "images"
def test_non_ascii_document_identity_uses_stable_fallback_slug() -> None:
first = DocumentIdentity.from_path("한글 보고서.pdf")
second = DocumentIdentity.from_path("다른 보고서.pdf")
assert first.filename == "한글 보고서.pdf"
assert first.slug.startswith("document-")
assert first.slug == DocumentIdentity.from_path("한글 보고서.pdf").slug
assert first.slug != second.slug
def test_page_range_invariants_and_helpers() -> None:
page_range = PageRange(start=3, end=7)
assert page_range.count == 5
assert page_range.label == "3-7"
assert page_range.contains(3)
assert page_range.contains(7)
assert not page_range.contains(8)
@pytest.mark.parametrize(
("start", "end"),
[(0, 1), (4, 3)],
)
def test_page_range_rejects_invalid_bounds(start: int, end: int) -> None:
with pytest.raises(ValueError):
PageRange(start=start, end=end)
def test_block_formula_table_figure_and_asset_construction() -> None:
bbox = BoundingBox(page=2, x0=10.0, y0=20.0, x1=110.0, y1=80.0)
page_range = PageRange(start=2, end=2)
identity = DocumentIdentity.from_path("Example Paper.pdf")
formula = Formula(
id="eq-001",
latex=r"E = mc^2",
display=True,
source_text="E = mc2",
number="1",
)
table = Table(
id="tbl-001",
rows=(("A", "B"), ("1", "2")),
caption="Table 1. Values",
number="1",
)
figure = Figure(
id="fig-001",
caption="Figure 1. Diagram",
number="1",
asset_id="asset-001",
)
asset = Asset(
id="asset-001",
kind=AssetKind.FIGURE,
relative_path=f"{identity.asset_dir}/{identity.figure_asset_filename('1')}",
page=2,
bbox=bbox,
content_hash="abc123",
)
block = DocumentBlock(
id="block-001",
role=BlockRole.FORMULA,
page_range=page_range,
bbox=bbox,
text="E = mc^2",
formula=formula,
)
chunk = ChunkMetadata(
index=1,
document=identity,
page_range=page_range,
block_ids=("block-001",),
asset_ids=("asset-001",),
)
analysis = PageAnalysis(
page=2,
text_length=200,
image_count=1,
has_text_layer=True,
needs_ocr=False,
)
assert bbox.width == 100.0
assert bbox.height == 60.0
assert block.role is BlockRole.FORMULA
assert block.formula == formula
assert table.rows[0] == ("A", "B")
assert figure.asset_id == asset.id
assert asset.relative_path == "images/example-paper_fig-1.png"
assert chunk.filename == "example-paper_001.md"
assert chunk.slug == "example-paper"
assert analysis.text_layer_quality == "text"
def test_bounding_box_rejects_invalid_coordinates() -> None:
with pytest.raises(ValueError):
BoundingBox(page=1, x0=10.0, y0=0.0, x1=10.0, y1=20.0)
def test_page_analysis_rejects_negative_counts() -> None:
with pytest.raises(ValueError):
PageAnalysis(
page=1,
text_length=-1,
image_count=0,
has_text_layer=False,
needs_ocr=True,
)