141 lines
3.8 KiB
Python
141 lines
3.8 KiB
Python
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from pdftomd import (
|
|
Asset,
|
|
AssetKind,
|
|
BlockRole,
|
|
BoundingBox,
|
|
ChunkMetadata,
|
|
DocumentBlock,
|
|
DocumentIdentity,
|
|
Figure,
|
|
Formula,
|
|
PageAnalysis,
|
|
PageRange,
|
|
Table,
|
|
)
|
|
|
|
|
|
def test_document_identity_slug_and_output_fields_are_deterministic() -> None:
|
|
document = DocumentIdentity.from_path(r"C:\papers\MITC Korean Report 2026.pdf")
|
|
|
|
assert document.source_path == r"C:\papers\MITC Korean Report 2026.pdf"
|
|
assert document.stem == "MITC Korean Report 2026"
|
|
assert document.slug == "mitc-korean-report-2026"
|
|
assert document.chunk_filename(1) == "mitc-korean-report-2026_001.md"
|
|
assert document.asset_dir == "images"
|
|
|
|
|
|
def test_non_ascii_document_identity_uses_stable_fallback_slug() -> None:
|
|
first = DocumentIdentity.from_path("한글 보고서.pdf")
|
|
second = DocumentIdentity.from_path("다른 보고서.pdf")
|
|
|
|
assert first.filename == "한글 보고서.pdf"
|
|
assert first.slug.startswith("document-")
|
|
assert first.slug == DocumentIdentity.from_path("한글 보고서.pdf").slug
|
|
assert first.slug != second.slug
|
|
|
|
|
|
def test_page_range_invariants_and_helpers() -> None:
|
|
page_range = PageRange(start=3, end=7)
|
|
|
|
assert page_range.count == 5
|
|
assert page_range.label == "3-7"
|
|
assert page_range.contains(3)
|
|
assert page_range.contains(7)
|
|
assert not page_range.contains(8)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("start", "end"),
|
|
[(0, 1), (4, 3)],
|
|
)
|
|
def test_page_range_rejects_invalid_bounds(start: int, end: int) -> None:
|
|
with pytest.raises(ValueError):
|
|
PageRange(start=start, end=end)
|
|
|
|
|
|
def test_block_formula_table_figure_and_asset_construction() -> None:
|
|
bbox = BoundingBox(page=2, x0=10.0, y0=20.0, x1=110.0, y1=80.0)
|
|
page_range = PageRange(start=2, end=2)
|
|
identity = DocumentIdentity.from_path("Example Paper.pdf")
|
|
|
|
formula = Formula(
|
|
id="eq-001",
|
|
latex=r"E = mc^2",
|
|
display=True,
|
|
source_text="E = mc2",
|
|
number="1",
|
|
)
|
|
table = Table(
|
|
id="tbl-001",
|
|
rows=(("A", "B"), ("1", "2")),
|
|
caption="Table 1. Values",
|
|
number="1",
|
|
)
|
|
figure = Figure(
|
|
id="fig-001",
|
|
caption="Figure 1. Diagram",
|
|
number="1",
|
|
asset_id="asset-001",
|
|
)
|
|
asset = Asset(
|
|
id="asset-001",
|
|
kind=AssetKind.FIGURE,
|
|
relative_path=f"{identity.asset_dir}/{identity.figure_asset_filename('1')}",
|
|
page=2,
|
|
bbox=bbox,
|
|
content_hash="abc123",
|
|
)
|
|
block = DocumentBlock(
|
|
id="block-001",
|
|
role=BlockRole.FORMULA,
|
|
page_range=page_range,
|
|
bbox=bbox,
|
|
text="E = mc^2",
|
|
formula=formula,
|
|
)
|
|
chunk = ChunkMetadata(
|
|
index=1,
|
|
document=identity,
|
|
page_range=page_range,
|
|
block_ids=("block-001",),
|
|
asset_ids=("asset-001",),
|
|
)
|
|
analysis = PageAnalysis(
|
|
page=2,
|
|
text_length=200,
|
|
image_count=1,
|
|
has_text_layer=True,
|
|
needs_ocr=False,
|
|
)
|
|
|
|
assert bbox.width == 100.0
|
|
assert bbox.height == 60.0
|
|
assert block.role is BlockRole.FORMULA
|
|
assert block.formula == formula
|
|
assert table.rows[0] == ("A", "B")
|
|
assert figure.asset_id == asset.id
|
|
assert asset.relative_path == "images/example-paper_fig-1.png"
|
|
assert chunk.filename == "example-paper_001.md"
|
|
assert chunk.slug == "example-paper"
|
|
assert analysis.text_layer_quality == "text"
|
|
|
|
|
|
def test_bounding_box_rejects_invalid_coordinates() -> None:
|
|
with pytest.raises(ValueError):
|
|
BoundingBox(page=1, x0=10.0, y0=0.0, x1=10.0, y1=20.0)
|
|
|
|
|
|
def test_page_analysis_rejects_negative_counts() -> None:
|
|
with pytest.raises(ValueError):
|
|
PageAnalysis(
|
|
page=1,
|
|
text_length=-1,
|
|
image_count=0,
|
|
has_text_layer=False,
|
|
needs_ocr=True,
|
|
)
|