add files
This commit is contained in:
@@ -0,0 +1,140 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from pdftomd import (
|
||||
Asset,
|
||||
AssetKind,
|
||||
BlockRole,
|
||||
BoundingBox,
|
||||
ChunkMetadata,
|
||||
DocumentBlock,
|
||||
DocumentIdentity,
|
||||
Figure,
|
||||
Formula,
|
||||
PageAnalysis,
|
||||
PageRange,
|
||||
Table,
|
||||
)
|
||||
|
||||
|
||||
def test_document_identity_slug_and_output_fields_are_deterministic() -> None:
|
||||
document = DocumentIdentity.from_path(r"C:\papers\MITC Korean Report 2026.pdf")
|
||||
|
||||
assert document.source_path == r"C:\papers\MITC Korean Report 2026.pdf"
|
||||
assert document.stem == "MITC Korean Report 2026"
|
||||
assert document.slug == "mitc-korean-report-2026"
|
||||
assert document.chunk_filename(1) == "mitc-korean-report-2026_001.md"
|
||||
assert document.asset_dir == "images"
|
||||
|
||||
|
||||
def test_non_ascii_document_identity_uses_stable_fallback_slug() -> None:
|
||||
first = DocumentIdentity.from_path("한글 보고서.pdf")
|
||||
second = DocumentIdentity.from_path("다른 보고서.pdf")
|
||||
|
||||
assert first.filename == "한글 보고서.pdf"
|
||||
assert first.slug.startswith("document-")
|
||||
assert first.slug == DocumentIdentity.from_path("한글 보고서.pdf").slug
|
||||
assert first.slug != second.slug
|
||||
|
||||
|
||||
def test_page_range_invariants_and_helpers() -> None:
|
||||
page_range = PageRange(start=3, end=7)
|
||||
|
||||
assert page_range.count == 5
|
||||
assert page_range.label == "3-7"
|
||||
assert page_range.contains(3)
|
||||
assert page_range.contains(7)
|
||||
assert not page_range.contains(8)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("start", "end"),
|
||||
[(0, 1), (4, 3)],
|
||||
)
|
||||
def test_page_range_rejects_invalid_bounds(start: int, end: int) -> None:
|
||||
with pytest.raises(ValueError):
|
||||
PageRange(start=start, end=end)
|
||||
|
||||
|
||||
def test_block_formula_table_figure_and_asset_construction() -> None:
|
||||
bbox = BoundingBox(page=2, x0=10.0, y0=20.0, x1=110.0, y1=80.0)
|
||||
page_range = PageRange(start=2, end=2)
|
||||
identity = DocumentIdentity.from_path("Example Paper.pdf")
|
||||
|
||||
formula = Formula(
|
||||
id="eq-001",
|
||||
latex=r"E = mc^2",
|
||||
display=True,
|
||||
source_text="E = mc2",
|
||||
number="1",
|
||||
)
|
||||
table = Table(
|
||||
id="tbl-001",
|
||||
rows=(("A", "B"), ("1", "2")),
|
||||
caption="Table 1. Values",
|
||||
number="1",
|
||||
)
|
||||
figure = Figure(
|
||||
id="fig-001",
|
||||
caption="Figure 1. Diagram",
|
||||
number="1",
|
||||
asset_id="asset-001",
|
||||
)
|
||||
asset = Asset(
|
||||
id="asset-001",
|
||||
kind=AssetKind.FIGURE,
|
||||
relative_path=f"{identity.asset_dir}/{identity.figure_asset_filename('1')}",
|
||||
page=2,
|
||||
bbox=bbox,
|
||||
content_hash="abc123",
|
||||
)
|
||||
block = DocumentBlock(
|
||||
id="block-001",
|
||||
role=BlockRole.FORMULA,
|
||||
page_range=page_range,
|
||||
bbox=bbox,
|
||||
text="E = mc^2",
|
||||
formula=formula,
|
||||
)
|
||||
chunk = ChunkMetadata(
|
||||
index=1,
|
||||
document=identity,
|
||||
page_range=page_range,
|
||||
block_ids=("block-001",),
|
||||
asset_ids=("asset-001",),
|
||||
)
|
||||
analysis = PageAnalysis(
|
||||
page=2,
|
||||
text_length=200,
|
||||
image_count=1,
|
||||
has_text_layer=True,
|
||||
needs_ocr=False,
|
||||
)
|
||||
|
||||
assert bbox.width == 100.0
|
||||
assert bbox.height == 60.0
|
||||
assert block.role is BlockRole.FORMULA
|
||||
assert block.formula == formula
|
||||
assert table.rows[0] == ("A", "B")
|
||||
assert figure.asset_id == asset.id
|
||||
assert asset.relative_path == "images/example-paper_fig-1.png"
|
||||
assert chunk.filename == "example-paper_001.md"
|
||||
assert chunk.slug == "example-paper"
|
||||
assert analysis.text_layer_quality == "text"
|
||||
|
||||
|
||||
def test_bounding_box_rejects_invalid_coordinates() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
BoundingBox(page=1, x0=10.0, y0=0.0, x1=10.0, y1=20.0)
|
||||
|
||||
|
||||
def test_page_analysis_rejects_negative_counts() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
PageAnalysis(
|
||||
page=1,
|
||||
text_length=-1,
|
||||
image_count=0,
|
||||
has_text_layer=False,
|
||||
needs_ocr=True,
|
||||
)
|
||||
Reference in New Issue
Block a user