from __future__ import annotations import pytest from pdftomd import ( Asset, AssetKind, BlockRole, BoundingBox, ChunkMetadata, DocumentBlock, DocumentIdentity, Figure, Formula, PageAnalysis, PageRange, Table, ) def test_document_identity_slug_and_output_fields_are_deterministic() -> None: document = DocumentIdentity.from_path(r"C:\papers\MITC Korean Report 2026.pdf") assert document.source_path == r"C:\papers\MITC Korean Report 2026.pdf" assert document.stem == "MITC Korean Report 2026" assert document.slug == "mitc-korean-report-2026" assert document.chunk_filename(1) == "mitc-korean-report-2026_001.md" assert document.asset_dir == "images" def test_non_ascii_document_identity_uses_stable_fallback_slug() -> None: first = DocumentIdentity.from_path("한글 보고서.pdf") second = DocumentIdentity.from_path("다른 보고서.pdf") assert first.filename == "한글 보고서.pdf" assert first.slug.startswith("document-") assert first.slug == DocumentIdentity.from_path("한글 보고서.pdf").slug assert first.slug != second.slug def test_page_range_invariants_and_helpers() -> None: page_range = PageRange(start=3, end=7) assert page_range.count == 5 assert page_range.label == "3-7" assert page_range.contains(3) assert page_range.contains(7) assert not page_range.contains(8) @pytest.mark.parametrize( ("start", "end"), [(0, 1), (4, 3)], ) def test_page_range_rejects_invalid_bounds(start: int, end: int) -> None: with pytest.raises(ValueError): PageRange(start=start, end=end) def test_block_formula_table_figure_and_asset_construction() -> None: bbox = BoundingBox(page=2, x0=10.0, y0=20.0, x1=110.0, y1=80.0) page_range = PageRange(start=2, end=2) identity = DocumentIdentity.from_path("Example Paper.pdf") formula = Formula( id="eq-001", latex=r"E = mc^2", display=True, source_text="E = mc2", number="1", ) table = Table( id="tbl-001", rows=(("A", "B"), ("1", "2")), caption="Table 1. Values", number="1", ) figure = Figure( id="fig-001", caption="Figure 1. Diagram", number="1", asset_id="asset-001", ) asset = Asset( id="asset-001", kind=AssetKind.FIGURE, relative_path=f"{identity.asset_dir}/{identity.figure_asset_filename('1')}", page=2, bbox=bbox, content_hash="abc123", ) block = DocumentBlock( id="block-001", role=BlockRole.FORMULA, page_range=page_range, bbox=bbox, text="E = mc^2", formula=formula, ) chunk = ChunkMetadata( index=1, document=identity, page_range=page_range, block_ids=("block-001",), asset_ids=("asset-001",), ) analysis = PageAnalysis( page=2, text_length=200, image_count=1, has_text_layer=True, needs_ocr=False, ) assert bbox.width == 100.0 assert bbox.height == 60.0 assert block.role is BlockRole.FORMULA assert block.formula == formula assert table.rows[0] == ("A", "B") assert figure.asset_id == asset.id assert asset.relative_path == "images/example-paper_fig-1.png" assert chunk.filename == "example-paper_001.md" assert chunk.slug == "example-paper" assert analysis.text_layer_quality == "text" def test_bounding_box_rejects_invalid_coordinates() -> None: with pytest.raises(ValueError): BoundingBox(page=1, x0=10.0, y0=0.0, x1=10.0, y1=20.0) def test_page_analysis_rejects_negative_counts() -> None: with pytest.raises(ValueError): PageAnalysis( page=1, text_length=-1, image_count=0, has_text_layer=False, needs_ocr=True, )