remove files
This commit is contained in:
@@ -1,11 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
SRC = ROOT / "src"
|
||||
|
||||
if str(SRC) not in sys.path:
|
||||
sys.path.insert(0, str(SRC))
|
||||
@@ -1,140 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from pdftomd import (
|
||||
Asset,
|
||||
AssetKind,
|
||||
BlockRole,
|
||||
BoundingBox,
|
||||
ChunkMetadata,
|
||||
DocumentBlock,
|
||||
DocumentIdentity,
|
||||
Figure,
|
||||
Formula,
|
||||
PageAnalysis,
|
||||
PageRange,
|
||||
Table,
|
||||
)
|
||||
|
||||
|
||||
def test_document_identity_slug_and_output_fields_are_deterministic() -> None:
|
||||
document = DocumentIdentity.from_path(r"C:\papers\MITC Korean Report 2026.pdf")
|
||||
|
||||
assert document.source_path == r"C:\papers\MITC Korean Report 2026.pdf"
|
||||
assert document.stem == "MITC Korean Report 2026"
|
||||
assert document.slug == "mitc-korean-report-2026"
|
||||
assert document.chunk_filename(1) == "mitc-korean-report-2026_001.md"
|
||||
assert document.asset_dir == "images"
|
||||
|
||||
|
||||
def test_non_ascii_document_identity_uses_stable_fallback_slug() -> None:
|
||||
first = DocumentIdentity.from_path("한글 보고서.pdf")
|
||||
second = DocumentIdentity.from_path("다른 보고서.pdf")
|
||||
|
||||
assert first.filename == "한글 보고서.pdf"
|
||||
assert first.slug.startswith("document-")
|
||||
assert first.slug == DocumentIdentity.from_path("한글 보고서.pdf").slug
|
||||
assert first.slug != second.slug
|
||||
|
||||
|
||||
def test_page_range_invariants_and_helpers() -> None:
|
||||
page_range = PageRange(start=3, end=7)
|
||||
|
||||
assert page_range.count == 5
|
||||
assert page_range.label == "3-7"
|
||||
assert page_range.contains(3)
|
||||
assert page_range.contains(7)
|
||||
assert not page_range.contains(8)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("start", "end"),
|
||||
[(0, 1), (4, 3)],
|
||||
)
|
||||
def test_page_range_rejects_invalid_bounds(start: int, end: int) -> None:
|
||||
with pytest.raises(ValueError):
|
||||
PageRange(start=start, end=end)
|
||||
|
||||
|
||||
def test_block_formula_table_figure_and_asset_construction() -> None:
|
||||
bbox = BoundingBox(page=2, x0=10.0, y0=20.0, x1=110.0, y1=80.0)
|
||||
page_range = PageRange(start=2, end=2)
|
||||
identity = DocumentIdentity.from_path("Example Paper.pdf")
|
||||
|
||||
formula = Formula(
|
||||
id="eq-001",
|
||||
latex=r"E = mc^2",
|
||||
display=True,
|
||||
source_text="E = mc2",
|
||||
number="1",
|
||||
)
|
||||
table = Table(
|
||||
id="tbl-001",
|
||||
rows=(("A", "B"), ("1", "2")),
|
||||
caption="Table 1. Values",
|
||||
number="1",
|
||||
)
|
||||
figure = Figure(
|
||||
id="fig-001",
|
||||
caption="Figure 1. Diagram",
|
||||
number="1",
|
||||
asset_id="asset-001",
|
||||
)
|
||||
asset = Asset(
|
||||
id="asset-001",
|
||||
kind=AssetKind.FIGURE,
|
||||
relative_path=f"{identity.asset_dir}/{identity.figure_asset_filename('1')}",
|
||||
page=2,
|
||||
bbox=bbox,
|
||||
content_hash="abc123",
|
||||
)
|
||||
block = DocumentBlock(
|
||||
id="block-001",
|
||||
role=BlockRole.FORMULA,
|
||||
page_range=page_range,
|
||||
bbox=bbox,
|
||||
text="E = mc^2",
|
||||
formula=formula,
|
||||
)
|
||||
chunk = ChunkMetadata(
|
||||
index=1,
|
||||
document=identity,
|
||||
page_range=page_range,
|
||||
block_ids=("block-001",),
|
||||
asset_ids=("asset-001",),
|
||||
)
|
||||
analysis = PageAnalysis(
|
||||
page=2,
|
||||
text_length=200,
|
||||
image_count=1,
|
||||
has_text_layer=True,
|
||||
needs_ocr=False,
|
||||
)
|
||||
|
||||
assert bbox.width == 100.0
|
||||
assert bbox.height == 60.0
|
||||
assert block.role is BlockRole.FORMULA
|
||||
assert block.formula == formula
|
||||
assert table.rows[0] == ("A", "B")
|
||||
assert figure.asset_id == asset.id
|
||||
assert asset.relative_path == "images/example-paper_fig-1.png"
|
||||
assert chunk.filename == "example-paper_001.md"
|
||||
assert chunk.slug == "example-paper"
|
||||
assert analysis.text_layer_quality == "text"
|
||||
|
||||
|
||||
def test_bounding_box_rejects_invalid_coordinates() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
BoundingBox(page=1, x0=10.0, y0=0.0, x1=10.0, y1=20.0)
|
||||
|
||||
|
||||
def test_page_analysis_rejects_negative_counts() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
PageAnalysis(
|
||||
page=1,
|
||||
text_length=-1,
|
||||
image_count=0,
|
||||
has_text_layer=False,
|
||||
needs_ocr=True,
|
||||
)
|
||||
@@ -1,49 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdftomd.options import ConversionOptions, FormulaParser, RuntimeMode
|
||||
|
||||
|
||||
def test_conversion_options_defaults_match_project_policy() -> None:
|
||||
options = ConversionOptions()
|
||||
|
||||
assert options.runtime is RuntimeMode.CUDA
|
||||
assert options.formula_parser is FormulaParser.NOUGAT
|
||||
assert options.chunk_target_pages == 20
|
||||
assert options.output_dir == Path("output")
|
||||
assert options.resume is False
|
||||
assert options.write_logs is True
|
||||
|
||||
|
||||
def test_runtime_modes_express_cuda_fail_fast_and_auto_fallback() -> None:
|
||||
assert RuntimeMode.CUDA.requires_cuda
|
||||
assert not RuntimeMode.CUDA.allows_cpu_fallback
|
||||
assert RuntimeMode.AUTO.allows_cpu_fallback
|
||||
assert not RuntimeMode.CPU.requires_cuda
|
||||
|
||||
|
||||
def test_conversion_options_validate_chunk_size_and_formula_boundary() -> None:
|
||||
with pytest.raises(ValueError, match="chunk_target_pages"):
|
||||
ConversionOptions(chunk_target_pages=0)
|
||||
|
||||
options = ConversionOptions(formula_parser=FormulaParser.MARKER)
|
||||
assert options.formula_parser is FormulaParser.MARKER
|
||||
assert not hasattr(options, "pyqt")
|
||||
assert not hasattr(options, "api_url")
|
||||
|
||||
|
||||
def test_conversion_options_normalize_optional_paths(tmp_path: Path) -> None:
|
||||
options = ConversionOptions(
|
||||
output_dir=tmp_path / "out",
|
||||
nougat_command=tmp_path / "venv" / "Scripts" / "nougat.exe",
|
||||
model_cache_dir=tmp_path / ".models",
|
||||
log_dir=tmp_path / "logs",
|
||||
)
|
||||
|
||||
assert options.output_dir == tmp_path / "out"
|
||||
assert options.nougat_command is not None
|
||||
assert options.model_cache_dir == tmp_path / ".models"
|
||||
assert options.log_dir == tmp_path / "logs"
|
||||
@@ -1,67 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdftomd.models import DocumentIdentity
|
||||
from pdftomd.paths import (
|
||||
OutputBundlePaths,
|
||||
document_identity_from_pdf,
|
||||
make_anchor,
|
||||
normalize_pdf_path,
|
||||
)
|
||||
|
||||
|
||||
def test_normalize_pdf_path_accepts_korean_and_spaced_paths(tmp_path: Path) -> None:
|
||||
pdf = tmp_path / "한글 경로" / "My Report 2026.pdf"
|
||||
pdf.parent.mkdir()
|
||||
pdf.write_bytes(b"%PDF-1.7\n")
|
||||
|
||||
normalized = normalize_pdf_path(pdf)
|
||||
|
||||
assert normalized.is_absolute()
|
||||
assert normalized.name == "My Report 2026.pdf"
|
||||
|
||||
|
||||
def test_normalize_pdf_path_rejects_non_pdf_files(tmp_path: Path) -> None:
|
||||
text_file = tmp_path / "document.txt"
|
||||
text_file.write_text("not a pdf", encoding="utf-8")
|
||||
|
||||
with pytest.raises(ValueError, match="PDF"):
|
||||
normalize_pdf_path(text_file)
|
||||
|
||||
|
||||
def test_document_identity_from_pdf_uses_stable_slug(tmp_path: Path) -> None:
|
||||
pdf = tmp_path / "한글 보고서.pdf"
|
||||
pdf.write_bytes(b"%PDF-1.7\n")
|
||||
|
||||
first = document_identity_from_pdf(pdf)
|
||||
second = document_identity_from_pdf(pdf)
|
||||
|
||||
assert first.filename == "한글 보고서.pdf"
|
||||
assert first.slug == second.slug
|
||||
assert first.slug.startswith("document-")
|
||||
assert first.source_path == str(normalize_pdf_path(pdf))
|
||||
|
||||
|
||||
def test_output_bundle_paths_keep_document_and_runtime_artifacts_separate(tmp_path: Path) -> None:
|
||||
document = DocumentIdentity.from_path("Example Paper.pdf")
|
||||
bundle = OutputBundlePaths.from_document(tmp_path, document)
|
||||
|
||||
assert bundle.document_dir == tmp_path / "example-paper"
|
||||
assert bundle.images_dir == tmp_path / "example-paper" / "images"
|
||||
assert bundle.chunk_path(1) == tmp_path / "example-paper" / "example-paper_001.md"
|
||||
assert bundle.figure_asset_path("1") == tmp_path / "example-paper" / "images" / "example-paper_fig-1.png"
|
||||
assert bundle.runtime_dir == tmp_path / ".pdftomd-runtime" / "example-paper"
|
||||
assert bundle.log_path.name == "conversion.log"
|
||||
assert bundle.resume_state_path.name == "resume-state.json"
|
||||
assert bundle.runtime_dir not in bundle.document_dir.parents
|
||||
|
||||
|
||||
def test_make_anchor_is_deterministic_and_validates_kind() -> None:
|
||||
assert make_anchor("Figure", "2 A") == "figure-2-a"
|
||||
assert make_anchor("Equation", "식 3") == "equation-3"
|
||||
|
||||
with pytest.raises(ValueError, match="kind"):
|
||||
make_anchor("", "1")
|
||||
@@ -1,93 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from pdftomd.models import PageAnalysis, PageRange
|
||||
from pdftomd.preanalysis import analyze_pdf, is_ocr_candidate, plan_page_chunks
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
METADATA_PATH = ROOT / "samples" / "metadata.json"
|
||||
|
||||
|
||||
def _metadata_samples() -> list[dict]:
|
||||
return json.loads(METADATA_PATH.read_text(encoding="utf-8"))["samples"]
|
||||
|
||||
|
||||
def _sample_with(**traits: object) -> dict:
|
||||
for sample in _metadata_samples():
|
||||
sample_traits = sample["traits"]
|
||||
if all(sample_traits.get(key) == value for key, value in traits.items()):
|
||||
return sample
|
||||
raise AssertionError(f"no sample matched traits: {traits}")
|
||||
|
||||
|
||||
def test_analyze_text_heavy_sample_returns_page_facts_from_metadata() -> None:
|
||||
sample = _sample_with(text_layer_quality="good", mixed_scanned_text_pages=False)
|
||||
|
||||
result = analyze_pdf(ROOT / sample["path"])
|
||||
|
||||
assert result.page_count == sample["page_count"]
|
||||
assert len(result.pages) == sample["page_count"]
|
||||
assert all(isinstance(page, PageAnalysis) for page in result.pages)
|
||||
assert result.pages[0].page == 1
|
||||
assert result.pages[0].text_length > 1000
|
||||
assert result.pages[0].has_text_layer
|
||||
assert not result.pages[0].needs_ocr
|
||||
|
||||
|
||||
def test_analyze_mixed_scanned_risk_sample_marks_metadata_scanned_pages() -> None:
|
||||
sample = _sample_with(mixed_scanned_text_pages=True)
|
||||
|
||||
result = analyze_pdf(ROOT / sample["path"])
|
||||
ocr_pages = {page.page for page in result.pages if page.needs_ocr}
|
||||
|
||||
assert result.page_count == sample["page_count"]
|
||||
assert set(sample["traits"]["scanned_pages"]) <= ocr_pages
|
||||
assert any(page.image_count > 0 for page in result.pages)
|
||||
|
||||
|
||||
def test_analyze_pdf_accepts_korean_pathlib_path(tmp_path: Path) -> None:
|
||||
sample = _sample_with(has_korean_path=True, text_layer_quality="good")
|
||||
source = ROOT / sample["path"]
|
||||
target_dir = tmp_path / "한글 경로"
|
||||
target_dir.mkdir()
|
||||
target = target_dir / source.name
|
||||
shutil.copyfile(source, target)
|
||||
|
||||
result = analyze_pdf(target)
|
||||
|
||||
assert result.page_count == sample["page_count"]
|
||||
assert result.pages[0].has_text_layer
|
||||
|
||||
|
||||
def test_ocr_candidate_logic_is_deterministic() -> None:
|
||||
cases = [
|
||||
(0, 0, True),
|
||||
(0, 2, True),
|
||||
(40, 0, False),
|
||||
(199, 1, True),
|
||||
(200, 1, False),
|
||||
(1000, 8, False),
|
||||
]
|
||||
|
||||
first = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
|
||||
second = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
|
||||
|
||||
assert first == [expected for _, _, expected in cases]
|
||||
assert first == second
|
||||
|
||||
|
||||
def test_chunk_candidates_are_twenty_page_ranges_within_bounds() -> None:
|
||||
chunks = plan_page_chunks(76)
|
||||
|
||||
assert chunks == (
|
||||
PageRange(1, 20),
|
||||
PageRange(21, 40),
|
||||
PageRange(41, 60),
|
||||
PageRange(61, 76),
|
||||
)
|
||||
assert all(chunk.end <= 76 for chunk in chunks)
|
||||
assert plan_page_chunks(0) == ()
|
||||
@@ -1,166 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pdftomd.models import Asset, AssetKind
|
||||
from pdftomd.quality import (
|
||||
validate_caption_reference_anchors,
|
||||
validate_chunk_frontmatter,
|
||||
validate_image_links,
|
||||
validate_latex_environments,
|
||||
validate_markdown_quality,
|
||||
validate_math_delimiters,
|
||||
validate_tables,
|
||||
)
|
||||
|
||||
|
||||
def messages(issues: list[object]) -> list[str]:
|
||||
return [getattr(issue, "message") for issue in issues]
|
||||
|
||||
|
||||
def test_math_delimiters_accept_inline_and_block_math() -> None:
|
||||
markdown = "\n".join(
|
||||
[
|
||||
"Inline energy $E = mc^2$ is preserved.",
|
||||
"",
|
||||
"$$",
|
||||
r"\int_0^1 x^2 dx",
|
||||
"$$",
|
||||
]
|
||||
)
|
||||
|
||||
assert validate_math_delimiters(markdown) == []
|
||||
|
||||
|
||||
def test_math_delimiters_report_actionable_unclosed_inline_math() -> None:
|
||||
issues = validate_math_delimiters("The expression $E = mc^2 is missing a close.")
|
||||
|
||||
assert len(issues) == 1
|
||||
assert "Unclosed inline math delimiter" in issues[0].message
|
||||
assert issues[0].line == 1
|
||||
assert "$" in issues[0].message
|
||||
|
||||
|
||||
def test_math_delimiters_report_actionable_unclosed_block_math() -> None:
|
||||
issues = validate_math_delimiters("Before\n$$\na^2 + b^2 = c^2\nAfter")
|
||||
|
||||
assert len(issues) == 1
|
||||
assert "Unclosed block math delimiter" in issues[0].message
|
||||
assert issues[0].line == 2
|
||||
|
||||
|
||||
def test_latex_environment_pairs_accept_nested_matching_pairs() -> None:
|
||||
markdown = r"""
|
||||
$$
|
||||
\begin{aligned}
|
||||
a &= b \\
|
||||
\begin{matrix}1 & 2\end{matrix}
|
||||
\end{aligned}
|
||||
$$
|
||||
"""
|
||||
|
||||
assert validate_latex_environments(markdown) == []
|
||||
|
||||
|
||||
def test_latex_environment_pairs_report_mismatch() -> None:
|
||||
issues = validate_latex_environments(r"\begin{aligned} x \end{matrix}")
|
||||
|
||||
assert len(issues) == 1
|
||||
assert "LaTeX environment mismatch" in issues[0].message
|
||||
assert "aligned" in issues[0].message
|
||||
assert "matrix" in issues[0].message
|
||||
|
||||
|
||||
def test_image_links_validate_filesystem_and_modeled_assets(tmp_path: Path) -> None:
|
||||
image_dir = tmp_path / "images"
|
||||
image_dir.mkdir()
|
||||
(image_dir / "paper_fig-1.png").write_bytes(b"png")
|
||||
asset = Asset(
|
||||
id="asset-001",
|
||||
kind=AssetKind.FIGURE,
|
||||
relative_path="images/paper_fig-1.png",
|
||||
page=1,
|
||||
)
|
||||
markdown = "\n"
|
||||
|
||||
issues = validate_image_links(markdown, base_dir=tmp_path, assets=[asset])
|
||||
|
||||
assert messages(issues) == [
|
||||
"Image link target does not exist on disk and is not present in modeled assets: images/missing.png"
|
||||
]
|
||||
|
||||
|
||||
def test_simple_markdown_table_parseability() -> None:
|
||||
markdown = "\n".join(
|
||||
[
|
||||
"| A | B |",
|
||||
"| --- | --- |",
|
||||
"| 1 | 2 |",
|
||||
"| 3 | 4 |",
|
||||
]
|
||||
)
|
||||
|
||||
assert validate_tables(markdown) == []
|
||||
|
||||
|
||||
def test_markdown_table_reports_row_width_mismatch() -> None:
|
||||
issues = validate_tables("| A | B |\n| --- | --- |\n| 1 | 2 | 3 |")
|
||||
|
||||
assert len(issues) == 1
|
||||
assert "Markdown table row has 3 cells; expected 2" in issues[0].message
|
||||
|
||||
|
||||
def test_complex_table_can_be_represented_as_allowed_html_with_fallback() -> None:
|
||||
markdown = "\n".join(
|
||||
[
|
||||
'<table id="tbl-1">',
|
||||
"<tr><th rowspan=\"2\">Load</th><th>Value</th></tr>",
|
||||
"<tr><td>42</td></tr>",
|
||||
"</table>",
|
||||
"",
|
||||
]
|
||||
)
|
||||
|
||||
assert validate_tables(markdown, allow_html_table_fallback=True) == []
|
||||
|
||||
|
||||
def test_frontmatter_requires_chunk_context_fields() -> None:
|
||||
markdown = "---\ndocument_slug: paper\nchunk_index: 1\n---\n# Paper"
|
||||
|
||||
issues = validate_chunk_frontmatter(markdown)
|
||||
|
||||
assert messages(issues) == [
|
||||
"Chunk frontmatter is missing required field: title",
|
||||
"Chunk frontmatter is missing required field: page_range",
|
||||
]
|
||||
|
||||
|
||||
def test_frontmatter_accepts_required_chunk_context_fields() -> None:
|
||||
markdown = "---\ntitle: Paper\ndocument_slug: paper\nchunk_index: 1\npage_range: 1-3\n---\n# Paper"
|
||||
|
||||
assert validate_chunk_frontmatter(markdown) == []
|
||||
|
||||
|
||||
def test_caption_reference_anchor_shape_checks_known_reference_targets() -> None:
|
||||
markdown = "\n".join(
|
||||
[
|
||||
'<a id="fig-1"></a>',
|
||||
"",
|
||||
"Figure 1. Diagram.",
|
||||
"As shown in [Fig. 1](#fig-1) and [Table 2](#table-2).",
|
||||
]
|
||||
)
|
||||
|
||||
issues = validate_caption_reference_anchors(markdown)
|
||||
|
||||
assert messages(issues) == ["Reference link points to a missing anchor: #table-2"]
|
||||
|
||||
|
||||
def test_combined_quality_gate_does_not_mutate_markdown() -> None:
|
||||
markdown = "---\ntitle: Paper\ndocument_slug: paper\nchunk_index: 1\npage_range: 1\n---\n$E=mc^2$"
|
||||
|
||||
result = validate_markdown_quality(markdown)
|
||||
|
||||
assert result.markdown == markdown
|
||||
assert result.ok
|
||||
assert result.issues == ()
|
||||
@@ -1,50 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pdftomd.models import DocumentIdentity
|
||||
from pdftomd.runtime_contracts import ModelCachePolicy, RuntimeArtifactPaths
|
||||
|
||||
|
||||
def test_model_cache_policy_prefers_explicit_path(tmp_path: Path) -> None:
|
||||
policy = ModelCachePolicy.from_environment(
|
||||
project_root=tmp_path,
|
||||
explicit_model_cache=tmp_path / "explicit-models",
|
||||
env={"PDFTOMD_MODEL_CACHE": str(tmp_path / "ignored")},
|
||||
)
|
||||
|
||||
assert policy.root == tmp_path / "explicit-models"
|
||||
assert policy.marker_dir == tmp_path / "explicit-models" / "marker"
|
||||
assert policy.nougat_dir == tmp_path / "explicit-models" / "nougat"
|
||||
assert policy.huggingface_home == tmp_path / "explicit-models" / "huggingface"
|
||||
|
||||
|
||||
def test_model_cache_policy_uses_env_then_project_default(tmp_path: Path) -> None:
|
||||
env_policy = ModelCachePolicy.from_environment(
|
||||
project_root=tmp_path,
|
||||
env={"PDFTOMD_MODEL_CACHE": str(tmp_path / "env-models")},
|
||||
)
|
||||
default_policy = ModelCachePolicy.from_environment(project_root=tmp_path, env={})
|
||||
|
||||
assert env_policy.root == tmp_path / "env-models"
|
||||
assert default_policy.root == tmp_path / ".models"
|
||||
|
||||
|
||||
def test_model_cache_policy_exports_offline_environment(tmp_path: Path) -> None:
|
||||
policy = ModelCachePolicy.from_environment(project_root=tmp_path, env={})
|
||||
|
||||
environment = policy.to_environment(offline=True)
|
||||
|
||||
assert environment["HF_HOME"] == str(tmp_path / ".models" / "huggingface")
|
||||
assert environment["HUGGINGFACE_HUB_CACHE"] == str(tmp_path / ".models" / "huggingface" / "hub")
|
||||
assert environment["HF_HUB_OFFLINE"] == "1"
|
||||
|
||||
|
||||
def test_runtime_artifact_paths_are_outside_document_bundle(tmp_path: Path) -> None:
|
||||
document = DocumentIdentity.from_path("Example Paper.pdf")
|
||||
artifacts = RuntimeArtifactPaths.from_output_root(tmp_path, document)
|
||||
|
||||
assert artifacts.root == tmp_path / ".pdftomd-runtime" / "example-paper"
|
||||
assert artifacts.log_file == artifacts.root / "logs" / "conversion.log"
|
||||
assert artifacts.resume_state_file == artifacts.root / "state" / "resume-state.json"
|
||||
assert "example-paper" in str(artifacts.root)
|
||||
@@ -1,87 +0,0 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import fitz
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
METADATA_PATH = ROOT / "samples" / "metadata.json"
|
||||
|
||||
REQUIRED_TRAITS = {
|
||||
"figure_density",
|
||||
"formula_density",
|
||||
"has_korean_path",
|
||||
"layout_risk",
|
||||
"mixed_scanned_text_pages",
|
||||
"scanned_pages",
|
||||
"table_density",
|
||||
"target_regression_focus",
|
||||
"text_layer_quality",
|
||||
}
|
||||
|
||||
|
||||
def _load_metadata():
|
||||
with METADATA_PATH.open("r", encoding="utf-8") as handle:
|
||||
return json.load(handle)
|
||||
|
||||
|
||||
def _metadata_samples():
|
||||
metadata = _load_metadata()
|
||||
assert isinstance(metadata, dict)
|
||||
samples = metadata.get("samples")
|
||||
assert isinstance(samples, list)
|
||||
return samples
|
||||
|
||||
|
||||
def test_metadata_paths_match_current_sample_pdfs_exactly():
|
||||
expected_paths = sorted(
|
||||
path.as_posix()
|
||||
for path in Path("samples").glob("*.pdf")
|
||||
)
|
||||
samples = _metadata_samples()
|
||||
metadata_paths = [sample.get("path") for sample in samples]
|
||||
|
||||
assert sorted(metadata_paths) == expected_paths
|
||||
|
||||
|
||||
def test_metadata_paths_are_unique():
|
||||
metadata_paths = [sample.get("path") for sample in _metadata_samples()]
|
||||
|
||||
assert len(metadata_paths) == len(set(metadata_paths))
|
||||
|
||||
|
||||
def test_metadata_paths_are_exact_relative_samples_pdf_paths():
|
||||
for sample in _metadata_samples():
|
||||
path = sample.get("path")
|
||||
|
||||
assert isinstance(path, str)
|
||||
assert path.startswith("samples/")
|
||||
assert Path(path).suffix == ".pdf"
|
||||
assert not Path(path).is_absolute()
|
||||
assert (ROOT / path).is_file()
|
||||
|
||||
|
||||
def test_required_trait_fields_are_present():
|
||||
for sample in _metadata_samples():
|
||||
traits = sample.get("traits")
|
||||
|
||||
assert isinstance(traits, dict), sample.get("path")
|
||||
assert REQUIRED_TRAITS <= traits.keys(), sample.get("path")
|
||||
|
||||
|
||||
def test_page_counts_match_current_sample_pdfs():
|
||||
for sample in _metadata_samples():
|
||||
page_count = sample.get("page_count")
|
||||
path = ROOT / sample["path"]
|
||||
|
||||
assert isinstance(page_count, int), sample["path"]
|
||||
assert page_count > 0, sample["path"]
|
||||
with fitz.open(path) as document:
|
||||
assert page_count == document.page_count, sample["path"]
|
||||
|
||||
|
||||
def test_metadata_json_is_deterministic_utf8():
|
||||
raw = METADATA_PATH.read_text(encoding="utf-8")
|
||||
metadata = json.loads(raw)
|
||||
|
||||
assert raw == json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
|
||||
Reference in New Issue
Block a user