remove files

2026-05-08 16:31:17 +09:00
parent 7e985ae94a
commit 551ab50735
135 changed files with 0 additions and 41205 deletions
@@ -1,11 +0,0 @@
-from __future__ import annotations
-
-import sys
-from pathlib import Path
-
-
-ROOT = Path(__file__).resolve().parents[1]
-SRC = ROOT / "src"
-
-if str(SRC) not in sys.path:
-    sys.path.insert(0, str(SRC))
@@ -1,140 +0,0 @@
-from __future__ import annotations
-
-import pytest
-
-from pdftomd import (
-    Asset,
-    AssetKind,
-    BlockRole,
-    BoundingBox,
-    ChunkMetadata,
-    DocumentBlock,
-    DocumentIdentity,
-    Figure,
-    Formula,
-    PageAnalysis,
-    PageRange,
-    Table,
-)
-
-
-def test_document_identity_slug_and_output_fields_are_deterministic() -> None:
-    document = DocumentIdentity.from_path(r"C:\papers\MITC Korean Report 2026.pdf")
-
-    assert document.source_path == r"C:\papers\MITC Korean Report 2026.pdf"
-    assert document.stem == "MITC Korean Report 2026"
-    assert document.slug == "mitc-korean-report-2026"
-    assert document.chunk_filename(1) == "mitc-korean-report-2026_001.md"
-    assert document.asset_dir == "images"
-
-
-def test_non_ascii_document_identity_uses_stable_fallback_slug() -> None:
-    first = DocumentIdentity.from_path("한글 보고서.pdf")
-    second = DocumentIdentity.from_path("다른 보고서.pdf")
-
-    assert first.filename == "한글 보고서.pdf"
-    assert first.slug.startswith("document-")
-    assert first.slug == DocumentIdentity.from_path("한글 보고서.pdf").slug
-    assert first.slug != second.slug
-
-
-def test_page_range_invariants_and_helpers() -> None:
-    page_range = PageRange(start=3, end=7)
-
-    assert page_range.count == 5
-    assert page_range.label == "3-7"
-    assert page_range.contains(3)
-    assert page_range.contains(7)
-    assert not page_range.contains(8)
-
-
-@pytest.mark.parametrize(
-    ("start", "end"),
-    [(0, 1), (4, 3)],
-)
-def test_page_range_rejects_invalid_bounds(start: int, end: int) -> None:
-    with pytest.raises(ValueError):
-        PageRange(start=start, end=end)
-
-
-def test_block_formula_table_figure_and_asset_construction() -> None:
-    bbox = BoundingBox(page=2, x0=10.0, y0=20.0, x1=110.0, y1=80.0)
-    page_range = PageRange(start=2, end=2)
-    identity = DocumentIdentity.from_path("Example Paper.pdf")
-
-    formula = Formula(
-        id="eq-001",
-        latex=r"E = mc^2",
-        display=True,
-        source_text="E = mc2",
-        number="1",
-    )
-    table = Table(
-        id="tbl-001",
-        rows=(("A", "B"), ("1", "2")),
-        caption="Table 1. Values",
-        number="1",
-    )
-    figure = Figure(
-        id="fig-001",
-        caption="Figure 1. Diagram",
-        number="1",
-        asset_id="asset-001",
-    )
-    asset = Asset(
-        id="asset-001",
-        kind=AssetKind.FIGURE,
-        relative_path=f"{identity.asset_dir}/{identity.figure_asset_filename('1')}",
-        page=2,
-        bbox=bbox,
-        content_hash="abc123",
-    )
-    block = DocumentBlock(
-        id="block-001",
-        role=BlockRole.FORMULA,
-        page_range=page_range,
-        bbox=bbox,
-        text="E = mc^2",
-        formula=formula,
-    )
-    chunk = ChunkMetadata(
-        index=1,
-        document=identity,
-        page_range=page_range,
-        block_ids=("block-001",),
-        asset_ids=("asset-001",),
-    )
-    analysis = PageAnalysis(
-        page=2,
-        text_length=200,
-        image_count=1,
-        has_text_layer=True,
-        needs_ocr=False,
-    )
-
-    assert bbox.width == 100.0
-    assert bbox.height == 60.0
-    assert block.role is BlockRole.FORMULA
-    assert block.formula == formula
-    assert table.rows[0] == ("A", "B")
-    assert figure.asset_id == asset.id
-    assert asset.relative_path == "images/example-paper_fig-1.png"
-    assert chunk.filename == "example-paper_001.md"
-    assert chunk.slug == "example-paper"
-    assert analysis.text_layer_quality == "text"
-
-
-def test_bounding_box_rejects_invalid_coordinates() -> None:
-    with pytest.raises(ValueError):
-        BoundingBox(page=1, x0=10.0, y0=0.0, x1=10.0, y1=20.0)
-
-
-def test_page_analysis_rejects_negative_counts() -> None:
-    with pytest.raises(ValueError):
-        PageAnalysis(
-            page=1,
-            text_length=-1,
-            image_count=0,
-            has_text_layer=False,
-            needs_ocr=True,
-        )
@@ -1,49 +0,0 @@
-from __future__ import annotations
-
-from pathlib import Path
-
-import pytest
-
-from pdftomd.options import ConversionOptions, FormulaParser, RuntimeMode
-
-
-def test_conversion_options_defaults_match_project_policy() -> None:
-    options = ConversionOptions()
-
-    assert options.runtime is RuntimeMode.CUDA
-    assert options.formula_parser is FormulaParser.NOUGAT
-    assert options.chunk_target_pages == 20
-    assert options.output_dir == Path("output")
-    assert options.resume is False
-    assert options.write_logs is True
-
-
-def test_runtime_modes_express_cuda_fail_fast_and_auto_fallback() -> None:
-    assert RuntimeMode.CUDA.requires_cuda
-    assert not RuntimeMode.CUDA.allows_cpu_fallback
-    assert RuntimeMode.AUTO.allows_cpu_fallback
-    assert not RuntimeMode.CPU.requires_cuda
-
-
-def test_conversion_options_validate_chunk_size_and_formula_boundary() -> None:
-    with pytest.raises(ValueError, match="chunk_target_pages"):
-        ConversionOptions(chunk_target_pages=0)
-
-    options = ConversionOptions(formula_parser=FormulaParser.MARKER)
-    assert options.formula_parser is FormulaParser.MARKER
-    assert not hasattr(options, "pyqt")
-    assert not hasattr(options, "api_url")
-
-
-def test_conversion_options_normalize_optional_paths(tmp_path: Path) -> None:
-    options = ConversionOptions(
-        output_dir=tmp_path / "out",
-        nougat_command=tmp_path / "venv" / "Scripts" / "nougat.exe",
-        model_cache_dir=tmp_path / ".models",
-        log_dir=tmp_path / "logs",
-    )
-
-    assert options.output_dir == tmp_path / "out"
-    assert options.nougat_command is not None
-    assert options.model_cache_dir == tmp_path / ".models"
-    assert options.log_dir == tmp_path / "logs"
@@ -1,67 +0,0 @@
-from __future__ import annotations
-
-from pathlib import Path
-
-import pytest
-
-from pdftomd.models import DocumentIdentity
-from pdftomd.paths import (
-    OutputBundlePaths,
-    document_identity_from_pdf,
-    make_anchor,
-    normalize_pdf_path,
-)
-
-
-def test_normalize_pdf_path_accepts_korean_and_spaced_paths(tmp_path: Path) -> None:
-    pdf = tmp_path / "한글 경로" / "My Report 2026.pdf"
-    pdf.parent.mkdir()
-    pdf.write_bytes(b"%PDF-1.7\n")
-
-    normalized = normalize_pdf_path(pdf)
-
-    assert normalized.is_absolute()
-    assert normalized.name == "My Report 2026.pdf"
-
-
-def test_normalize_pdf_path_rejects_non_pdf_files(tmp_path: Path) -> None:
-    text_file = tmp_path / "document.txt"
-    text_file.write_text("not a pdf", encoding="utf-8")
-
-    with pytest.raises(ValueError, match="PDF"):
-        normalize_pdf_path(text_file)
-
-
-def test_document_identity_from_pdf_uses_stable_slug(tmp_path: Path) -> None:
-    pdf = tmp_path / "한글 보고서.pdf"
-    pdf.write_bytes(b"%PDF-1.7\n")
-
-    first = document_identity_from_pdf(pdf)
-    second = document_identity_from_pdf(pdf)
-
-    assert first.filename == "한글 보고서.pdf"
-    assert first.slug == second.slug
-    assert first.slug.startswith("document-")
-    assert first.source_path == str(normalize_pdf_path(pdf))
-
-
-def test_output_bundle_paths_keep_document_and_runtime_artifacts_separate(tmp_path: Path) -> None:
-    document = DocumentIdentity.from_path("Example Paper.pdf")
-    bundle = OutputBundlePaths.from_document(tmp_path, document)
-
-    assert bundle.document_dir == tmp_path / "example-paper"
-    assert bundle.images_dir == tmp_path / "example-paper" / "images"
-    assert bundle.chunk_path(1) == tmp_path / "example-paper" / "example-paper_001.md"
-    assert bundle.figure_asset_path("1") == tmp_path / "example-paper" / "images" / "example-paper_fig-1.png"
-    assert bundle.runtime_dir == tmp_path / ".pdftomd-runtime" / "example-paper"
-    assert bundle.log_path.name == "conversion.log"
-    assert bundle.resume_state_path.name == "resume-state.json"
-    assert bundle.runtime_dir not in bundle.document_dir.parents
-
-
-def test_make_anchor_is_deterministic_and_validates_kind() -> None:
-    assert make_anchor("Figure", "2 A") == "figure-2-a"
-    assert make_anchor("Equation", "식 3") == "equation-3"
-
-    with pytest.raises(ValueError, match="kind"):
-        make_anchor("", "1")
@@ -1,93 +0,0 @@
-from __future__ import annotations
-
-import json
-import shutil
-from pathlib import Path
-
-from pdftomd.models import PageAnalysis, PageRange
-from pdftomd.preanalysis import analyze_pdf, is_ocr_candidate, plan_page_chunks
-
-
-ROOT = Path(__file__).resolve().parents[1]
-METADATA_PATH = ROOT / "samples" / "metadata.json"
-
-
-def _metadata_samples() -> list[dict]:
-    return json.loads(METADATA_PATH.read_text(encoding="utf-8"))["samples"]
-
-
-def _sample_with(**traits: object) -> dict:
-    for sample in _metadata_samples():
-        sample_traits = sample["traits"]
-        if all(sample_traits.get(key) == value for key, value in traits.items()):
-            return sample
-    raise AssertionError(f"no sample matched traits: {traits}")
-
-
-def test_analyze_text_heavy_sample_returns_page_facts_from_metadata() -> None:
-    sample = _sample_with(text_layer_quality="good", mixed_scanned_text_pages=False)
-
-    result = analyze_pdf(ROOT / sample["path"])
-
-    assert result.page_count == sample["page_count"]
-    assert len(result.pages) == sample["page_count"]
-    assert all(isinstance(page, PageAnalysis) for page in result.pages)
-    assert result.pages[0].page == 1
-    assert result.pages[0].text_length > 1000
-    assert result.pages[0].has_text_layer
-    assert not result.pages[0].needs_ocr
-
-
-def test_analyze_mixed_scanned_risk_sample_marks_metadata_scanned_pages() -> None:
-    sample = _sample_with(mixed_scanned_text_pages=True)
-
-    result = analyze_pdf(ROOT / sample["path"])
-    ocr_pages = {page.page for page in result.pages if page.needs_ocr}
-
-    assert result.page_count == sample["page_count"]
-    assert set(sample["traits"]["scanned_pages"]) <= ocr_pages
-    assert any(page.image_count > 0 for page in result.pages)
-
-
-def test_analyze_pdf_accepts_korean_pathlib_path(tmp_path: Path) -> None:
-    sample = _sample_with(has_korean_path=True, text_layer_quality="good")
-    source = ROOT / sample["path"]
-    target_dir = tmp_path / "한글 경로"
-    target_dir.mkdir()
-    target = target_dir / source.name
-    shutil.copyfile(source, target)
-
-    result = analyze_pdf(target)
-
-    assert result.page_count == sample["page_count"]
-    assert result.pages[0].has_text_layer
-
-
-def test_ocr_candidate_logic_is_deterministic() -> None:
-    cases = [
-        (0, 0, True),
-        (0, 2, True),
-        (40, 0, False),
-        (199, 1, True),
-        (200, 1, False),
-        (1000, 8, False),
-    ]
-
-    first = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
-    second = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
-
-    assert first == [expected for _, _, expected in cases]
-    assert first == second
-
-
-def test_chunk_candidates_are_twenty_page_ranges_within_bounds() -> None:
-    chunks = plan_page_chunks(76)
-
-    assert chunks == (
-        PageRange(1, 20),
-        PageRange(21, 40),
-        PageRange(41, 60),
-        PageRange(61, 76),
-    )
-    assert all(chunk.end <= 76 for chunk in chunks)
-    assert plan_page_chunks(0) == ()
@@ -1,166 +0,0 @@
-from __future__ import annotations
-
-from pathlib import Path
-
-from pdftomd.models import Asset, AssetKind
-from pdftomd.quality import (
-    validate_caption_reference_anchors,
-    validate_chunk_frontmatter,
-    validate_image_links,
-    validate_latex_environments,
-    validate_markdown_quality,
-    validate_math_delimiters,
-    validate_tables,
-)
-
-
-def messages(issues: list[object]) -> list[str]:
-    return [getattr(issue, "message") for issue in issues]
-
-
-def test_math_delimiters_accept_inline_and_block_math() -> None:
-    markdown = "\n".join(
-        [
-            "Inline energy $E = mc^2$ is preserved.",
-            "",
-            "$$",
-            r"\int_0^1 x^2 dx",
-            "$$",
-        ]
-    )
-
-    assert validate_math_delimiters(markdown) == []
-
-
-def test_math_delimiters_report_actionable_unclosed_inline_math() -> None:
-    issues = validate_math_delimiters("The expression $E = mc^2 is missing a close.")
-
-    assert len(issues) == 1
-    assert "Unclosed inline math delimiter" in issues[0].message
-    assert issues[0].line == 1
-    assert "$" in issues[0].message
-
-
-def test_math_delimiters_report_actionable_unclosed_block_math() -> None:
-    issues = validate_math_delimiters("Before\n$$\na^2 + b^2 = c^2\nAfter")
-
-    assert len(issues) == 1
-    assert "Unclosed block math delimiter" in issues[0].message
-    assert issues[0].line == 2
-
-
-def test_latex_environment_pairs_accept_nested_matching_pairs() -> None:
-    markdown = r"""
-$$
-\begin{aligned}
-a &= b \\
-\begin{matrix}1 & 2\end{matrix}
-\end{aligned}
-$$
-"""
-
-    assert validate_latex_environments(markdown) == []
-
-
-def test_latex_environment_pairs_report_mismatch() -> None:
-    issues = validate_latex_environments(r"\begin{aligned} x \end{matrix}")
-
-    assert len(issues) == 1
-    assert "LaTeX environment mismatch" in issues[0].message
-    assert "aligned" in issues[0].message
-    assert "matrix" in issues[0].message
-
-
-def test_image_links_validate_filesystem_and_modeled_assets(tmp_path: Path) -> None:
-    image_dir = tmp_path / "images"
-    image_dir.mkdir()
-    (image_dir / "paper_fig-1.png").write_bytes(b"png")
-    asset = Asset(
-        id="asset-001",
-        kind=AssetKind.FIGURE,
-        relative_path="images/paper_fig-1.png",
-        page=1,
-    )
-    markdown = "![Figure 1](images/paper_fig-1.png)\n![Figure 2](images/missing.png)"
-
-    issues = validate_image_links(markdown, base_dir=tmp_path, assets=[asset])
-
-    assert messages(issues) == [
-        "Image link target does not exist on disk and is not present in modeled assets: images/missing.png"
-    ]
-
-
-def test_simple_markdown_table_parseability() -> None:
-    markdown = "\n".join(
-        [
-            "| A | B |",
-            "| --- | --- |",
-            "| 1 | 2 |",
-            "| 3 | 4 |",
-        ]
-    )
-
-    assert validate_tables(markdown) == []
-
-
-def test_markdown_table_reports_row_width_mismatch() -> None:
-    issues = validate_tables("| A | B |\n| --- | --- |\n| 1 | 2 | 3 |")
-
-    assert len(issues) == 1
-    assert "Markdown table row has 3 cells; expected 2" in issues[0].message
-
-
-def test_complex_table_can_be_represented_as_allowed_html_with_fallback() -> None:
-    markdown = "\n".join(
-        [
-            '<table id="tbl-1">',
-            "<tr><th rowspan=\"2\">Load</th><th>Value</th></tr>",
-            "<tr><td>42</td></tr>",
-            "</table>",
-            "![Table 1 fallback](images/table-1.png)",
-        ]
-    )
-
-    assert validate_tables(markdown, allow_html_table_fallback=True) == []
-
-
-def test_frontmatter_requires_chunk_context_fields() -> None:
-    markdown = "---\ndocument_slug: paper\nchunk_index: 1\n---\n# Paper"
-
-    issues = validate_chunk_frontmatter(markdown)
-
-    assert messages(issues) == [
-        "Chunk frontmatter is missing required field: title",
-        "Chunk frontmatter is missing required field: page_range",
-    ]
-
-
-def test_frontmatter_accepts_required_chunk_context_fields() -> None:
-    markdown = "---\ntitle: Paper\ndocument_slug: paper\nchunk_index: 1\npage_range: 1-3\n---\n# Paper"
-
-    assert validate_chunk_frontmatter(markdown) == []
-
-
-def test_caption_reference_anchor_shape_checks_known_reference_targets() -> None:
-    markdown = "\n".join(
-        [
-            '<a id="fig-1"></a>',
-            "![Figure 1](images/fig-1.png)",
-            "Figure 1. Diagram.",
-            "As shown in [Fig. 1](#fig-1) and [Table 2](#table-2).",
-        ]
-    )
-
-    issues = validate_caption_reference_anchors(markdown)
-
-    assert messages(issues) == ["Reference link points to a missing anchor: #table-2"]
-
-
-def test_combined_quality_gate_does_not_mutate_markdown() -> None:
-    markdown = "---\ntitle: Paper\ndocument_slug: paper\nchunk_index: 1\npage_range: 1\n---\n$E=mc^2$"
-
-    result = validate_markdown_quality(markdown)
-
-    assert result.markdown == markdown
-    assert result.ok
-    assert result.issues == ()
@@ -1,50 +0,0 @@
-from __future__ import annotations
-
-from pathlib import Path
-
-from pdftomd.models import DocumentIdentity
-from pdftomd.runtime_contracts import ModelCachePolicy, RuntimeArtifactPaths
-
-
-def test_model_cache_policy_prefers_explicit_path(tmp_path: Path) -> None:
-    policy = ModelCachePolicy.from_environment(
-        project_root=tmp_path,
-        explicit_model_cache=tmp_path / "explicit-models",
-        env={"PDFTOMD_MODEL_CACHE": str(tmp_path / "ignored")},
-    )
-
-    assert policy.root == tmp_path / "explicit-models"
-    assert policy.marker_dir == tmp_path / "explicit-models" / "marker"
-    assert policy.nougat_dir == tmp_path / "explicit-models" / "nougat"
-    assert policy.huggingface_home == tmp_path / "explicit-models" / "huggingface"
-
-
-def test_model_cache_policy_uses_env_then_project_default(tmp_path: Path) -> None:
-    env_policy = ModelCachePolicy.from_environment(
-        project_root=tmp_path,
-        env={"PDFTOMD_MODEL_CACHE": str(tmp_path / "env-models")},
-    )
-    default_policy = ModelCachePolicy.from_environment(project_root=tmp_path, env={})
-
-    assert env_policy.root == tmp_path / "env-models"
-    assert default_policy.root == tmp_path / ".models"
-
-
-def test_model_cache_policy_exports_offline_environment(tmp_path: Path) -> None:
-    policy = ModelCachePolicy.from_environment(project_root=tmp_path, env={})
-
-    environment = policy.to_environment(offline=True)
-
-    assert environment["HF_HOME"] == str(tmp_path / ".models" / "huggingface")
-    assert environment["HUGGINGFACE_HUB_CACHE"] == str(tmp_path / ".models" / "huggingface" / "hub")
-    assert environment["HF_HUB_OFFLINE"] == "1"
-
-
-def test_runtime_artifact_paths_are_outside_document_bundle(tmp_path: Path) -> None:
-    document = DocumentIdentity.from_path("Example Paper.pdf")
-    artifacts = RuntimeArtifactPaths.from_output_root(tmp_path, document)
-
-    assert artifacts.root == tmp_path / ".pdftomd-runtime" / "example-paper"
-    assert artifacts.log_file == artifacts.root / "logs" / "conversion.log"
-    assert artifacts.resume_state_file == artifacts.root / "state" / "resume-state.json"
-    assert "example-paper" in str(artifacts.root)
@@ -1,87 +0,0 @@
-import json
-from pathlib import Path
-
-import fitz
-
-
-ROOT = Path(__file__).resolve().parents[1]
-METADATA_PATH = ROOT / "samples" / "metadata.json"
-
-REQUIRED_TRAITS = {
-    "figure_density",
-    "formula_density",
-    "has_korean_path",
-    "layout_risk",
-    "mixed_scanned_text_pages",
-    "scanned_pages",
-    "table_density",
-    "target_regression_focus",
-    "text_layer_quality",
-}
-
-
-def _load_metadata():
-    with METADATA_PATH.open("r", encoding="utf-8") as handle:
-        return json.load(handle)
-
-
-def _metadata_samples():
-    metadata = _load_metadata()
-    assert isinstance(metadata, dict)
-    samples = metadata.get("samples")
-    assert isinstance(samples, list)
-    return samples
-
-
-def test_metadata_paths_match_current_sample_pdfs_exactly():
-    expected_paths = sorted(
-        path.as_posix()
-        for path in Path("samples").glob("*.pdf")
-    )
-    samples = _metadata_samples()
-    metadata_paths = [sample.get("path") for sample in samples]
-
-    assert sorted(metadata_paths) == expected_paths
-
-
-def test_metadata_paths_are_unique():
-    metadata_paths = [sample.get("path") for sample in _metadata_samples()]
-
-    assert len(metadata_paths) == len(set(metadata_paths))
-
-
-def test_metadata_paths_are_exact_relative_samples_pdf_paths():
-    for sample in _metadata_samples():
-        path = sample.get("path")
-
-        assert isinstance(path, str)
-        assert path.startswith("samples/")
-        assert Path(path).suffix == ".pdf"
-        assert not Path(path).is_absolute()
-        assert (ROOT / path).is_file()
-
-
-def test_required_trait_fields_are_present():
-    for sample in _metadata_samples():
-        traits = sample.get("traits")
-
-        assert isinstance(traits, dict), sample.get("path")
-        assert REQUIRED_TRAITS <= traits.keys(), sample.get("path")
-
-
-def test_page_counts_match_current_sample_pdfs():
-    for sample in _metadata_samples():
-        page_count = sample.get("page_count")
-        path = ROOT / sample["path"]
-
-        assert isinstance(page_count, int), sample["path"]
-        assert page_count > 0, sample["path"]
-        with fitz.open(path) as document:
-            assert page_count == document.page_count, sample["path"]
-
-
-def test_metadata_json_is_deterministic_utf8():
-    raw = METADATA_PATH.read_text(encoding="utf-8")
-    metadata = json.loads(raw)
-
-    assert raw == json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True) + "\n"