from __future__ import annotations import os from pathlib import Path import pytest from pdf2md.paths import ( DiscoveredPdf, DuplicateOutputPathError, InputDiscoveryError, OutputConflictError, OutputPathError, OutputRootError, discover_pdfs, plan_outputs, plan_pdf_outputs, ) def touch(path: Path) -> Path: path.parent.mkdir(parents=True, exist_ok=True) path.write_bytes(b"") return path def test_discovers_single_pdf_case_insensitive(tmp_path: Path) -> None: pdf = touch(tmp_path / "Paper.PDF") discovered = discover_pdfs(pdf) assert discovered == (DiscoveredPdf(source_path=pdf.resolve()),) def test_rejects_nonexistent_and_non_pdf_inputs(tmp_path: Path) -> None: with pytest.raises(InputDiscoveryError, match="does not exist"): discover_pdfs(tmp_path / "missing.pdf") text_file = touch(tmp_path / "notes.txt") with pytest.raises(InputDiscoveryError, match="not a PDF"): discover_pdfs(text_file) def test_discovers_directory_non_recursive_only(tmp_path: Path) -> None: root_pdf = touch(tmp_path / "root.pdf") nested_pdf = touch(tmp_path / "nested" / "child.pdf") discovered = discover_pdfs(tmp_path, recursive=False) assert [item.source_path for item in discovered] == [root_pdf.resolve()] assert nested_pdf.resolve() not in {item.source_path for item in discovered} def test_non_recursive_directory_with_only_nested_pdfs_fails(tmp_path: Path) -> None: touch(tmp_path / "nested" / "child.pdf") with pytest.raises(InputDiscoveryError, match="no PDF files"): discover_pdfs(tmp_path, recursive=False) def test_discovers_directory_recursive_with_relative_parents(tmp_path: Path) -> None: root_pdf = touch(tmp_path / "root.pdf") nested_pdf = touch(tmp_path / "nested" / "child.pdf") deeper_pdf = touch(tmp_path / "nested" / "deeper" / "leaf.PdF") discovered = discover_pdfs(tmp_path, recursive=True) assert [(item.source_path, item.relative_parent) for item in discovered] == [ (nested_pdf.resolve(), Path("nested")), (deeper_pdf.resolve(), Path("nested") / "deeper"), (root_pdf.resolve(), Path()), ] def test_discovery_order_is_deterministic_for_non_ascii_names(tmp_path: Path) -> None: korean_pdf = touch(tmp_path / "논문.pdf") touch(tmp_path / "Alpha.pdf") touch(tmp_path / "beta.PDF") first = discover_pdfs(tmp_path) second = discover_pdfs(tmp_path) assert [item.source_path.name for item in first] == ["Alpha.pdf", "beta.PDF", korean_pdf.name] assert first == second def test_plans_all_default_output_paths_for_single_pdf(tmp_path: Path) -> None: pdf = touch(tmp_path / "입력.pdf") output_root = tmp_path / "out" [plan] = plan_pdf_outputs(pdf, output_root) assert plan.source_pdf == pdf.resolve() assert plan.markdown_path == output_root.resolve() / "입력" / "입력_001.md" assert plan.assets_dir == output_root.resolve() / "입력" / "images" assert plan.metadata_path is None assert plan.report_path == output_root.resolve() / "입력" / "입력_report.md" assert plan.raw_dir is None def test_plans_metadata_flag_as_noop_and_raw_outputs(tmp_path: Path) -> None: pdf = touch(tmp_path / "paper.pdf") [with_metadata_flag] = plan_pdf_outputs(pdf, tmp_path / "out", metadata=True) [without_metadata] = plan_pdf_outputs(pdf, tmp_path / "out", metadata=False) [with_raw] = plan_pdf_outputs(pdf, tmp_path / "out", keep_raw=True) assert with_metadata_flag.metadata_path is None assert without_metadata.metadata_path is None assert without_metadata.report_path == (tmp_path / "out").resolve() / "paper" / "paper_report.md" assert with_raw.raw_dir == (tmp_path / "out").resolve() / "paper" / "raw" def test_recursive_planning_preserves_relative_subdirectories(tmp_path: Path) -> None: root = tmp_path / "pdfs" touch(root / "same.pdf") touch(root / "nested" / "same.pdf") plans = plan_pdf_outputs(root, tmp_path / "out", recursive=True) assert [plan.markdown_path.relative_to((tmp_path / "out").resolve()) for plan in plans] == [ Path("nested") / "same" / "same_001.md", Path("same") / "same_001.md", ] def test_non_recursive_duplicate_output_paths_fail(tmp_path: Path) -> None: first = touch(tmp_path / "first" / "same.pdf") second = touch(tmp_path / "second" / "same.pdf") discovered = ( DiscoveredPdf(source_path=first.resolve()), DiscoveredPdf(source_path=second.resolve()), ) with pytest.raises(DuplicateOutputPathError, match="duplicated"): plan_outputs(discovered, tmp_path / "out") def test_output_conflicts_report_all_existing_paths(tmp_path: Path) -> None: pdf = touch(tmp_path / "paper.pdf") output_root = tmp_path / "out" (output_root / "paper" / "images").mkdir(parents=True) (output_root / "paper" / "paper_001.md").mkdir() touch(output_root / "paper" / "paper_report.md") with pytest.raises(OutputConflictError) as error: plan_pdf_outputs(pdf, output_root) conflict_names = {path.name for path in error.value.conflicts} assert conflict_names == {"images", "paper_001.md", "paper_report.md"} def test_overwrite_allows_existing_paths_without_deleting(tmp_path: Path) -> None: pdf = touch(tmp_path / "paper.pdf") output_root = tmp_path / "out" existing = touch(output_root / "paper" / "paper_001.md") [plan] = plan_pdf_outputs(pdf, output_root, overwrite=True) assert plan.markdown_path == existing.resolve() assert existing.exists() def test_output_root_cannot_be_existing_file(tmp_path: Path) -> None: pdf = touch(tmp_path / "paper.pdf") output_root = touch(tmp_path / "out") with pytest.raises(OutputRootError, match="not a directory"): plan_pdf_outputs(pdf, output_root) def test_planned_paths_cannot_escape_output_root(tmp_path: Path) -> None: pdf = touch(tmp_path / "paper.pdf") discovered = (DiscoveredPdf(source_path=pdf.resolve(), relative_parent=Path("..")),) with pytest.raises(OutputPathError, match="escape"): plan_outputs(discovered, tmp_path / "out") @pytest.mark.skipif(os.name != "nt", reason="Windows rooted path behavior") @pytest.mark.parametrize("relative_parent", [Path("\\outside"), Path("/outside"), Path("C:outside")]) def test_windows_rooted_relative_parents_cannot_escape_output_root( tmp_path: Path, relative_parent: Path, ) -> None: pdf = touch(tmp_path / "paper.pdf") discovered = (DiscoveredPdf(source_path=pdf.resolve(), relative_parent=relative_parent),) with pytest.raises(OutputPathError, match="escape"): plan_outputs(discovered, tmp_path / "out")