add pdftomd

This commit is contained in:
김경종
2026-05-08 16:42:19 +09:00
parent 551ab50735
commit 88d6b92283
99 changed files with 47332 additions and 0 deletions
+188
View File
@@ -0,0 +1,188 @@
from __future__ import annotations
import os
from pathlib import Path
import pytest
from pdf2md.paths import (
DiscoveredPdf,
DuplicateOutputPathError,
InputDiscoveryError,
OutputConflictError,
OutputPathError,
OutputRootError,
discover_pdfs,
plan_outputs,
plan_pdf_outputs,
)
def touch(path: Path) -> Path:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(b"")
return path
def test_discovers_single_pdf_case_insensitive(tmp_path: Path) -> None:
pdf = touch(tmp_path / "Paper.PDF")
discovered = discover_pdfs(pdf)
assert discovered == (DiscoveredPdf(source_path=pdf.resolve()),)
def test_rejects_nonexistent_and_non_pdf_inputs(tmp_path: Path) -> None:
with pytest.raises(InputDiscoveryError, match="does not exist"):
discover_pdfs(tmp_path / "missing.pdf")
text_file = touch(tmp_path / "notes.txt")
with pytest.raises(InputDiscoveryError, match="not a PDF"):
discover_pdfs(text_file)
def test_discovers_directory_non_recursive_only(tmp_path: Path) -> None:
root_pdf = touch(tmp_path / "root.pdf")
nested_pdf = touch(tmp_path / "nested" / "child.pdf")
discovered = discover_pdfs(tmp_path, recursive=False)
assert [item.source_path for item in discovered] == [root_pdf.resolve()]
assert nested_pdf.resolve() not in {item.source_path for item in discovered}
def test_non_recursive_directory_with_only_nested_pdfs_fails(tmp_path: Path) -> None:
touch(tmp_path / "nested" / "child.pdf")
with pytest.raises(InputDiscoveryError, match="no PDF files"):
discover_pdfs(tmp_path, recursive=False)
def test_discovers_directory_recursive_with_relative_parents(tmp_path: Path) -> None:
root_pdf = touch(tmp_path / "root.pdf")
nested_pdf = touch(tmp_path / "nested" / "child.pdf")
deeper_pdf = touch(tmp_path / "nested" / "deeper" / "leaf.PdF")
discovered = discover_pdfs(tmp_path, recursive=True)
assert [(item.source_path, item.relative_parent) for item in discovered] == [
(nested_pdf.resolve(), Path("nested")),
(deeper_pdf.resolve(), Path("nested") / "deeper"),
(root_pdf.resolve(), Path()),
]
def test_discovery_order_is_deterministic_for_non_ascii_names(tmp_path: Path) -> None:
touch(tmp_path / "한글.pdf")
touch(tmp_path / "Alpha.pdf")
touch(tmp_path / "beta.PDF")
first = discover_pdfs(tmp_path)
second = discover_pdfs(tmp_path)
assert [item.source_path.name for item in first] == ["Alpha.pdf", "beta.PDF", "한글.pdf"]
assert first == second
def test_plans_all_default_output_paths_for_single_pdf(tmp_path: Path) -> None:
pdf = touch(tmp_path / "입력.pdf")
output_root = tmp_path / "out"
[plan] = plan_pdf_outputs(pdf, output_root)
assert plan.source_pdf == pdf.resolve()
assert plan.markdown_path == output_root.resolve() / "입력.md"
assert plan.assets_dir == output_root.resolve() / "입력.assets"
assert plan.metadata_path == output_root.resolve() / "입력.metadata.json"
assert plan.report_path == output_root.resolve() / "입력.report.md"
assert plan.raw_dir is None
def test_plans_optional_metadata_and_raw_outputs(tmp_path: Path) -> None:
pdf = touch(tmp_path / "paper.pdf")
[without_metadata] = plan_pdf_outputs(pdf, tmp_path / "out", metadata=False)
[with_raw] = plan_pdf_outputs(pdf, tmp_path / "out", keep_raw=True)
assert without_metadata.metadata_path is None
assert without_metadata.report_path == (tmp_path / "out").resolve() / "paper.report.md"
assert with_raw.raw_dir == (tmp_path / "out").resolve() / "paper.raw"
def test_recursive_planning_preserves_relative_subdirectories(tmp_path: Path) -> None:
root = tmp_path / "pdfs"
touch(root / "same.pdf")
touch(root / "nested" / "same.pdf")
plans = plan_pdf_outputs(root, tmp_path / "out", recursive=True)
assert [plan.markdown_path.relative_to((tmp_path / "out").resolve()) for plan in plans] == [
Path("nested") / "same.md",
Path("same.md"),
]
def test_non_recursive_duplicate_output_paths_fail(tmp_path: Path) -> None:
first = touch(tmp_path / "first" / "same.pdf")
second = touch(tmp_path / "second" / "same.pdf")
discovered = (
DiscoveredPdf(source_path=first.resolve()),
DiscoveredPdf(source_path=second.resolve()),
)
with pytest.raises(DuplicateOutputPathError, match="duplicated"):
plan_outputs(discovered, tmp_path / "out")
def test_output_conflicts_report_all_existing_paths(tmp_path: Path) -> None:
pdf = touch(tmp_path / "paper.pdf")
output_root = tmp_path / "out"
(output_root / "paper.assets").mkdir(parents=True)
(output_root / "paper.md").mkdir()
touch(output_root / "paper.metadata.json")
with pytest.raises(OutputConflictError) as error:
plan_pdf_outputs(pdf, output_root)
conflict_names = {path.name for path in error.value.conflicts}
assert conflict_names == {"paper.assets", "paper.md", "paper.metadata.json"}
def test_overwrite_allows_existing_paths_without_deleting(tmp_path: Path) -> None:
pdf = touch(tmp_path / "paper.pdf")
output_root = tmp_path / "out"
existing = touch(output_root / "paper.md")
[plan] = plan_pdf_outputs(pdf, output_root, overwrite=True)
assert plan.markdown_path == existing.resolve()
assert existing.exists()
def test_output_root_cannot_be_existing_file(tmp_path: Path) -> None:
pdf = touch(tmp_path / "paper.pdf")
output_root = touch(tmp_path / "out")
with pytest.raises(OutputRootError, match="not a directory"):
plan_pdf_outputs(pdf, output_root)
def test_planned_paths_cannot_escape_output_root(tmp_path: Path) -> None:
pdf = touch(tmp_path / "paper.pdf")
discovered = (DiscoveredPdf(source_path=pdf.resolve(), relative_parent=Path("..")),)
with pytest.raises(OutputPathError, match="escape"):
plan_outputs(discovered, tmp_path / "out")
@pytest.mark.skipif(os.name != "nt", reason="Windows rooted path behavior")
@pytest.mark.parametrize("relative_parent", [Path("\\outside"), Path("/outside"), Path("C:outside")])
def test_windows_rooted_relative_parents_cannot_escape_output_root(
tmp_path: Path,
relative_parent: Path,
) -> None:
pdf = touch(tmp_path / "paper.pdf")
discovered = (DiscoveredPdf(source_path=pdf.resolve(), relative_parent=relative_parent),)
with pytest.raises(OutputPathError, match="escape"):
plan_outputs(discovered, tmp_path / "out")