remove files

This commit is contained in:
김경종
2026-05-08 16:31:17 +09:00
parent 7e985ae94a
commit 551ab50735
135 changed files with 0 additions and 41205 deletions
-11
View File
@@ -1,11 +0,0 @@
from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
-140
View File
@@ -1,140 +0,0 @@
from __future__ import annotations
import pytest
from pdftomd import (
Asset,
AssetKind,
BlockRole,
BoundingBox,
ChunkMetadata,
DocumentBlock,
DocumentIdentity,
Figure,
Formula,
PageAnalysis,
PageRange,
Table,
)
def test_document_identity_slug_and_output_fields_are_deterministic() -> None:
document = DocumentIdentity.from_path(r"C:\papers\MITC Korean Report 2026.pdf")
assert document.source_path == r"C:\papers\MITC Korean Report 2026.pdf"
assert document.stem == "MITC Korean Report 2026"
assert document.slug == "mitc-korean-report-2026"
assert document.chunk_filename(1) == "mitc-korean-report-2026_001.md"
assert document.asset_dir == "images"
def test_non_ascii_document_identity_uses_stable_fallback_slug() -> None:
first = DocumentIdentity.from_path("한글 보고서.pdf")
second = DocumentIdentity.from_path("다른 보고서.pdf")
assert first.filename == "한글 보고서.pdf"
assert first.slug.startswith("document-")
assert first.slug == DocumentIdentity.from_path("한글 보고서.pdf").slug
assert first.slug != second.slug
def test_page_range_invariants_and_helpers() -> None:
page_range = PageRange(start=3, end=7)
assert page_range.count == 5
assert page_range.label == "3-7"
assert page_range.contains(3)
assert page_range.contains(7)
assert not page_range.contains(8)
@pytest.mark.parametrize(
("start", "end"),
[(0, 1), (4, 3)],
)
def test_page_range_rejects_invalid_bounds(start: int, end: int) -> None:
with pytest.raises(ValueError):
PageRange(start=start, end=end)
def test_block_formula_table_figure_and_asset_construction() -> None:
bbox = BoundingBox(page=2, x0=10.0, y0=20.0, x1=110.0, y1=80.0)
page_range = PageRange(start=2, end=2)
identity = DocumentIdentity.from_path("Example Paper.pdf")
formula = Formula(
id="eq-001",
latex=r"E = mc^2",
display=True,
source_text="E = mc2",
number="1",
)
table = Table(
id="tbl-001",
rows=(("A", "B"), ("1", "2")),
caption="Table 1. Values",
number="1",
)
figure = Figure(
id="fig-001",
caption="Figure 1. Diagram",
number="1",
asset_id="asset-001",
)
asset = Asset(
id="asset-001",
kind=AssetKind.FIGURE,
relative_path=f"{identity.asset_dir}/{identity.figure_asset_filename('1')}",
page=2,
bbox=bbox,
content_hash="abc123",
)
block = DocumentBlock(
id="block-001",
role=BlockRole.FORMULA,
page_range=page_range,
bbox=bbox,
text="E = mc^2",
formula=formula,
)
chunk = ChunkMetadata(
index=1,
document=identity,
page_range=page_range,
block_ids=("block-001",),
asset_ids=("asset-001",),
)
analysis = PageAnalysis(
page=2,
text_length=200,
image_count=1,
has_text_layer=True,
needs_ocr=False,
)
assert bbox.width == 100.0
assert bbox.height == 60.0
assert block.role is BlockRole.FORMULA
assert block.formula == formula
assert table.rows[0] == ("A", "B")
assert figure.asset_id == asset.id
assert asset.relative_path == "images/example-paper_fig-1.png"
assert chunk.filename == "example-paper_001.md"
assert chunk.slug == "example-paper"
assert analysis.text_layer_quality == "text"
def test_bounding_box_rejects_invalid_coordinates() -> None:
with pytest.raises(ValueError):
BoundingBox(page=1, x0=10.0, y0=0.0, x1=10.0, y1=20.0)
def test_page_analysis_rejects_negative_counts() -> None:
with pytest.raises(ValueError):
PageAnalysis(
page=1,
text_length=-1,
image_count=0,
has_text_layer=False,
needs_ocr=True,
)
-49
View File
@@ -1,49 +0,0 @@
from __future__ import annotations
from pathlib import Path
import pytest
from pdftomd.options import ConversionOptions, FormulaParser, RuntimeMode
def test_conversion_options_defaults_match_project_policy() -> None:
options = ConversionOptions()
assert options.runtime is RuntimeMode.CUDA
assert options.formula_parser is FormulaParser.NOUGAT
assert options.chunk_target_pages == 20
assert options.output_dir == Path("output")
assert options.resume is False
assert options.write_logs is True
def test_runtime_modes_express_cuda_fail_fast_and_auto_fallback() -> None:
assert RuntimeMode.CUDA.requires_cuda
assert not RuntimeMode.CUDA.allows_cpu_fallback
assert RuntimeMode.AUTO.allows_cpu_fallback
assert not RuntimeMode.CPU.requires_cuda
def test_conversion_options_validate_chunk_size_and_formula_boundary() -> None:
with pytest.raises(ValueError, match="chunk_target_pages"):
ConversionOptions(chunk_target_pages=0)
options = ConversionOptions(formula_parser=FormulaParser.MARKER)
assert options.formula_parser is FormulaParser.MARKER
assert not hasattr(options, "pyqt")
assert not hasattr(options, "api_url")
def test_conversion_options_normalize_optional_paths(tmp_path: Path) -> None:
options = ConversionOptions(
output_dir=tmp_path / "out",
nougat_command=tmp_path / "venv" / "Scripts" / "nougat.exe",
model_cache_dir=tmp_path / ".models",
log_dir=tmp_path / "logs",
)
assert options.output_dir == tmp_path / "out"
assert options.nougat_command is not None
assert options.model_cache_dir == tmp_path / ".models"
assert options.log_dir == tmp_path / "logs"
-67
View File
@@ -1,67 +0,0 @@
from __future__ import annotations
from pathlib import Path
import pytest
from pdftomd.models import DocumentIdentity
from pdftomd.paths import (
OutputBundlePaths,
document_identity_from_pdf,
make_anchor,
normalize_pdf_path,
)
def test_normalize_pdf_path_accepts_korean_and_spaced_paths(tmp_path: Path) -> None:
pdf = tmp_path / "한글 경로" / "My Report 2026.pdf"
pdf.parent.mkdir()
pdf.write_bytes(b"%PDF-1.7\n")
normalized = normalize_pdf_path(pdf)
assert normalized.is_absolute()
assert normalized.name == "My Report 2026.pdf"
def test_normalize_pdf_path_rejects_non_pdf_files(tmp_path: Path) -> None:
text_file = tmp_path / "document.txt"
text_file.write_text("not a pdf", encoding="utf-8")
with pytest.raises(ValueError, match="PDF"):
normalize_pdf_path(text_file)
def test_document_identity_from_pdf_uses_stable_slug(tmp_path: Path) -> None:
pdf = tmp_path / "한글 보고서.pdf"
pdf.write_bytes(b"%PDF-1.7\n")
first = document_identity_from_pdf(pdf)
second = document_identity_from_pdf(pdf)
assert first.filename == "한글 보고서.pdf"
assert first.slug == second.slug
assert first.slug.startswith("document-")
assert first.source_path == str(normalize_pdf_path(pdf))
def test_output_bundle_paths_keep_document_and_runtime_artifacts_separate(tmp_path: Path) -> None:
document = DocumentIdentity.from_path("Example Paper.pdf")
bundle = OutputBundlePaths.from_document(tmp_path, document)
assert bundle.document_dir == tmp_path / "example-paper"
assert bundle.images_dir == tmp_path / "example-paper" / "images"
assert bundle.chunk_path(1) == tmp_path / "example-paper" / "example-paper_001.md"
assert bundle.figure_asset_path("1") == tmp_path / "example-paper" / "images" / "example-paper_fig-1.png"
assert bundle.runtime_dir == tmp_path / ".pdftomd-runtime" / "example-paper"
assert bundle.log_path.name == "conversion.log"
assert bundle.resume_state_path.name == "resume-state.json"
assert bundle.runtime_dir not in bundle.document_dir.parents
def test_make_anchor_is_deterministic_and_validates_kind() -> None:
assert make_anchor("Figure", "2 A") == "figure-2-a"
assert make_anchor("Equation", "식 3") == "equation-3"
with pytest.raises(ValueError, match="kind"):
make_anchor("", "1")
-93
View File
@@ -1,93 +0,0 @@
from __future__ import annotations
import json
import shutil
from pathlib import Path
from pdftomd.models import PageAnalysis, PageRange
from pdftomd.preanalysis import analyze_pdf, is_ocr_candidate, plan_page_chunks
ROOT = Path(__file__).resolve().parents[1]
METADATA_PATH = ROOT / "samples" / "metadata.json"
def _metadata_samples() -> list[dict]:
return json.loads(METADATA_PATH.read_text(encoding="utf-8"))["samples"]
def _sample_with(**traits: object) -> dict:
for sample in _metadata_samples():
sample_traits = sample["traits"]
if all(sample_traits.get(key) == value for key, value in traits.items()):
return sample
raise AssertionError(f"no sample matched traits: {traits}")
def test_analyze_text_heavy_sample_returns_page_facts_from_metadata() -> None:
sample = _sample_with(text_layer_quality="good", mixed_scanned_text_pages=False)
result = analyze_pdf(ROOT / sample["path"])
assert result.page_count == sample["page_count"]
assert len(result.pages) == sample["page_count"]
assert all(isinstance(page, PageAnalysis) for page in result.pages)
assert result.pages[0].page == 1
assert result.pages[0].text_length > 1000
assert result.pages[0].has_text_layer
assert not result.pages[0].needs_ocr
def test_analyze_mixed_scanned_risk_sample_marks_metadata_scanned_pages() -> None:
sample = _sample_with(mixed_scanned_text_pages=True)
result = analyze_pdf(ROOT / sample["path"])
ocr_pages = {page.page for page in result.pages if page.needs_ocr}
assert result.page_count == sample["page_count"]
assert set(sample["traits"]["scanned_pages"]) <= ocr_pages
assert any(page.image_count > 0 for page in result.pages)
def test_analyze_pdf_accepts_korean_pathlib_path(tmp_path: Path) -> None:
sample = _sample_with(has_korean_path=True, text_layer_quality="good")
source = ROOT / sample["path"]
target_dir = tmp_path / "한글 경로"
target_dir.mkdir()
target = target_dir / source.name
shutil.copyfile(source, target)
result = analyze_pdf(target)
assert result.page_count == sample["page_count"]
assert result.pages[0].has_text_layer
def test_ocr_candidate_logic_is_deterministic() -> None:
cases = [
(0, 0, True),
(0, 2, True),
(40, 0, False),
(199, 1, True),
(200, 1, False),
(1000, 8, False),
]
first = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
second = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
assert first == [expected for _, _, expected in cases]
assert first == second
def test_chunk_candidates_are_twenty_page_ranges_within_bounds() -> None:
chunks = plan_page_chunks(76)
assert chunks == (
PageRange(1, 20),
PageRange(21, 40),
PageRange(41, 60),
PageRange(61, 76),
)
assert all(chunk.end <= 76 for chunk in chunks)
assert plan_page_chunks(0) == ()
-166
View File
@@ -1,166 +0,0 @@
from __future__ import annotations
from pathlib import Path
from pdftomd.models import Asset, AssetKind
from pdftomd.quality import (
validate_caption_reference_anchors,
validate_chunk_frontmatter,
validate_image_links,
validate_latex_environments,
validate_markdown_quality,
validate_math_delimiters,
validate_tables,
)
def messages(issues: list[object]) -> list[str]:
return [getattr(issue, "message") for issue in issues]
def test_math_delimiters_accept_inline_and_block_math() -> None:
markdown = "\n".join(
[
"Inline energy $E = mc^2$ is preserved.",
"",
"$$",
r"\int_0^1 x^2 dx",
"$$",
]
)
assert validate_math_delimiters(markdown) == []
def test_math_delimiters_report_actionable_unclosed_inline_math() -> None:
issues = validate_math_delimiters("The expression $E = mc^2 is missing a close.")
assert len(issues) == 1
assert "Unclosed inline math delimiter" in issues[0].message
assert issues[0].line == 1
assert "$" in issues[0].message
def test_math_delimiters_report_actionable_unclosed_block_math() -> None:
issues = validate_math_delimiters("Before\n$$\na^2 + b^2 = c^2\nAfter")
assert len(issues) == 1
assert "Unclosed block math delimiter" in issues[0].message
assert issues[0].line == 2
def test_latex_environment_pairs_accept_nested_matching_pairs() -> None:
markdown = r"""
$$
\begin{aligned}
a &= b \\
\begin{matrix}1 & 2\end{matrix}
\end{aligned}
$$
"""
assert validate_latex_environments(markdown) == []
def test_latex_environment_pairs_report_mismatch() -> None:
issues = validate_latex_environments(r"\begin{aligned} x \end{matrix}")
assert len(issues) == 1
assert "LaTeX environment mismatch" in issues[0].message
assert "aligned" in issues[0].message
assert "matrix" in issues[0].message
def test_image_links_validate_filesystem_and_modeled_assets(tmp_path: Path) -> None:
image_dir = tmp_path / "images"
image_dir.mkdir()
(image_dir / "paper_fig-1.png").write_bytes(b"png")
asset = Asset(
id="asset-001",
kind=AssetKind.FIGURE,
relative_path="images/paper_fig-1.png",
page=1,
)
markdown = "![Figure 1](images/paper_fig-1.png)\n![Figure 2](images/missing.png)"
issues = validate_image_links(markdown, base_dir=tmp_path, assets=[asset])
assert messages(issues) == [
"Image link target does not exist on disk and is not present in modeled assets: images/missing.png"
]
def test_simple_markdown_table_parseability() -> None:
markdown = "\n".join(
[
"| A | B |",
"| --- | --- |",
"| 1 | 2 |",
"| 3 | 4 |",
]
)
assert validate_tables(markdown) == []
def test_markdown_table_reports_row_width_mismatch() -> None:
issues = validate_tables("| A | B |\n| --- | --- |\n| 1 | 2 | 3 |")
assert len(issues) == 1
assert "Markdown table row has 3 cells; expected 2" in issues[0].message
def test_complex_table_can_be_represented_as_allowed_html_with_fallback() -> None:
markdown = "\n".join(
[
'<table id="tbl-1">',
"<tr><th rowspan=\"2\">Load</th><th>Value</th></tr>",
"<tr><td>42</td></tr>",
"</table>",
"![Table 1 fallback](images/table-1.png)",
]
)
assert validate_tables(markdown, allow_html_table_fallback=True) == []
def test_frontmatter_requires_chunk_context_fields() -> None:
markdown = "---\ndocument_slug: paper\nchunk_index: 1\n---\n# Paper"
issues = validate_chunk_frontmatter(markdown)
assert messages(issues) == [
"Chunk frontmatter is missing required field: title",
"Chunk frontmatter is missing required field: page_range",
]
def test_frontmatter_accepts_required_chunk_context_fields() -> None:
markdown = "---\ntitle: Paper\ndocument_slug: paper\nchunk_index: 1\npage_range: 1-3\n---\n# Paper"
assert validate_chunk_frontmatter(markdown) == []
def test_caption_reference_anchor_shape_checks_known_reference_targets() -> None:
markdown = "\n".join(
[
'<a id="fig-1"></a>',
"![Figure 1](images/fig-1.png)",
"Figure 1. Diagram.",
"As shown in [Fig. 1](#fig-1) and [Table 2](#table-2).",
]
)
issues = validate_caption_reference_anchors(markdown)
assert messages(issues) == ["Reference link points to a missing anchor: #table-2"]
def test_combined_quality_gate_does_not_mutate_markdown() -> None:
markdown = "---\ntitle: Paper\ndocument_slug: paper\nchunk_index: 1\npage_range: 1\n---\n$E=mc^2$"
result = validate_markdown_quality(markdown)
assert result.markdown == markdown
assert result.ok
assert result.issues == ()
-50
View File
@@ -1,50 +0,0 @@
from __future__ import annotations
from pathlib import Path
from pdftomd.models import DocumentIdentity
from pdftomd.runtime_contracts import ModelCachePolicy, RuntimeArtifactPaths
def test_model_cache_policy_prefers_explicit_path(tmp_path: Path) -> None:
policy = ModelCachePolicy.from_environment(
project_root=tmp_path,
explicit_model_cache=tmp_path / "explicit-models",
env={"PDFTOMD_MODEL_CACHE": str(tmp_path / "ignored")},
)
assert policy.root == tmp_path / "explicit-models"
assert policy.marker_dir == tmp_path / "explicit-models" / "marker"
assert policy.nougat_dir == tmp_path / "explicit-models" / "nougat"
assert policy.huggingface_home == tmp_path / "explicit-models" / "huggingface"
def test_model_cache_policy_uses_env_then_project_default(tmp_path: Path) -> None:
env_policy = ModelCachePolicy.from_environment(
project_root=tmp_path,
env={"PDFTOMD_MODEL_CACHE": str(tmp_path / "env-models")},
)
default_policy = ModelCachePolicy.from_environment(project_root=tmp_path, env={})
assert env_policy.root == tmp_path / "env-models"
assert default_policy.root == tmp_path / ".models"
def test_model_cache_policy_exports_offline_environment(tmp_path: Path) -> None:
policy = ModelCachePolicy.from_environment(project_root=tmp_path, env={})
environment = policy.to_environment(offline=True)
assert environment["HF_HOME"] == str(tmp_path / ".models" / "huggingface")
assert environment["HUGGINGFACE_HUB_CACHE"] == str(tmp_path / ".models" / "huggingface" / "hub")
assert environment["HF_HUB_OFFLINE"] == "1"
def test_runtime_artifact_paths_are_outside_document_bundle(tmp_path: Path) -> None:
document = DocumentIdentity.from_path("Example Paper.pdf")
artifacts = RuntimeArtifactPaths.from_output_root(tmp_path, document)
assert artifacts.root == tmp_path / ".pdftomd-runtime" / "example-paper"
assert artifacts.log_file == artifacts.root / "logs" / "conversion.log"
assert artifacts.resume_state_file == artifacts.root / "state" / "resume-state.json"
assert "example-paper" in str(artifacts.root)
-87
View File
@@ -1,87 +0,0 @@
import json
from pathlib import Path
import fitz
ROOT = Path(__file__).resolve().parents[1]
METADATA_PATH = ROOT / "samples" / "metadata.json"
REQUIRED_TRAITS = {
"figure_density",
"formula_density",
"has_korean_path",
"layout_risk",
"mixed_scanned_text_pages",
"scanned_pages",
"table_density",
"target_regression_focus",
"text_layer_quality",
}
def _load_metadata():
with METADATA_PATH.open("r", encoding="utf-8") as handle:
return json.load(handle)
def _metadata_samples():
metadata = _load_metadata()
assert isinstance(metadata, dict)
samples = metadata.get("samples")
assert isinstance(samples, list)
return samples
def test_metadata_paths_match_current_sample_pdfs_exactly():
expected_paths = sorted(
path.as_posix()
for path in Path("samples").glob("*.pdf")
)
samples = _metadata_samples()
metadata_paths = [sample.get("path") for sample in samples]
assert sorted(metadata_paths) == expected_paths
def test_metadata_paths_are_unique():
metadata_paths = [sample.get("path") for sample in _metadata_samples()]
assert len(metadata_paths) == len(set(metadata_paths))
def test_metadata_paths_are_exact_relative_samples_pdf_paths():
for sample in _metadata_samples():
path = sample.get("path")
assert isinstance(path, str)
assert path.startswith("samples/")
assert Path(path).suffix == ".pdf"
assert not Path(path).is_absolute()
assert (ROOT / path).is_file()
def test_required_trait_fields_are_present():
for sample in _metadata_samples():
traits = sample.get("traits")
assert isinstance(traits, dict), sample.get("path")
assert REQUIRED_TRAITS <= traits.keys(), sample.get("path")
def test_page_counts_match_current_sample_pdfs():
for sample in _metadata_samples():
page_count = sample.get("page_count")
path = ROOT / sample["path"]
assert isinstance(page_count, int), sample["path"]
assert page_count > 0, sample["path"]
with fitz.open(path) as document:
assert page_count == document.page_count, sample["path"]
def test_metadata_json_is_deterministic_utf8():
raw = METADATA_PATH.read_text(encoding="utf-8")
metadata = json.loads(raw)
assert raw == json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True) + "\n"