2014 lines
71 KiB
Python
2014 lines
71 KiB
Python
"""Conversion orchestration for local PDF-to-Markdown output."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import shutil
|
|
import tempfile
|
|
from collections.abc import Callable
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path, PurePosixPath
|
|
from typing import Any, Protocol
|
|
|
|
from pdf2md.ir import (
|
|
AssetRecord,
|
|
BlockRecord,
|
|
BlockType,
|
|
DocumentRecord,
|
|
PageRecord,
|
|
TextFidelityRecord,
|
|
WarningCode,
|
|
WarningRecord,
|
|
WarningSeverity,
|
|
)
|
|
from pdf2md.gpu import GpuInfo, normalize_cuda_device, query_nvidia_gpus, select_gpu
|
|
from pdf2md.markdown import normalize_markdown
|
|
from pdf2md.math_render import create_default_math_checker
|
|
from pdf2md.math_repair import repair_math_render_failures
|
|
from pdf2md.metadata import build_metadata
|
|
from pdf2md.mineru_adapter import (
|
|
ENGINE_NAME,
|
|
MinerUAdapter,
|
|
MinerUAdapterResult,
|
|
MinerUOptions,
|
|
StrictLocalViolationError,
|
|
)
|
|
from pdf2md.mineru_profile import resolve_mineru_profile
|
|
from pdf2md.paths import (
|
|
DiscoveredPdf,
|
|
DuplicateOutputPathError,
|
|
OutputConflictError,
|
|
OutputPathError,
|
|
OutputRootError,
|
|
PathLike,
|
|
PlannedOutput,
|
|
discover_pdfs,
|
|
plan_outputs,
|
|
)
|
|
from pdf2md.pdf_splitter import PdfChunkPlan, plan_pdf_chunks, write_pdf_chunk
|
|
from pdf2md.quality import MathChecker, QualityResult, check_asset_links, check_math_renderability_details, merge_quality_results
|
|
from pdf2md.report import FinalStatus, determine_final_status, render_report
|
|
from pdf2md.text_fidelity import TextFidelityResult, check_text_fidelity, extract_pdf_text_pages
|
|
|
|
|
|
Clock = Callable[[], datetime]
|
|
DEFAULT_GPU_DEVICE = "cuda:0"
|
|
DEFAULT_MINERU_PROFILE = "auto"
|
|
DEFAULT_CHUNK_PAGES = 20
|
|
|
|
|
|
class ConversionAdapter(Protocol):
|
|
def convert(self, input_pdf: PathLike, work_dir: PathLike, options: MinerUOptions | None = None) -> MinerUAdapterResult:
|
|
"""Run the conversion engine into a local work directory."""
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ConversionResult:
|
|
source_pdf: Path
|
|
markdown_path: Path
|
|
metadata_path: Path | None
|
|
report_path: Path
|
|
assets_dir: Path
|
|
raw_dir: Path | None
|
|
engine: str
|
|
engine_version: str
|
|
final_status: FinalStatus
|
|
warning_count: int
|
|
warnings: tuple[WarningRecord, ...]
|
|
pages_processed: int
|
|
_report_metadata: dict[str, Any] | None = None
|
|
_report_quality: QualityResult | None = None
|
|
|
|
@property
|
|
def succeeded(self) -> bool:
|
|
return self.final_status != "failed"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class BatchConversionResult:
|
|
results: tuple[ConversionResult, ...]
|
|
|
|
@property
|
|
def converted_count(self) -> int:
|
|
return sum(result.succeeded for result in self.results)
|
|
|
|
@property
|
|
def failed_count(self) -> int:
|
|
return sum(not result.succeeded for result in self.results)
|
|
|
|
@property
|
|
def warning_count(self) -> int:
|
|
return sum(result.warning_count for result in self.results)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _AssetMaterialization:
|
|
records: tuple[AssetRecord, ...]
|
|
warnings: tuple[WarningRecord, ...]
|
|
link_map: dict[str, str]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _ConversionTask:
|
|
output_plan: PlannedOutput
|
|
group_plan: PdfChunkPlan | None = None
|
|
group_size: int | None = None
|
|
page_plans: tuple[PdfChunkPlan, ...] = ()
|
|
original_source_pdf: Path | None = None
|
|
original_source_sha256: str | None = None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _PageConversionArtifact:
|
|
source_page_number: int
|
|
group_page_index: int
|
|
result: ConversionResult
|
|
markdown: str | None
|
|
metadata: dict[str, Any] | None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _PreparedMarkdown:
|
|
markdown: str
|
|
quality: QualityResult
|
|
|
|
|
|
_IMAGE_LINK_RE = re.compile(r"!\[(?P<alt>[^\]\n]*)\]\((?P<target>[^)\n]+)\)")
|
|
_DISPLAY_MATH_RE = re.compile(r"(?<!\\)\$\$(?P<body>.*?)(?<!\\)\$\$", re.DOTALL)
|
|
_INLINE_MATH_RE = re.compile(r"(?<!\\)\$(?P<body>[^\n$]+?)(?<!\\)\$")
|
|
_RECHECKED_WARNING_CODES = frozenset(
|
|
{
|
|
WarningCode.MATH_RENDER_FAILED,
|
|
WarningCode.MATH_RENDER_REPAIRED,
|
|
WarningCode.ASSET_LINK_MISSING,
|
|
WarningCode.ASSET_LINK_INVALID,
|
|
WarningCode.TEXT_LAYER_AVAILABLE,
|
|
WarningCode.TEXT_FIDELITY_LOW,
|
|
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
|
|
WarningCode.HANGUL_SPACING_SUSPECT,
|
|
WarningCode.TEXT_PAGE_MAPPING_UNCERTAIN,
|
|
}
|
|
)
|
|
|
|
|
|
def convert_pdf(
|
|
input_path: PathLike,
|
|
output_dir: PathLike,
|
|
*,
|
|
metadata: bool = True,
|
|
keep_raw: bool = False,
|
|
overwrite: bool = False,
|
|
gpu: str | None = DEFAULT_GPU_DEVICE,
|
|
mineru_profile: str = DEFAULT_MINERU_PROFILE,
|
|
gpu_inventory: tuple[GpuInfo, ...] | None = None,
|
|
strict_local: bool = True,
|
|
adapter: ConversionAdapter | None = None,
|
|
math_checker: MathChecker | None = None,
|
|
chunk_pages: int | None = None,
|
|
clock: Clock | None = None,
|
|
) -> ConversionResult | BatchConversionResult:
|
|
"""Convert one local PDF into Markdown, assets, and report outputs."""
|
|
|
|
_raise_if_strict_local_disabled(strict_local)
|
|
candidate = Path(input_path).expanduser()
|
|
if candidate.exists() and not candidate.is_file():
|
|
raise ValueError("convert_pdf requires a PDF file input")
|
|
discovered = discover_pdfs(input_path, recursive=False)
|
|
if len(discovered) != 1:
|
|
raise ValueError("convert_pdf requires a single PDF input")
|
|
engine = adapter or MinerUAdapter()
|
|
now = clock or _utc_now
|
|
if chunk_pages is None:
|
|
plan = plan_outputs(discovered, output_dir, metadata=False, keep_raw=keep_raw, overwrite=overwrite)[0]
|
|
return _convert_plan(
|
|
plan,
|
|
adapter=engine,
|
|
clock=now,
|
|
metadata_enabled=False,
|
|
keep_raw=keep_raw,
|
|
overwrite=overwrite,
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
math_checker=math_checker,
|
|
)
|
|
|
|
tasks = _plan_conversion_tasks(
|
|
discovered,
|
|
output_dir,
|
|
metadata=False,
|
|
keep_raw=keep_raw,
|
|
overwrite=overwrite,
|
|
chunk_pages=chunk_pages,
|
|
)
|
|
return BatchConversionResult(
|
|
_convert_tasks(
|
|
tasks,
|
|
adapter=engine,
|
|
clock=now,
|
|
metadata_enabled=False,
|
|
keep_raw=keep_raw,
|
|
overwrite=overwrite,
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
math_checker=math_checker,
|
|
)
|
|
)
|
|
|
|
|
|
def convert_input(
|
|
input_path: PathLike,
|
|
output_dir: PathLike,
|
|
*,
|
|
metadata: bool = True,
|
|
keep_raw: bool = False,
|
|
recursive: bool = False,
|
|
overwrite: bool = False,
|
|
gpu: str | None = DEFAULT_GPU_DEVICE,
|
|
mineru_profile: str = DEFAULT_MINERU_PROFILE,
|
|
gpu_inventory: tuple[GpuInfo, ...] | None = None,
|
|
strict_local: bool = True,
|
|
adapter: ConversionAdapter | None = None,
|
|
math_checker: MathChecker | None = None,
|
|
chunk_pages: int | None = None,
|
|
clock: Clock | None = None,
|
|
) -> BatchConversionResult:
|
|
"""Convert a local PDF or directory of PDFs."""
|
|
|
|
_raise_if_strict_local_disabled(strict_local)
|
|
discovered = discover_pdfs(input_path, recursive=recursive)
|
|
tasks = _plan_conversion_tasks(
|
|
discovered,
|
|
output_dir,
|
|
metadata=False,
|
|
keep_raw=keep_raw,
|
|
overwrite=overwrite,
|
|
chunk_pages=chunk_pages,
|
|
)
|
|
engine = adapter or MinerUAdapter()
|
|
now = clock or _utc_now
|
|
return BatchConversionResult(
|
|
_convert_tasks(
|
|
tasks,
|
|
adapter=engine,
|
|
clock=now,
|
|
metadata_enabled=False,
|
|
keep_raw=keep_raw,
|
|
overwrite=overwrite,
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
math_checker=math_checker,
|
|
)
|
|
)
|
|
|
|
|
|
def recheck_markdown(
|
|
markdown_path: PathLike,
|
|
*,
|
|
math_checker: MathChecker | None = None,
|
|
clock: Clock | None = None,
|
|
) -> ConversionResult:
|
|
"""Re-run local quality checks for an existing Markdown output and rewrite metadata/report."""
|
|
|
|
markdown_file = Path(markdown_path).expanduser().resolve()
|
|
if not markdown_file.is_file():
|
|
raise ValueError(f"Markdown output does not exist: {markdown_file}")
|
|
|
|
metadata_path = markdown_file.with_suffix(".metadata.json")
|
|
report_path = markdown_file.with_suffix(".report.md")
|
|
if not metadata_path.is_file():
|
|
raise ValueError(f"Legacy adjacent metadata JSON is required for recheck: {metadata_path}")
|
|
|
|
existing_metadata = _read_metadata_json(metadata_path)
|
|
markdown = markdown_file.read_text(encoding="utf-8")
|
|
assets_dir = markdown_file.with_suffix(".assets")
|
|
assets = _assets_from_metadata(existing_metadata)
|
|
prepared = _prepare_markdown_for_output(
|
|
markdown,
|
|
markdown_dir=markdown_file.parent,
|
|
asset_root=assets_dir,
|
|
math_checker=math_checker,
|
|
)
|
|
markdown = prepared.markdown
|
|
quality = prepared.quality
|
|
engine_options = _metadata_engine_options(existing_metadata)
|
|
text_fidelity = _run_text_fidelity_checks(
|
|
Path(_metadata_text(existing_metadata, "source_pdf")),
|
|
markdown,
|
|
page_count=_metadata_page_count(existing_metadata),
|
|
engine_options=engine_options,
|
|
)
|
|
warnings = _preserved_metadata_warnings(existing_metadata) + quality.warnings + text_fidelity.warnings
|
|
document = _build_document(
|
|
source_pdf=Path(_metadata_text(existing_metadata, "source_pdf")),
|
|
markdown=markdown,
|
|
assets=assets,
|
|
warnings=warnings,
|
|
raw_structured={"pages": [None] * _metadata_page_count(existing_metadata)},
|
|
text_fidelity=text_fidelity.pages,
|
|
)
|
|
now = clock or _utc_now
|
|
metadata_data = build_metadata(
|
|
document=document,
|
|
source_sha256=_metadata_text(existing_metadata, "source_sha256"),
|
|
created_at=_format_timestamp(now()),
|
|
engine=_metadata_text(existing_metadata, "engine"),
|
|
engine_version=_metadata_text(existing_metadata, "engine_version"),
|
|
engine_options=engine_options,
|
|
)
|
|
report_quality = QualityResult(
|
|
missing_asset_link_count=quality.missing_asset_link_count,
|
|
invalid_asset_link_count=quality.invalid_asset_link_count,
|
|
)
|
|
report_text = render_report(
|
|
metadata_data,
|
|
quality=report_quality,
|
|
markdown_path=markdown_file,
|
|
metadata_path=metadata_path,
|
|
report_path=report_path,
|
|
)
|
|
final_status = determine_final_status(metadata_data, report_quality)
|
|
|
|
_write_text(markdown_file, markdown)
|
|
_write_text(metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
|
|
_write_text(report_path, report_text)
|
|
|
|
return ConversionResult(
|
|
source_pdf=Path(_metadata_text(metadata_data, "source_pdf")),
|
|
markdown_path=markdown_file,
|
|
metadata_path=metadata_path,
|
|
report_path=report_path,
|
|
assets_dir=assets_dir,
|
|
raw_dir=None,
|
|
engine=_metadata_text(metadata_data, "engine"),
|
|
engine_version=_metadata_text(metadata_data, "engine_version"),
|
|
final_status=final_status,
|
|
warning_count=len(warnings),
|
|
warnings=warnings,
|
|
pages_processed=int(metadata_data["summary"]["pages_processed"]),
|
|
_report_metadata=metadata_data,
|
|
_report_quality=report_quality,
|
|
)
|
|
|
|
|
|
def _read_metadata_json(path: Path) -> dict[str, Any]:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
if not isinstance(data, dict):
|
|
raise ValueError(f"metadata JSON must contain an object: {path}")
|
|
return data
|
|
|
|
|
|
def _assets_from_metadata(metadata: dict[str, Any]) -> tuple[AssetRecord, ...]:
|
|
raw_assets = metadata.get("assets", ())
|
|
if not isinstance(raw_assets, list):
|
|
return ()
|
|
assets: list[AssetRecord] = []
|
|
for item in raw_assets:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
relative_path = item.get("relative_path")
|
|
if not isinstance(relative_path, str) or not relative_path:
|
|
continue
|
|
assets.append(
|
|
AssetRecord(
|
|
relative_path,
|
|
page_index=_optional_page_index(item.get("page_index")),
|
|
bbox=_optional_bbox(item.get("bbox")),
|
|
)
|
|
)
|
|
return tuple(assets)
|
|
|
|
|
|
def _preserved_metadata_warnings(metadata: dict[str, Any]) -> tuple[WarningRecord, ...]:
|
|
raw_warnings = metadata.get("warnings", ())
|
|
if not isinstance(raw_warnings, list):
|
|
return ()
|
|
warnings: list[WarningRecord] = []
|
|
for item in raw_warnings:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
warning = _warning_from_metadata(item)
|
|
if warning is not None and warning.code not in _RECHECKED_WARNING_CODES:
|
|
warnings.append(warning)
|
|
return tuple(warnings)
|
|
|
|
|
|
def _warning_from_metadata(item: dict[str, Any]) -> WarningRecord | None:
|
|
code = item.get("code")
|
|
severity = item.get("severity")
|
|
message = item.get("message")
|
|
if not isinstance(code, str) or not isinstance(severity, str) or not isinstance(message, str) or not message:
|
|
return None
|
|
return WarningRecord(
|
|
WarningCode(code),
|
|
WarningSeverity(severity),
|
|
message,
|
|
page_index=_optional_page_index(item.get("page_index")),
|
|
bbox=_optional_bbox(item.get("bbox")),
|
|
)
|
|
|
|
|
|
def _metadata_text(metadata: dict[str, Any], field_name: str) -> str:
|
|
value = metadata.get(field_name)
|
|
if not isinstance(value, str) or not value:
|
|
raise ValueError(f"metadata field is required: {field_name}")
|
|
return value
|
|
|
|
|
|
def _metadata_engine_options(metadata: dict[str, Any]) -> dict[str, Any]:
|
|
value = metadata.get("engine_options", {})
|
|
return dict(value) if isinstance(value, dict) else {}
|
|
|
|
|
|
def _metadata_page_count(metadata: dict[str, Any]) -> int:
|
|
pages = metadata.get("pages")
|
|
if isinstance(pages, list) and pages:
|
|
return len(pages)
|
|
summary = metadata.get("summary")
|
|
if isinstance(summary, dict):
|
|
pages_processed = summary.get("pages_processed")
|
|
if isinstance(pages_processed, int) and pages_processed > 0:
|
|
return pages_processed
|
|
return 1
|
|
|
|
|
|
def _optional_page_index(value: object) -> int | None:
|
|
return value if isinstance(value, int) and value >= 0 else None
|
|
|
|
|
|
def _optional_bbox(value: object) -> tuple[float, float, float, float] | None:
|
|
if not isinstance(value, list | tuple) or len(value) != 4:
|
|
return None
|
|
if not all(isinstance(part, int | float) for part in value):
|
|
return None
|
|
return tuple(float(part) for part in value)
|
|
|
|
|
|
def _int_value(value: object) -> int:
|
|
return value if isinstance(value, int) else 0
|
|
|
|
|
|
def _float_value(value: object) -> float:
|
|
return float(value) if isinstance(value, int | float) else 0.0
|
|
|
|
|
|
def _optional_float_value(value: object) -> float | None:
|
|
return float(value) if isinstance(value, int | float) else None
|
|
|
|
|
|
def _bool_value(value: object) -> bool:
|
|
return value if isinstance(value, bool) else False
|
|
|
|
|
|
def _plan_conversion_tasks(
|
|
discovered: tuple[DiscoveredPdf, ...],
|
|
output_dir: PathLike,
|
|
*,
|
|
metadata: bool,
|
|
keep_raw: bool,
|
|
overwrite: bool,
|
|
chunk_pages: int | None,
|
|
) -> tuple[_ConversionTask, ...]:
|
|
if chunk_pages is None:
|
|
plans = plan_outputs(discovered, output_dir, metadata=False, keep_raw=keep_raw, overwrite=overwrite)
|
|
return tuple(_ConversionTask(output_plan=plan) for plan in plans)
|
|
if not isinstance(chunk_pages, int) or chunk_pages < 1:
|
|
raise ValueError("chunk_pages must be a positive integer")
|
|
|
|
root = _resolve_output_root(output_dir)
|
|
tasks: list[_ConversionTask] = []
|
|
for item in discovered:
|
|
groups = plan_pdf_chunks(item.source_path, chunk_pages=chunk_pages)
|
|
page_plans = plan_pdf_chunks(item.source_path, chunk_pages=1)
|
|
source_hash = _sha256(item.source_path)
|
|
output_folder = _output_folder_for_pdf(root, item)
|
|
stem = item.source_path.stem
|
|
part_width = max(3, len(str(len(groups))))
|
|
for group in groups:
|
|
part_stem = f"{stem}_{group.chunk_index:0{part_width}d}"
|
|
plan = PlannedOutput(
|
|
source_pdf=item.source_path,
|
|
markdown_path=output_folder / f"{part_stem}.md",
|
|
assets_dir=output_folder / "images",
|
|
metadata_path=None,
|
|
report_path=output_folder / f"{stem}_report.md",
|
|
raw_dir=output_folder / "raw" / part_stem if keep_raw else None,
|
|
)
|
|
_raise_if_plan_escapes_root(plan, root)
|
|
tasks.append(
|
|
_ConversionTask(
|
|
output_plan=plan,
|
|
group_plan=group,
|
|
group_size=chunk_pages,
|
|
page_plans=tuple(
|
|
page
|
|
for page in page_plans
|
|
if group.start_page_index <= page.start_page_index < group.end_page_index
|
|
),
|
|
original_source_pdf=item.source_path,
|
|
original_source_sha256=source_hash,
|
|
)
|
|
)
|
|
|
|
_raise_if_duplicate_task_outputs(tasks)
|
|
if not overwrite:
|
|
_raise_if_task_output_conflicts(tasks)
|
|
return tuple(tasks)
|
|
|
|
|
|
def _resolve_output_root(output_dir: PathLike) -> Path:
|
|
root = Path(output_dir).expanduser()
|
|
if root.exists() and not root.is_dir():
|
|
raise OutputRootError(f"output root exists and is not a directory: {root}")
|
|
return root.resolve(strict=False)
|
|
|
|
|
|
def _output_folder_for_pdf(output_root: Path, item: DiscoveredPdf) -> Path:
|
|
relative_parent = _safe_relative_parent(item.relative_parent)
|
|
return output_root / relative_parent / item.source_path.stem
|
|
|
|
|
|
def _safe_relative_parent(path: Path) -> Path:
|
|
if path.is_absolute() or path.drive or path.root or ".." in path.parts:
|
|
raise OutputPathError(f"relative parent would escape the output root: {path}")
|
|
return path
|
|
|
|
|
|
def _raise_if_plan_escapes_root(plan: PlannedOutput, output_root: Path) -> None:
|
|
root = output_root.resolve(strict=False)
|
|
for path in plan.planned_paths():
|
|
try:
|
|
path.resolve(strict=False).relative_to(root)
|
|
except ValueError as error:
|
|
raise OutputPathError(f"planned path would escape the output root: {path}") from error
|
|
|
|
|
|
def _raise_if_duplicate_task_outputs(tasks: tuple[_ConversionTask, ...] | list[_ConversionTask]) -> None:
|
|
seen: set[str] = set()
|
|
duplicates: list[Path] = []
|
|
for task in tasks:
|
|
paths = [task.output_plan.markdown_path]
|
|
if task.output_plan.raw_dir is not None:
|
|
paths.append(task.output_plan.raw_dir)
|
|
for path in paths:
|
|
key = _path_key(path)
|
|
if key in seen:
|
|
duplicates.append(path)
|
|
else:
|
|
seen.add(key)
|
|
if duplicates:
|
|
raise DuplicateOutputPathError(duplicates)
|
|
|
|
|
|
def _raise_if_task_output_conflicts(tasks: tuple[_ConversionTask, ...] | list[_ConversionTask]) -> None:
|
|
conflicts = tuple(path for path in _unique_task_output_paths(tasks) if path.exists())
|
|
if conflicts:
|
|
raise OutputConflictError(conflicts)
|
|
|
|
|
|
def _unique_task_output_paths(tasks: tuple[_ConversionTask, ...] | list[_ConversionTask]) -> tuple[Path, ...]:
|
|
seen: set[str] = set()
|
|
paths: list[Path] = []
|
|
for task in tasks:
|
|
for path in task.output_plan.planned_paths():
|
|
key = _path_key(path)
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
paths.append(path)
|
|
return tuple(paths)
|
|
|
|
|
|
def _convert_tasks(
|
|
tasks: tuple[_ConversionTask, ...],
|
|
*,
|
|
adapter: ConversionAdapter,
|
|
clock: Clock,
|
|
metadata_enabled: bool,
|
|
keep_raw: bool,
|
|
overwrite: bool,
|
|
gpu: str | None,
|
|
mineru_profile: str,
|
|
gpu_inventory: tuple[GpuInfo, ...] | None,
|
|
strict_local: bool,
|
|
math_checker: MathChecker | None,
|
|
) -> tuple[ConversionResult, ...]:
|
|
if any(task.group_plan is not None for task in tasks):
|
|
if overwrite:
|
|
_clear_task_outputs(tasks)
|
|
source_text_pages_by_pdf = _source_text_pages_by_pdf(tasks)
|
|
with tempfile.TemporaryDirectory(prefix="pdf2md.pages.") as chunk_directory:
|
|
results = tuple(
|
|
_convert_task(
|
|
task,
|
|
chunk_directory=Path(chunk_directory),
|
|
adapter=adapter,
|
|
clock=clock,
|
|
metadata_enabled=metadata_enabled,
|
|
keep_raw=keep_raw,
|
|
overwrite=False,
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
math_checker=math_checker,
|
|
source_text_pages_by_pdf=source_text_pages_by_pdf,
|
|
)
|
|
for task in tasks
|
|
)
|
|
_write_aggregate_group_reports(results)
|
|
return results
|
|
|
|
return tuple(
|
|
_convert_task(
|
|
task,
|
|
chunk_directory=None,
|
|
adapter=adapter,
|
|
clock=clock,
|
|
metadata_enabled=metadata_enabled,
|
|
keep_raw=keep_raw,
|
|
overwrite=overwrite,
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
math_checker=math_checker,
|
|
)
|
|
for task in tasks
|
|
)
|
|
|
|
|
|
def _source_text_pages_by_pdf(tasks: tuple[_ConversionTask, ...]) -> dict[str, tuple[str, ...]]:
|
|
cache: dict[str, tuple[str, ...]] = {}
|
|
for task in tasks:
|
|
if task.group_plan is None or task.original_source_pdf is None:
|
|
continue
|
|
key = _path_key(task.original_source_pdf)
|
|
if key in cache:
|
|
continue
|
|
try:
|
|
cache[key] = extract_pdf_text_pages(task.original_source_pdf)
|
|
except Exception:
|
|
cache[key] = ()
|
|
return cache
|
|
|
|
|
|
def _cached_source_text_pages(
|
|
cache: dict[str, tuple[str, ...]] | None,
|
|
source_pdf: Path | None,
|
|
) -> tuple[str, ...] | None:
|
|
if cache is None or source_pdf is None:
|
|
return None
|
|
return cache.get(_path_key(source_pdf))
|
|
|
|
|
|
def _convert_task(
|
|
task: _ConversionTask,
|
|
*,
|
|
chunk_directory: Path | None,
|
|
adapter: ConversionAdapter,
|
|
clock: Clock,
|
|
metadata_enabled: bool,
|
|
keep_raw: bool,
|
|
overwrite: bool,
|
|
gpu: str | None,
|
|
mineru_profile: str,
|
|
gpu_inventory: tuple[GpuInfo, ...] | None,
|
|
strict_local: bool,
|
|
math_checker: MathChecker | None,
|
|
source_text_pages_by_pdf: dict[str, tuple[str, ...]] | None = None,
|
|
) -> ConversionResult:
|
|
if task.group_plan is None:
|
|
return _convert_plan(
|
|
task.output_plan,
|
|
adapter=adapter,
|
|
clock=clock,
|
|
metadata_enabled=metadata_enabled,
|
|
keep_raw=keep_raw,
|
|
overwrite=overwrite,
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
math_checker=math_checker,
|
|
)
|
|
|
|
if chunk_directory is None:
|
|
raise ValueError("temporary directory is required for grouped page conversion")
|
|
return _convert_grouped_task(
|
|
task,
|
|
temporary_root=chunk_directory,
|
|
adapter=adapter,
|
|
clock=clock,
|
|
metadata_enabled=metadata_enabled,
|
|
keep_raw=keep_raw,
|
|
overwrite=overwrite,
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
math_checker=math_checker,
|
|
source_text_pages_by_pdf=source_text_pages_by_pdf,
|
|
)
|
|
|
|
|
|
def _convert_grouped_task(
|
|
task: _ConversionTask,
|
|
*,
|
|
temporary_root: Path,
|
|
adapter: ConversionAdapter,
|
|
clock: Clock,
|
|
metadata_enabled: bool,
|
|
keep_raw: bool,
|
|
overwrite: bool,
|
|
gpu: str | None,
|
|
mineru_profile: str,
|
|
gpu_inventory: tuple[GpuInfo, ...] | None,
|
|
strict_local: bool,
|
|
math_checker: MathChecker | None,
|
|
source_text_pages_by_pdf: dict[str, tuple[str, ...]] | None,
|
|
) -> ConversionResult:
|
|
if task.group_plan is None or task.original_source_pdf is None or task.original_source_sha256 is None:
|
|
raise ValueError("grouped conversion requires an original source and group plan")
|
|
|
|
page_root = temporary_root / f"group-{task.group_plan.chunk_index:03d}"
|
|
page_root.mkdir(parents=True, exist_ok=True)
|
|
source_text_pages = _cached_source_text_pages(source_text_pages_by_pdf, task.original_source_pdf)
|
|
artifacts = tuple(
|
|
_convert_single_page_artifact(
|
|
page_plan,
|
|
group_plan=task.group_plan,
|
|
page_root=page_root,
|
|
adapter=adapter,
|
|
clock=clock,
|
|
keep_raw=keep_raw,
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
math_checker=math_checker,
|
|
original_source_pdf=task.original_source_pdf,
|
|
original_source_sha256=task.original_source_sha256,
|
|
source_text_pages=source_text_pages,
|
|
)
|
|
for page_plan in task.page_plans
|
|
)
|
|
return _write_grouped_outputs(
|
|
task.output_plan,
|
|
group_plan=task.group_plan,
|
|
group_size=task.group_size,
|
|
artifacts=artifacts,
|
|
metadata_enabled=metadata_enabled,
|
|
clock=clock,
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
original_source_pdf=task.original_source_pdf,
|
|
original_source_sha256=task.original_source_sha256,
|
|
math_checker=math_checker,
|
|
)
|
|
|
|
|
|
def _convert_single_page_artifact(
|
|
page_plan: PdfChunkPlan,
|
|
*,
|
|
group_plan: PdfChunkPlan,
|
|
page_root: Path,
|
|
adapter: ConversionAdapter,
|
|
clock: Clock,
|
|
keep_raw: bool,
|
|
gpu: str | None,
|
|
mineru_profile: str,
|
|
gpu_inventory: tuple[GpuInfo, ...] | None,
|
|
strict_local: bool,
|
|
math_checker: MathChecker | None,
|
|
original_source_pdf: Path,
|
|
original_source_sha256: str,
|
|
source_text_pages: tuple[str, ...] | None,
|
|
) -> _PageConversionArtifact:
|
|
page_pdf = write_pdf_chunk(page_plan, page_root / _page_pdf_filename(page_plan))
|
|
page_output_plan = _temporary_page_output_plan(page_pdf, page_root, keep_raw=keep_raw)
|
|
result = _convert_plan(
|
|
page_output_plan,
|
|
adapter=adapter,
|
|
clock=clock,
|
|
metadata_enabled=True,
|
|
keep_raw=keep_raw,
|
|
overwrite=True,
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
math_checker=math_checker,
|
|
result_source_pdf=original_source_pdf,
|
|
metadata_source_pdf=original_source_pdf,
|
|
metadata_source_sha256=original_source_sha256,
|
|
engine_options_extra={"chunk": _chunk_metadata(page_plan)},
|
|
source_text_pages=source_text_pages,
|
|
)
|
|
markdown = result.markdown_path.read_text(encoding="utf-8") if result.succeeded and result.markdown_path.is_file() else None
|
|
metadata = _read_metadata_json(result.metadata_path) if result.metadata_path is not None and result.metadata_path.is_file() else None
|
|
return _PageConversionArtifact(
|
|
source_page_number=page_plan.source_page_start,
|
|
group_page_index=page_plan.start_page_index - group_plan.start_page_index,
|
|
result=result,
|
|
markdown=markdown,
|
|
metadata=metadata,
|
|
)
|
|
|
|
|
|
def _write_grouped_outputs(
|
|
plan: PlannedOutput,
|
|
*,
|
|
group_plan: PdfChunkPlan,
|
|
group_size: int | None,
|
|
artifacts: tuple[_PageConversionArtifact, ...],
|
|
metadata_enabled: bool,
|
|
clock: Clock,
|
|
gpu: str | None,
|
|
mineru_profile: str,
|
|
gpu_inventory: tuple[GpuInfo, ...] | None,
|
|
strict_local: bool,
|
|
original_source_pdf: Path,
|
|
original_source_sha256: str,
|
|
math_checker: MathChecker | None,
|
|
) -> ConversionResult:
|
|
successful = tuple(artifact for artifact in artifacts if artifact.result.succeeded and artifact.markdown is not None)
|
|
all_failed = not successful
|
|
warnings = _group_warnings(artifacts, all_failed=all_failed)
|
|
engine = _first_engine(artifacts)
|
|
engine_version = _first_engine_version(artifacts)
|
|
engine_options = _group_engine_options(
|
|
artifacts,
|
|
group_plan=group_plan,
|
|
group_size=group_size or group_plan.page_count,
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
failed_source_pages=tuple(artifact.source_page_number for artifact in artifacts if not artifact.result.succeeded),
|
|
)
|
|
text_fidelity = _group_text_fidelity(artifacts)
|
|
|
|
quality = QualityResult()
|
|
assets: tuple[AssetRecord, ...] = ()
|
|
markdown = ""
|
|
plan.assets_dir.mkdir(parents=True, exist_ok=True)
|
|
if not all_failed:
|
|
markdown, assets, asset_warnings = _assemble_group_markdown_and_assets(plan, artifacts)
|
|
prepared = _prepare_markdown_for_output(
|
|
markdown,
|
|
markdown_dir=plan.markdown_path.parent,
|
|
asset_root=plan.assets_dir,
|
|
math_checker=math_checker,
|
|
)
|
|
markdown = prepared.markdown
|
|
quality = prepared.quality
|
|
warnings = warnings + asset_warnings + quality.warnings
|
|
|
|
document = _build_document(
|
|
source_pdf=original_source_pdf,
|
|
markdown=markdown,
|
|
assets=assets,
|
|
warnings=warnings,
|
|
raw_structured={"pages": [None] * group_plan.page_count},
|
|
text_fidelity=text_fidelity,
|
|
)
|
|
metadata_data = build_metadata(
|
|
document=document,
|
|
source_sha256=original_source_sha256,
|
|
created_at=_format_timestamp(clock()),
|
|
engine=engine,
|
|
engine_version=engine_version,
|
|
engine_options=engine_options,
|
|
)
|
|
report_quality = QualityResult(
|
|
missing_asset_link_count=quality.missing_asset_link_count,
|
|
invalid_asset_link_count=quality.invalid_asset_link_count,
|
|
)
|
|
report_text = render_report(
|
|
metadata_data,
|
|
quality=report_quality,
|
|
markdown_path=plan.markdown_path if not all_failed else None,
|
|
metadata_path=plan.metadata_path if metadata_enabled else None,
|
|
report_path=plan.report_path,
|
|
)
|
|
final_status = "failed" if all_failed else determine_final_status(metadata_data, report_quality)
|
|
|
|
plan.markdown_path.parent.mkdir(parents=True, exist_ok=True)
|
|
if not all_failed:
|
|
_write_text(plan.markdown_path, markdown)
|
|
_copy_group_raw_outputs(plan.raw_dir, artifacts)
|
|
if metadata_enabled and plan.metadata_path is not None:
|
|
_write_text(plan.metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
|
|
_write_text(plan.report_path, report_text)
|
|
|
|
return ConversionResult(
|
|
source_pdf=original_source_pdf,
|
|
markdown_path=plan.markdown_path,
|
|
metadata_path=plan.metadata_path if metadata_enabled else None,
|
|
report_path=plan.report_path,
|
|
assets_dir=plan.assets_dir,
|
|
raw_dir=plan.raw_dir,
|
|
engine=engine,
|
|
engine_version=engine_version,
|
|
final_status=final_status,
|
|
warning_count=int(metadata_data["summary"]["warning_count"]),
|
|
warnings=warnings,
|
|
pages_processed=int(metadata_data["summary"]["pages_processed"]),
|
|
_report_metadata=metadata_data,
|
|
_report_quality=report_quality,
|
|
)
|
|
|
|
|
|
def _write_aggregate_group_reports(results: tuple[ConversionResult, ...]) -> None:
|
|
grouped: dict[Path, list[ConversionResult]] = {}
|
|
for result in results:
|
|
if result._report_metadata is None:
|
|
continue
|
|
grouped.setdefault(result.report_path, []).append(result)
|
|
|
|
for report_path, report_results in grouped.items():
|
|
metadatas = tuple(result._report_metadata for result in report_results if result._report_metadata is not None)
|
|
if not metadatas:
|
|
continue
|
|
aggregate_metadata = _aggregate_report_metadata(tuple(report_results), metadatas)
|
|
aggregate_metadata["engine_options"]["output_folder"] = str(report_path.parent)
|
|
aggregate_quality = _aggregate_report_quality(tuple(report_results))
|
|
report_text = render_report(
|
|
aggregate_metadata,
|
|
quality=aggregate_quality,
|
|
markdown_path=None,
|
|
metadata_path=None,
|
|
report_path=report_path,
|
|
)
|
|
_write_text(report_path, report_text)
|
|
|
|
|
|
def _aggregate_report_metadata(
|
|
results: tuple[ConversionResult, ...],
|
|
metadatas: tuple[dict[str, Any], ...],
|
|
) -> dict[str, Any]:
|
|
first = metadatas[0]
|
|
summary = _aggregate_summary(metadatas)
|
|
parts = [_part_report_record(result, metadata) for result, metadata in zip(results, metadatas, strict=True)]
|
|
engine_options = _aggregate_engine_options(first.get("engine_options", {}), parts)
|
|
warnings = _aggregate_warning_records(metadatas)
|
|
text_fidelity = _aggregate_text_fidelity_records(metadatas)
|
|
aggregate: dict[str, Any] = {
|
|
"source_pdf": first.get("source_pdf", "unavailable"),
|
|
"source_sha256": first.get("source_sha256", "unavailable"),
|
|
"created_at": first.get("created_at", "unavailable"),
|
|
"engine": first.get("engine", ENGINE_NAME),
|
|
"engine_version": first.get("engine_version", "unknown"),
|
|
"engine_options": engine_options,
|
|
"pages": [{} for _ in range(max(1, _int_from_summary(summary, "pages_processed")))],
|
|
"assets": [asset for metadata in metadatas for asset in _list_value(metadata.get("assets"))],
|
|
"warnings": warnings,
|
|
"summary": {**summary, "warning_count": len(warnings)},
|
|
}
|
|
if text_fidelity:
|
|
aggregate["text_fidelity"] = text_fidelity
|
|
return aggregate
|
|
|
|
|
|
def _aggregate_summary(metadatas: tuple[dict[str, Any], ...]) -> dict[str, Any]:
|
|
keys = (
|
|
"pages_processed",
|
|
"warning_count",
|
|
"asset_count",
|
|
"display_formula_count",
|
|
"inline_formula_count",
|
|
"math_render_error_count",
|
|
"text_fidelity_checked_page_count",
|
|
"text_fidelity_low_page_count",
|
|
"text_fidelity_unexpected_cjk_count",
|
|
"text_fidelity_replacement_candidate_page_count",
|
|
"text_fidelity_page_mapping_uncertain_count",
|
|
)
|
|
summary: dict[str, Any] = {}
|
|
for key in keys:
|
|
total = sum(_int_from_summary(_dict_value(metadata.get("summary")), key) for metadata in metadatas)
|
|
if total or key in {"pages_processed", "warning_count", "asset_count", "display_formula_count", "inline_formula_count", "math_render_error_count"}:
|
|
summary[key] = total
|
|
return summary
|
|
|
|
|
|
def _part_report_record(result: ConversionResult, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
engine_options = _dict_value(metadata.get("engine_options"))
|
|
chunk = _dict_value(engine_options.get("chunk"))
|
|
page_conversion = _dict_value(engine_options.get("page_conversion"))
|
|
record: dict[str, Any] = {
|
|
"index": _int_value(chunk.get("chunk_index")),
|
|
"total": _int_value(chunk.get("total_chunks")),
|
|
"source_page_start": _int_value(chunk.get("source_page_start")),
|
|
"source_page_end": _int_value(chunk.get("source_page_end")),
|
|
"markdown_path": str(result.markdown_path) if result.markdown_path.exists() else None,
|
|
"status": result.final_status,
|
|
"warning_count": result.warning_count,
|
|
}
|
|
failed_source_pages = page_conversion.get("failed_source_pages")
|
|
if isinstance(failed_source_pages, list):
|
|
record["failed_source_pages"] = [page for page in failed_source_pages if isinstance(page, int)]
|
|
return record
|
|
|
|
|
|
def _aggregate_engine_options(first_options: object, parts: list[dict[str, Any]]) -> dict[str, Any]:
|
|
engine_options = _dict_value(first_options)
|
|
engine_options.pop("chunk", None)
|
|
engine_options.pop("page_conversion", None)
|
|
engine_options["parts"] = parts
|
|
failed_pages = sorted(
|
|
page
|
|
for part in parts
|
|
for page in _list_value(part.get("failed_source_pages"))
|
|
if isinstance(page, int)
|
|
)
|
|
if failed_pages:
|
|
engine_options["failed_source_pages"] = failed_pages
|
|
return engine_options
|
|
|
|
|
|
def _aggregate_report_quality(results: tuple[ConversionResult, ...]) -> QualityResult:
|
|
return QualityResult(
|
|
missing_asset_link_count=sum((result._report_quality or QualityResult()).missing_asset_link_count for result in results),
|
|
invalid_asset_link_count=sum((result._report_quality or QualityResult()).invalid_asset_link_count for result in results),
|
|
math_render_error_count=sum((result._report_quality or QualityResult()).math_render_error_count for result in results),
|
|
)
|
|
|
|
|
|
def _aggregate_warning_records(metadatas: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
|
|
warnings: list[dict[str, Any]] = []
|
|
for metadata in metadatas:
|
|
page_offset = _source_page_offset(metadata)
|
|
for warning in _list_value(metadata.get("warnings")):
|
|
if not isinstance(warning, dict):
|
|
continue
|
|
adjusted = dict(warning)
|
|
page_index = adjusted.get("page_index")
|
|
if isinstance(page_index, int):
|
|
adjusted["page_index"] = page_offset + page_index
|
|
warnings.append(adjusted)
|
|
return warnings
|
|
|
|
|
|
def _aggregate_text_fidelity_records(metadatas: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
|
|
records: list[dict[str, Any]] = []
|
|
for metadata in metadatas:
|
|
page_offset = _source_page_offset(metadata)
|
|
for record in _list_value(metadata.get("text_fidelity")):
|
|
if not isinstance(record, dict):
|
|
continue
|
|
adjusted = dict(record)
|
|
page_index = adjusted.get("page_index")
|
|
if isinstance(page_index, int):
|
|
adjusted["page_index"] = page_offset + page_index
|
|
records.append(adjusted)
|
|
return records
|
|
|
|
|
|
def _source_page_offset(metadata: dict[str, Any]) -> int:
|
|
chunk = _dict_value(_dict_value(metadata.get("engine_options")).get("chunk"))
|
|
source_page_start = chunk.get("source_page_start")
|
|
return source_page_start - 1 if isinstance(source_page_start, int) and source_page_start > 0 else 0
|
|
|
|
|
|
def _dict_value(value: object) -> dict[str, Any]:
|
|
return dict(value) if isinstance(value, dict) else {}
|
|
|
|
|
|
def _list_value(value: object) -> list[object]:
|
|
return list(value) if isinstance(value, list) else []
|
|
|
|
|
|
def _int_from_summary(summary: dict[str, Any], key: str) -> int:
|
|
value = summary.get(key)
|
|
return value if isinstance(value, int) else 0
|
|
|
|
|
|
def _page_pdf_filename(page_plan: PdfChunkPlan) -> str:
|
|
width = page_plan.page_number_width
|
|
return f"{page_plan.source_pdf.stem}.page-{page_plan.source_page_start:0{width}d}.pdf"
|
|
|
|
|
|
def _temporary_page_output_plan(page_pdf: Path, page_root: Path, *, keep_raw: bool) -> PlannedOutput:
|
|
output_dir = page_root / "outputs"
|
|
stem = page_pdf.stem
|
|
return PlannedOutput(
|
|
source_pdf=page_pdf,
|
|
markdown_path=output_dir / f"{stem}.md",
|
|
assets_dir=output_dir / f"{stem}.assets",
|
|
metadata_path=output_dir / f"{stem}.metadata.json",
|
|
report_path=output_dir / f"{stem}.report.md",
|
|
raw_dir=output_dir / f"{stem}.raw" if keep_raw else None,
|
|
)
|
|
|
|
|
|
def _chunk_metadata(plan: PdfChunkPlan) -> dict[str, object]:
|
|
return {
|
|
"original_source_pdf": str(plan.source_pdf),
|
|
"chunk_index": plan.chunk_index,
|
|
"total_chunks": plan.total_chunks,
|
|
"source_page_start": plan.source_page_start,
|
|
"source_page_end": plan.source_page_end,
|
|
"chunk_page_count": plan.page_count,
|
|
}
|
|
|
|
|
|
def _group_engine_options(
|
|
artifacts: tuple[_PageConversionArtifact, ...],
|
|
*,
|
|
group_plan: PdfChunkPlan,
|
|
group_size: int,
|
|
gpu: str | None,
|
|
mineru_profile: str,
|
|
gpu_inventory: tuple[GpuInfo, ...] | None,
|
|
strict_local: bool,
|
|
failed_source_pages: tuple[int, ...],
|
|
) -> dict[str, Any]:
|
|
engine_options = _first_page_engine_options(artifacts)
|
|
if not engine_options:
|
|
engine_options = _mineru_options(
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
).to_engine_options()
|
|
engine_options.pop("chunk", None)
|
|
engine_options.pop("page_conversion", None)
|
|
engine_options["chunk"] = _chunk_metadata(group_plan)
|
|
engine_options["page_conversion"] = {
|
|
"mode": "single_page",
|
|
"mineru_input_page_count": 1,
|
|
"output_group_page_count": group_size,
|
|
"failed_source_pages": list(failed_source_pages),
|
|
}
|
|
return engine_options
|
|
|
|
|
|
def _first_page_engine_options(artifacts: tuple[_PageConversionArtifact, ...]) -> dict[str, Any]:
|
|
for artifact in artifacts:
|
|
if artifact.metadata is None:
|
|
continue
|
|
value = artifact.metadata.get("engine_options")
|
|
if isinstance(value, dict):
|
|
return dict(value)
|
|
return {}
|
|
|
|
|
|
def _mineru_options(
|
|
*,
|
|
gpu: str | None,
|
|
mineru_profile: str,
|
|
gpu_inventory: tuple[GpuInfo, ...] | None,
|
|
strict_local: bool,
|
|
) -> MinerUOptions:
|
|
gpu_device, selected_gpu = _resolve_gpu(gpu, gpu_inventory)
|
|
cuda_requested = bool(gpu_device and gpu_device.startswith("cuda:"))
|
|
warn_without_inventory = mineru_profile.strip().casefold() != DEFAULT_MINERU_PROFILE
|
|
profile = resolve_mineru_profile(
|
|
mineru_profile,
|
|
selected_gpu=selected_gpu,
|
|
cuda_requested=cuda_requested and (selected_gpu is not None or warn_without_inventory),
|
|
)
|
|
return MinerUOptions(
|
|
strict_local=strict_local,
|
|
gpu_device=gpu_device,
|
|
mineru_profile=mineru_profile,
|
|
profile_environment=profile.environment,
|
|
profile_engine_options=profile.to_engine_options(),
|
|
profile_warnings=profile.warnings,
|
|
)
|
|
|
|
|
|
def _resolve_gpu(gpu: str | None, gpu_inventory: tuple[GpuInfo, ...] | None) -> tuple[str | None, GpuInfo | None]:
|
|
requested = normalize_cuda_device(gpu)
|
|
if requested is None:
|
|
return None, None
|
|
|
|
if requested.casefold() == "auto":
|
|
inventory = gpu_inventory if gpu_inventory is not None else query_nvidia_gpus()
|
|
selection = select_gpu(inventory, requested)
|
|
return selection.cuda_device, selection.gpu
|
|
|
|
if gpu_inventory is None:
|
|
return requested, None
|
|
|
|
selection = select_gpu(gpu_inventory, requested)
|
|
return selection.cuda_device, selection.gpu
|
|
|
|
|
|
def _first_engine(artifacts: tuple[_PageConversionArtifact, ...]) -> str:
|
|
for artifact in artifacts:
|
|
if artifact.result.engine:
|
|
return artifact.result.engine
|
|
return ENGINE_NAME
|
|
|
|
|
|
def _first_engine_version(artifacts: tuple[_PageConversionArtifact, ...]) -> str:
|
|
for artifact in artifacts:
|
|
if artifact.result.engine_version:
|
|
return artifact.result.engine_version
|
|
return "unknown"
|
|
|
|
|
|
def _assemble_group_markdown_and_assets(
|
|
plan: PlannedOutput,
|
|
artifacts: tuple[_PageConversionArtifact, ...],
|
|
) -> tuple[str, tuple[AssetRecord, ...], tuple[WarningRecord, ...]]:
|
|
sections: list[str] = []
|
|
assets: list[AssetRecord] = []
|
|
warnings: list[WarningRecord] = []
|
|
copied_asset_names: set[str] = set()
|
|
for artifact in artifacts:
|
|
if artifact.result.succeeded and artifact.markdown is not None:
|
|
page_markdown, page_assets, page_warnings = _copy_page_assets_for_group(
|
|
plan.assets_dir,
|
|
artifact,
|
|
copied_asset_names,
|
|
)
|
|
assets.extend(page_assets)
|
|
warnings.extend(page_warnings)
|
|
body = page_markdown.strip()
|
|
if body:
|
|
sections.append(f"<!-- source-page: {artifact.source_page_number} -->\n\n{body}")
|
|
else:
|
|
sections.append(f"<!-- source-page: {artifact.source_page_number} -->")
|
|
continue
|
|
sections.append(f"<!-- source-page: {artifact.source_page_number} conversion failed; see report -->")
|
|
|
|
return "\n\n".join(sections).rstrip() + "\n", tuple(assets), tuple(warnings)
|
|
|
|
|
|
def _copy_page_assets_for_group(
|
|
group_assets_dir: Path,
|
|
artifact: _PageConversionArtifact,
|
|
copied_asset_names: set[str],
|
|
) -> tuple[str, tuple[AssetRecord, ...], tuple[WarningRecord, ...]]:
|
|
if artifact.markdown is None or artifact.metadata is None:
|
|
return artifact.markdown or "", (), ()
|
|
|
|
link_map: dict[str, str] = {}
|
|
assets: list[AssetRecord] = []
|
|
warnings: list[WarningRecord] = []
|
|
for page_asset in _assets_from_metadata(artifact.metadata):
|
|
source = artifact.result.markdown_path.parent / page_asset.relative_path
|
|
if not source.is_file():
|
|
warnings.append(
|
|
WarningRecord(
|
|
WarningCode.ASSET_LINK_MISSING,
|
|
WarningSeverity.WARNING,
|
|
f"Page asset could not be copied into grouped output: {page_asset.relative_path}",
|
|
page_index=artifact.group_page_index,
|
|
)
|
|
)
|
|
continue
|
|
|
|
destination_relative = _group_asset_relative_path(page_asset.relative_path, artifact, copied_asset_names)
|
|
destination = group_assets_dir.joinpath(*destination_relative.parts)
|
|
try:
|
|
destination.resolve(strict=False).relative_to(group_assets_dir.resolve(strict=False))
|
|
except ValueError:
|
|
warnings.append(
|
|
WarningRecord(
|
|
WarningCode.ASSET_LINK_INVALID,
|
|
WarningSeverity.WARNING,
|
|
f"Grouped asset destination would escape assets directory: {page_asset.relative_path}",
|
|
page_index=artifact.group_page_index,
|
|
)
|
|
)
|
|
continue
|
|
|
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(source, destination)
|
|
final_link = PurePosixPath(group_assets_dir.name, destination_relative).as_posix()
|
|
link_map[page_asset.relative_path.replace("\\", "/")] = final_link
|
|
assets.append(AssetRecord(final_link, page_index=artifact.group_page_index))
|
|
|
|
return _rewrite_asset_links(artifact.markdown, link_map), tuple(assets), tuple(warnings)
|
|
|
|
|
|
def _group_asset_relative_path(
|
|
relative_path: str,
|
|
artifact: _PageConversionArtifact,
|
|
copied_asset_names: set[str],
|
|
) -> PurePosixPath:
|
|
parts = PurePosixPath(relative_path.replace("\\", "/")).parts
|
|
if parts and parts[0] == artifact.result.assets_dir.name:
|
|
parts = parts[1:]
|
|
if not parts:
|
|
parts = ("asset",)
|
|
original_name = PurePosixPath(*parts).name or "asset"
|
|
return _unique_asset_filename(f"page-{artifact.source_page_number:03d}_{original_name}", copied_asset_names)
|
|
|
|
|
|
def _group_warnings(
|
|
artifacts: tuple[_PageConversionArtifact, ...],
|
|
*,
|
|
all_failed: bool,
|
|
) -> tuple[WarningRecord, ...]:
|
|
warnings: list[WarningRecord] = []
|
|
for artifact in artifacts:
|
|
page_warnings = _artifact_warnings(artifact)
|
|
if artifact.result.succeeded:
|
|
warnings.extend(_adjust_warning_for_group(warning, artifact.group_page_index) for warning in page_warnings)
|
|
continue
|
|
|
|
severity = WarningSeverity.ERROR if all_failed else WarningSeverity.WARNING
|
|
if not page_warnings:
|
|
page_warnings = (
|
|
WarningRecord(
|
|
WarningCode.MINERU_CLI_FAILED,
|
|
severity,
|
|
f"MinerU failed for source page {artifact.source_page_number}.",
|
|
),
|
|
)
|
|
warnings.extend(
|
|
WarningRecord(
|
|
warning.code,
|
|
severity,
|
|
f"Source page {artifact.source_page_number}: {warning.message}",
|
|
page_index=artifact.group_page_index,
|
|
bbox=warning.bbox,
|
|
)
|
|
for warning in page_warnings
|
|
)
|
|
return tuple(warnings)
|
|
|
|
|
|
def _artifact_warnings(artifact: _PageConversionArtifact) -> tuple[WarningRecord, ...]:
|
|
if artifact.metadata is None:
|
|
return artifact.result.warnings
|
|
raw_warnings = artifact.metadata.get("warnings")
|
|
if not isinstance(raw_warnings, list):
|
|
return artifact.result.warnings
|
|
warnings = tuple(
|
|
warning
|
|
for item in raw_warnings
|
|
if isinstance(item, dict)
|
|
for warning in (_warning_from_metadata(item),)
|
|
if warning is not None
|
|
)
|
|
return warnings if warnings else artifact.result.warnings
|
|
|
|
|
|
def _adjust_warning_for_group(warning: WarningRecord, group_page_index: int) -> WarningRecord:
|
|
page_index = group_page_index if warning.page_index is None else group_page_index + warning.page_index
|
|
return WarningRecord(
|
|
warning.code,
|
|
warning.severity,
|
|
warning.message,
|
|
page_index=page_index,
|
|
bbox=warning.bbox,
|
|
)
|
|
|
|
|
|
def _group_text_fidelity(artifacts: tuple[_PageConversionArtifact, ...]) -> tuple[TextFidelityRecord, ...]:
|
|
records: list[TextFidelityRecord] = []
|
|
for artifact in artifacts:
|
|
if artifact.metadata is None:
|
|
continue
|
|
raw_records = artifact.metadata.get("text_fidelity")
|
|
if not isinstance(raw_records, list):
|
|
continue
|
|
for item in raw_records:
|
|
if isinstance(item, dict):
|
|
records.append(_text_fidelity_from_metadata(item, group_page_index=artifact.group_page_index))
|
|
return tuple(records)
|
|
|
|
|
|
def _text_fidelity_from_metadata(item: dict[str, Any], *, group_page_index: int) -> TextFidelityRecord:
|
|
source_page_number = item.get("source_page_number")
|
|
return TextFidelityRecord(
|
|
page_index=group_page_index + _int_value(item.get("page_index")),
|
|
source_page_number=source_page_number if isinstance(source_page_number, int) else None,
|
|
pypdf_text_available=_bool_value(item.get("pypdf_text_available")),
|
|
markdown_text_available=_bool_value(item.get("markdown_text_available")),
|
|
pypdf_hangul_count=_int_value(item.get("pypdf_hangul_count")),
|
|
markdown_hangul_count=_int_value(item.get("markdown_hangul_count")),
|
|
hangul_count_delta=_int_value(item.get("hangul_count_delta")),
|
|
hangul_count_ratio=_optional_float_value(item.get("hangul_count_ratio")),
|
|
unexpected_cjk_count=_int_value(item.get("unexpected_cjk_count")),
|
|
pypdf_hangul_spacing_anomaly_ratio=_float_value(item.get("pypdf_hangul_spacing_anomaly_ratio")),
|
|
markdown_hangul_spacing_anomaly_ratio=_float_value(item.get("markdown_hangul_spacing_anomaly_ratio")),
|
|
text_similarity=_optional_float_value(item.get("text_similarity")),
|
|
replacement_candidate=_bool_value(item.get("replacement_candidate")),
|
|
comparison_status=str(item.get("comparison_status") or "unknown"),
|
|
)
|
|
|
|
|
|
def _convert_plan(
|
|
plan: PlannedOutput,
|
|
*,
|
|
adapter: ConversionAdapter,
|
|
clock: Clock,
|
|
metadata_enabled: bool,
|
|
keep_raw: bool,
|
|
overwrite: bool,
|
|
gpu: str | None,
|
|
mineru_profile: str,
|
|
gpu_inventory: tuple[GpuInfo, ...] | None,
|
|
strict_local: bool,
|
|
math_checker: MathChecker | None,
|
|
result_source_pdf: Path | None = None,
|
|
metadata_source_pdf: Path | None = None,
|
|
metadata_source_sha256: str | None = None,
|
|
engine_options_extra: dict[str, object] | None = None,
|
|
source_text_pages: tuple[str, ...] | None = None,
|
|
) -> ConversionResult:
|
|
if overwrite:
|
|
_clear_planned_outputs(plan)
|
|
|
|
plan.markdown_path.parent.mkdir(parents=True, exist_ok=True)
|
|
options = _mineru_options(
|
|
gpu=gpu,
|
|
mineru_profile=mineru_profile,
|
|
gpu_inventory=gpu_inventory,
|
|
strict_local=strict_local,
|
|
)
|
|
|
|
if keep_raw:
|
|
if plan.raw_dir is None:
|
|
raise ValueError("raw output directory is required when keep_raw is enabled")
|
|
plan.raw_dir.mkdir(parents=True, exist_ok=True)
|
|
return _convert_in_work_dir(
|
|
plan,
|
|
plan.raw_dir,
|
|
adapter,
|
|
options,
|
|
clock,
|
|
metadata_enabled,
|
|
math_checker,
|
|
result_source_pdf=result_source_pdf,
|
|
metadata_source_pdf=metadata_source_pdf,
|
|
metadata_source_sha256=metadata_source_sha256,
|
|
engine_options_extra=engine_options_extra,
|
|
source_text_pages=source_text_pages,
|
|
)
|
|
|
|
with tempfile.TemporaryDirectory(prefix=f"{plan.source_pdf.stem}.", dir=plan.markdown_path.parent) as temporary_dir:
|
|
return _convert_in_work_dir(
|
|
plan,
|
|
Path(temporary_dir),
|
|
adapter,
|
|
options,
|
|
clock,
|
|
metadata_enabled,
|
|
math_checker,
|
|
result_source_pdf=result_source_pdf,
|
|
metadata_source_pdf=metadata_source_pdf,
|
|
metadata_source_sha256=metadata_source_sha256,
|
|
engine_options_extra=engine_options_extra,
|
|
source_text_pages=source_text_pages,
|
|
)
|
|
|
|
|
|
def _convert_in_work_dir(
|
|
plan: PlannedOutput,
|
|
work_dir: Path,
|
|
adapter: ConversionAdapter,
|
|
options: MinerUOptions,
|
|
clock: Clock,
|
|
metadata_enabled: bool,
|
|
math_checker: MathChecker | None,
|
|
result_source_pdf: Path | None = None,
|
|
metadata_source_pdf: Path | None = None,
|
|
metadata_source_sha256: str | None = None,
|
|
engine_options_extra: dict[str, object] | None = None,
|
|
source_text_pages: tuple[str, ...] | None = None,
|
|
) -> ConversionResult:
|
|
result_source = result_source_pdf or plan.source_pdf
|
|
metadata_source = metadata_source_pdf or result_source
|
|
try:
|
|
adapter_result = adapter.convert(plan.source_pdf, work_dir, options)
|
|
except StrictLocalViolationError as error:
|
|
return _failed_result(
|
|
plan,
|
|
warnings=(error.warning,),
|
|
source_pdf=result_source,
|
|
metadata_source_pdf=metadata_source,
|
|
metadata_source_sha256=metadata_source_sha256,
|
|
engine_options=options.to_engine_options(),
|
|
clock=clock,
|
|
)
|
|
|
|
engine = adapter_result.engine or ENGINE_NAME
|
|
engine_version = adapter_result.engine_version or "unknown"
|
|
adapter_warnings = _merge_option_warnings(options.profile_warnings, adapter_result.warnings)
|
|
if not adapter_result.succeeded:
|
|
return _failed_result(
|
|
plan,
|
|
warnings=adapter_warnings,
|
|
engine=engine,
|
|
engine_version=engine_version,
|
|
source_pdf=result_source,
|
|
metadata_source_pdf=metadata_source,
|
|
metadata_source_sha256=metadata_source_sha256,
|
|
engine_options=options.to_engine_options(),
|
|
clock=clock,
|
|
)
|
|
|
|
if adapter_result.raw_markdown is None:
|
|
warning = WarningRecord(
|
|
WarningCode.MINERU_CLI_FAILED,
|
|
WarningSeverity.ERROR,
|
|
"MinerU produced structured output but no Markdown; no fallback engine was used.",
|
|
)
|
|
return _failed_result(
|
|
plan,
|
|
warnings=adapter_warnings + (warning,),
|
|
engine=engine,
|
|
engine_version=engine_version,
|
|
source_pdf=result_source,
|
|
metadata_source_pdf=metadata_source,
|
|
metadata_source_sha256=metadata_source_sha256,
|
|
engine_options=options.to_engine_options(),
|
|
clock=clock,
|
|
)
|
|
|
|
plan.assets_dir.mkdir(parents=True, exist_ok=True)
|
|
assets = _materialize_assets(adapter_result.asset_paths, work_dir, plan.assets_dir)
|
|
markdown_source = _rewrite_asset_links(adapter_result.raw_markdown, assets.link_map)
|
|
normalized = normalize_markdown(
|
|
markdown_source,
|
|
markdown_dir=plan.markdown_path.parent,
|
|
asset_root=plan.assets_dir,
|
|
check_assets=False,
|
|
)
|
|
prepared = _prepare_markdown_for_output(
|
|
normalized.markdown,
|
|
markdown_dir=plan.markdown_path.parent,
|
|
asset_root=plan.assets_dir,
|
|
math_checker=math_checker,
|
|
)
|
|
quality = prepared.quality
|
|
engine_options = dict(adapter_result.engine_options)
|
|
if engine_options_extra:
|
|
engine_options.update(engine_options_extra)
|
|
text_fidelity = _run_text_fidelity_checks(
|
|
metadata_source,
|
|
prepared.markdown,
|
|
page_count=_page_count(adapter_result.raw_structured),
|
|
engine_options=engine_options,
|
|
source_text_pages=source_text_pages,
|
|
)
|
|
warnings = adapter_warnings + assets.warnings + normalized.warnings + quality.warnings + text_fidelity.warnings
|
|
document = _build_document(
|
|
source_pdf=metadata_source,
|
|
markdown=prepared.markdown,
|
|
assets=assets.records,
|
|
warnings=warnings,
|
|
raw_structured=adapter_result.raw_structured,
|
|
text_fidelity=text_fidelity.pages,
|
|
)
|
|
metadata_data = build_metadata(
|
|
document=document,
|
|
source_sha256=metadata_source_sha256 or _sha256(metadata_source),
|
|
created_at=_format_timestamp(clock()),
|
|
engine=engine,
|
|
engine_version=engine_version,
|
|
engine_options=engine_options,
|
|
)
|
|
report_quality = QualityResult(
|
|
missing_asset_link_count=quality.missing_asset_link_count,
|
|
invalid_asset_link_count=quality.invalid_asset_link_count,
|
|
)
|
|
report_text = render_report(
|
|
metadata_data,
|
|
quality=report_quality,
|
|
markdown_path=plan.markdown_path,
|
|
metadata_path=plan.metadata_path if metadata_enabled else None,
|
|
report_path=plan.report_path,
|
|
)
|
|
final_status = determine_final_status(metadata_data, report_quality)
|
|
|
|
_write_text(plan.markdown_path, prepared.markdown)
|
|
if metadata_enabled and plan.metadata_path is not None:
|
|
_write_text(plan.metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
|
|
_write_text(plan.report_path, report_text)
|
|
|
|
return ConversionResult(
|
|
source_pdf=result_source,
|
|
markdown_path=plan.markdown_path,
|
|
metadata_path=plan.metadata_path if metadata_enabled else None,
|
|
report_path=plan.report_path,
|
|
assets_dir=plan.assets_dir,
|
|
raw_dir=plan.raw_dir,
|
|
engine=engine,
|
|
engine_version=engine_version,
|
|
final_status=final_status,
|
|
warning_count=len(warnings),
|
|
warnings=warnings,
|
|
pages_processed=int(metadata_data["summary"]["pages_processed"]),
|
|
_report_metadata=metadata_data,
|
|
_report_quality=report_quality,
|
|
)
|
|
|
|
|
|
def _materialize_assets(asset_paths: tuple[Path, ...], work_dir: Path, assets_dir: Path) -> _AssetMaterialization:
|
|
records: list[AssetRecord] = []
|
|
warnings: list[WarningRecord] = []
|
|
link_map: dict[str, str] = {}
|
|
copied: set[str] = set()
|
|
work_root = work_dir.resolve()
|
|
|
|
for source in asset_paths:
|
|
source_path = Path(source)
|
|
if not source_path.exists() or not source_path.is_file():
|
|
warnings.append(_warning(WarningCode.ASSET_LINK_MISSING, f"Adapter asset file does not exist: {source_path}"))
|
|
continue
|
|
|
|
try:
|
|
source_relative = source_path.resolve().relative_to(work_root)
|
|
except ValueError:
|
|
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Adapter asset path is outside the work directory: {source_path}"))
|
|
continue
|
|
|
|
destination_relative = _unique_asset_filename(_asset_filename(source_path, len(copied) + 1), copied)
|
|
destination = assets_dir / destination_relative
|
|
try:
|
|
destination.resolve(strict=False).relative_to(assets_dir.resolve(strict=False))
|
|
except ValueError:
|
|
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Adapter asset destination is outside the assets directory: {source_path}"))
|
|
continue
|
|
|
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(source_path, destination)
|
|
|
|
final_link = PurePosixPath(assets_dir.name, destination_relative).as_posix()
|
|
records.append(AssetRecord(final_link))
|
|
_add_asset_link_keys(link_map, source_path, source_relative, destination_relative, final_link)
|
|
|
|
return _AssetMaterialization(records=tuple(records), warnings=tuple(warnings), link_map=link_map)
|
|
|
|
|
|
def _asset_filename(source_path: Path, index: int) -> str:
|
|
name = source_path.name.strip()
|
|
if name and name not in {".", ".."}:
|
|
return name
|
|
suffix = source_path.suffix if source_path.suffix else ""
|
|
return f"asset-{index:03d}{suffix}"
|
|
|
|
|
|
def _unique_asset_filename(filename: str, used_names: set[str]) -> PurePosixPath:
|
|
clean_name = PurePosixPath(filename.replace("\\", "/")).name
|
|
if not clean_name or clean_name in {".", ".."}:
|
|
clean_name = "asset"
|
|
path = PurePosixPath(clean_name)
|
|
stem = path.stem or "asset"
|
|
suffix = path.suffix
|
|
candidate = f"{stem}{suffix}"
|
|
index = 2
|
|
while candidate.casefold() in used_names:
|
|
candidate = f"{stem}-{index:03d}{suffix}"
|
|
index += 1
|
|
used_names.add(candidate.casefold())
|
|
return PurePosixPath(candidate)
|
|
|
|
|
|
def _add_asset_link_keys(
|
|
link_map: dict[str, str],
|
|
source_path: Path,
|
|
source_relative: Path,
|
|
destination_relative: PurePosixPath,
|
|
final_link: str,
|
|
) -> None:
|
|
keys = {
|
|
source_relative.as_posix(),
|
|
destination_relative.as_posix(),
|
|
source_path.name,
|
|
str(source_path),
|
|
source_path.as_posix(),
|
|
}
|
|
keys.update(_asset_link_suffixes(source_relative))
|
|
keys.update(_asset_link_suffixes(destination_relative))
|
|
for key in keys:
|
|
link_map[key.replace("\\", "/")] = final_link
|
|
|
|
|
|
def _asset_link_suffixes(path: Path | PurePosixPath) -> set[str]:
|
|
parts = PurePosixPath(path.as_posix()).parts
|
|
suffixes: set[str] = set()
|
|
for index, part in enumerate(parts):
|
|
if part.casefold() in {"asset", "assets", "image", "images"} and index + 1 < len(parts):
|
|
suffixes.add(PurePosixPath(*parts[index:]).as_posix())
|
|
return suffixes
|
|
|
|
|
|
def _rewrite_asset_links(markdown: str, link_map: dict[str, str]) -> str:
|
|
if not link_map:
|
|
return markdown
|
|
|
|
def replace(match: re.Match[str]) -> str:
|
|
alt = match.group("alt")
|
|
target = match.group("target").strip()
|
|
unwrapped = _unwrap_angle_target(target).replace("\\", "/")
|
|
replacement = link_map.get(unwrapped)
|
|
if replacement is None:
|
|
return match.group(0)
|
|
return f""
|
|
|
|
return _IMAGE_LINK_RE.sub(replace, markdown)
|
|
|
|
|
|
def _build_document(
|
|
*,
|
|
source_pdf: Path,
|
|
markdown: str,
|
|
assets: tuple[AssetRecord, ...],
|
|
warnings: tuple[WarningRecord, ...],
|
|
raw_structured: object | None,
|
|
text_fidelity: tuple = (),
|
|
) -> DocumentRecord:
|
|
page_count = _page_count(raw_structured)
|
|
blocks = _formula_blocks(markdown)
|
|
pages = [
|
|
PageRecord(page_index=page_index, blocks=blocks if page_index == 0 else ())
|
|
for page_index in range(page_count)
|
|
]
|
|
return DocumentRecord(
|
|
source_pdf=source_pdf,
|
|
pages=tuple(pages),
|
|
assets=assets,
|
|
warnings=warnings,
|
|
text_fidelity=text_fidelity,
|
|
)
|
|
|
|
|
|
def _run_text_fidelity_checks(
|
|
source_pdf: Path,
|
|
markdown: str,
|
|
*,
|
|
page_count: int,
|
|
engine_options: dict[str, Any],
|
|
source_text_pages: tuple[str, ...] | None = None,
|
|
) -> TextFidelityResult:
|
|
return check_text_fidelity(
|
|
source_pdf,
|
|
markdown,
|
|
page_count=page_count,
|
|
engine_options=engine_options,
|
|
source_text_pages=source_text_pages,
|
|
)
|
|
|
|
|
|
def _merge_option_warnings(
|
|
option_warnings: tuple[WarningRecord, ...],
|
|
adapter_warnings: tuple[WarningRecord, ...],
|
|
) -> tuple[WarningRecord, ...]:
|
|
extras = tuple(warning for warning in option_warnings if warning not in adapter_warnings)
|
|
return extras + adapter_warnings
|
|
|
|
|
|
def _run_quality_checks(
|
|
markdown: str,
|
|
*,
|
|
markdown_dir: Path,
|
|
asset_root: Path,
|
|
math_checker: MathChecker | None,
|
|
) -> QualityResult:
|
|
asset_quality = check_asset_links(markdown, markdown_dir=markdown_dir, asset_root=asset_root)
|
|
if not _has_math(markdown):
|
|
return asset_quality
|
|
if math_checker is None:
|
|
math_checker = create_default_math_checker()
|
|
math_quality = check_math_renderability_details(markdown, math_checker).quality
|
|
return merge_quality_results(asset_quality, math_quality)
|
|
|
|
|
|
def _prepare_markdown_for_output(
|
|
markdown: str,
|
|
*,
|
|
markdown_dir: Path,
|
|
asset_root: Path,
|
|
math_checker: MathChecker | None,
|
|
) -> _PreparedMarkdown:
|
|
asset_quality = check_asset_links(markdown, markdown_dir=markdown_dir, asset_root=asset_root)
|
|
if not _has_math(markdown):
|
|
return _PreparedMarkdown(markdown=markdown, quality=asset_quality)
|
|
|
|
checker = math_checker if math_checker is not None else create_default_math_checker()
|
|
math_details = check_math_renderability_details(markdown, checker)
|
|
initial_quality = merge_quality_results(asset_quality, math_details.quality)
|
|
if checker is None or not math_details.failures:
|
|
return _PreparedMarkdown(markdown=markdown, quality=initial_quality)
|
|
|
|
repair_result = repair_math_render_failures(markdown, math_details.failures, checker)
|
|
if not repair_result.repairs:
|
|
return _PreparedMarkdown(markdown=markdown, quality=initial_quality)
|
|
|
|
repaired_quality = _run_quality_checks(
|
|
repair_result.markdown,
|
|
markdown_dir=markdown_dir,
|
|
asset_root=asset_root,
|
|
math_checker=checker,
|
|
)
|
|
repair_quality = QualityResult(warnings=repair_result.warnings)
|
|
return _PreparedMarkdown(
|
|
markdown=repair_result.markdown,
|
|
quality=merge_quality_results(repaired_quality, repair_quality),
|
|
)
|
|
|
|
|
|
def _has_math(markdown: str) -> bool:
|
|
return _DISPLAY_MATH_RE.search(markdown) is not None or _INLINE_MATH_RE.search(markdown) is not None
|
|
|
|
|
|
def _formula_blocks(markdown: str) -> tuple[BlockRecord, ...]:
|
|
blocks: list[BlockRecord] = []
|
|
display_spans: list[tuple[int, int]] = []
|
|
for match in _DISPLAY_MATH_RE.finditer(markdown):
|
|
display_spans.append(match.span())
|
|
blocks.append(BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0, markdown_span=match.span()))
|
|
|
|
inline_parts: list[tuple[int, str]] = []
|
|
cursor = 0
|
|
for start, end in display_spans:
|
|
inline_parts.append((cursor, markdown[cursor:start]))
|
|
cursor = end
|
|
inline_parts.append((cursor, markdown[cursor:]))
|
|
for offset, part in inline_parts:
|
|
for match in _INLINE_MATH_RE.finditer(part):
|
|
body = match.group("body").strip()
|
|
if body and not body[0].isdigit():
|
|
start = offset + match.start()
|
|
end = offset + match.end()
|
|
blocks.append(BlockRecord(BlockType.INLINE_FORMULA, page_index=0, markdown_span=(start, end)))
|
|
return tuple(blocks) or (BlockRecord(BlockType.PARAGRAPH, page_index=0),)
|
|
|
|
|
|
def _page_count(raw_structured: object | None) -> int:
|
|
if isinstance(raw_structured, dict):
|
|
pages = raw_structured.get("pages")
|
|
if isinstance(pages, list):
|
|
return max(1, len(pages))
|
|
if isinstance(pages, int):
|
|
return max(1, pages)
|
|
if isinstance(pages, dict):
|
|
return max(1, len(pages))
|
|
pdf_info = raw_structured.get("pdf_info")
|
|
if isinstance(pdf_info, list):
|
|
return max(1, len(pdf_info))
|
|
page_info = raw_structured.get("page_info")
|
|
if isinstance(page_info, list):
|
|
return max(1, len(page_info))
|
|
page_indexes = tuple(_page_indexes(raw_structured))
|
|
if page_indexes:
|
|
return max(1, max(page_indexes) + 1)
|
|
return 1
|
|
|
|
|
|
def _page_indexes(value: object) -> tuple[int, ...]:
|
|
indexes: list[int] = []
|
|
if isinstance(value, dict):
|
|
for key in ("page_idx", "page_index"):
|
|
page_value = value.get(key)
|
|
if isinstance(page_value, int) and page_value >= 0:
|
|
indexes.append(page_value)
|
|
for item in value.values():
|
|
indexes.extend(_page_indexes(item))
|
|
elif isinstance(value, list):
|
|
for item in value:
|
|
indexes.extend(_page_indexes(item))
|
|
return tuple(indexes)
|
|
|
|
|
|
def _failed_result(
|
|
plan: PlannedOutput,
|
|
*,
|
|
warnings: tuple[WarningRecord, ...],
|
|
engine: str = ENGINE_NAME,
|
|
engine_version: str = "unknown",
|
|
source_pdf: Path | None = None,
|
|
metadata_source_pdf: Path | None = None,
|
|
metadata_source_sha256: str | None = None,
|
|
engine_options: dict[str, Any] | None = None,
|
|
clock: Clock | None = None,
|
|
) -> ConversionResult:
|
|
result_source = source_pdf or plan.source_pdf
|
|
metadata_source = metadata_source_pdf or result_source
|
|
metadata_data: dict[str, Any] | None = None
|
|
report_quality = QualityResult()
|
|
if clock is not None:
|
|
document = DocumentRecord(
|
|
source_pdf=metadata_source,
|
|
pages=(PageRecord(page_index=0),),
|
|
assets=(),
|
|
warnings=warnings,
|
|
)
|
|
metadata_data = build_metadata(
|
|
document=document,
|
|
source_sha256=metadata_source_sha256 or _sha256(metadata_source),
|
|
created_at=_format_timestamp(clock()),
|
|
engine=engine,
|
|
engine_version=engine_version,
|
|
engine_options=engine_options or {},
|
|
)
|
|
report_text = render_report(
|
|
metadata_data,
|
|
quality=report_quality,
|
|
markdown_path=None,
|
|
metadata_path=None,
|
|
report_path=plan.report_path,
|
|
)
|
|
_write_text(plan.report_path, report_text)
|
|
|
|
return ConversionResult(
|
|
source_pdf=result_source,
|
|
markdown_path=plan.markdown_path,
|
|
metadata_path=None,
|
|
report_path=plan.report_path,
|
|
assets_dir=plan.assets_dir,
|
|
raw_dir=plan.raw_dir,
|
|
engine=engine,
|
|
engine_version=engine_version,
|
|
final_status="failed",
|
|
warning_count=len(warnings),
|
|
warnings=warnings,
|
|
pages_processed=0 if metadata_data is None else int(metadata_data["summary"]["pages_processed"]),
|
|
_report_metadata=metadata_data,
|
|
_report_quality=report_quality,
|
|
)
|
|
|
|
|
|
def _clear_planned_outputs(plan: PlannedOutput) -> None:
|
|
for path in plan.planned_paths():
|
|
if path.is_dir():
|
|
shutil.rmtree(path)
|
|
elif path.exists():
|
|
path.unlink()
|
|
|
|
|
|
def _clear_task_outputs(tasks: tuple[_ConversionTask, ...]) -> None:
|
|
for path in _unique_task_output_paths(tasks):
|
|
if path.is_dir():
|
|
shutil.rmtree(path)
|
|
elif path.exists():
|
|
path.unlink()
|
|
|
|
|
|
def _copy_group_raw_outputs(raw_dir: Path | None, artifacts: tuple[_PageConversionArtifact, ...]) -> None:
|
|
if raw_dir is None:
|
|
return
|
|
for artifact in artifacts:
|
|
source_raw_dir = artifact.result.raw_dir
|
|
if source_raw_dir is None or not source_raw_dir.exists():
|
|
continue
|
|
destination = raw_dir / f"page-{artifact.source_page_number:03d}"
|
|
if destination.exists():
|
|
shutil.rmtree(destination)
|
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copytree(source_raw_dir, destination)
|
|
|
|
|
|
def _write_text(path: Path, text: str) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text(text, encoding="utf-8")
|
|
|
|
|
|
def _sha256(path: Path) -> str:
|
|
digest = hashlib.sha256()
|
|
with path.open("rb") as file:
|
|
for chunk in iter(lambda: file.read(1024 * 1024), b""):
|
|
digest.update(chunk)
|
|
return digest.hexdigest()
|
|
|
|
|
|
def _path_key(path: Path) -> str:
|
|
return os.path.normcase(os.path.normpath(str(path.resolve(strict=False))))
|
|
|
|
|
|
def _format_timestamp(value: datetime) -> str:
|
|
if value.tzinfo is None:
|
|
value = value.replace(tzinfo=timezone.utc)
|
|
return value.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
|
|
|
|
|
|
def _utc_now() -> datetime:
|
|
return datetime.now(timezone.utc)
|
|
|
|
|
|
def _unwrap_angle_target(target: str) -> str:
|
|
if target.startswith("<") and target.endswith(">"):
|
|
return target[1:-1].strip()
|
|
return target
|
|
|
|
|
|
def _warning(code: WarningCode, message: str) -> WarningRecord:
|
|
return WarningRecord(code, WarningSeverity.WARNING, message)
|
|
|
|
|
|
def _raise_if_strict_local_disabled(strict_local: bool) -> None:
|
|
if not strict_local:
|
|
raise StrictLocalViolationError("strict-local execution cannot be disabled in v1.")
|