Files
PDFToMD/src/pdf2md/conversion.py
T
2026-05-14 10:16:59 +09:00

2014 lines
71 KiB
Python

"""Conversion orchestration for local PDF-to-Markdown output."""
from __future__ import annotations
import hashlib
import json
import os
import re
import shutil
import tempfile
from collections.abc import Callable
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path, PurePosixPath
from typing import Any, Protocol
from pdf2md.ir import (
AssetRecord,
BlockRecord,
BlockType,
DocumentRecord,
PageRecord,
TextFidelityRecord,
WarningCode,
WarningRecord,
WarningSeverity,
)
from pdf2md.gpu import GpuInfo, normalize_cuda_device, query_nvidia_gpus, select_gpu
from pdf2md.markdown import normalize_markdown
from pdf2md.math_render import create_default_math_checker
from pdf2md.math_repair import repair_math_render_failures
from pdf2md.metadata import build_metadata
from pdf2md.mineru_adapter import (
ENGINE_NAME,
MinerUAdapter,
MinerUAdapterResult,
MinerUOptions,
StrictLocalViolationError,
)
from pdf2md.mineru_profile import resolve_mineru_profile
from pdf2md.paths import (
DiscoveredPdf,
DuplicateOutputPathError,
OutputConflictError,
OutputPathError,
OutputRootError,
PathLike,
PlannedOutput,
discover_pdfs,
plan_outputs,
)
from pdf2md.pdf_splitter import PdfChunkPlan, plan_pdf_chunks, write_pdf_chunk
from pdf2md.quality import MathChecker, QualityResult, check_asset_links, check_math_renderability_details, merge_quality_results
from pdf2md.report import FinalStatus, determine_final_status, render_report
from pdf2md.text_fidelity import TextFidelityResult, check_text_fidelity, extract_pdf_text_pages
Clock = Callable[[], datetime]
DEFAULT_GPU_DEVICE = "cuda:0"
DEFAULT_MINERU_PROFILE = "auto"
DEFAULT_CHUNK_PAGES = 20
class ConversionAdapter(Protocol):
def convert(self, input_pdf: PathLike, work_dir: PathLike, options: MinerUOptions | None = None) -> MinerUAdapterResult:
"""Run the conversion engine into a local work directory."""
@dataclass(frozen=True)
class ConversionResult:
source_pdf: Path
markdown_path: Path
metadata_path: Path | None
report_path: Path
assets_dir: Path
raw_dir: Path | None
engine: str
engine_version: str
final_status: FinalStatus
warning_count: int
warnings: tuple[WarningRecord, ...]
pages_processed: int
_report_metadata: dict[str, Any] | None = None
_report_quality: QualityResult | None = None
@property
def succeeded(self) -> bool:
return self.final_status != "failed"
@dataclass(frozen=True)
class BatchConversionResult:
results: tuple[ConversionResult, ...]
@property
def converted_count(self) -> int:
return sum(result.succeeded for result in self.results)
@property
def failed_count(self) -> int:
return sum(not result.succeeded for result in self.results)
@property
def warning_count(self) -> int:
return sum(result.warning_count for result in self.results)
@dataclass(frozen=True)
class _AssetMaterialization:
records: tuple[AssetRecord, ...]
warnings: tuple[WarningRecord, ...]
link_map: dict[str, str]
@dataclass(frozen=True)
class _ConversionTask:
output_plan: PlannedOutput
group_plan: PdfChunkPlan | None = None
group_size: int | None = None
page_plans: tuple[PdfChunkPlan, ...] = ()
original_source_pdf: Path | None = None
original_source_sha256: str | None = None
@dataclass(frozen=True)
class _PageConversionArtifact:
source_page_number: int
group_page_index: int
result: ConversionResult
markdown: str | None
metadata: dict[str, Any] | None
@dataclass(frozen=True)
class _PreparedMarkdown:
markdown: str
quality: QualityResult
_IMAGE_LINK_RE = re.compile(r"!\[(?P<alt>[^\]\n]*)\]\((?P<target>[^)\n]+)\)")
_DISPLAY_MATH_RE = re.compile(r"(?<!\\)\$\$(?P<body>.*?)(?<!\\)\$\$", re.DOTALL)
_INLINE_MATH_RE = re.compile(r"(?<!\\)\$(?P<body>[^\n$]+?)(?<!\\)\$")
_RECHECKED_WARNING_CODES = frozenset(
{
WarningCode.MATH_RENDER_FAILED,
WarningCode.MATH_RENDER_REPAIRED,
WarningCode.ASSET_LINK_MISSING,
WarningCode.ASSET_LINK_INVALID,
WarningCode.TEXT_LAYER_AVAILABLE,
WarningCode.TEXT_FIDELITY_LOW,
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
WarningCode.HANGUL_SPACING_SUSPECT,
WarningCode.TEXT_PAGE_MAPPING_UNCERTAIN,
}
)
def convert_pdf(
input_path: PathLike,
output_dir: PathLike,
*,
metadata: bool = True,
keep_raw: bool = False,
overwrite: bool = False,
gpu: str | None = DEFAULT_GPU_DEVICE,
mineru_profile: str = DEFAULT_MINERU_PROFILE,
gpu_inventory: tuple[GpuInfo, ...] | None = None,
strict_local: bool = True,
adapter: ConversionAdapter | None = None,
math_checker: MathChecker | None = None,
chunk_pages: int | None = None,
clock: Clock | None = None,
) -> ConversionResult | BatchConversionResult:
"""Convert one local PDF into Markdown, assets, and report outputs."""
_raise_if_strict_local_disabled(strict_local)
candidate = Path(input_path).expanduser()
if candidate.exists() and not candidate.is_file():
raise ValueError("convert_pdf requires a PDF file input")
discovered = discover_pdfs(input_path, recursive=False)
if len(discovered) != 1:
raise ValueError("convert_pdf requires a single PDF input")
engine = adapter or MinerUAdapter()
now = clock or _utc_now
if chunk_pages is None:
plan = plan_outputs(discovered, output_dir, metadata=False, keep_raw=keep_raw, overwrite=overwrite)[0]
return _convert_plan(
plan,
adapter=engine,
clock=now,
metadata_enabled=False,
keep_raw=keep_raw,
overwrite=overwrite,
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
math_checker=math_checker,
)
tasks = _plan_conversion_tasks(
discovered,
output_dir,
metadata=False,
keep_raw=keep_raw,
overwrite=overwrite,
chunk_pages=chunk_pages,
)
return BatchConversionResult(
_convert_tasks(
tasks,
adapter=engine,
clock=now,
metadata_enabled=False,
keep_raw=keep_raw,
overwrite=overwrite,
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
math_checker=math_checker,
)
)
def convert_input(
input_path: PathLike,
output_dir: PathLike,
*,
metadata: bool = True,
keep_raw: bool = False,
recursive: bool = False,
overwrite: bool = False,
gpu: str | None = DEFAULT_GPU_DEVICE,
mineru_profile: str = DEFAULT_MINERU_PROFILE,
gpu_inventory: tuple[GpuInfo, ...] | None = None,
strict_local: bool = True,
adapter: ConversionAdapter | None = None,
math_checker: MathChecker | None = None,
chunk_pages: int | None = None,
clock: Clock | None = None,
) -> BatchConversionResult:
"""Convert a local PDF or directory of PDFs."""
_raise_if_strict_local_disabled(strict_local)
discovered = discover_pdfs(input_path, recursive=recursive)
tasks = _plan_conversion_tasks(
discovered,
output_dir,
metadata=False,
keep_raw=keep_raw,
overwrite=overwrite,
chunk_pages=chunk_pages,
)
engine = adapter or MinerUAdapter()
now = clock or _utc_now
return BatchConversionResult(
_convert_tasks(
tasks,
adapter=engine,
clock=now,
metadata_enabled=False,
keep_raw=keep_raw,
overwrite=overwrite,
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
math_checker=math_checker,
)
)
def recheck_markdown(
markdown_path: PathLike,
*,
math_checker: MathChecker | None = None,
clock: Clock | None = None,
) -> ConversionResult:
"""Re-run local quality checks for an existing Markdown output and rewrite metadata/report."""
markdown_file = Path(markdown_path).expanduser().resolve()
if not markdown_file.is_file():
raise ValueError(f"Markdown output does not exist: {markdown_file}")
metadata_path = markdown_file.with_suffix(".metadata.json")
report_path = markdown_file.with_suffix(".report.md")
if not metadata_path.is_file():
raise ValueError(f"Legacy adjacent metadata JSON is required for recheck: {metadata_path}")
existing_metadata = _read_metadata_json(metadata_path)
markdown = markdown_file.read_text(encoding="utf-8")
assets_dir = markdown_file.with_suffix(".assets")
assets = _assets_from_metadata(existing_metadata)
prepared = _prepare_markdown_for_output(
markdown,
markdown_dir=markdown_file.parent,
asset_root=assets_dir,
math_checker=math_checker,
)
markdown = prepared.markdown
quality = prepared.quality
engine_options = _metadata_engine_options(existing_metadata)
text_fidelity = _run_text_fidelity_checks(
Path(_metadata_text(existing_metadata, "source_pdf")),
markdown,
page_count=_metadata_page_count(existing_metadata),
engine_options=engine_options,
)
warnings = _preserved_metadata_warnings(existing_metadata) + quality.warnings + text_fidelity.warnings
document = _build_document(
source_pdf=Path(_metadata_text(existing_metadata, "source_pdf")),
markdown=markdown,
assets=assets,
warnings=warnings,
raw_structured={"pages": [None] * _metadata_page_count(existing_metadata)},
text_fidelity=text_fidelity.pages,
)
now = clock or _utc_now
metadata_data = build_metadata(
document=document,
source_sha256=_metadata_text(existing_metadata, "source_sha256"),
created_at=_format_timestamp(now()),
engine=_metadata_text(existing_metadata, "engine"),
engine_version=_metadata_text(existing_metadata, "engine_version"),
engine_options=engine_options,
)
report_quality = QualityResult(
missing_asset_link_count=quality.missing_asset_link_count,
invalid_asset_link_count=quality.invalid_asset_link_count,
)
report_text = render_report(
metadata_data,
quality=report_quality,
markdown_path=markdown_file,
metadata_path=metadata_path,
report_path=report_path,
)
final_status = determine_final_status(metadata_data, report_quality)
_write_text(markdown_file, markdown)
_write_text(metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
_write_text(report_path, report_text)
return ConversionResult(
source_pdf=Path(_metadata_text(metadata_data, "source_pdf")),
markdown_path=markdown_file,
metadata_path=metadata_path,
report_path=report_path,
assets_dir=assets_dir,
raw_dir=None,
engine=_metadata_text(metadata_data, "engine"),
engine_version=_metadata_text(metadata_data, "engine_version"),
final_status=final_status,
warning_count=len(warnings),
warnings=warnings,
pages_processed=int(metadata_data["summary"]["pages_processed"]),
_report_metadata=metadata_data,
_report_quality=report_quality,
)
def _read_metadata_json(path: Path) -> dict[str, Any]:
data = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(data, dict):
raise ValueError(f"metadata JSON must contain an object: {path}")
return data
def _assets_from_metadata(metadata: dict[str, Any]) -> tuple[AssetRecord, ...]:
raw_assets = metadata.get("assets", ())
if not isinstance(raw_assets, list):
return ()
assets: list[AssetRecord] = []
for item in raw_assets:
if not isinstance(item, dict):
continue
relative_path = item.get("relative_path")
if not isinstance(relative_path, str) or not relative_path:
continue
assets.append(
AssetRecord(
relative_path,
page_index=_optional_page_index(item.get("page_index")),
bbox=_optional_bbox(item.get("bbox")),
)
)
return tuple(assets)
def _preserved_metadata_warnings(metadata: dict[str, Any]) -> tuple[WarningRecord, ...]:
raw_warnings = metadata.get("warnings", ())
if not isinstance(raw_warnings, list):
return ()
warnings: list[WarningRecord] = []
for item in raw_warnings:
if not isinstance(item, dict):
continue
warning = _warning_from_metadata(item)
if warning is not None and warning.code not in _RECHECKED_WARNING_CODES:
warnings.append(warning)
return tuple(warnings)
def _warning_from_metadata(item: dict[str, Any]) -> WarningRecord | None:
code = item.get("code")
severity = item.get("severity")
message = item.get("message")
if not isinstance(code, str) or not isinstance(severity, str) or not isinstance(message, str) or not message:
return None
return WarningRecord(
WarningCode(code),
WarningSeverity(severity),
message,
page_index=_optional_page_index(item.get("page_index")),
bbox=_optional_bbox(item.get("bbox")),
)
def _metadata_text(metadata: dict[str, Any], field_name: str) -> str:
value = metadata.get(field_name)
if not isinstance(value, str) or not value:
raise ValueError(f"metadata field is required: {field_name}")
return value
def _metadata_engine_options(metadata: dict[str, Any]) -> dict[str, Any]:
value = metadata.get("engine_options", {})
return dict(value) if isinstance(value, dict) else {}
def _metadata_page_count(metadata: dict[str, Any]) -> int:
pages = metadata.get("pages")
if isinstance(pages, list) and pages:
return len(pages)
summary = metadata.get("summary")
if isinstance(summary, dict):
pages_processed = summary.get("pages_processed")
if isinstance(pages_processed, int) and pages_processed > 0:
return pages_processed
return 1
def _optional_page_index(value: object) -> int | None:
return value if isinstance(value, int) and value >= 0 else None
def _optional_bbox(value: object) -> tuple[float, float, float, float] | None:
if not isinstance(value, list | tuple) or len(value) != 4:
return None
if not all(isinstance(part, int | float) for part in value):
return None
return tuple(float(part) for part in value)
def _int_value(value: object) -> int:
return value if isinstance(value, int) else 0
def _float_value(value: object) -> float:
return float(value) if isinstance(value, int | float) else 0.0
def _optional_float_value(value: object) -> float | None:
return float(value) if isinstance(value, int | float) else None
def _bool_value(value: object) -> bool:
return value if isinstance(value, bool) else False
def _plan_conversion_tasks(
discovered: tuple[DiscoveredPdf, ...],
output_dir: PathLike,
*,
metadata: bool,
keep_raw: bool,
overwrite: bool,
chunk_pages: int | None,
) -> tuple[_ConversionTask, ...]:
if chunk_pages is None:
plans = plan_outputs(discovered, output_dir, metadata=False, keep_raw=keep_raw, overwrite=overwrite)
return tuple(_ConversionTask(output_plan=plan) for plan in plans)
if not isinstance(chunk_pages, int) or chunk_pages < 1:
raise ValueError("chunk_pages must be a positive integer")
root = _resolve_output_root(output_dir)
tasks: list[_ConversionTask] = []
for item in discovered:
groups = plan_pdf_chunks(item.source_path, chunk_pages=chunk_pages)
page_plans = plan_pdf_chunks(item.source_path, chunk_pages=1)
source_hash = _sha256(item.source_path)
output_folder = _output_folder_for_pdf(root, item)
stem = item.source_path.stem
part_width = max(3, len(str(len(groups))))
for group in groups:
part_stem = f"{stem}_{group.chunk_index:0{part_width}d}"
plan = PlannedOutput(
source_pdf=item.source_path,
markdown_path=output_folder / f"{part_stem}.md",
assets_dir=output_folder / "images",
metadata_path=None,
report_path=output_folder / f"{stem}_report.md",
raw_dir=output_folder / "raw" / part_stem if keep_raw else None,
)
_raise_if_plan_escapes_root(plan, root)
tasks.append(
_ConversionTask(
output_plan=plan,
group_plan=group,
group_size=chunk_pages,
page_plans=tuple(
page
for page in page_plans
if group.start_page_index <= page.start_page_index < group.end_page_index
),
original_source_pdf=item.source_path,
original_source_sha256=source_hash,
)
)
_raise_if_duplicate_task_outputs(tasks)
if not overwrite:
_raise_if_task_output_conflicts(tasks)
return tuple(tasks)
def _resolve_output_root(output_dir: PathLike) -> Path:
root = Path(output_dir).expanduser()
if root.exists() and not root.is_dir():
raise OutputRootError(f"output root exists and is not a directory: {root}")
return root.resolve(strict=False)
def _output_folder_for_pdf(output_root: Path, item: DiscoveredPdf) -> Path:
relative_parent = _safe_relative_parent(item.relative_parent)
return output_root / relative_parent / item.source_path.stem
def _safe_relative_parent(path: Path) -> Path:
if path.is_absolute() or path.drive or path.root or ".." in path.parts:
raise OutputPathError(f"relative parent would escape the output root: {path}")
return path
def _raise_if_plan_escapes_root(plan: PlannedOutput, output_root: Path) -> None:
root = output_root.resolve(strict=False)
for path in plan.planned_paths():
try:
path.resolve(strict=False).relative_to(root)
except ValueError as error:
raise OutputPathError(f"planned path would escape the output root: {path}") from error
def _raise_if_duplicate_task_outputs(tasks: tuple[_ConversionTask, ...] | list[_ConversionTask]) -> None:
seen: set[str] = set()
duplicates: list[Path] = []
for task in tasks:
paths = [task.output_plan.markdown_path]
if task.output_plan.raw_dir is not None:
paths.append(task.output_plan.raw_dir)
for path in paths:
key = _path_key(path)
if key in seen:
duplicates.append(path)
else:
seen.add(key)
if duplicates:
raise DuplicateOutputPathError(duplicates)
def _raise_if_task_output_conflicts(tasks: tuple[_ConversionTask, ...] | list[_ConversionTask]) -> None:
conflicts = tuple(path for path in _unique_task_output_paths(tasks) if path.exists())
if conflicts:
raise OutputConflictError(conflicts)
def _unique_task_output_paths(tasks: tuple[_ConversionTask, ...] | list[_ConversionTask]) -> tuple[Path, ...]:
seen: set[str] = set()
paths: list[Path] = []
for task in tasks:
for path in task.output_plan.planned_paths():
key = _path_key(path)
if key in seen:
continue
seen.add(key)
paths.append(path)
return tuple(paths)
def _convert_tasks(
tasks: tuple[_ConversionTask, ...],
*,
adapter: ConversionAdapter,
clock: Clock,
metadata_enabled: bool,
keep_raw: bool,
overwrite: bool,
gpu: str | None,
mineru_profile: str,
gpu_inventory: tuple[GpuInfo, ...] | None,
strict_local: bool,
math_checker: MathChecker | None,
) -> tuple[ConversionResult, ...]:
if any(task.group_plan is not None for task in tasks):
if overwrite:
_clear_task_outputs(tasks)
source_text_pages_by_pdf = _source_text_pages_by_pdf(tasks)
with tempfile.TemporaryDirectory(prefix="pdf2md.pages.") as chunk_directory:
results = tuple(
_convert_task(
task,
chunk_directory=Path(chunk_directory),
adapter=adapter,
clock=clock,
metadata_enabled=metadata_enabled,
keep_raw=keep_raw,
overwrite=False,
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
math_checker=math_checker,
source_text_pages_by_pdf=source_text_pages_by_pdf,
)
for task in tasks
)
_write_aggregate_group_reports(results)
return results
return tuple(
_convert_task(
task,
chunk_directory=None,
adapter=adapter,
clock=clock,
metadata_enabled=metadata_enabled,
keep_raw=keep_raw,
overwrite=overwrite,
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
math_checker=math_checker,
)
for task in tasks
)
def _source_text_pages_by_pdf(tasks: tuple[_ConversionTask, ...]) -> dict[str, tuple[str, ...]]:
cache: dict[str, tuple[str, ...]] = {}
for task in tasks:
if task.group_plan is None or task.original_source_pdf is None:
continue
key = _path_key(task.original_source_pdf)
if key in cache:
continue
try:
cache[key] = extract_pdf_text_pages(task.original_source_pdf)
except Exception:
cache[key] = ()
return cache
def _cached_source_text_pages(
cache: dict[str, tuple[str, ...]] | None,
source_pdf: Path | None,
) -> tuple[str, ...] | None:
if cache is None or source_pdf is None:
return None
return cache.get(_path_key(source_pdf))
def _convert_task(
task: _ConversionTask,
*,
chunk_directory: Path | None,
adapter: ConversionAdapter,
clock: Clock,
metadata_enabled: bool,
keep_raw: bool,
overwrite: bool,
gpu: str | None,
mineru_profile: str,
gpu_inventory: tuple[GpuInfo, ...] | None,
strict_local: bool,
math_checker: MathChecker | None,
source_text_pages_by_pdf: dict[str, tuple[str, ...]] | None = None,
) -> ConversionResult:
if task.group_plan is None:
return _convert_plan(
task.output_plan,
adapter=adapter,
clock=clock,
metadata_enabled=metadata_enabled,
keep_raw=keep_raw,
overwrite=overwrite,
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
math_checker=math_checker,
)
if chunk_directory is None:
raise ValueError("temporary directory is required for grouped page conversion")
return _convert_grouped_task(
task,
temporary_root=chunk_directory,
adapter=adapter,
clock=clock,
metadata_enabled=metadata_enabled,
keep_raw=keep_raw,
overwrite=overwrite,
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
math_checker=math_checker,
source_text_pages_by_pdf=source_text_pages_by_pdf,
)
def _convert_grouped_task(
task: _ConversionTask,
*,
temporary_root: Path,
adapter: ConversionAdapter,
clock: Clock,
metadata_enabled: bool,
keep_raw: bool,
overwrite: bool,
gpu: str | None,
mineru_profile: str,
gpu_inventory: tuple[GpuInfo, ...] | None,
strict_local: bool,
math_checker: MathChecker | None,
source_text_pages_by_pdf: dict[str, tuple[str, ...]] | None,
) -> ConversionResult:
if task.group_plan is None or task.original_source_pdf is None or task.original_source_sha256 is None:
raise ValueError("grouped conversion requires an original source and group plan")
page_root = temporary_root / f"group-{task.group_plan.chunk_index:03d}"
page_root.mkdir(parents=True, exist_ok=True)
source_text_pages = _cached_source_text_pages(source_text_pages_by_pdf, task.original_source_pdf)
artifacts = tuple(
_convert_single_page_artifact(
page_plan,
group_plan=task.group_plan,
page_root=page_root,
adapter=adapter,
clock=clock,
keep_raw=keep_raw,
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
math_checker=math_checker,
original_source_pdf=task.original_source_pdf,
original_source_sha256=task.original_source_sha256,
source_text_pages=source_text_pages,
)
for page_plan in task.page_plans
)
return _write_grouped_outputs(
task.output_plan,
group_plan=task.group_plan,
group_size=task.group_size,
artifacts=artifacts,
metadata_enabled=metadata_enabled,
clock=clock,
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
original_source_pdf=task.original_source_pdf,
original_source_sha256=task.original_source_sha256,
math_checker=math_checker,
)
def _convert_single_page_artifact(
page_plan: PdfChunkPlan,
*,
group_plan: PdfChunkPlan,
page_root: Path,
adapter: ConversionAdapter,
clock: Clock,
keep_raw: bool,
gpu: str | None,
mineru_profile: str,
gpu_inventory: tuple[GpuInfo, ...] | None,
strict_local: bool,
math_checker: MathChecker | None,
original_source_pdf: Path,
original_source_sha256: str,
source_text_pages: tuple[str, ...] | None,
) -> _PageConversionArtifact:
page_pdf = write_pdf_chunk(page_plan, page_root / _page_pdf_filename(page_plan))
page_output_plan = _temporary_page_output_plan(page_pdf, page_root, keep_raw=keep_raw)
result = _convert_plan(
page_output_plan,
adapter=adapter,
clock=clock,
metadata_enabled=True,
keep_raw=keep_raw,
overwrite=True,
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
math_checker=math_checker,
result_source_pdf=original_source_pdf,
metadata_source_pdf=original_source_pdf,
metadata_source_sha256=original_source_sha256,
engine_options_extra={"chunk": _chunk_metadata(page_plan)},
source_text_pages=source_text_pages,
)
markdown = result.markdown_path.read_text(encoding="utf-8") if result.succeeded and result.markdown_path.is_file() else None
metadata = _read_metadata_json(result.metadata_path) if result.metadata_path is not None and result.metadata_path.is_file() else None
return _PageConversionArtifact(
source_page_number=page_plan.source_page_start,
group_page_index=page_plan.start_page_index - group_plan.start_page_index,
result=result,
markdown=markdown,
metadata=metadata,
)
def _write_grouped_outputs(
plan: PlannedOutput,
*,
group_plan: PdfChunkPlan,
group_size: int | None,
artifacts: tuple[_PageConversionArtifact, ...],
metadata_enabled: bool,
clock: Clock,
gpu: str | None,
mineru_profile: str,
gpu_inventory: tuple[GpuInfo, ...] | None,
strict_local: bool,
original_source_pdf: Path,
original_source_sha256: str,
math_checker: MathChecker | None,
) -> ConversionResult:
successful = tuple(artifact for artifact in artifacts if artifact.result.succeeded and artifact.markdown is not None)
all_failed = not successful
warnings = _group_warnings(artifacts, all_failed=all_failed)
engine = _first_engine(artifacts)
engine_version = _first_engine_version(artifacts)
engine_options = _group_engine_options(
artifacts,
group_plan=group_plan,
group_size=group_size or group_plan.page_count,
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
failed_source_pages=tuple(artifact.source_page_number for artifact in artifacts if not artifact.result.succeeded),
)
text_fidelity = _group_text_fidelity(artifacts)
quality = QualityResult()
assets: tuple[AssetRecord, ...] = ()
markdown = ""
plan.assets_dir.mkdir(parents=True, exist_ok=True)
if not all_failed:
markdown, assets, asset_warnings = _assemble_group_markdown_and_assets(plan, artifacts)
prepared = _prepare_markdown_for_output(
markdown,
markdown_dir=plan.markdown_path.parent,
asset_root=plan.assets_dir,
math_checker=math_checker,
)
markdown = prepared.markdown
quality = prepared.quality
warnings = warnings + asset_warnings + quality.warnings
document = _build_document(
source_pdf=original_source_pdf,
markdown=markdown,
assets=assets,
warnings=warnings,
raw_structured={"pages": [None] * group_plan.page_count},
text_fidelity=text_fidelity,
)
metadata_data = build_metadata(
document=document,
source_sha256=original_source_sha256,
created_at=_format_timestamp(clock()),
engine=engine,
engine_version=engine_version,
engine_options=engine_options,
)
report_quality = QualityResult(
missing_asset_link_count=quality.missing_asset_link_count,
invalid_asset_link_count=quality.invalid_asset_link_count,
)
report_text = render_report(
metadata_data,
quality=report_quality,
markdown_path=plan.markdown_path if not all_failed else None,
metadata_path=plan.metadata_path if metadata_enabled else None,
report_path=plan.report_path,
)
final_status = "failed" if all_failed else determine_final_status(metadata_data, report_quality)
plan.markdown_path.parent.mkdir(parents=True, exist_ok=True)
if not all_failed:
_write_text(plan.markdown_path, markdown)
_copy_group_raw_outputs(plan.raw_dir, artifacts)
if metadata_enabled and plan.metadata_path is not None:
_write_text(plan.metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
_write_text(plan.report_path, report_text)
return ConversionResult(
source_pdf=original_source_pdf,
markdown_path=plan.markdown_path,
metadata_path=plan.metadata_path if metadata_enabled else None,
report_path=plan.report_path,
assets_dir=plan.assets_dir,
raw_dir=plan.raw_dir,
engine=engine,
engine_version=engine_version,
final_status=final_status,
warning_count=int(metadata_data["summary"]["warning_count"]),
warnings=warnings,
pages_processed=int(metadata_data["summary"]["pages_processed"]),
_report_metadata=metadata_data,
_report_quality=report_quality,
)
def _write_aggregate_group_reports(results: tuple[ConversionResult, ...]) -> None:
grouped: dict[Path, list[ConversionResult]] = {}
for result in results:
if result._report_metadata is None:
continue
grouped.setdefault(result.report_path, []).append(result)
for report_path, report_results in grouped.items():
metadatas = tuple(result._report_metadata for result in report_results if result._report_metadata is not None)
if not metadatas:
continue
aggregate_metadata = _aggregate_report_metadata(tuple(report_results), metadatas)
aggregate_metadata["engine_options"]["output_folder"] = str(report_path.parent)
aggregate_quality = _aggregate_report_quality(tuple(report_results))
report_text = render_report(
aggregate_metadata,
quality=aggregate_quality,
markdown_path=None,
metadata_path=None,
report_path=report_path,
)
_write_text(report_path, report_text)
def _aggregate_report_metadata(
results: tuple[ConversionResult, ...],
metadatas: tuple[dict[str, Any], ...],
) -> dict[str, Any]:
first = metadatas[0]
summary = _aggregate_summary(metadatas)
parts = [_part_report_record(result, metadata) for result, metadata in zip(results, metadatas, strict=True)]
engine_options = _aggregate_engine_options(first.get("engine_options", {}), parts)
warnings = _aggregate_warning_records(metadatas)
text_fidelity = _aggregate_text_fidelity_records(metadatas)
aggregate: dict[str, Any] = {
"source_pdf": first.get("source_pdf", "unavailable"),
"source_sha256": first.get("source_sha256", "unavailable"),
"created_at": first.get("created_at", "unavailable"),
"engine": first.get("engine", ENGINE_NAME),
"engine_version": first.get("engine_version", "unknown"),
"engine_options": engine_options,
"pages": [{} for _ in range(max(1, _int_from_summary(summary, "pages_processed")))],
"assets": [asset for metadata in metadatas for asset in _list_value(metadata.get("assets"))],
"warnings": warnings,
"summary": {**summary, "warning_count": len(warnings)},
}
if text_fidelity:
aggregate["text_fidelity"] = text_fidelity
return aggregate
def _aggregate_summary(metadatas: tuple[dict[str, Any], ...]) -> dict[str, Any]:
keys = (
"pages_processed",
"warning_count",
"asset_count",
"display_formula_count",
"inline_formula_count",
"math_render_error_count",
"text_fidelity_checked_page_count",
"text_fidelity_low_page_count",
"text_fidelity_unexpected_cjk_count",
"text_fidelity_replacement_candidate_page_count",
"text_fidelity_page_mapping_uncertain_count",
)
summary: dict[str, Any] = {}
for key in keys:
total = sum(_int_from_summary(_dict_value(metadata.get("summary")), key) for metadata in metadatas)
if total or key in {"pages_processed", "warning_count", "asset_count", "display_formula_count", "inline_formula_count", "math_render_error_count"}:
summary[key] = total
return summary
def _part_report_record(result: ConversionResult, metadata: dict[str, Any]) -> dict[str, Any]:
engine_options = _dict_value(metadata.get("engine_options"))
chunk = _dict_value(engine_options.get("chunk"))
page_conversion = _dict_value(engine_options.get("page_conversion"))
record: dict[str, Any] = {
"index": _int_value(chunk.get("chunk_index")),
"total": _int_value(chunk.get("total_chunks")),
"source_page_start": _int_value(chunk.get("source_page_start")),
"source_page_end": _int_value(chunk.get("source_page_end")),
"markdown_path": str(result.markdown_path) if result.markdown_path.exists() else None,
"status": result.final_status,
"warning_count": result.warning_count,
}
failed_source_pages = page_conversion.get("failed_source_pages")
if isinstance(failed_source_pages, list):
record["failed_source_pages"] = [page for page in failed_source_pages if isinstance(page, int)]
return record
def _aggregate_engine_options(first_options: object, parts: list[dict[str, Any]]) -> dict[str, Any]:
engine_options = _dict_value(first_options)
engine_options.pop("chunk", None)
engine_options.pop("page_conversion", None)
engine_options["parts"] = parts
failed_pages = sorted(
page
for part in parts
for page in _list_value(part.get("failed_source_pages"))
if isinstance(page, int)
)
if failed_pages:
engine_options["failed_source_pages"] = failed_pages
return engine_options
def _aggregate_report_quality(results: tuple[ConversionResult, ...]) -> QualityResult:
return QualityResult(
missing_asset_link_count=sum((result._report_quality or QualityResult()).missing_asset_link_count for result in results),
invalid_asset_link_count=sum((result._report_quality or QualityResult()).invalid_asset_link_count for result in results),
math_render_error_count=sum((result._report_quality or QualityResult()).math_render_error_count for result in results),
)
def _aggregate_warning_records(metadatas: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
warnings: list[dict[str, Any]] = []
for metadata in metadatas:
page_offset = _source_page_offset(metadata)
for warning in _list_value(metadata.get("warnings")):
if not isinstance(warning, dict):
continue
adjusted = dict(warning)
page_index = adjusted.get("page_index")
if isinstance(page_index, int):
adjusted["page_index"] = page_offset + page_index
warnings.append(adjusted)
return warnings
def _aggregate_text_fidelity_records(metadatas: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
records: list[dict[str, Any]] = []
for metadata in metadatas:
page_offset = _source_page_offset(metadata)
for record in _list_value(metadata.get("text_fidelity")):
if not isinstance(record, dict):
continue
adjusted = dict(record)
page_index = adjusted.get("page_index")
if isinstance(page_index, int):
adjusted["page_index"] = page_offset + page_index
records.append(adjusted)
return records
def _source_page_offset(metadata: dict[str, Any]) -> int:
chunk = _dict_value(_dict_value(metadata.get("engine_options")).get("chunk"))
source_page_start = chunk.get("source_page_start")
return source_page_start - 1 if isinstance(source_page_start, int) and source_page_start > 0 else 0
def _dict_value(value: object) -> dict[str, Any]:
return dict(value) if isinstance(value, dict) else {}
def _list_value(value: object) -> list[object]:
return list(value) if isinstance(value, list) else []
def _int_from_summary(summary: dict[str, Any], key: str) -> int:
value = summary.get(key)
return value if isinstance(value, int) else 0
def _page_pdf_filename(page_plan: PdfChunkPlan) -> str:
width = page_plan.page_number_width
return f"{page_plan.source_pdf.stem}.page-{page_plan.source_page_start:0{width}d}.pdf"
def _temporary_page_output_plan(page_pdf: Path, page_root: Path, *, keep_raw: bool) -> PlannedOutput:
output_dir = page_root / "outputs"
stem = page_pdf.stem
return PlannedOutput(
source_pdf=page_pdf,
markdown_path=output_dir / f"{stem}.md",
assets_dir=output_dir / f"{stem}.assets",
metadata_path=output_dir / f"{stem}.metadata.json",
report_path=output_dir / f"{stem}.report.md",
raw_dir=output_dir / f"{stem}.raw" if keep_raw else None,
)
def _chunk_metadata(plan: PdfChunkPlan) -> dict[str, object]:
return {
"original_source_pdf": str(plan.source_pdf),
"chunk_index": plan.chunk_index,
"total_chunks": plan.total_chunks,
"source_page_start": plan.source_page_start,
"source_page_end": plan.source_page_end,
"chunk_page_count": plan.page_count,
}
def _group_engine_options(
artifacts: tuple[_PageConversionArtifact, ...],
*,
group_plan: PdfChunkPlan,
group_size: int,
gpu: str | None,
mineru_profile: str,
gpu_inventory: tuple[GpuInfo, ...] | None,
strict_local: bool,
failed_source_pages: tuple[int, ...],
) -> dict[str, Any]:
engine_options = _first_page_engine_options(artifacts)
if not engine_options:
engine_options = _mineru_options(
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
).to_engine_options()
engine_options.pop("chunk", None)
engine_options.pop("page_conversion", None)
engine_options["chunk"] = _chunk_metadata(group_plan)
engine_options["page_conversion"] = {
"mode": "single_page",
"mineru_input_page_count": 1,
"output_group_page_count": group_size,
"failed_source_pages": list(failed_source_pages),
}
return engine_options
def _first_page_engine_options(artifacts: tuple[_PageConversionArtifact, ...]) -> dict[str, Any]:
for artifact in artifacts:
if artifact.metadata is None:
continue
value = artifact.metadata.get("engine_options")
if isinstance(value, dict):
return dict(value)
return {}
def _mineru_options(
*,
gpu: str | None,
mineru_profile: str,
gpu_inventory: tuple[GpuInfo, ...] | None,
strict_local: bool,
) -> MinerUOptions:
gpu_device, selected_gpu = _resolve_gpu(gpu, gpu_inventory)
cuda_requested = bool(gpu_device and gpu_device.startswith("cuda:"))
warn_without_inventory = mineru_profile.strip().casefold() != DEFAULT_MINERU_PROFILE
profile = resolve_mineru_profile(
mineru_profile,
selected_gpu=selected_gpu,
cuda_requested=cuda_requested and (selected_gpu is not None or warn_without_inventory),
)
return MinerUOptions(
strict_local=strict_local,
gpu_device=gpu_device,
mineru_profile=mineru_profile,
profile_environment=profile.environment,
profile_engine_options=profile.to_engine_options(),
profile_warnings=profile.warnings,
)
def _resolve_gpu(gpu: str | None, gpu_inventory: tuple[GpuInfo, ...] | None) -> tuple[str | None, GpuInfo | None]:
requested = normalize_cuda_device(gpu)
if requested is None:
return None, None
if requested.casefold() == "auto":
inventory = gpu_inventory if gpu_inventory is not None else query_nvidia_gpus()
selection = select_gpu(inventory, requested)
return selection.cuda_device, selection.gpu
if gpu_inventory is None:
return requested, None
selection = select_gpu(gpu_inventory, requested)
return selection.cuda_device, selection.gpu
def _first_engine(artifacts: tuple[_PageConversionArtifact, ...]) -> str:
for artifact in artifacts:
if artifact.result.engine:
return artifact.result.engine
return ENGINE_NAME
def _first_engine_version(artifacts: tuple[_PageConversionArtifact, ...]) -> str:
for artifact in artifacts:
if artifact.result.engine_version:
return artifact.result.engine_version
return "unknown"
def _assemble_group_markdown_and_assets(
plan: PlannedOutput,
artifacts: tuple[_PageConversionArtifact, ...],
) -> tuple[str, tuple[AssetRecord, ...], tuple[WarningRecord, ...]]:
sections: list[str] = []
assets: list[AssetRecord] = []
warnings: list[WarningRecord] = []
copied_asset_names: set[str] = set()
for artifact in artifacts:
if artifact.result.succeeded and artifact.markdown is not None:
page_markdown, page_assets, page_warnings = _copy_page_assets_for_group(
plan.assets_dir,
artifact,
copied_asset_names,
)
assets.extend(page_assets)
warnings.extend(page_warnings)
body = page_markdown.strip()
if body:
sections.append(f"<!-- source-page: {artifact.source_page_number} -->\n\n{body}")
else:
sections.append(f"<!-- source-page: {artifact.source_page_number} -->")
continue
sections.append(f"<!-- source-page: {artifact.source_page_number} conversion failed; see report -->")
return "\n\n".join(sections).rstrip() + "\n", tuple(assets), tuple(warnings)
def _copy_page_assets_for_group(
group_assets_dir: Path,
artifact: _PageConversionArtifact,
copied_asset_names: set[str],
) -> tuple[str, tuple[AssetRecord, ...], tuple[WarningRecord, ...]]:
if artifact.markdown is None or artifact.metadata is None:
return artifact.markdown or "", (), ()
link_map: dict[str, str] = {}
assets: list[AssetRecord] = []
warnings: list[WarningRecord] = []
for page_asset in _assets_from_metadata(artifact.metadata):
source = artifact.result.markdown_path.parent / page_asset.relative_path
if not source.is_file():
warnings.append(
WarningRecord(
WarningCode.ASSET_LINK_MISSING,
WarningSeverity.WARNING,
f"Page asset could not be copied into grouped output: {page_asset.relative_path}",
page_index=artifact.group_page_index,
)
)
continue
destination_relative = _group_asset_relative_path(page_asset.relative_path, artifact, copied_asset_names)
destination = group_assets_dir.joinpath(*destination_relative.parts)
try:
destination.resolve(strict=False).relative_to(group_assets_dir.resolve(strict=False))
except ValueError:
warnings.append(
WarningRecord(
WarningCode.ASSET_LINK_INVALID,
WarningSeverity.WARNING,
f"Grouped asset destination would escape assets directory: {page_asset.relative_path}",
page_index=artifact.group_page_index,
)
)
continue
destination.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(source, destination)
final_link = PurePosixPath(group_assets_dir.name, destination_relative).as_posix()
link_map[page_asset.relative_path.replace("\\", "/")] = final_link
assets.append(AssetRecord(final_link, page_index=artifact.group_page_index))
return _rewrite_asset_links(artifact.markdown, link_map), tuple(assets), tuple(warnings)
def _group_asset_relative_path(
relative_path: str,
artifact: _PageConversionArtifact,
copied_asset_names: set[str],
) -> PurePosixPath:
parts = PurePosixPath(relative_path.replace("\\", "/")).parts
if parts and parts[0] == artifact.result.assets_dir.name:
parts = parts[1:]
if not parts:
parts = ("asset",)
original_name = PurePosixPath(*parts).name or "asset"
return _unique_asset_filename(f"page-{artifact.source_page_number:03d}_{original_name}", copied_asset_names)
def _group_warnings(
artifacts: tuple[_PageConversionArtifact, ...],
*,
all_failed: bool,
) -> tuple[WarningRecord, ...]:
warnings: list[WarningRecord] = []
for artifact in artifacts:
page_warnings = _artifact_warnings(artifact)
if artifact.result.succeeded:
warnings.extend(_adjust_warning_for_group(warning, artifact.group_page_index) for warning in page_warnings)
continue
severity = WarningSeverity.ERROR if all_failed else WarningSeverity.WARNING
if not page_warnings:
page_warnings = (
WarningRecord(
WarningCode.MINERU_CLI_FAILED,
severity,
f"MinerU failed for source page {artifact.source_page_number}.",
),
)
warnings.extend(
WarningRecord(
warning.code,
severity,
f"Source page {artifact.source_page_number}: {warning.message}",
page_index=artifact.group_page_index,
bbox=warning.bbox,
)
for warning in page_warnings
)
return tuple(warnings)
def _artifact_warnings(artifact: _PageConversionArtifact) -> tuple[WarningRecord, ...]:
if artifact.metadata is None:
return artifact.result.warnings
raw_warnings = artifact.metadata.get("warnings")
if not isinstance(raw_warnings, list):
return artifact.result.warnings
warnings = tuple(
warning
for item in raw_warnings
if isinstance(item, dict)
for warning in (_warning_from_metadata(item),)
if warning is not None
)
return warnings if warnings else artifact.result.warnings
def _adjust_warning_for_group(warning: WarningRecord, group_page_index: int) -> WarningRecord:
page_index = group_page_index if warning.page_index is None else group_page_index + warning.page_index
return WarningRecord(
warning.code,
warning.severity,
warning.message,
page_index=page_index,
bbox=warning.bbox,
)
def _group_text_fidelity(artifacts: tuple[_PageConversionArtifact, ...]) -> tuple[TextFidelityRecord, ...]:
records: list[TextFidelityRecord] = []
for artifact in artifacts:
if artifact.metadata is None:
continue
raw_records = artifact.metadata.get("text_fidelity")
if not isinstance(raw_records, list):
continue
for item in raw_records:
if isinstance(item, dict):
records.append(_text_fidelity_from_metadata(item, group_page_index=artifact.group_page_index))
return tuple(records)
def _text_fidelity_from_metadata(item: dict[str, Any], *, group_page_index: int) -> TextFidelityRecord:
source_page_number = item.get("source_page_number")
return TextFidelityRecord(
page_index=group_page_index + _int_value(item.get("page_index")),
source_page_number=source_page_number if isinstance(source_page_number, int) else None,
pypdf_text_available=_bool_value(item.get("pypdf_text_available")),
markdown_text_available=_bool_value(item.get("markdown_text_available")),
pypdf_hangul_count=_int_value(item.get("pypdf_hangul_count")),
markdown_hangul_count=_int_value(item.get("markdown_hangul_count")),
hangul_count_delta=_int_value(item.get("hangul_count_delta")),
hangul_count_ratio=_optional_float_value(item.get("hangul_count_ratio")),
unexpected_cjk_count=_int_value(item.get("unexpected_cjk_count")),
pypdf_hangul_spacing_anomaly_ratio=_float_value(item.get("pypdf_hangul_spacing_anomaly_ratio")),
markdown_hangul_spacing_anomaly_ratio=_float_value(item.get("markdown_hangul_spacing_anomaly_ratio")),
text_similarity=_optional_float_value(item.get("text_similarity")),
replacement_candidate=_bool_value(item.get("replacement_candidate")),
comparison_status=str(item.get("comparison_status") or "unknown"),
)
def _convert_plan(
plan: PlannedOutput,
*,
adapter: ConversionAdapter,
clock: Clock,
metadata_enabled: bool,
keep_raw: bool,
overwrite: bool,
gpu: str | None,
mineru_profile: str,
gpu_inventory: tuple[GpuInfo, ...] | None,
strict_local: bool,
math_checker: MathChecker | None,
result_source_pdf: Path | None = None,
metadata_source_pdf: Path | None = None,
metadata_source_sha256: str | None = None,
engine_options_extra: dict[str, object] | None = None,
source_text_pages: tuple[str, ...] | None = None,
) -> ConversionResult:
if overwrite:
_clear_planned_outputs(plan)
plan.markdown_path.parent.mkdir(parents=True, exist_ok=True)
options = _mineru_options(
gpu=gpu,
mineru_profile=mineru_profile,
gpu_inventory=gpu_inventory,
strict_local=strict_local,
)
if keep_raw:
if plan.raw_dir is None:
raise ValueError("raw output directory is required when keep_raw is enabled")
plan.raw_dir.mkdir(parents=True, exist_ok=True)
return _convert_in_work_dir(
plan,
plan.raw_dir,
adapter,
options,
clock,
metadata_enabled,
math_checker,
result_source_pdf=result_source_pdf,
metadata_source_pdf=metadata_source_pdf,
metadata_source_sha256=metadata_source_sha256,
engine_options_extra=engine_options_extra,
source_text_pages=source_text_pages,
)
with tempfile.TemporaryDirectory(prefix=f"{plan.source_pdf.stem}.", dir=plan.markdown_path.parent) as temporary_dir:
return _convert_in_work_dir(
plan,
Path(temporary_dir),
adapter,
options,
clock,
metadata_enabled,
math_checker,
result_source_pdf=result_source_pdf,
metadata_source_pdf=metadata_source_pdf,
metadata_source_sha256=metadata_source_sha256,
engine_options_extra=engine_options_extra,
source_text_pages=source_text_pages,
)
def _convert_in_work_dir(
plan: PlannedOutput,
work_dir: Path,
adapter: ConversionAdapter,
options: MinerUOptions,
clock: Clock,
metadata_enabled: bool,
math_checker: MathChecker | None,
result_source_pdf: Path | None = None,
metadata_source_pdf: Path | None = None,
metadata_source_sha256: str | None = None,
engine_options_extra: dict[str, object] | None = None,
source_text_pages: tuple[str, ...] | None = None,
) -> ConversionResult:
result_source = result_source_pdf or plan.source_pdf
metadata_source = metadata_source_pdf or result_source
try:
adapter_result = adapter.convert(plan.source_pdf, work_dir, options)
except StrictLocalViolationError as error:
return _failed_result(
plan,
warnings=(error.warning,),
source_pdf=result_source,
metadata_source_pdf=metadata_source,
metadata_source_sha256=metadata_source_sha256,
engine_options=options.to_engine_options(),
clock=clock,
)
engine = adapter_result.engine or ENGINE_NAME
engine_version = adapter_result.engine_version or "unknown"
adapter_warnings = _merge_option_warnings(options.profile_warnings, adapter_result.warnings)
if not adapter_result.succeeded:
return _failed_result(
plan,
warnings=adapter_warnings,
engine=engine,
engine_version=engine_version,
source_pdf=result_source,
metadata_source_pdf=metadata_source,
metadata_source_sha256=metadata_source_sha256,
engine_options=options.to_engine_options(),
clock=clock,
)
if adapter_result.raw_markdown is None:
warning = WarningRecord(
WarningCode.MINERU_CLI_FAILED,
WarningSeverity.ERROR,
"MinerU produced structured output but no Markdown; no fallback engine was used.",
)
return _failed_result(
plan,
warnings=adapter_warnings + (warning,),
engine=engine,
engine_version=engine_version,
source_pdf=result_source,
metadata_source_pdf=metadata_source,
metadata_source_sha256=metadata_source_sha256,
engine_options=options.to_engine_options(),
clock=clock,
)
plan.assets_dir.mkdir(parents=True, exist_ok=True)
assets = _materialize_assets(adapter_result.asset_paths, work_dir, plan.assets_dir)
markdown_source = _rewrite_asset_links(adapter_result.raw_markdown, assets.link_map)
normalized = normalize_markdown(
markdown_source,
markdown_dir=plan.markdown_path.parent,
asset_root=plan.assets_dir,
check_assets=False,
)
prepared = _prepare_markdown_for_output(
normalized.markdown,
markdown_dir=plan.markdown_path.parent,
asset_root=plan.assets_dir,
math_checker=math_checker,
)
quality = prepared.quality
engine_options = dict(adapter_result.engine_options)
if engine_options_extra:
engine_options.update(engine_options_extra)
text_fidelity = _run_text_fidelity_checks(
metadata_source,
prepared.markdown,
page_count=_page_count(adapter_result.raw_structured),
engine_options=engine_options,
source_text_pages=source_text_pages,
)
warnings = adapter_warnings + assets.warnings + normalized.warnings + quality.warnings + text_fidelity.warnings
document = _build_document(
source_pdf=metadata_source,
markdown=prepared.markdown,
assets=assets.records,
warnings=warnings,
raw_structured=adapter_result.raw_structured,
text_fidelity=text_fidelity.pages,
)
metadata_data = build_metadata(
document=document,
source_sha256=metadata_source_sha256 or _sha256(metadata_source),
created_at=_format_timestamp(clock()),
engine=engine,
engine_version=engine_version,
engine_options=engine_options,
)
report_quality = QualityResult(
missing_asset_link_count=quality.missing_asset_link_count,
invalid_asset_link_count=quality.invalid_asset_link_count,
)
report_text = render_report(
metadata_data,
quality=report_quality,
markdown_path=plan.markdown_path,
metadata_path=plan.metadata_path if metadata_enabled else None,
report_path=plan.report_path,
)
final_status = determine_final_status(metadata_data, report_quality)
_write_text(plan.markdown_path, prepared.markdown)
if metadata_enabled and plan.metadata_path is not None:
_write_text(plan.metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
_write_text(plan.report_path, report_text)
return ConversionResult(
source_pdf=result_source,
markdown_path=plan.markdown_path,
metadata_path=plan.metadata_path if metadata_enabled else None,
report_path=plan.report_path,
assets_dir=plan.assets_dir,
raw_dir=plan.raw_dir,
engine=engine,
engine_version=engine_version,
final_status=final_status,
warning_count=len(warnings),
warnings=warnings,
pages_processed=int(metadata_data["summary"]["pages_processed"]),
_report_metadata=metadata_data,
_report_quality=report_quality,
)
def _materialize_assets(asset_paths: tuple[Path, ...], work_dir: Path, assets_dir: Path) -> _AssetMaterialization:
records: list[AssetRecord] = []
warnings: list[WarningRecord] = []
link_map: dict[str, str] = {}
copied: set[str] = set()
work_root = work_dir.resolve()
for source in asset_paths:
source_path = Path(source)
if not source_path.exists() or not source_path.is_file():
warnings.append(_warning(WarningCode.ASSET_LINK_MISSING, f"Adapter asset file does not exist: {source_path}"))
continue
try:
source_relative = source_path.resolve().relative_to(work_root)
except ValueError:
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Adapter asset path is outside the work directory: {source_path}"))
continue
destination_relative = _unique_asset_filename(_asset_filename(source_path, len(copied) + 1), copied)
destination = assets_dir / destination_relative
try:
destination.resolve(strict=False).relative_to(assets_dir.resolve(strict=False))
except ValueError:
warnings.append(_warning(WarningCode.ASSET_LINK_INVALID, f"Adapter asset destination is outside the assets directory: {source_path}"))
continue
destination.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(source_path, destination)
final_link = PurePosixPath(assets_dir.name, destination_relative).as_posix()
records.append(AssetRecord(final_link))
_add_asset_link_keys(link_map, source_path, source_relative, destination_relative, final_link)
return _AssetMaterialization(records=tuple(records), warnings=tuple(warnings), link_map=link_map)
def _asset_filename(source_path: Path, index: int) -> str:
name = source_path.name.strip()
if name and name not in {".", ".."}:
return name
suffix = source_path.suffix if source_path.suffix else ""
return f"asset-{index:03d}{suffix}"
def _unique_asset_filename(filename: str, used_names: set[str]) -> PurePosixPath:
clean_name = PurePosixPath(filename.replace("\\", "/")).name
if not clean_name or clean_name in {".", ".."}:
clean_name = "asset"
path = PurePosixPath(clean_name)
stem = path.stem or "asset"
suffix = path.suffix
candidate = f"{stem}{suffix}"
index = 2
while candidate.casefold() in used_names:
candidate = f"{stem}-{index:03d}{suffix}"
index += 1
used_names.add(candidate.casefold())
return PurePosixPath(candidate)
def _add_asset_link_keys(
link_map: dict[str, str],
source_path: Path,
source_relative: Path,
destination_relative: PurePosixPath,
final_link: str,
) -> None:
keys = {
source_relative.as_posix(),
destination_relative.as_posix(),
source_path.name,
str(source_path),
source_path.as_posix(),
}
keys.update(_asset_link_suffixes(source_relative))
keys.update(_asset_link_suffixes(destination_relative))
for key in keys:
link_map[key.replace("\\", "/")] = final_link
def _asset_link_suffixes(path: Path | PurePosixPath) -> set[str]:
parts = PurePosixPath(path.as_posix()).parts
suffixes: set[str] = set()
for index, part in enumerate(parts):
if part.casefold() in {"asset", "assets", "image", "images"} and index + 1 < len(parts):
suffixes.add(PurePosixPath(*parts[index:]).as_posix())
return suffixes
def _rewrite_asset_links(markdown: str, link_map: dict[str, str]) -> str:
if not link_map:
return markdown
def replace(match: re.Match[str]) -> str:
alt = match.group("alt")
target = match.group("target").strip()
unwrapped = _unwrap_angle_target(target).replace("\\", "/")
replacement = link_map.get(unwrapped)
if replacement is None:
return match.group(0)
return f"![{alt}]({replacement})"
return _IMAGE_LINK_RE.sub(replace, markdown)
def _build_document(
*,
source_pdf: Path,
markdown: str,
assets: tuple[AssetRecord, ...],
warnings: tuple[WarningRecord, ...],
raw_structured: object | None,
text_fidelity: tuple = (),
) -> DocumentRecord:
page_count = _page_count(raw_structured)
blocks = _formula_blocks(markdown)
pages = [
PageRecord(page_index=page_index, blocks=blocks if page_index == 0 else ())
for page_index in range(page_count)
]
return DocumentRecord(
source_pdf=source_pdf,
pages=tuple(pages),
assets=assets,
warnings=warnings,
text_fidelity=text_fidelity,
)
def _run_text_fidelity_checks(
source_pdf: Path,
markdown: str,
*,
page_count: int,
engine_options: dict[str, Any],
source_text_pages: tuple[str, ...] | None = None,
) -> TextFidelityResult:
return check_text_fidelity(
source_pdf,
markdown,
page_count=page_count,
engine_options=engine_options,
source_text_pages=source_text_pages,
)
def _merge_option_warnings(
option_warnings: tuple[WarningRecord, ...],
adapter_warnings: tuple[WarningRecord, ...],
) -> tuple[WarningRecord, ...]:
extras = tuple(warning for warning in option_warnings if warning not in adapter_warnings)
return extras + adapter_warnings
def _run_quality_checks(
markdown: str,
*,
markdown_dir: Path,
asset_root: Path,
math_checker: MathChecker | None,
) -> QualityResult:
asset_quality = check_asset_links(markdown, markdown_dir=markdown_dir, asset_root=asset_root)
if not _has_math(markdown):
return asset_quality
if math_checker is None:
math_checker = create_default_math_checker()
math_quality = check_math_renderability_details(markdown, math_checker).quality
return merge_quality_results(asset_quality, math_quality)
def _prepare_markdown_for_output(
markdown: str,
*,
markdown_dir: Path,
asset_root: Path,
math_checker: MathChecker | None,
) -> _PreparedMarkdown:
asset_quality = check_asset_links(markdown, markdown_dir=markdown_dir, asset_root=asset_root)
if not _has_math(markdown):
return _PreparedMarkdown(markdown=markdown, quality=asset_quality)
checker = math_checker if math_checker is not None else create_default_math_checker()
math_details = check_math_renderability_details(markdown, checker)
initial_quality = merge_quality_results(asset_quality, math_details.quality)
if checker is None or not math_details.failures:
return _PreparedMarkdown(markdown=markdown, quality=initial_quality)
repair_result = repair_math_render_failures(markdown, math_details.failures, checker)
if not repair_result.repairs:
return _PreparedMarkdown(markdown=markdown, quality=initial_quality)
repaired_quality = _run_quality_checks(
repair_result.markdown,
markdown_dir=markdown_dir,
asset_root=asset_root,
math_checker=checker,
)
repair_quality = QualityResult(warnings=repair_result.warnings)
return _PreparedMarkdown(
markdown=repair_result.markdown,
quality=merge_quality_results(repaired_quality, repair_quality),
)
def _has_math(markdown: str) -> bool:
return _DISPLAY_MATH_RE.search(markdown) is not None or _INLINE_MATH_RE.search(markdown) is not None
def _formula_blocks(markdown: str) -> tuple[BlockRecord, ...]:
blocks: list[BlockRecord] = []
display_spans: list[tuple[int, int]] = []
for match in _DISPLAY_MATH_RE.finditer(markdown):
display_spans.append(match.span())
blocks.append(BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0, markdown_span=match.span()))
inline_parts: list[tuple[int, str]] = []
cursor = 0
for start, end in display_spans:
inline_parts.append((cursor, markdown[cursor:start]))
cursor = end
inline_parts.append((cursor, markdown[cursor:]))
for offset, part in inline_parts:
for match in _INLINE_MATH_RE.finditer(part):
body = match.group("body").strip()
if body and not body[0].isdigit():
start = offset + match.start()
end = offset + match.end()
blocks.append(BlockRecord(BlockType.INLINE_FORMULA, page_index=0, markdown_span=(start, end)))
return tuple(blocks) or (BlockRecord(BlockType.PARAGRAPH, page_index=0),)
def _page_count(raw_structured: object | None) -> int:
if isinstance(raw_structured, dict):
pages = raw_structured.get("pages")
if isinstance(pages, list):
return max(1, len(pages))
if isinstance(pages, int):
return max(1, pages)
if isinstance(pages, dict):
return max(1, len(pages))
pdf_info = raw_structured.get("pdf_info")
if isinstance(pdf_info, list):
return max(1, len(pdf_info))
page_info = raw_structured.get("page_info")
if isinstance(page_info, list):
return max(1, len(page_info))
page_indexes = tuple(_page_indexes(raw_structured))
if page_indexes:
return max(1, max(page_indexes) + 1)
return 1
def _page_indexes(value: object) -> tuple[int, ...]:
indexes: list[int] = []
if isinstance(value, dict):
for key in ("page_idx", "page_index"):
page_value = value.get(key)
if isinstance(page_value, int) and page_value >= 0:
indexes.append(page_value)
for item in value.values():
indexes.extend(_page_indexes(item))
elif isinstance(value, list):
for item in value:
indexes.extend(_page_indexes(item))
return tuple(indexes)
def _failed_result(
plan: PlannedOutput,
*,
warnings: tuple[WarningRecord, ...],
engine: str = ENGINE_NAME,
engine_version: str = "unknown",
source_pdf: Path | None = None,
metadata_source_pdf: Path | None = None,
metadata_source_sha256: str | None = None,
engine_options: dict[str, Any] | None = None,
clock: Clock | None = None,
) -> ConversionResult:
result_source = source_pdf or plan.source_pdf
metadata_source = metadata_source_pdf or result_source
metadata_data: dict[str, Any] | None = None
report_quality = QualityResult()
if clock is not None:
document = DocumentRecord(
source_pdf=metadata_source,
pages=(PageRecord(page_index=0),),
assets=(),
warnings=warnings,
)
metadata_data = build_metadata(
document=document,
source_sha256=metadata_source_sha256 or _sha256(metadata_source),
created_at=_format_timestamp(clock()),
engine=engine,
engine_version=engine_version,
engine_options=engine_options or {},
)
report_text = render_report(
metadata_data,
quality=report_quality,
markdown_path=None,
metadata_path=None,
report_path=plan.report_path,
)
_write_text(plan.report_path, report_text)
return ConversionResult(
source_pdf=result_source,
markdown_path=plan.markdown_path,
metadata_path=None,
report_path=plan.report_path,
assets_dir=plan.assets_dir,
raw_dir=plan.raw_dir,
engine=engine,
engine_version=engine_version,
final_status="failed",
warning_count=len(warnings),
warnings=warnings,
pages_processed=0 if metadata_data is None else int(metadata_data["summary"]["pages_processed"]),
_report_metadata=metadata_data,
_report_quality=report_quality,
)
def _clear_planned_outputs(plan: PlannedOutput) -> None:
for path in plan.planned_paths():
if path.is_dir():
shutil.rmtree(path)
elif path.exists():
path.unlink()
def _clear_task_outputs(tasks: tuple[_ConversionTask, ...]) -> None:
for path in _unique_task_output_paths(tasks):
if path.is_dir():
shutil.rmtree(path)
elif path.exists():
path.unlink()
def _copy_group_raw_outputs(raw_dir: Path | None, artifacts: tuple[_PageConversionArtifact, ...]) -> None:
if raw_dir is None:
return
for artifact in artifacts:
source_raw_dir = artifact.result.raw_dir
if source_raw_dir is None or not source_raw_dir.exists():
continue
destination = raw_dir / f"page-{artifact.source_page_number:03d}"
if destination.exists():
shutil.rmtree(destination)
destination.parent.mkdir(parents=True, exist_ok=True)
shutil.copytree(source_raw_dir, destination)
def _write_text(path: Path, text: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(text, encoding="utf-8")
def _sha256(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as file:
for chunk in iter(lambda: file.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def _path_key(path: Path) -> str:
return os.path.normcase(os.path.normpath(str(path.resolve(strict=False))))
def _format_timestamp(value: datetime) -> str:
if value.tzinfo is None:
value = value.replace(tzinfo=timezone.utc)
return value.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
def _utc_now() -> datetime:
return datetime.now(timezone.utc)
def _unwrap_angle_target(target: str) -> str:
if target.startswith("<") and target.endswith(">"):
return target[1:-1].strip()
return target
def _warning(code: WarningCode, message: str) -> WarningRecord:
return WarningRecord(code, WarningSeverity.WARNING, message)
def _raise_if_strict_local_disabled(strict_local: bool) -> None:
if not strict_local:
raise StrictLocalViolationError("strict-local execution cannot be disabled in v1.")