feat: mitigate MathJax formula warnings

This commit is contained in:
NINI
2026-05-11 02:08:46 +09:00
parent 005f17bac1
commit 71e6fbcc51
12 changed files with 625 additions and 41 deletions
+52 -6
View File
@@ -25,6 +25,7 @@ from pdf2md.ir import (
)
from pdf2md.markdown import normalize_markdown
from pdf2md.math_render import create_default_math_checker
from pdf2md.math_repair import repair_math_render_failures
from pdf2md.metadata import build_metadata
from pdf2md.mineru_adapter import (
ENGINE_NAME,
@@ -35,7 +36,7 @@ from pdf2md.mineru_adapter import (
)
from pdf2md.paths import DiscoveredPdf, PathLike, PlannedOutput, discover_pdfs, plan_outputs
from pdf2md.pdf_splitter import PdfChunkPlan, plan_pdf_chunks, write_pdf_chunk
from pdf2md.quality import MathChecker, QualityResult, check_asset_links, check_math_renderability, merge_quality_results
from pdf2md.quality import MathChecker, QualityResult, check_asset_links, check_math_renderability_details, merge_quality_results
from pdf2md.report import FinalStatus, determine_final_status, render_report
@@ -101,12 +102,19 @@ class _ConversionTask:
original_source_sha256: str | None = None
@dataclass(frozen=True)
class _PreparedMarkdown:
markdown: str
quality: QualityResult
_IMAGE_LINK_RE = re.compile(r"!\[(?P<alt>[^\]\n]*)\]\((?P<target>[^)\n]+)\)")
_DISPLAY_MATH_RE = re.compile(r"(?<!\\)\$\$(?P<body>.*?)(?<!\\)\$\$", re.DOTALL)
_INLINE_MATH_RE = re.compile(r"(?<!\\)\$(?P<body>[^\n$]+?)(?<!\\)\$")
_RECHECKED_WARNING_CODES = frozenset(
{
WarningCode.MATH_RENDER_FAILED,
WarningCode.MATH_RENDER_REPAIRED,
WarningCode.ASSET_LINK_MISSING,
WarningCode.ASSET_LINK_INVALID,
}
@@ -240,12 +248,14 @@ def recheck_markdown(
markdown = markdown_file.read_text(encoding="utf-8")
assets_dir = markdown_file.with_suffix(".assets")
assets = _assets_from_metadata(existing_metadata)
quality = _run_quality_checks(
prepared = _prepare_markdown_for_output(
markdown,
markdown_dir=markdown_file.parent,
asset_root=assets_dir,
math_checker=math_checker,
)
markdown = prepared.markdown
quality = prepared.quality
warnings = _preserved_metadata_warnings(existing_metadata) + quality.warnings
document = _build_document(
source_pdf=Path(_metadata_text(existing_metadata, "source_pdf")),
@@ -276,6 +286,7 @@ def recheck_markdown(
)
final_status = determine_final_status(metadata_data, report_quality)
_write_text(markdown_file, markdown)
_write_text(metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
_write_text(report_path, report_text)
@@ -641,16 +652,17 @@ def _convert_in_work_dir(
asset_root=plan.assets_dir,
check_assets=False,
)
quality = _run_quality_checks(
prepared = _prepare_markdown_for_output(
normalized.markdown,
markdown_dir=plan.markdown_path.parent,
asset_root=plan.assets_dir,
math_checker=math_checker,
)
quality = prepared.quality
warnings = adapter_result.warnings + assets.warnings + normalized.warnings + quality.warnings
document = _build_document(
source_pdf=metadata_source,
markdown=normalized.markdown,
markdown=prepared.markdown,
assets=assets.records,
warnings=warnings,
raw_structured=adapter_result.raw_structured,
@@ -679,7 +691,7 @@ def _convert_in_work_dir(
)
final_status = determine_final_status(metadata_data, report_quality)
_write_text(plan.markdown_path, normalized.markdown)
_write_text(plan.markdown_path, prepared.markdown)
if metadata_enabled and plan.metadata_path is not None:
_write_text(plan.metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
_write_text(plan.report_path, report_text)
@@ -824,10 +836,44 @@ def _run_quality_checks(
return asset_quality
if math_checker is None:
math_checker = create_default_math_checker()
math_quality = check_math_renderability(markdown, math_checker)
math_quality = check_math_renderability_details(markdown, math_checker).quality
return merge_quality_results(asset_quality, math_quality)
def _prepare_markdown_for_output(
markdown: str,
*,
markdown_dir: Path,
asset_root: Path,
math_checker: MathChecker | None,
) -> _PreparedMarkdown:
asset_quality = check_asset_links(markdown, markdown_dir=markdown_dir, asset_root=asset_root)
if not _has_math(markdown):
return _PreparedMarkdown(markdown=markdown, quality=asset_quality)
checker = math_checker if math_checker is not None else create_default_math_checker()
math_details = check_math_renderability_details(markdown, checker)
initial_quality = merge_quality_results(asset_quality, math_details.quality)
if checker is None or not math_details.failures:
return _PreparedMarkdown(markdown=markdown, quality=initial_quality)
repair_result = repair_math_render_failures(markdown, math_details.failures, checker)
if not repair_result.repairs:
return _PreparedMarkdown(markdown=markdown, quality=initial_quality)
repaired_quality = _run_quality_checks(
repair_result.markdown,
markdown_dir=markdown_dir,
asset_root=asset_root,
math_checker=checker,
)
repair_quality = QualityResult(warnings=repair_result.warnings)
return _PreparedMarkdown(
markdown=repair_result.markdown,
quality=merge_quality_results(repaired_quality, repair_quality),
)
def _has_math(markdown: str) -> bool:
return _DISPLAY_MATH_RE.search(markdown) is not None or _INLINE_MATH_RE.search(markdown) is not None