feat: mitigate MathJax formula warnings
This commit is contained in:
@@ -25,6 +25,7 @@ from pdf2md.ir import (
|
||||
)
|
||||
from pdf2md.markdown import normalize_markdown
|
||||
from pdf2md.math_render import create_default_math_checker
|
||||
from pdf2md.math_repair import repair_math_render_failures
|
||||
from pdf2md.metadata import build_metadata
|
||||
from pdf2md.mineru_adapter import (
|
||||
ENGINE_NAME,
|
||||
@@ -35,7 +36,7 @@ from pdf2md.mineru_adapter import (
|
||||
)
|
||||
from pdf2md.paths import DiscoveredPdf, PathLike, PlannedOutput, discover_pdfs, plan_outputs
|
||||
from pdf2md.pdf_splitter import PdfChunkPlan, plan_pdf_chunks, write_pdf_chunk
|
||||
from pdf2md.quality import MathChecker, QualityResult, check_asset_links, check_math_renderability, merge_quality_results
|
||||
from pdf2md.quality import MathChecker, QualityResult, check_asset_links, check_math_renderability_details, merge_quality_results
|
||||
from pdf2md.report import FinalStatus, determine_final_status, render_report
|
||||
|
||||
|
||||
@@ -101,12 +102,19 @@ class _ConversionTask:
|
||||
original_source_sha256: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _PreparedMarkdown:
|
||||
markdown: str
|
||||
quality: QualityResult
|
||||
|
||||
|
||||
_IMAGE_LINK_RE = re.compile(r"!\[(?P<alt>[^\]\n]*)\]\((?P<target>[^)\n]+)\)")
|
||||
_DISPLAY_MATH_RE = re.compile(r"(?<!\\)\$\$(?P<body>.*?)(?<!\\)\$\$", re.DOTALL)
|
||||
_INLINE_MATH_RE = re.compile(r"(?<!\\)\$(?P<body>[^\n$]+?)(?<!\\)\$")
|
||||
_RECHECKED_WARNING_CODES = frozenset(
|
||||
{
|
||||
WarningCode.MATH_RENDER_FAILED,
|
||||
WarningCode.MATH_RENDER_REPAIRED,
|
||||
WarningCode.ASSET_LINK_MISSING,
|
||||
WarningCode.ASSET_LINK_INVALID,
|
||||
}
|
||||
@@ -240,12 +248,14 @@ def recheck_markdown(
|
||||
markdown = markdown_file.read_text(encoding="utf-8")
|
||||
assets_dir = markdown_file.with_suffix(".assets")
|
||||
assets = _assets_from_metadata(existing_metadata)
|
||||
quality = _run_quality_checks(
|
||||
prepared = _prepare_markdown_for_output(
|
||||
markdown,
|
||||
markdown_dir=markdown_file.parent,
|
||||
asset_root=assets_dir,
|
||||
math_checker=math_checker,
|
||||
)
|
||||
markdown = prepared.markdown
|
||||
quality = prepared.quality
|
||||
warnings = _preserved_metadata_warnings(existing_metadata) + quality.warnings
|
||||
document = _build_document(
|
||||
source_pdf=Path(_metadata_text(existing_metadata, "source_pdf")),
|
||||
@@ -276,6 +286,7 @@ def recheck_markdown(
|
||||
)
|
||||
final_status = determine_final_status(metadata_data, report_quality)
|
||||
|
||||
_write_text(markdown_file, markdown)
|
||||
_write_text(metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
|
||||
_write_text(report_path, report_text)
|
||||
|
||||
@@ -641,16 +652,17 @@ def _convert_in_work_dir(
|
||||
asset_root=plan.assets_dir,
|
||||
check_assets=False,
|
||||
)
|
||||
quality = _run_quality_checks(
|
||||
prepared = _prepare_markdown_for_output(
|
||||
normalized.markdown,
|
||||
markdown_dir=plan.markdown_path.parent,
|
||||
asset_root=plan.assets_dir,
|
||||
math_checker=math_checker,
|
||||
)
|
||||
quality = prepared.quality
|
||||
warnings = adapter_result.warnings + assets.warnings + normalized.warnings + quality.warnings
|
||||
document = _build_document(
|
||||
source_pdf=metadata_source,
|
||||
markdown=normalized.markdown,
|
||||
markdown=prepared.markdown,
|
||||
assets=assets.records,
|
||||
warnings=warnings,
|
||||
raw_structured=adapter_result.raw_structured,
|
||||
@@ -679,7 +691,7 @@ def _convert_in_work_dir(
|
||||
)
|
||||
final_status = determine_final_status(metadata_data, report_quality)
|
||||
|
||||
_write_text(plan.markdown_path, normalized.markdown)
|
||||
_write_text(plan.markdown_path, prepared.markdown)
|
||||
if metadata_enabled and plan.metadata_path is not None:
|
||||
_write_text(plan.metadata_path, json.dumps(metadata_data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
|
||||
_write_text(plan.report_path, report_text)
|
||||
@@ -824,10 +836,44 @@ def _run_quality_checks(
|
||||
return asset_quality
|
||||
if math_checker is None:
|
||||
math_checker = create_default_math_checker()
|
||||
math_quality = check_math_renderability(markdown, math_checker)
|
||||
math_quality = check_math_renderability_details(markdown, math_checker).quality
|
||||
return merge_quality_results(asset_quality, math_quality)
|
||||
|
||||
|
||||
def _prepare_markdown_for_output(
|
||||
markdown: str,
|
||||
*,
|
||||
markdown_dir: Path,
|
||||
asset_root: Path,
|
||||
math_checker: MathChecker | None,
|
||||
) -> _PreparedMarkdown:
|
||||
asset_quality = check_asset_links(markdown, markdown_dir=markdown_dir, asset_root=asset_root)
|
||||
if not _has_math(markdown):
|
||||
return _PreparedMarkdown(markdown=markdown, quality=asset_quality)
|
||||
|
||||
checker = math_checker if math_checker is not None else create_default_math_checker()
|
||||
math_details = check_math_renderability_details(markdown, checker)
|
||||
initial_quality = merge_quality_results(asset_quality, math_details.quality)
|
||||
if checker is None or not math_details.failures:
|
||||
return _PreparedMarkdown(markdown=markdown, quality=initial_quality)
|
||||
|
||||
repair_result = repair_math_render_failures(markdown, math_details.failures, checker)
|
||||
if not repair_result.repairs:
|
||||
return _PreparedMarkdown(markdown=markdown, quality=initial_quality)
|
||||
|
||||
repaired_quality = _run_quality_checks(
|
||||
repair_result.markdown,
|
||||
markdown_dir=markdown_dir,
|
||||
asset_root=asset_root,
|
||||
math_checker=checker,
|
||||
)
|
||||
repair_quality = QualityResult(warnings=repair_result.warnings)
|
||||
return _PreparedMarkdown(
|
||||
markdown=repair_result.markdown,
|
||||
quality=merge_quality_results(repaired_quality, repair_quality),
|
||||
)
|
||||
|
||||
|
||||
def _has_math(markdown: str) -> bool:
|
||||
return _DISPLAY_MATH_RE.search(markdown) is not None or _INLINE_MATH_RE.search(markdown) is not None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user