import json
import re
from collections.abc import Generator
from typing import Any

import fitz
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage


class PdfToMarkdownTool(Tool):
    """Convert PDF to Markdown using an external catalog array."""

    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
        file = tool_parameters.get("file")
        catalog_text = (tool_parameters.get("catalog") or "").strip()
        if not file:
            yield self.create_text_message("Error: file is required")
            return
        if not catalog_text:
            yield self.create_text_message("Error: catalog is required")
            return

        catalog = self._parse_catalog(catalog_text)
        if not catalog:
            yield self.create_text_message("Error: catalog must be a JSON array with title and page indexes")
            return

        doc = fitz.open(stream=file.blob, filetype="pdf")
        try:
            num_pages = len(doc)
            hf_texts = self._detect_headers_footers(doc, num_pages)
            page_mds = [self._page_to_markdown(doc[index], hf_texts) for index in range(num_pages)]
            final_md = self._assemble_by_catalog(catalog, page_mds, num_pages)

            yield self.create_text_message(final_md)
            yield self.create_blob_message(
                blob=final_md.encode("utf-8"),
                meta={"mime_type": "text/markdown"},
            )
        finally:
            doc.close()

    def _parse_catalog(self, catalog_text: str) -> list[dict[str, Any]]:
        try:
            raw = json.loads(catalog_text)
        except Exception:
            return []

        if not isinstance(raw, list):
            return []

        result: list[dict[str, Any]] = []
        for item in raw:
            if not isinstance(item, dict):
                continue

            title = str(item.get("title") or "").strip() or "Untitled"
            start_index = self._to_int(item.get("page_start_index"), None)
            end_index = self._to_int(item.get("page_end_index"), start_index)

            if start_index is None:
                start = self._to_int(item.get("start"), None)
                end = self._to_int(item.get("end"), start)
                if start is None:
                    continue
                start_index = max(0, start - 1)
                end_index = max(start_index, (end if end is not None else start) - 1)

            if end_index is None:
                end_index = start_index

            result.append(
                {
                    "title": title,
                    "page_start_index": max(0, start_index),
                    "page_end_index": max(start_index, end_index),
                }
            )
        return result

    def _detect_headers_footers(self, doc: fitz.Document, num_pages: int) -> set[str]:
        margin_ratio = 0.08
        sample_count = min(num_pages, 30)
        text_counts: dict[str, int] = {}

        for idx in range(sample_count):
            page = doc[idx]
            page_height = page.rect.height
            top_limit = page_height * margin_ratio
            bottom_limit = page_height * (1 - margin_ratio)
            try:
                blocks = page.get_text("blocks", sort=True) or []
            except Exception:
                continue

            seen: set[str] = set()
            for block in blocks:
                if len(block) < 7 or block[6] != 0:
                    continue
                y0, y1 = block[1], block[3]
                text = (block[4] or "").strip()
                if not text or len(text) < 2 or text in seen:
                    continue
                if y1 <= top_limit or y0 >= bottom_limit:
                    seen.add(text)
                    text_counts[text] = text_counts.get(text, 0) + 1

        threshold = max(3, sample_count * 0.35)
        return {text for text, count in text_counts.items() if count >= threshold}

    def _page_to_markdown(self, page: fitz.Page, hf_texts: set[str]) -> str:
        parts: list[str] = []
        page_height = page.rect.height
        top_margin = page_height * 0.06
        bottom_margin = page_height * 0.94

        table_rects: list[fitz.Rect] = []
        table_mds: list[str] = []
        try:
            find_tables = getattr(page, "find_tables", None)
            tables = []
            if callable(find_tables):
                table_finder = find_tables()
                tables = getattr(table_finder, "tables", []) or []

            for table in tables[:5]:
                try:
                    table_rects.append(fitz.Rect(table.bbox))
                except Exception:
                    pass

                cells = table.extract() or []
                if len(cells) < 2:
                    continue
                if hf_texts and len(cells) <= 3:
                    flat = " ".join(str(cell or "") for row in cells for cell in row)
                    if any(hf in flat for hf in hf_texts):
                        continue

                md_table = self._cells_to_md_table(cells)
                if md_table:
                    table_mds.append(md_table)
        except Exception:
            pass

        try:
            blocks = page.get_text("blocks", sort=True) or []
        except Exception:
            blocks = []

        for block in blocks:
            if len(block) < 7 or block[6] != 0:
                continue
            x0, y0, x1, y1 = block[:4]
            text = (block[4] or "").strip()
            if not text:
                continue

            block_rect = fitz.Rect(x0, y0, x1, y1)
            if any(self._rects_overlap(block_rect, table_rect) for table_rect in table_rects):
                continue
            if hf_texts and (y1 <= top_margin or y0 >= bottom_margin):
                if any(hf in text for hf in hf_texts):
                    continue
            if re.fullmatch(r"\s*\d{1,4}\s*", text):
                continue

            parts.append(text)

        parts.extend(table_mds)
        return "\n\n".join(parts)

    def _assemble_by_catalog(self, catalog: list[dict[str, Any]], page_mds: list[str], num_pages: int) -> str:
        parts: list[str] = []
        used_pages: set[int] = set()

        for item in catalog:
            start = max(0, min(int(item["page_start_index"]), num_pages - 1))
            end = max(start, min(int(item["page_end_index"]), num_pages - 1))

            chapter_parts = [f"# {item['title']}\n"]
            for idx in range(start, end + 1):
                if idx < len(page_mds) and page_mds[idx].strip() and idx not in used_pages:
                    chapter_parts.append(page_mds[idx])
                    used_pages.add(idx)

            if len(chapter_parts) > 1:
                parts.append("\n\n".join(chapter_parts))

        if parts:
            return "\n\n---\n\n".join(parts)
        return "\n\n---\n\n".join(m for m in page_mds if m.strip())

    @staticmethod
    def _rects_overlap(block_rect: fitz.Rect, table_rect: fitz.Rect) -> bool:
        inter = block_rect & table_rect
        if inter.is_empty:
            return False
        block_area = block_rect.width * block_rect.height
        if block_area <= 0:
            return False
        return (inter.width * inter.height) / block_area >= 0.3

    @staticmethod
    def _cells_to_md_table(cells: list) -> str:
        if not cells:
            return ""

        header = cells[0]
        ncols = len(header)
        if ncols == 0:
            return ""

        def clean(value: Any) -> str:
            return str(value or "").replace("|", "\\|").replace("\n", " ").strip()

        lines = [
            "| " + " | ".join(clean(cell) for cell in header) + " |",
            "| " + " | ".join("---" for _ in range(ncols)) + " |",
        ]
        for row in cells[1:]:
            padded = list(row) + [""] * max(0, ncols - len(row))
            lines.append("| " + " | ".join(clean(cell) for cell in padded[:ncols]) + " |")
        return "\n".join(lines)

    @staticmethod
    def _to_int(value: Any, default: int | None) -> int | None:
        try:
            if value is None or value == "":
                return default
            return int(value)
        except Exception:
            return default