dify

2025-12-01 17:21:38 +08:00
parent 32fee2b8ab
commit fab8c13cb3
7511 changed files with 996300 additions and 0 deletions
--- a/dify/api/core/rag/extractor/blob/blob.py
+++ b/dify/api/core/rag/extractor/blob/blob.py
@@ -0,0 +1,144 @@
+"""Schema for Blobs and Blob Loaders.
+
+The goal is to facilitate decoupling of content loading from content parsing code.
+
+In addition, content loading code should provide a lazy loading interface by default.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import mimetypes
+from collections.abc import Generator, Mapping
+from io import BufferedReader, BytesIO
+from pathlib import Path, PurePath
+from typing import Any, Union
+
+from pydantic import BaseModel, ConfigDict, model_validator
+
+PathLike = Union[str, PurePath]
+
+
+class Blob(BaseModel):
+    """A blob is used to represent raw data by either reference or value.
+
+    Provides an interface to materialize the blob in different representations, and
+    help to decouple the development of data loaders from the downstream parsing of
+    the raw data.
+
+    Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
+    """
+
+    data: Union[bytes, str, None] = None  # Raw data
+    mimetype: str | None = None  # Not to be confused with a file extension
+    encoding: str = "utf-8"  # Use utf-8 as default encoding, if decoding to string
+    # Location where the original content was found
+    # Represent location on the local file system
+    # Useful for situations where downstream code assumes it must work with file paths
+    # rather than in-memory content.
+    path: PathLike | None = None
+    model_config = ConfigDict(arbitrary_types_allowed=True, frozen=True)
+
+    @property
+    def source(self) -> str | None:
+        """The source location of the blob as string if known otherwise none."""
+        return str(self.path) if self.path else None
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
+        """Verify that either data or path is provided."""
+        if "data" not in values and "path" not in values:
+            raise ValueError("Either data or path must be provided")
+        return values
+
+    def as_string(self) -> str:
+        """Read data as a string."""
+        if self.data is None and self.path:
+            return Path(str(self.path)).read_text(encoding=self.encoding)
+        elif isinstance(self.data, bytes):
+            return self.data.decode(self.encoding)
+        elif isinstance(self.data, str):
+            return self.data
+        else:
+            raise ValueError(f"Unable to get string for blob {self}")
+
+    def as_bytes(self) -> bytes:
+        """Read data as bytes."""
+        if isinstance(self.data, bytes):
+            return self.data
+        elif isinstance(self.data, str):
+            return self.data.encode(self.encoding)
+        elif self.data is None and self.path:
+            return Path(str(self.path)).read_bytes()
+        else:
+            raise ValueError(f"Unable to get bytes for blob {self}")
+
+    @contextlib.contextmanager
+    def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
+        """Read data as a byte stream."""
+        if isinstance(self.data, bytes):
+            yield BytesIO(self.data)
+        elif self.data is None and self.path:
+            with open(str(self.path), "rb") as f:
+                yield f
+        else:
+            raise NotImplementedError(f"Unable to convert blob {self}")
+
+    @classmethod
+    def from_path(
+        cls,
+        path: PathLike,
+        *,
+        encoding: str = "utf-8",
+        mime_type: str | None = None,
+        guess_type: bool = True,
+    ) -> Blob:
+        """Load the blob from a path like object.
+
+        Args:
+            path: path like object to file to be read
+            encoding: Encoding to use if decoding the bytes into a string
+            mime_type: if provided, will be set as the mime-type of the data
+            guess_type: If True, the mimetype will be guessed from the file extension,
+                        if a mime-type was not provided
+
+        Returns:
+            Blob instance
+        """
+        if mime_type is None and guess_type:
+            _mimetype = mimetypes.guess_type(path)[0]
+        else:
+            _mimetype = mime_type
+        # We do not load the data immediately, instead we treat the blob as a
+        # reference to the underlying data.
+        return cls(data=None, mimetype=_mimetype, encoding=encoding, path=path)
+
+    @classmethod
+    def from_data(
+        cls,
+        data: Union[str, bytes],
+        *,
+        encoding: str = "utf-8",
+        mime_type: str | None = None,
+        path: str | None = None,
+    ) -> Blob:
+        """Initialize the blob from in-memory data.
+
+        Args:
+            data: the in-memory data associated with the blob
+            encoding: Encoding to use if decoding the bytes into a string
+            mime_type: if provided, will be set as the mime-type of the data
+            path: if provided, will be set as the source from which the data came
+
+        Returns:
+            Blob instance
+        """
+        return cls(data=data, mimetype=mime_type, encoding=encoding, path=path)
+
+    def __repr__(self) -> str:
+        """Define the blob representation."""
+        str_repr = f"Blob {id(self)}"
+        if self.source:
+            str_repr += f" {self.source}"
+        return str_repr
--- a/dify/api/core/rag/extractor/csv_extractor.py
+++ b/dify/api/core/rag/extractor/csv_extractor.py
@@ -0,0 +1,77 @@
+"""Abstract interface for document loader implementations."""
+
+import csv
+
+import pandas as pd
+
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.extractor.helpers import detect_file_encodings
+from core.rag.models.document import Document
+
+
+class CSVExtractor(BaseExtractor):
+    """Load CSV files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        encoding: str | None = None,
+        autodetect_encoding: bool = False,
+        source_column: str | None = None,
+        csv_args: dict | None = None,
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._encoding = encoding
+        self._autodetect_encoding = autodetect_encoding
+        self.source_column = source_column
+        self.csv_args = csv_args or {}
+
+    def extract(self) -> list[Document]:
+        """Load data into document objects."""
+        docs = []
+        try:
+            with open(self._file_path, newline="", encoding=self._encoding) as csvfile:
+                docs = self._read_from_file(csvfile)
+        except UnicodeDecodeError as e:
+            if self._autodetect_encoding:
+                detected_encodings = detect_file_encodings(self._file_path)
+                for encoding in detected_encodings:
+                    try:
+                        with open(self._file_path, newline="", encoding=encoding.encoding) as csvfile:
+                            docs = self._read_from_file(csvfile)
+                        break
+                    except UnicodeDecodeError:
+                        continue
+            else:
+                raise RuntimeError(f"Error loading {self._file_path}") from e
+
+        return docs
+
+    def _read_from_file(self, csvfile) -> list[Document]:
+        docs = []
+        try:
+            # load csv file into pandas dataframe
+            df = pd.read_csv(csvfile, on_bad_lines="skip", **self.csv_args)
+
+            # check source column exists
+            if self.source_column and self.source_column not in df.columns:
+                raise ValueError(f"Source column '{self.source_column}' not found in CSV file.")
+
+            # create document objects
+
+            for i, row in df.iterrows():
+                content = ";".join(f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns)
+                source = row[self.source_column] if self.source_column else ""
+                metadata = {"source": source, "row": i}
+                doc = Document(page_content=content, metadata=metadata)
+                docs.append(doc)
+        except csv.Error as e:
+            raise e
+
+        return docs
--- a/dify/api/core/rag/extractor/entity/datasource_type.py
+++ b/dify/api/core/rag/extractor/entity/datasource_type.py
@@ -0,0 +1,7 @@
+from enum import StrEnum
+
+
+class DatasourceType(StrEnum):
+    FILE = "upload_file"
+    NOTION = "notion_import"
+    WEBSITE = "website_crawl"
--- a/dify/api/core/rag/extractor/entity/extract_setting.py
+++ b/dify/api/core/rag/extractor/entity/extract_setting.py
@@ -0,0 +1,46 @@
+from pydantic import BaseModel, ConfigDict
+
+from models.dataset import Document
+from models.model import UploadFile
+
+
+class NotionInfo(BaseModel):
+    """
+    Notion import info.
+    """
+
+    credential_id: str | None = None
+    notion_workspace_id: str
+    notion_obj_id: str
+    notion_page_type: str
+    document: Document | None = None
+    tenant_id: str
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class WebsiteInfo(BaseModel):
+    """
+    website import info.
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    provider: str
+    job_id: str
+    url: str
+    mode: str
+    tenant_id: str
+    only_main_content: bool = False
+
+
+class ExtractSetting(BaseModel):
+    """
+    Model class for provider response.
+    """
+
+    datasource_type: str
+    upload_file: UploadFile | None = None
+    notion_info: NotionInfo | None = None
+    website_info: WebsiteInfo | None = None
+    document_model: str | None = None
+    model_config = ConfigDict(arbitrary_types_allowed=True)
--- a/dify/api/core/rag/extractor/excel_extractor.py
+++ b/dify/api/core/rag/extractor/excel_extractor.py
@@ -0,0 +1,77 @@
+"""Abstract interface for document loader implementations."""
+
+import os
+from typing import cast
+
+import pandas as pd
+from openpyxl import load_workbook
+
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+
+
+class ExcelExtractor(BaseExtractor):
+    """Load Excel files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(self, file_path: str, encoding: str | None = None, autodetect_encoding: bool = False):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._encoding = encoding
+        self._autodetect_encoding = autodetect_encoding
+
+    def extract(self) -> list[Document]:
+        """Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
+        documents = []
+        file_extension = os.path.splitext(self._file_path)[-1].lower()
+
+        if file_extension == ".xlsx":
+            wb = load_workbook(self._file_path, data_only=True)
+            for sheet_name in wb.sheetnames:
+                sheet = wb[sheet_name]
+                data = sheet.values
+                cols = next(data, None)
+                if cols is None:
+                    continue
+                df = pd.DataFrame(data, columns=cols)
+
+                df.dropna(how="all", inplace=True)
+
+                for index, row in df.iterrows():
+                    page_content = []
+                    for col_index, (k, v) in enumerate(row.items()):
+                        if pd.notna(v):
+                            cell = sheet.cell(
+                                row=cast(int, index) + 2, column=col_index + 1
+                            )  # +2 to account for header and 1-based index
+                            if cell.hyperlink:
+                                value = f"[{v}]({cell.hyperlink.target})"
+                                page_content.append(f'"{k}":"{value}"')
+                            else:
+                                page_content.append(f'"{k}":"{v}"')
+                    documents.append(
+                        Document(page_content=";".join(page_content), metadata={"source": self._file_path})
+                    )
+
+        elif file_extension == ".xls":
+            excel_file = pd.ExcelFile(self._file_path, engine="xlrd")
+            for excel_sheet_name in excel_file.sheet_names:
+                df = excel_file.parse(sheet_name=excel_sheet_name)
+                df.dropna(how="all", inplace=True)
+
+                for _, row in df.iterrows():
+                    page_content = []
+                    for k, v in row.items():
+                        if pd.notna(v):
+                            page_content.append(f'"{k}":"{v}"')
+                    documents.append(
+                        Document(page_content=";".join(page_content), metadata={"source": self._file_path})
+                    )
+        else:
+            raise ValueError(f"Unsupported file extension: {file_extension}")
+
+        return documents
--- a/dify/api/core/rag/extractor/extract_processor.py
+++ b/dify/api/core/rag/extractor/extract_processor.py
@@ -0,0 +1,209 @@
+import re
+import tempfile
+from pathlib import Path
+from typing import Union
+from urllib.parse import unquote
+
+from configs import dify_config
+from core.helper import ssrf_proxy
+from core.rag.extractor.csv_extractor import CSVExtractor
+from core.rag.extractor.entity.datasource_type import DatasourceType
+from core.rag.extractor.entity.extract_setting import ExtractSetting
+from core.rag.extractor.excel_extractor import ExcelExtractor
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor
+from core.rag.extractor.html_extractor import HtmlExtractor
+from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor
+from core.rag.extractor.markdown_extractor import MarkdownExtractor
+from core.rag.extractor.notion_extractor import NotionExtractor
+from core.rag.extractor.pdf_extractor import PdfExtractor
+from core.rag.extractor.text_extractor import TextExtractor
+from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor
+from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
+from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
+from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
+from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
+from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
+from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
+from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor
+from core.rag.extractor.watercrawl.extractor import WaterCrawlWebExtractor
+from core.rag.extractor.word_extractor import WordExtractor
+from core.rag.models.document import Document
+from extensions.ext_storage import storage
+from models.model import UploadFile
+
+SUPPORT_URL_CONTENT_TYPES = ["application/pdf", "text/plain", "application/json"]
+USER_AGENT = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124"
+    " Safari/537.36"
+)
+
+
+class ExtractProcessor:
+    @classmethod
+    def load_from_upload_file(
+        cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False
+    ) -> Union[list[Document], str]:
+        extract_setting = ExtractSetting(
+            datasource_type=DatasourceType.FILE, upload_file=upload_file, document_model="text_model"
+        )
+        if return_text:
+            delimiter = "\n"
+            return delimiter.join([document.page_content for document in cls.extract(extract_setting, is_automatic)])
+        else:
+            return cls.extract(extract_setting, is_automatic)
+
+    @classmethod
+    def load_from_url(cls, url: str, return_text: bool = False) -> Union[list[Document], str]:
+        response = ssrf_proxy.get(url, headers={"User-Agent": USER_AGENT})
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            suffix = Path(url).suffix
+            if not suffix and suffix != ".":
+                # get content-type
+                if response.headers.get("Content-Type"):
+                    suffix = "." + response.headers.get("Content-Type").split("/")[-1]
+                else:
+                    content_disposition = response.headers.get("Content-Disposition")
+                    filename_match = re.search(r'filename="([^"]+)"', content_disposition)
+                    if filename_match:
+                        filename = unquote(filename_match.group(1))
+                        match = re.search(r"\.(\w+)$", filename)
+                        if match:
+                            suffix = "." + match.group(1)
+                        else:
+                            suffix = ""
+            # https://stackoverflow.com/questions/26541416/generate-temporary-file-names-without-creating-actual-file-in-python#comment90414256_26541521
+            file_path = f"{temp_dir}/{tempfile.gettempdir()}{suffix}"
+            Path(file_path).write_bytes(response.content)
+            extract_setting = ExtractSetting(datasource_type=DatasourceType.FILE, document_model="text_model")
+            if return_text:
+                delimiter = "\n"
+                return delimiter.join(
+                    [
+                        document.page_content
+                        for document in cls.extract(extract_setting=extract_setting, file_path=file_path)
+                    ]
+                )
+            else:
+                return cls.extract(extract_setting=extract_setting, file_path=file_path)
+
+    @classmethod
+    def extract(
+        cls, extract_setting: ExtractSetting, is_automatic: bool = False, file_path: str | None = None
+    ) -> list[Document]:
+        if extract_setting.datasource_type == DatasourceType.FILE:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                if not file_path:
+                    assert extract_setting.upload_file is not None, "upload_file is required"
+                    upload_file: UploadFile = extract_setting.upload_file
+                    suffix = Path(upload_file.key).suffix
+                    # FIXME mypy: Cannot determine type of 'tempfile._get_candidate_names' better not use it here
+                    file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"  # type: ignore
+                    storage.download(upload_file.key, file_path)
+                input_file = Path(file_path)
+                file_extension = input_file.suffix.lower()
+                etl_type = dify_config.ETL_TYPE
+                extractor: BaseExtractor | None = None
+                if etl_type == "Unstructured":
+                    unstructured_api_url = dify_config.UNSTRUCTURED_API_URL or ""
+                    unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
+
+                    if file_extension in {".xlsx", ".xls"}:
+                        extractor = ExcelExtractor(file_path)
+                    elif file_extension == ".pdf":
+                        extractor = PdfExtractor(file_path)
+                    elif file_extension in {".md", ".markdown", ".mdx"}:
+                        extractor = (
+                            UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key)
+                            if is_automatic
+                            else MarkdownExtractor(file_path, autodetect_encoding=True)
+                        )
+                    elif file_extension in {".htm", ".html"}:
+                        extractor = HtmlExtractor(file_path)
+                    elif file_extension == ".docx":
+                        extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
+                    elif file_extension == ".doc":
+                        extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key)
+                    elif file_extension == ".csv":
+                        extractor = CSVExtractor(file_path, autodetect_encoding=True)
+                    elif file_extension == ".msg":
+                        extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url, unstructured_api_key)
+                    elif file_extension == ".eml":
+                        extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url, unstructured_api_key)
+                    elif file_extension == ".ppt":
+                        extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
+                        # You must first specify the API key
+                        # because unstructured_api_key is necessary to parse .ppt documents
+                    elif file_extension == ".pptx":
+                        extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url, unstructured_api_key)
+                    elif file_extension == ".xml":
+                        extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url, unstructured_api_key)
+                    elif file_extension == ".epub":
+                        extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
+                    else:
+                        # txt
+                        extractor = TextExtractor(file_path, autodetect_encoding=True)
+                else:
+                    if file_extension in {".xlsx", ".xls"}:
+                        extractor = ExcelExtractor(file_path)
+                    elif file_extension == ".pdf":
+                        extractor = PdfExtractor(file_path)
+                    elif file_extension in {".md", ".markdown", ".mdx"}:
+                        extractor = MarkdownExtractor(file_path, autodetect_encoding=True)
+                    elif file_extension in {".htm", ".html"}:
+                        extractor = HtmlExtractor(file_path)
+                    elif file_extension == ".docx":
+                        extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
+                    elif file_extension == ".csv":
+                        extractor = CSVExtractor(file_path, autodetect_encoding=True)
+                    elif file_extension == ".epub":
+                        extractor = UnstructuredEpubExtractor(file_path)
+                    else:
+                        # txt
+                        extractor = TextExtractor(file_path, autodetect_encoding=True)
+                return extractor.extract()
+        elif extract_setting.datasource_type == DatasourceType.NOTION:
+            assert extract_setting.notion_info is not None, "notion_info is required"
+            extractor = NotionExtractor(
+                notion_workspace_id=extract_setting.notion_info.notion_workspace_id,
+                notion_obj_id=extract_setting.notion_info.notion_obj_id,
+                notion_page_type=extract_setting.notion_info.notion_page_type,
+                document_model=extract_setting.notion_info.document,
+                tenant_id=extract_setting.notion_info.tenant_id,
+                credential_id=extract_setting.notion_info.credential_id,
+            )
+            return extractor.extract()
+        elif extract_setting.datasource_type == DatasourceType.WEBSITE:
+            assert extract_setting.website_info is not None, "website_info is required"
+            if extract_setting.website_info.provider == "firecrawl":
+                extractor = FirecrawlWebExtractor(
+                    url=extract_setting.website_info.url,
+                    job_id=extract_setting.website_info.job_id,
+                    tenant_id=extract_setting.website_info.tenant_id,
+                    mode=extract_setting.website_info.mode,
+                    only_main_content=extract_setting.website_info.only_main_content,
+                )
+                return extractor.extract()
+            elif extract_setting.website_info.provider == "watercrawl":
+                extractor = WaterCrawlWebExtractor(
+                    url=extract_setting.website_info.url,
+                    job_id=extract_setting.website_info.job_id,
+                    tenant_id=extract_setting.website_info.tenant_id,
+                    mode=extract_setting.website_info.mode,
+                    only_main_content=extract_setting.website_info.only_main_content,
+                )
+                return extractor.extract()
+            elif extract_setting.website_info.provider == "jinareader":
+                extractor = JinaReaderWebExtractor(
+                    url=extract_setting.website_info.url,
+                    job_id=extract_setting.website_info.job_id,
+                    tenant_id=extract_setting.website_info.tenant_id,
+                    mode=extract_setting.website_info.mode,
+                    only_main_content=extract_setting.website_info.only_main_content,
+                )
+                return extractor.extract()
+            else:
+                raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}")
+        else:
+            raise ValueError(f"Unsupported datasource type: {extract_setting.datasource_type}")
--- a/dify/api/core/rag/extractor/extractor_base.py
+++ b/dify/api/core/rag/extractor/extractor_base.py
@@ -0,0 +1,11 @@
+"""Abstract interface for document loader implementations."""
+
+from abc import ABC, abstractmethod
+
+
+class BaseExtractor(ABC):
+    """Interface for extract files."""
+
+    @abstractmethod
+    def extract(self):
+        raise NotImplementedError
--- a/dify/api/core/rag/extractor/firecrawl/firecrawl_app.py
+++ b/dify/api/core/rag/extractor/firecrawl/firecrawl_app.py
@@ -0,0 +1,173 @@
+import json
+import time
+from typing import Any, cast
+
+import httpx
+
+from extensions.ext_storage import storage
+
+
+class FirecrawlApp:
+    def __init__(self, api_key=None, base_url=None):
+        self.api_key = api_key
+        self.base_url = base_url or "https://api.firecrawl.dev"
+        if self.api_key is None and self.base_url == "https://api.firecrawl.dev":
+            raise ValueError("No API key provided")
+
+    def scrape_url(self, url, params=None) -> dict[str, Any]:
+        # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/scrape
+        headers = self._prepare_headers()
+        json_data = {
+            "url": url,
+            "formats": ["markdown"],
+            "onlyMainContent": True,
+            "timeout": 30000,
+        }
+        if params:
+            json_data.update(params)
+        response = self._post_request(f"{self.base_url}/v2/scrape", json_data, headers)
+        if response.status_code == 200:
+            response_data = response.json()
+            data = response_data["data"]
+            return self._extract_common_fields(data)
+        elif response.status_code in {402, 409, 500, 429, 408}:
+            self._handle_error(response, "scrape URL")
+            return {}  # Avoid additional exception after handling error
+        else:
+            raise Exception(f"Failed to scrape URL. Status code: {response.status_code}")
+
+    def crawl_url(self, url, params=None) -> str:
+        # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post
+        headers = self._prepare_headers()
+        json_data = {"url": url}
+        if params:
+            json_data.update(params)
+        response = self._post_request(f"{self.base_url}/v2/crawl", json_data, headers)
+        if response.status_code == 200:
+            # There's also another two fields in the response: "success" (bool) and "url" (str)
+            job_id = response.json().get("id")
+            return cast(str, job_id)
+        else:
+            self._handle_error(response, "start crawl job")
+            return ""  # unreachable
+
+    def map(self, url: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
+        # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/map
+        headers = self._prepare_headers()
+        json_data: dict[str, Any] = {"url": url, "integration": "dify"}
+        if params:
+            # Pass through provided params, including optional "sitemap": "only" | "include" | "skip"
+            json_data.update(params)
+        response = self._post_request(f"{self.base_url}/v2/map", json_data, headers)
+        if response.status_code == 200:
+            return cast(dict[str, Any], response.json())
+        elif response.status_code in {402, 409, 500, 429, 408}:
+            self._handle_error(response, "start map job")
+            return {}
+        else:
+            raise Exception(f"Failed to start map job. Status code: {response.status_code}")
+
+    def check_crawl_status(self, job_id) -> dict[str, Any]:
+        headers = self._prepare_headers()
+        response = self._get_request(f"{self.base_url}/v2/crawl/{job_id}", headers)
+        if response.status_code == 200:
+            crawl_status_response = response.json()
+            if crawl_status_response.get("status") == "completed":
+                total = crawl_status_response.get("total", 0)
+                if total == 0:
+                    raise Exception("Failed to check crawl status. Error: No page found")
+                data = crawl_status_response.get("data", [])
+                url_data_list = []
+                for item in data:
+                    if isinstance(item, dict) and "metadata" in item and "markdown" in item:
+                        url_data = self._extract_common_fields(item)
+                        url_data_list.append(url_data)
+                if url_data_list:
+                    file_key = "website_files/" + job_id + ".txt"
+                    try:
+                        if storage.exists(file_key):
+                            storage.delete(file_key)
+                        storage.save(file_key, json.dumps(url_data_list).encode("utf-8"))
+                    except Exception as e:
+                        raise Exception(f"Error saving crawl data: {e}")
+                return self._format_crawl_status_response("completed", crawl_status_response, url_data_list)
+            else:
+                return self._format_crawl_status_response(
+                    crawl_status_response.get("status"), crawl_status_response, []
+                )
+        else:
+            self._handle_error(response, "check crawl status")
+            return {}  # unreachable
+
+    def _format_crawl_status_response(
+        self, status: str, crawl_status_response: dict[str, Any], url_data_list: list[dict[str, Any]]
+    ) -> dict[str, Any]:
+        return {
+            "status": status,
+            "total": crawl_status_response.get("total"),
+            "current": crawl_status_response.get("completed"),
+            "data": url_data_list,
+        }
+
+    def _extract_common_fields(self, item: dict[str, Any]) -> dict[str, Any]:
+        return {
+            "title": item.get("metadata", {}).get("title"),
+            "description": item.get("metadata", {}).get("description"),
+            "source_url": item.get("metadata", {}).get("sourceURL"),
+            "markdown": item.get("markdown"),
+        }
+
+    def _prepare_headers(self) -> dict[str, Any]:
+        return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
+
+    def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
+        for attempt in range(retries):
+            response = httpx.post(url, headers=headers, json=data)
+            if response.status_code == 502:
+                time.sleep(backoff_factor * (2**attempt))
+            else:
+                return response
+        return response
+
+    def _get_request(self, url, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
+        for attempt in range(retries):
+            response = httpx.get(url, headers=headers)
+            if response.status_code == 502:
+                time.sleep(backoff_factor * (2**attempt))
+            else:
+                return response
+        return response
+
+    def _handle_error(self, response, action):
+        error_message = response.json().get("error", "Unknown error occurred")
+        raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}")  # type: ignore[return]
+
+    def search(self, query: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
+        # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/search
+        headers = self._prepare_headers()
+        json_data = {
+            "query": query,
+            "limit": 5,
+            "lang": "en",
+            "country": "us",
+            "timeout": 60000,
+            "ignoreInvalidURLs": True,
+            "scrapeOptions": {},
+            "sources": [
+                {"type": "web"},
+            ],
+            "integration": "dify",
+        }
+        if params:
+            json_data.update(params)
+        response = self._post_request(f"{self.base_url}/v2/search", json_data, headers)
+        if response.status_code == 200:
+            response_data = response.json()
+            if not response_data.get("success"):
+                raise Exception(f"Search failed. Error: {response_data.get('warning', 'Unknown error')}")
+            return cast(dict[str, Any], response_data)
+        elif response.status_code in {402, 409, 500, 429, 408}:
+            self._handle_error(response, "perform search")
+            return {}  # Avoid additional exception after handling error
+        else:
+            raise Exception(f"Failed to perform search. Status code: {response.status_code}")
--- a/dify/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py
+++ b/dify/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py
@@ -0,0 +1,63 @@
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+from services.website_service import WebsiteService
+
+
+class FirecrawlWebExtractor(BaseExtractor):
+    """
+    Crawl and scrape websites and return content in clean llm-ready markdown.
+
+    Args:
+        url: The URL to scrape.
+        job_id: The crawl job id.
+        tenant_id: The tenant id.
+        mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
+        only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
+    """
+
+    def __init__(
+        self,
+        url: str,
+        job_id: str,
+        tenant_id: str,
+        mode: str = "crawl",
+        only_main_content: bool = True,
+    ):
+        """Initialize with url, api_key, base_url and mode."""
+        self._url = url
+        self.job_id = job_id
+        self.tenant_id = tenant_id
+        self.mode = mode
+        self.only_main_content = only_main_content
+
+    def extract(self) -> list[Document]:
+        """Extract content from the URL."""
+        documents = []
+        if self.mode == "crawl":
+            crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "firecrawl", self._url, self.tenant_id)
+            if crawl_data is None:
+                return []
+            document = Document(
+                page_content=crawl_data.get("markdown", ""),
+                metadata={
+                    "source_url": crawl_data.get("source_url"),
+                    "description": crawl_data.get("description"),
+                    "title": crawl_data.get("title"),
+                },
+            )
+            documents.append(document)
+        elif self.mode == "scrape":
+            scrape_data = WebsiteService.get_scrape_url_data(
+                "firecrawl", self._url, self.tenant_id, self.only_main_content
+            )
+
+            document = Document(
+                page_content=scrape_data.get("markdown", ""),
+                metadata={
+                    "source_url": scrape_data.get("source_url"),
+                    "description": scrape_data.get("description"),
+                    "title": scrape_data.get("title"),
+                },
+            )
+            documents.append(document)
+        return documents
--- a/dify/api/core/rag/extractor/helpers.py
+++ b/dify/api/core/rag/extractor/helpers.py
@@ -0,0 +1,48 @@
+"""Document loader helpers."""
+
+import concurrent.futures
+from typing import NamedTuple, cast
+
+
+class FileEncoding(NamedTuple):
+    """A file encoding as the NamedTuple."""
+
+    encoding: str | None
+    """The encoding of the file."""
+    confidence: float
+    """The confidence of the encoding."""
+    language: str | None
+    """The language of the file."""
+
+
+def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1024 * 1024) -> list[FileEncoding]:
+    """Try to detect the file encoding.
+
+    Returns a list of `FileEncoding` tuples with the detected encodings ordered
+    by confidence.
+
+    Args:
+        file_path: The path to the file to detect the encoding for.
+        timeout: The timeout in seconds for the encoding detection.
+        sample_size: The number of bytes to read for encoding detection. Default is 1MB.
+                    For large files, reading only a sample is sufficient and prevents timeout.
+    """
+    import chardet
+
+    def read_and_detect(file_path: str):
+        with open(file_path, "rb") as f:
+            # Read only a sample of the file for encoding detection
+            # This prevents timeout on large files while still providing accurate encoding detection
+            rawdata = f.read(sample_size)
+        return cast(list[dict], chardet.detect_all(rawdata))
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future = executor.submit(read_and_detect, file_path)
+        try:
+            encodings = future.result(timeout=timeout)
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(f"Timeout reached while detecting encoding for {file_path}")
+
+    if all(encoding["encoding"] is None for encoding in encodings):
+        raise RuntimeError(f"Could not detect encoding for {file_path}")
+    return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]
--- a/dify/api/core/rag/extractor/html_extractor.py
+++ b/dify/api/core/rag/extractor/html_extractor.py
@@ -0,0 +1,32 @@
+"""Abstract interface for document loader implementations."""
+
+from bs4 import BeautifulSoup
+
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+
+
+class HtmlExtractor(BaseExtractor):
+    """
+    Load html files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(self, file_path: str):
+        """Initialize with file path."""
+        self._file_path = file_path
+
+    def extract(self) -> list[Document]:
+        return [Document(page_content=self._load_as_text())]
+
+    def _load_as_text(self) -> str:
+        text: str = ""
+        with open(self._file_path, "rb") as fp:
+            soup = BeautifulSoup(fp, "html.parser")
+            text = soup.get_text()
+            text = text.strip() if text else ""
+
+        return text
--- a/dify/api/core/rag/extractor/jina_reader_extractor.py
+++ b/dify/api/core/rag/extractor/jina_reader_extractor.py
@@ -0,0 +1,42 @@
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+from services.website_service import WebsiteService
+
+
+class JinaReaderWebExtractor(BaseExtractor):
+    """
+    Crawl and scrape websites and return content in clean llm-ready markdown.
+    """
+
+    def __init__(
+        self,
+        url: str,
+        job_id: str,
+        tenant_id: str,
+        mode: str = "crawl",
+        only_main_content: bool = False,
+    ):
+        """Initialize with url, api_key, base_url and mode."""
+        self._url = url
+        self.job_id = job_id
+        self.tenant_id = tenant_id
+        self.mode = mode
+        self.only_main_content = only_main_content
+
+    def extract(self) -> list[Document]:
+        """Extract content from the URL."""
+        documents = []
+        if self.mode == "crawl":
+            crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id)
+            if crawl_data is None:
+                return []
+            document = Document(
+                page_content=crawl_data.get("content", ""),
+                metadata={
+                    "source_url": crawl_data.get("url"),
+                    "description": crawl_data.get("description"),
+                    "title": crawl_data.get("title"),
+                },
+            )
+            documents.append(document)
+        return documents
--- a/dify/api/core/rag/extractor/markdown_extractor.py
+++ b/dify/api/core/rag/extractor/markdown_extractor.py
@@ -0,0 +1,121 @@
+"""Abstract interface for document loader implementations."""
+
+import re
+from pathlib import Path
+
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.extractor.helpers import detect_file_encodings
+from core.rag.models.document import Document
+
+
+class MarkdownExtractor(BaseExtractor):
+    """Load Markdown files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        remove_hyperlinks: bool = False,
+        remove_images: bool = False,
+        encoding: str | None = None,
+        autodetect_encoding: bool = True,
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._remove_hyperlinks = remove_hyperlinks
+        self._remove_images = remove_images
+        self._encoding = encoding
+        self._autodetect_encoding = autodetect_encoding
+
+    def extract(self) -> list[Document]:
+        """Load from file path."""
+        tups = self.parse_tups(self._file_path)
+        documents = []
+        for header, value in tups:
+            value = value.strip()
+            if header is None:
+                documents.append(Document(page_content=value))
+            else:
+                documents.append(Document(page_content=f"\n\n{header}\n{value}"))
+
+        return documents
+
+    def markdown_to_tups(self, markdown_text: str) -> list[tuple[str | None, str]]:
+        """Convert a markdown file to a dictionary.
+
+        The keys are the headers and the values are the text under each header.
+
+        """
+        markdown_tups: list[tuple[str | None, str]] = []
+        lines = markdown_text.split("\n")
+
+        current_header = None
+        current_text = ""
+        code_block_flag = False
+
+        for line in lines:
+            if line.startswith("```"):
+                code_block_flag = not code_block_flag
+                current_text += line + "\n"
+                continue
+            if code_block_flag:
+                current_text += line + "\n"
+                continue
+            header_match = re.match(r"^#+\s", line)
+            if header_match:
+                markdown_tups.append((current_header, current_text))
+                current_header = line
+                current_text = ""
+            else:
+                current_text += line + "\n"
+        markdown_tups.append((current_header, current_text))
+
+        markdown_tups = [
+            (re.sub(r"#", "", key).strip() if key else None, re.sub(r"<.*?>", "", value))
+            for key, value in markdown_tups
+        ]
+
+        return markdown_tups
+
+    def remove_images(self, content: str) -> str:
+        """Get a dictionary of a markdown file from its path."""
+        pattern = r"!{1}\[\[(.*)\]\]"
+        content = re.sub(pattern, "", content)
+        return content
+
+    def remove_hyperlinks(self, content: str) -> str:
+        """Get a dictionary of a markdown file from its path."""
+        pattern = r"\[(.*?)\]\((.*?)\)"
+        content = re.sub(pattern, r"\1", content)
+        return content
+
+    def parse_tups(self, filepath: str) -> list[tuple[str | None, str]]:
+        """Parse file into tuples."""
+        content = ""
+        try:
+            content = Path(filepath).read_text(encoding=self._encoding)
+        except UnicodeDecodeError as e:
+            if self._autodetect_encoding:
+                detected_encodings = detect_file_encodings(filepath)
+                for encoding in detected_encodings:
+                    try:
+                        content = Path(filepath).read_text(encoding=encoding.encoding)
+                        break
+                    except UnicodeDecodeError:
+                        continue
+            else:
+                raise RuntimeError(f"Error loading {filepath}") from e
+        except Exception as e:
+            raise RuntimeError(f"Error loading {filepath}") from e
+
+        if self._remove_hyperlinks:
+            content = self.remove_hyperlinks(content)
+
+        if self._remove_images:
+            content = self.remove_images(content)
+
+        return self.markdown_to_tups(content)
--- a/dify/api/core/rag/extractor/notion_extractor.py
+++ b/dify/api/core/rag/extractor/notion_extractor.py
@@ -0,0 +1,386 @@
+import json
+import logging
+import operator
+from typing import Any, cast
+
+import httpx
+
+from configs import dify_config
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+from extensions.ext_database import db
+from models.dataset import Document as DocumentModel
+from services.datasource_provider_service import DatasourceProviderService
+
+logger = logging.getLogger(__name__)
+
+BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
+DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
+SEARCH_URL = "https://api.notion.com/v1/search"
+
+RETRIEVE_PAGE_URL_TMPL = "https://api.notion.com/v1/pages/{page_id}"
+RETRIEVE_DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}"
+# if user want split by headings, use the corresponding splitter
+HEADING_SPLITTER = {
+    "heading_1": "# ",
+    "heading_2": "## ",
+    "heading_3": "### ",
+}
+
+
+class NotionExtractor(BaseExtractor):
+    def __init__(
+        self,
+        notion_workspace_id: str,
+        notion_obj_id: str,
+        notion_page_type: str,
+        tenant_id: str,
+        document_model: DocumentModel | None = None,
+        notion_access_token: str | None = None,
+        credential_id: str | None = None,
+    ):
+        self._notion_access_token = None
+        self._document_model = document_model
+        self._notion_workspace_id = notion_workspace_id
+        self._notion_obj_id = notion_obj_id
+        self._notion_page_type = notion_page_type
+        self._credential_id = credential_id
+        if notion_access_token:
+            self._notion_access_token = notion_access_token
+        else:
+            self._notion_access_token = self._get_access_token(tenant_id, self._credential_id)
+            if not self._notion_access_token:
+                integration_token = dify_config.NOTION_INTEGRATION_TOKEN
+                if integration_token is None:
+                    raise ValueError(
+                        "Must specify `integration_token` or set environment variable `NOTION_INTEGRATION_TOKEN`."
+                    )
+
+                self._notion_access_token = integration_token
+
+    def extract(self) -> list[Document]:
+        self.update_last_edited_time(self._document_model)
+
+        text_docs = self._load_data_as_documents(self._notion_obj_id, self._notion_page_type)
+
+        return text_docs
+
+    def _load_data_as_documents(self, notion_obj_id: str, notion_page_type: str) -> list[Document]:
+        docs = []
+        if notion_page_type == "database":
+            # get all the pages in the database
+            page_text_documents = self._get_notion_database_data(notion_obj_id)
+            docs.extend(page_text_documents)
+        elif notion_page_type == "page":
+            page_text_list = self._get_notion_block_data(notion_obj_id)
+            docs.append(Document(page_content="\n".join(page_text_list)))
+        else:
+            raise ValueError("notion page type not supported")
+
+        return docs
+
+    def _get_notion_database_data(self, database_id: str, query_dict: dict[str, Any] = {}) -> list[Document]:
+        """Get all the pages from a Notion database."""
+        assert self._notion_access_token is not None, "Notion access token is required"
+
+        database_content = []
+        next_cursor = None
+        has_more = True
+
+        while has_more:
+            current_query = query_dict.copy()
+            if next_cursor:
+                current_query["start_cursor"] = next_cursor
+
+            res = httpx.post(
+                DATABASE_URL_TMPL.format(database_id=database_id),
+                headers={
+                    "Authorization": "Bearer " + self._notion_access_token,
+                    "Content-Type": "application/json",
+                    "Notion-Version": "2022-06-28",
+                },
+                json=current_query,
+            )
+
+            response_data = res.json()
+
+            if "results" not in response_data or response_data["results"] is None:
+                break
+
+            for result in response_data["results"]:
+                properties = result["properties"]
+                data = {}
+                value: Any
+                for property_name, property_value in properties.items():
+                    type = property_value["type"]
+                    if type == "multi_select":
+                        value = []
+                        multi_select_list = property_value[type]
+                        for multi_select in multi_select_list:
+                            value.append(multi_select["name"])
+                    elif type in {"rich_text", "title"}:
+                        if len(property_value[type]) > 0:
+                            value = property_value[type][0]["plain_text"]
+                        else:
+                            value = ""
+                    elif type in {"select", "status"}:
+                        if property_value[type]:
+                            value = property_value[type]["name"]
+                        else:
+                            value = ""
+                    else:
+                        value = property_value[type]
+                    data[property_name] = value
+                row_dict = {k: v for k, v in data.items() if v}
+                row_content = ""
+                for key, value in sorted(row_dict.items(), key=operator.itemgetter(0)):
+                    if isinstance(value, dict):
+                        value_dict = {k: v for k, v in value.items() if v}
+                        value_content = "".join(f"{k}:{v} " for k, v in value_dict.items())
+                        row_content = row_content + f"{key}:{value_content}\n"
+                    else:
+                        row_content = row_content + f"{key}:{value}\n"
+                if "url" in result:
+                    row_content = row_content + f"Row Page URL:{result.get('url', '')}\n"
+                database_content.append(row_content)
+
+            has_more = response_data.get("has_more", False)
+            next_cursor = response_data.get("next_cursor")
+
+        if not database_content:
+            return []
+
+        return [Document(page_content="\n".join(database_content))]
+
+    def _get_notion_block_data(self, page_id: str) -> list[str]:
+        assert self._notion_access_token is not None, "Notion access token is required"
+        result_lines_arr = []
+        start_cursor = None
+        block_url = BLOCK_CHILD_URL_TMPL.format(block_id=page_id)
+        while True:
+            query_dict: dict[str, Any] = {} if not start_cursor else {"start_cursor": start_cursor}
+            try:
+                res = httpx.request(
+                    "GET",
+                    block_url,
+                    headers={
+                        "Authorization": "Bearer " + self._notion_access_token,
+                        "Content-Type": "application/json",
+                        "Notion-Version": "2022-06-28",
+                    },
+                    params=query_dict,
+                )
+                if res.status_code != 200:
+                    raise ValueError(f"Error fetching Notion block data: {res.text}")
+                data = res.json()
+            except httpx.HTTPError as e:
+                raise ValueError("Error fetching Notion block data") from e
+            if "results" not in data or not isinstance(data["results"], list):
+                raise ValueError("Error fetching Notion block data")
+            for result in data["results"]:
+                result_type = result["type"]
+                result_obj = result[result_type]
+                cur_result_text_arr = []
+                if result_type == "table":
+                    result_block_id = result["id"]
+                    text = self._read_table_rows(result_block_id)
+                    text += "\n\n"
+                    result_lines_arr.append(text)
+                else:
+                    if "rich_text" in result_obj:
+                        for rich_text in result_obj["rich_text"]:
+                            # skip if doesn't have text object
+                            if "text" in rich_text:
+                                text = rich_text["text"]["content"]
+                                cur_result_text_arr.append(text)
+
+                    result_block_id = result["id"]
+                    has_children = result["has_children"]
+                    block_type = result["type"]
+                    if has_children and block_type != "child_page":
+                        children_text = self._read_block(result_block_id, num_tabs=1)
+                        cur_result_text_arr.append(children_text)
+
+                    cur_result_text = "\n".join(cur_result_text_arr)
+                    if result_type in HEADING_SPLITTER:
+                        result_lines_arr.append(f"{HEADING_SPLITTER[result_type]}{cur_result_text}")
+                    else:
+                        result_lines_arr.append(cur_result_text + "\n\n")
+
+            if data["next_cursor"] is None:
+                break
+            else:
+                start_cursor = data["next_cursor"]
+        return result_lines_arr
+
+    def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
+        """Read a block."""
+        assert self._notion_access_token is not None, "Notion access token is required"
+        result_lines_arr = []
+        start_cursor = None
+        block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
+        while True:
+            query_dict: dict[str, Any] = {} if not start_cursor else {"start_cursor": start_cursor}
+
+            res = httpx.request(
+                "GET",
+                block_url,
+                headers={
+                    "Authorization": "Bearer " + self._notion_access_token,
+                    "Content-Type": "application/json",
+                    "Notion-Version": "2022-06-28",
+                },
+                params=query_dict,
+            )
+            data = res.json()
+            if "results" not in data or data["results"] is None:
+                break
+            for result in data["results"]:
+                result_type = result["type"]
+                result_obj = result[result_type]
+                cur_result_text_arr = []
+                if result_type == "table":
+                    result_block_id = result["id"]
+                    text = self._read_table_rows(result_block_id)
+                    result_lines_arr.append(text)
+                else:
+                    if "rich_text" in result_obj:
+                        for rich_text in result_obj["rich_text"]:
+                            # skip if doesn't have text object
+                            if "text" in rich_text:
+                                text = rich_text["text"]["content"]
+                                prefix = "\t" * num_tabs
+                                cur_result_text_arr.append(prefix + text)
+                    result_block_id = result["id"]
+                    has_children = result["has_children"]
+                    block_type = result["type"]
+                    if has_children and block_type != "child_page":
+                        children_text = self._read_block(result_block_id, num_tabs=num_tabs + 1)
+                        cur_result_text_arr.append(children_text)
+
+                    cur_result_text = "\n".join(cur_result_text_arr)
+                    if result_type in HEADING_SPLITTER:
+                        result_lines_arr.append(f"{HEADING_SPLITTER[result_type]}{cur_result_text}")
+                    else:
+                        result_lines_arr.append(cur_result_text + "\n\n")
+
+            if data["next_cursor"] is None:
+                break
+            else:
+                start_cursor = data["next_cursor"]
+
+        result_lines = "\n".join(result_lines_arr)
+        return result_lines
+
+    def _read_table_rows(self, block_id: str) -> str:
+        """Read table rows."""
+        assert self._notion_access_token is not None, "Notion access token is required"
+        done = False
+        result_lines_arr = []
+        start_cursor = None
+        block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
+        while not done:
+            query_dict: dict[str, Any] = {} if not start_cursor else {"start_cursor": start_cursor}
+
+            res = httpx.request(
+                "GET",
+                block_url,
+                headers={
+                    "Authorization": "Bearer " + self._notion_access_token,
+                    "Content-Type": "application/json",
+                    "Notion-Version": "2022-06-28",
+                },
+                params=query_dict,
+            )
+            data = res.json()
+            # get table headers text
+            table_header_cell_texts = []
+            table_header_cells = data["results"][0]["table_row"]["cells"]
+            for table_header_cell in table_header_cells:
+                if table_header_cell:
+                    for table_header_cell_text in table_header_cell:
+                        text = table_header_cell_text["text"]["content"]
+                        table_header_cell_texts.append(text)
+                else:
+                    table_header_cell_texts.append("")
+            # Initialize Markdown table with headers
+            markdown_table = "| " + " | ".join(table_header_cell_texts) + " |\n"
+            markdown_table += "| " + " | ".join(["---"] * len(table_header_cell_texts)) + " |\n"
+
+            # Process data to format each row in Markdown table format
+            results = data["results"]
+            for i in range(len(results) - 1):
+                column_texts = []
+                table_column_cells = data["results"][i + 1]["table_row"]["cells"]
+                for j in range(len(table_column_cells)):
+                    if table_column_cells[j]:
+                        for table_column_cell_text in table_column_cells[j]:
+                            column_text = table_column_cell_text["text"]["content"]
+                            column_texts.append(column_text)
+                # Add row to Markdown table
+                markdown_table += "| " + " | ".join(column_texts) + " |\n"
+            result_lines_arr.append(markdown_table)
+            if data["next_cursor"] is None:
+                done = True
+                break
+            else:
+                start_cursor = data["next_cursor"]
+
+        result_lines = "\n".join(result_lines_arr)
+        return result_lines
+
+    def update_last_edited_time(self, document_model: DocumentModel | None):
+        if not document_model:
+            return
+
+        last_edited_time = self.get_notion_last_edited_time()
+        data_source_info = document_model.data_source_info_dict
+        if data_source_info:
+            data_source_info["last_edited_time"] = last_edited_time
+
+        db.session.query(DocumentModel).filter_by(id=document_model.id).update(
+            {DocumentModel.data_source_info: json.dumps(data_source_info)}
+        )  # type: ignore
+        db.session.commit()
+
+    def get_notion_last_edited_time(self) -> str:
+        assert self._notion_access_token is not None, "Notion access token is required"
+        obj_id = self._notion_obj_id
+        page_type = self._notion_page_type
+        if page_type == "database":
+            retrieve_page_url = RETRIEVE_DATABASE_URL_TMPL.format(database_id=obj_id)
+        else:
+            retrieve_page_url = RETRIEVE_PAGE_URL_TMPL.format(page_id=obj_id)
+
+        query_dict: dict[str, Any] = {}
+
+        res = httpx.request(
+            "GET",
+            retrieve_page_url,
+            headers={
+                "Authorization": "Bearer " + self._notion_access_token,
+                "Content-Type": "application/json",
+                "Notion-Version": "2022-06-28",
+            },
+            json=query_dict,
+        )
+
+        data = res.json()
+        return cast(str, data["last_edited_time"])
+
+    @classmethod
+    def _get_access_token(cls, tenant_id: str, credential_id: str | None) -> str:
+        # get credential from tenant_id and credential_id
+        if not credential_id:
+            raise Exception(f"No credential id found for tenant {tenant_id}")
+        datasource_provider_service = DatasourceProviderService()
+        credential = datasource_provider_service.get_datasource_credentials(
+            tenant_id=tenant_id,
+            credential_id=credential_id,
+            provider="notion_datasource",
+            plugin_id="langgenius/notion_datasource",
+        )
+        if not credential:
+            raise Exception(f"No notion credential found for tenant {tenant_id} and credential {credential_id}")
+
+        return cast(str, credential["integration_secret"])
--- a/dify/api/core/rag/extractor/pdf_extractor.py
+++ b/dify/api/core/rag/extractor/pdf_extractor.py
@@ -0,0 +1,66 @@
+"""Abstract interface for document loader implementations."""
+
+import contextlib
+from collections.abc import Iterator
+
+from core.rag.extractor.blob.blob import Blob
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+from extensions.ext_storage import storage
+
+
+class PdfExtractor(BaseExtractor):
+    """Load pdf files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(self, file_path: str, file_cache_key: str | None = None):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._file_cache_key = file_cache_key
+
+    def extract(self) -> list[Document]:
+        plaintext_file_exists = False
+        if self._file_cache_key:
+            with contextlib.suppress(FileNotFoundError):
+                text = storage.load(self._file_cache_key).decode("utf-8")
+                plaintext_file_exists = True
+                return [Document(page_content=text)]
+        documents = list(self.load())
+        text_list = []
+        for document in documents:
+            text_list.append(document.page_content)
+        text = "\n\n".join(text_list)
+
+        # save plaintext file for caching
+        if not plaintext_file_exists and self._file_cache_key:
+            storage.save(self._file_cache_key, text.encode("utf-8"))
+
+        return documents
+
+    def load(
+        self,
+    ) -> Iterator[Document]:
+        """Lazy load given path as pages."""
+        blob = Blob.from_path(self._file_path)
+        yield from self.parse(blob)
+
+    def parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+        import pypdfium2  # type: ignore
+
+        with blob.as_bytes_io() as file_path:
+            pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
+            try:
+                for page_number, page in enumerate(pdf_reader):
+                    text_page = page.get_textpage()
+                    content = text_page.get_text_range()
+                    text_page.close()
+                    page.close()
+                    metadata = {"source": blob.source, "page": page_number}
+                    yield Document(page_content=content, metadata=metadata)
+            finally:
+                pdf_reader.close()
--- a/dify/api/core/rag/extractor/text_extractor.py
+++ b/dify/api/core/rag/extractor/text_extractor.py
@@ -0,0 +1,48 @@
+"""Abstract interface for document loader implementations."""
+
+from pathlib import Path
+
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.extractor.helpers import detect_file_encodings
+from core.rag.models.document import Document
+
+
+class TextExtractor(BaseExtractor):
+    """Load text files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(self, file_path: str, encoding: str | None = None, autodetect_encoding: bool = False):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._encoding = encoding
+        self._autodetect_encoding = autodetect_encoding
+
+    def extract(self) -> list[Document]:
+        """Load from file path."""
+        text = ""
+        try:
+            text = Path(self._file_path).read_text(encoding=self._encoding)
+        except UnicodeDecodeError as e:
+            if self._autodetect_encoding:
+                detected_encodings = detect_file_encodings(self._file_path)
+                for encoding in detected_encodings:
+                    try:
+                        text = Path(self._file_path).read_text(encoding=encoding.encoding)
+                        break
+                    except UnicodeDecodeError:
+                        continue
+                else:
+                    raise RuntimeError(
+                        f"Decode failed: {self._file_path}, all detected encodings failed. Original error: {e}"
+                    )
+            else:
+                raise RuntimeError(f"Decode failed: {self._file_path}, specified encoding failed. Original error: {e}")
+        except Exception as e:
+            raise RuntimeError(f"Error loading {self._file_path}") from e
+
+        metadata = {"source": self._file_path}
+        return [Document(page_content=text, metadata=metadata)]
--- a/dify/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
+++ b/dify/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
@@ -0,0 +1,59 @@
+import logging
+import os
+
+from configs import dify_config
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredWordExtractor(BaseExtractor):
+    """Loader that uses unstructured to load word documents."""
+
+    def __init__(self, file_path: str, api_url: str, api_key: str = ""):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+        self._api_key = api_key
+
+    def extract(self) -> list[Document]:
+        from unstructured.__version__ import __version__ as __unstructured_version__
+        from unstructured.file_utils.filetype import FileType, detect_filetype
+
+        unstructured_version = tuple(int(x) for x in __unstructured_version__.split("."))
+        # check the file extension
+        try:
+            import magic  # noqa: F401
+
+            is_doc = detect_filetype(self._file_path) == FileType.DOC
+        except ImportError:
+            _, extension = os.path.splitext(str(self._file_path))
+            is_doc = extension == ".doc"
+
+        if is_doc and unstructured_version < (0, 4, 11):
+            raise ValueError(
+                f"You are on unstructured version {__unstructured_version__}. "
+                "Partitioning .doc files is only supported in unstructured>=0.4.11. "
+                "Please upgrade the unstructured package and try again."
+            )
+
+        if is_doc:
+            from unstructured.partition.api import partition_via_api
+
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+
+        else:
+            from unstructured.partition.docx import partition_docx
+
+            elements = partition_docx(filename=self._file_path)
+
+        from unstructured.chunking.title import chunk_by_title
+
+        max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
+        chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+        return documents
--- a/dify/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/dify/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@@ -0,0 +1,56 @@
+import base64
+import contextlib
+import logging
+
+from bs4 import BeautifulSoup
+
+from configs import dify_config
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredEmailExtractor(BaseExtractor):
+    """Load eml files.
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+        self._api_key = api_key
+
+    def extract(self) -> list[Document]:
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
+
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.email import partition_email
+
+            elements = partition_email(filename=self._file_path)
+
+        # noinspection PyBroadException
+        with contextlib.suppress(Exception):
+            for element in elements:
+                element_text = element.text.strip()
+
+                padding_needed = 4 - len(element_text) % 4
+                element_text += "=" * padding_needed
+
+                element_decode = base64.b64decode(element_text)
+                soup = BeautifulSoup(element_decode.decode("utf-8"), "html.parser")
+                element.text = soup.get_text()
+
+        from unstructured.chunking.title import chunk_by_title
+
+        max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
+        chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+        return documents
--- a/dify/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
+++ b/dify/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
@@ -0,0 +1,51 @@
+import logging
+
+import pypandoc  # type: ignore
+
+from configs import dify_config
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredEpubExtractor(BaseExtractor):
+    """Load epub files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str | None = None,
+        api_key: str = "",
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+        self._api_key = api_key
+
+    def extract(self) -> list[Document]:
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
+
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.epub import partition_epub
+
+            pypandoc.download_pandoc()
+            elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
+
+        from unstructured.chunking.title import chunk_by_title
+
+        max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
+        chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/dify/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/dify/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@@ -0,0 +1,43 @@
+import logging
+
+from configs import dify_config
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredMarkdownExtractor(BaseExtractor):
+    """Load md files.
+
+
+    Args:
+        file_path: Path to the file to load.
+
+    """
+
+    def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+        self._api_key = api_key
+
+    def extract(self) -> list[Document]:
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
+
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.md import partition_md
+
+            elements = partition_md(filename=self._file_path)
+        from unstructured.chunking.title import chunk_by_title
+
+        max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
+        chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/dify/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/dify/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@@ -0,0 +1,42 @@
+import logging
+
+from configs import dify_config
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredMsgExtractor(BaseExtractor):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+        self._api_key = api_key
+
+    def extract(self) -> list[Document]:
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
+
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.msg import partition_msg
+
+            elements = partition_msg(filename=self._file_path)
+        from unstructured.chunking.title import chunk_by_title
+
+        max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
+        chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/dify/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
+++ b/dify/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
@@ -0,0 +1,46 @@
+import logging
+
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredPPTExtractor(BaseExtractor):
+    """Load ppt files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+        self._api_key = api_key
+
+    def extract(self) -> list[Document]:
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
+
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            raise NotImplementedError("Unstructured API Url is not configured")
+        text_by_page: dict[int, str] = {}
+        for element in elements:
+            page = element.metadata.page_number
+            if page is None:
+                continue
+            text = element.text
+            if page in text_by_page:
+                text_by_page[page] += "\n" + text
+            else:
+                text_by_page[page] = text
+
+        combined_texts = list(text_by_page.values())
+        documents = []
+        for combined_text in combined_texts:
+            text = combined_text.strip()
+            documents.append(Document(page_content=text))
+        return documents
--- a/dify/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
+++ b/dify/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
@@ -0,0 +1,48 @@
+import logging
+
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredPPTXExtractor(BaseExtractor):
+    """Load pptx files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+        self._api_key = api_key
+
+    def extract(self) -> list[Document]:
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
+
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.pptx import partition_pptx
+
+            elements = partition_pptx(filename=self._file_path)
+        text_by_page: dict[int, str] = {}
+        for element in elements:
+            page = element.metadata.page_number
+            text = element.text
+            if page is not None:
+                if page in text_by_page:
+                    text_by_page[page] += "\n" + text
+                else:
+                    text_by_page[page] = text
+
+        combined_texts = list(text_by_page.values())
+        documents = []
+        for combined_text in combined_texts:
+            text = combined_text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/dify/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/dify/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@@ -0,0 +1,43 @@
+import logging
+
+from configs import dify_config
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredXmlExtractor(BaseExtractor):
+    """Load xml files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+        self._api_key = api_key
+
+    def extract(self) -> list[Document]:
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
+
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.xml import partition_xml
+
+            elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
+
+        from unstructured.chunking.title import chunk_by_title
+
+        max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
+        chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/dify/api/core/rag/extractor/watercrawl/client.py
+++ b/dify/api/core/rag/extractor/watercrawl/client.py
@@ -0,0 +1,199 @@
+import json
+from collections.abc import Generator
+from typing import Union
+from urllib.parse import urljoin
+
+import httpx
+from httpx import Response
+
+from core.rag.extractor.watercrawl.exceptions import (
+    WaterCrawlAuthenticationError,
+    WaterCrawlBadRequestError,
+    WaterCrawlPermissionError,
+)
+
+
+class BaseAPIClient:
+    def __init__(self, api_key, base_url):
+        self.api_key = api_key
+        self.base_url = base_url
+        self.session = self.init_session()
+
+    def init_session(self):
+        headers = {
+            "X-API-Key": self.api_key,
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+            "User-Agent": "WaterCrawl-Plugin",
+            "Accept-Language": "en-US",
+        }
+        return httpx.Client(headers=headers, timeout=None)
+
+    def _request(
+        self,
+        method: str,
+        endpoint: str,
+        query_params: dict | None = None,
+        data: dict | None = None,
+        **kwargs,
+    ) -> Response:
+        stream = kwargs.pop("stream", False)
+        url = urljoin(self.base_url, endpoint)
+        if stream:
+            request = self.session.build_request(method, url, params=query_params, json=data)
+            return self.session.send(request, stream=True, **kwargs)
+
+        return self.session.request(method, url, params=query_params, json=data, **kwargs)
+
+    def _get(self, endpoint: str, query_params: dict | None = None, **kwargs):
+        return self._request("GET", endpoint, query_params=query_params, **kwargs)
+
+    def _post(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
+        return self._request("POST", endpoint, query_params=query_params, data=data, **kwargs)
+
+    def _put(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
+        return self._request("PUT", endpoint, query_params=query_params, data=data, **kwargs)
+
+    def _delete(self, endpoint: str, query_params: dict | None = None, **kwargs):
+        return self._request("DELETE", endpoint, query_params=query_params, **kwargs)
+
+    def _patch(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
+        return self._request("PATCH", endpoint, query_params=query_params, data=data, **kwargs)
+
+
+class WaterCrawlAPIClient(BaseAPIClient):
+    def __init__(self, api_key, base_url: str | None = "https://app.watercrawl.dev/"):
+        super().__init__(api_key, base_url)
+
+    def process_eventstream(self, response: Response, download: bool = False) -> Generator:
+        try:
+            for raw_line in response.iter_lines():
+                line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line
+                if line.startswith("data:"):
+                    line = line[5:].strip()
+                    data = json.loads(line)
+                    if data["type"] == "result" and download:
+                        data["data"] = self.download_result(data["data"])
+                    yield data
+        finally:
+            response.close()
+
+    def process_response(self, response: Response) -> dict | bytes | list | None | Generator:
+        if response.status_code == 401:
+            raise WaterCrawlAuthenticationError(response)
+
+        if response.status_code == 403:
+            raise WaterCrawlPermissionError(response)
+
+        if 400 <= response.status_code < 500:
+            raise WaterCrawlBadRequestError(response)
+
+        response.raise_for_status()
+        if response.status_code == 204:
+            return None
+        if response.headers.get("Content-Type") == "application/json":
+            return response.json() or {}
+
+        if response.headers.get("Content-Type") == "application/octet-stream":
+            return response.content
+
+        if response.headers.get("Content-Type") == "text/event-stream":
+            return self.process_eventstream(response)
+
+        raise Exception(f"Unknown response type: {response.headers.get('Content-Type')}")
+
+    def get_crawl_requests_list(self, page: int | None = None, page_size: int | None = None):
+        query_params = {"page": page or 1, "page_size": page_size or 10}
+        return self.process_response(
+            self._get(
+                "/api/v1/core/crawl-requests/",
+                query_params=query_params,
+            )
+        )
+
+    def get_crawl_request(self, item_id: str):
+        return self.process_response(
+            self._get(
+                f"/api/v1/core/crawl-requests/{item_id}/",
+            )
+        )
+
+    def create_crawl_request(
+        self,
+        url: Union[list, str] | None = None,
+        spider_options: dict | None = None,
+        page_options: dict | None = None,
+        plugin_options: dict | None = None,
+    ):
+        data = {
+            # 'urls': url if isinstance(url, list) else [url],
+            "url": url,
+            "options": {
+                "spider_options": spider_options or {},
+                "page_options": page_options or {},
+                "plugin_options": plugin_options or {},
+            },
+        }
+        return self.process_response(
+            self._post(
+                "/api/v1/core/crawl-requests/",
+                data=data,
+            )
+        )
+
+    def stop_crawl_request(self, item_id: str):
+        return self.process_response(
+            self._delete(
+                f"/api/v1/core/crawl-requests/{item_id}/",
+            )
+        )
+
+    def download_crawl_request(self, item_id: str):
+        return self.process_response(
+            self._get(
+                f"/api/v1/core/crawl-requests/{item_id}/download/",
+            )
+        )
+
+    def monitor_crawl_request(self, item_id: str, prefetched=False) -> Generator:
+        query_params = {"prefetched": str(prefetched).lower()}
+        generator = self.process_response(
+            self._get(f"/api/v1/core/crawl-requests/{item_id}/status/", stream=True, query_params=query_params),
+        )
+        if not isinstance(generator, Generator):
+            raise ValueError("Generator expected")
+        yield from generator
+
+    def get_crawl_request_results(
+        self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict | None = None
+    ):
+        query_params = query_params or {}
+        query_params.update({"page": page or 1, "page_size": page_size or 25})
+        return self.process_response(
+            self._get(f"/api/v1/core/crawl-requests/{item_id}/results/", query_params=query_params)
+        )
+
+    def scrape_url(
+        self,
+        url: str,
+        page_options: dict | None = None,
+        plugin_options: dict | None = None,
+        sync: bool = True,
+        prefetched: bool = True,
+    ):
+        response_result = self.create_crawl_request(url=url, page_options=page_options, plugin_options=plugin_options)
+        if not sync:
+            return response_result
+
+        for event_data in self.monitor_crawl_request(response_result["uuid"], prefetched):
+            if event_data["type"] == "result":
+                return event_data["data"]
+
+    def download_result(self, result_object: dict):
+        response = httpx.get(result_object["result"], timeout=None)
+        try:
+            response.raise_for_status()
+            result_object["result"] = response.json()
+        finally:
+            response.close()
+        return result_object
--- a/dify/api/core/rag/extractor/watercrawl/exceptions.py
+++ b/dify/api/core/rag/extractor/watercrawl/exceptions.py
@@ -0,0 +1,32 @@
+import json
+
+
+class WaterCrawlError(Exception):
+    pass
+
+
+class WaterCrawlBadRequestError(WaterCrawlError):
+    def __init__(self, response):
+        self.status_code = response.status_code
+        self.response = response
+        data = response.json()
+        self.message = data.get("message", "Unknown error occurred")
+        self.errors = data.get("errors", {})
+        super().__init__(self.message)
+
+    @property
+    def flat_errors(self):
+        return json.dumps(self.errors)
+
+    def __str__(self):
+        return f"WaterCrawlBadRequestError: {self.message} \n {self.flat_errors}"
+
+
+class WaterCrawlPermissionError(WaterCrawlBadRequestError):
+    def __str__(self):
+        return f"You are exceeding your WaterCrawl API limits. {self.message}"
+
+
+class WaterCrawlAuthenticationError(WaterCrawlBadRequestError):
+    def __str__(self):
+        return "WaterCrawl API key is invalid or expired. Please check your API key and try again."
--- a/dify/api/core/rag/extractor/watercrawl/extractor.py
+++ b/dify/api/core/rag/extractor/watercrawl/extractor.py
@@ -0,0 +1,64 @@
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+from services.website_service import WebsiteService
+
+
+class WaterCrawlWebExtractor(BaseExtractor):
+    """
+    Crawl and scrape websites and return content in clean llm-ready markdown.
+
+
+    Args:
+        url: The URL to scrape.
+        api_key: The API key for WaterCrawl.
+        base_url: The base URL for the Firecrawl API. Defaults to 'https://app.firecrawl.dev'.
+        mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
+        only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
+    """
+
+    def __init__(
+        self,
+        url: str,
+        job_id: str,
+        tenant_id: str,
+        mode: str = "crawl",
+        only_main_content: bool = True,
+    ):
+        """Initialize with url, api_key, base_url and mode."""
+        self._url = url
+        self.job_id = job_id
+        self.tenant_id = tenant_id
+        self.mode = mode
+        self.only_main_content = only_main_content
+
+    def extract(self) -> list[Document]:
+        """Extract content from the URL."""
+        documents = []
+        if self.mode == "crawl":
+            crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id)
+            if crawl_data is None:
+                return []
+            document = Document(
+                page_content=crawl_data.get("markdown", ""),
+                metadata={
+                    "source_url": crawl_data.get("source_url"),
+                    "description": crawl_data.get("description"),
+                    "title": crawl_data.get("title"),
+                },
+            )
+            documents.append(document)
+        elif self.mode == "scrape":
+            scrape_data = WebsiteService.get_scrape_url_data(
+                "watercrawl", self._url, self.tenant_id, self.only_main_content
+            )
+
+            document = Document(
+                page_content=scrape_data.get("markdown", ""),
+                metadata={
+                    "source_url": scrape_data.get("source_url"),
+                    "description": scrape_data.get("description"),
+                    "title": scrape_data.get("title"),
+                },
+            )
+            documents.append(document)
+        return documents
--- a/dify/api/core/rag/extractor/watercrawl/provider.py
+++ b/dify/api/core/rag/extractor/watercrawl/provider.py
@@ -0,0 +1,117 @@
+from collections.abc import Generator
+from datetime import datetime
+from typing import Any
+
+from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient
+
+
+class WaterCrawlProvider:
+    def __init__(self, api_key, base_url: str | None = None):
+        self.client = WaterCrawlAPIClient(api_key, base_url)
+
+    def crawl_url(self, url, options: dict | Any | None = None):
+        options = options or {}
+        spider_options = {
+            "max_depth": 1,
+            "page_limit": 1,
+            "allowed_domains": [],
+            "exclude_paths": [],
+            "include_paths": [],
+        }
+        if options.get("crawl_sub_pages", True):
+            spider_options["page_limit"] = options.get("limit", 1)
+            spider_options["max_depth"] = options.get("max_depth", 1)
+            spider_options["include_paths"] = options.get("includes", "").split(",") if options.get("includes") else []
+            spider_options["exclude_paths"] = options.get("excludes", "").split(",") if options.get("excludes") else []
+
+        wait_time = options.get("wait_time", 1000)
+        page_options = {
+            "exclude_tags": options.get("exclude_tags", "").split(",") if options.get("exclude_tags") else [],
+            "include_tags": options.get("include_tags", "").split(",") if options.get("include_tags") else [],
+            "wait_time": max(1000, wait_time),  # minimum wait time is 1 second
+            "include_html": False,
+            "only_main_content": options.get("only_main_content", True),
+            "include_links": False,
+            "timeout": 15000,
+            "accept_cookies_selector": "#cookies-accept",
+            "locale": "en-US",
+            "actions": [],
+        }
+        result = self.client.create_crawl_request(url=url, spider_options=spider_options, page_options=page_options)
+
+        return {"status": "active", "job_id": result.get("uuid")}
+
+    def get_crawl_status(self, crawl_request_id):
+        response = self.client.get_crawl_request(crawl_request_id)
+        data = []
+        if response["status"] in ["new", "running"]:
+            status = "active"
+        else:
+            status = "completed"
+            data = list(self._get_results(crawl_request_id))
+
+        time_str = response.get("duration")
+        time_consuming: float = 0
+        if time_str:
+            time_obj = datetime.strptime(time_str, "%H:%M:%S.%f")
+            time_consuming = (
+                time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000
+            )
+
+        return {
+            "status": status,
+            "job_id": response.get("uuid"),
+            "total": response.get("options", {}).get("spider_options", {}).get("page_limit", 1),
+            "current": response.get("number_of_documents", 0),
+            "data": data,
+            "time_consuming": time_consuming,
+        }
+
+    def get_crawl_url_data(self, job_id, url) -> dict | None:
+        if not job_id:
+            return self.scrape_url(url)
+
+        for result in self._get_results(
+            job_id,
+            {
+                # filter by url
+                "url": url
+            },
+        ):
+            return result
+
+        return None
+
+    def scrape_url(self, url: str):
+        response = self.client.scrape_url(url=url, sync=True, prefetched=True)
+        return self._structure_data(response)
+
+    def _structure_data(self, result_object: dict):
+        if isinstance(result_object.get("result", {}), str):
+            raise ValueError("Invalid result object. Expected a dictionary.")
+
+        metadata = result_object.get("result", {}).get("metadata", {})
+        return {
+            "title": metadata.get("og:title") or metadata.get("title"),
+            "description": metadata.get("description"),
+            "source_url": result_object.get("url"),
+            "markdown": result_object.get("result", {}).get("markdown"),
+        }
+
+    def _get_results(self, crawl_request_id: str, query_params: dict | None = None) -> Generator[dict, None, None]:
+        page = 0
+        page_size = 100
+
+        query_params = query_params or {}
+        query_params.update({"prefetched": "true"})
+        while True:
+            page += 1
+            response = self.client.get_crawl_request_results(crawl_request_id, page, page_size, query_params)
+            if not response["results"]:
+                break
+
+            for result in response["results"]:
+                yield self._structure_data(result)
+
+            if response["next"] is None:
+                break
--- a/dify/api/core/rag/extractor/word_extractor.py
+++ b/dify/api/core/rag/extractor/word_extractor.py
@@ -0,0 +1,295 @@
+"""Abstract interface for document loader implementations."""
+
+import logging
+import mimetypes
+import os
+import re
+import tempfile
+import uuid
+from urllib.parse import urlparse
+from xml.etree import ElementTree
+
+import httpx
+from docx import Document as DocxDocument
+
+from configs import dify_config
+from core.helper import ssrf_proxy
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+from extensions.ext_database import db
+from extensions.ext_storage import storage
+from libs.datetime_utils import naive_utc_now
+from models.enums import CreatorUserRole
+from models.model import UploadFile
+
+logger = logging.getLogger(__name__)
+
+
+class WordExtractor(BaseExtractor):
+    """Load docx files.
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(self, file_path: str, tenant_id: str, user_id: str):
+        """Initialize with file path."""
+        self.file_path = file_path
+        self.tenant_id = tenant_id
+        self.user_id = user_id
+
+        if "~" in self.file_path:
+            self.file_path = os.path.expanduser(self.file_path)
+
+        # If the file is a web path, download it to a temporary file, and use that
+        if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
+            response = httpx.get(self.file_path, timeout=None)
+
+            if response.status_code != 200:
+                response.close()
+                raise ValueError(f"Check the url of your file; returned status code {response.status_code}")
+
+            self.web_path = self.file_path
+            # TODO: use a better way to handle the file
+            self.temp_file = tempfile.NamedTemporaryFile()  # noqa SIM115
+            try:
+                self.temp_file.write(response.content)
+            finally:
+                response.close()
+            self.file_path = self.temp_file.name
+        elif not os.path.isfile(self.file_path):
+            raise ValueError(f"File path {self.file_path} is not a valid file or url")
+
+    def __del__(self):
+        if hasattr(self, "temp_file"):
+            self.temp_file.close()
+
+    def extract(self) -> list[Document]:
+        """Load given path as single page."""
+        content = self.parse_docx(self.file_path)
+        return [
+            Document(
+                page_content=content,
+                metadata={"source": self.file_path},
+            )
+        ]
+
+    @staticmethod
+    def _is_valid_url(url: str) -> bool:
+        """Check if the url is valid."""
+        parsed = urlparse(url)
+        return bool(parsed.netloc) and bool(parsed.scheme)
+
+    def _extract_images_from_docx(self, doc):
+        image_count = 0
+        image_map = {}
+
+        for rel in doc.part.rels.values():
+            if "image" in rel.target_ref:
+                image_count += 1
+                if rel.is_external:
+                    url = rel.target_ref
+                    response = ssrf_proxy.get(url)
+                    if response.status_code == 200:
+                        image_ext = mimetypes.guess_extension(response.headers["Content-Type"])
+                        if image_ext is None:
+                            continue
+                        file_uuid = str(uuid.uuid4())
+                        file_key = "image_files/" + self.tenant_id + "/" + file_uuid + "." + image_ext
+                        mime_type, _ = mimetypes.guess_type(file_key)
+                        storage.save(file_key, response.content)
+                    else:
+                        continue
+                else:
+                    image_ext = rel.target_ref.split(".")[-1]
+                    if image_ext is None:
+                        continue
+                    # user uuid as file name
+                    file_uuid = str(uuid.uuid4())
+                    file_key = "image_files/" + self.tenant_id + "/" + file_uuid + "." + image_ext
+                    mime_type, _ = mimetypes.guess_type(file_key)
+
+                    storage.save(file_key, rel.target_part.blob)
+                # save file to db
+                upload_file = UploadFile(
+                    tenant_id=self.tenant_id,
+                    storage_type=dify_config.STORAGE_TYPE,
+                    key=file_key,
+                    name=file_key,
+                    size=0,
+                    extension=str(image_ext),
+                    mime_type=mime_type or "",
+                    created_by=self.user_id,
+                    created_by_role=CreatorUserRole.ACCOUNT,
+                    created_at=naive_utc_now(),
+                    used=True,
+                    used_by=self.user_id,
+                    used_at=naive_utc_now(),
+                )
+
+                db.session.add(upload_file)
+                db.session.commit()
+                image_map[rel.target_part] = f"![image]({dify_config.FILES_URL}/files/{upload_file.id}/file-preview)"
+
+        return image_map
+
+    def _table_to_markdown(self, table, image_map):
+        markdown = []
+        # calculate the total number of columns
+        total_cols = max(len(row.cells) for row in table.rows)
+
+        header_row = table.rows[0]
+        headers = self._parse_row(header_row, image_map, total_cols)
+        markdown.append("| " + " | ".join(headers) + " |")
+        markdown.append("| " + " | ".join(["---"] * total_cols) + " |")
+
+        for row in table.rows[1:]:
+            row_cells = self._parse_row(row, image_map, total_cols)
+            markdown.append("| " + " | ".join(row_cells) + " |")
+        return "\n".join(markdown)
+
+    def _parse_row(self, row, image_map, total_cols):
+        # Initialize a row, all of which are empty by default
+        row_cells = [""] * total_cols
+        col_index = 0
+        while col_index < len(row.cells):
+            # make sure the col_index is not out of range
+            while col_index < len(row.cells) and row_cells[col_index] != "":
+                col_index += 1
+            # if col_index is out of range the loop is jumped
+            if col_index >= len(row.cells):
+                break
+            # get the correct cell
+            cell = row.cells[col_index]
+            cell_content = self._parse_cell(cell, image_map).strip()
+            cell_colspan = cell.grid_span or 1
+            for i in range(cell_colspan):
+                if col_index + i < total_cols:
+                    row_cells[col_index + i] = cell_content if i == 0 else ""
+            col_index += cell_colspan
+        return row_cells
+
+    def _parse_cell(self, cell, image_map):
+        cell_content = []
+        for paragraph in cell.paragraphs:
+            parsed_paragraph = self._parse_cell_paragraph(paragraph, image_map)
+            if parsed_paragraph:
+                cell_content.append(parsed_paragraph)
+        unique_content = list(dict.fromkeys(cell_content))
+        return " ".join(unique_content)
+
+    def _parse_cell_paragraph(self, paragraph, image_map):
+        paragraph_content = []
+        for run in paragraph.runs:
+            if run.element.xpath(".//a:blip"):
+                for blip in run.element.xpath(".//a:blip"):
+                    image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
+                    if not image_id:
+                        continue
+                    image_part = paragraph.part.rels[image_id].target_part
+
+                    if image_part in image_map:
+                        image_link = image_map[image_part]
+                        paragraph_content.append(image_link)
+            else:
+                paragraph_content.append(run.text)
+        return "".join(paragraph_content).strip()
+
+    def parse_docx(self, docx_path):
+        doc = DocxDocument(docx_path)
+
+        content = []
+
+        image_map = self._extract_images_from_docx(doc)
+
+        hyperlinks_url = None
+        url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+")
+        for para in doc.paragraphs:
+            for run in para.runs:
+                if run.text and hyperlinks_url:
+                    result = f"  [{run.text}]({hyperlinks_url})  "
+                    run.text = result
+                    hyperlinks_url = None
+                if "HYPERLINK" in run.element.xml:
+                    try:
+                        xml = ElementTree.XML(run.element.xml)
+                        x_child = [c for c in xml.iter() if c is not None]
+                        for x in x_child:
+                            if x is None:
+                                continue
+                            if x.tag.endswith("instrText"):
+                                if x.text is None:
+                                    continue
+                                for i in url_pattern.findall(x.text):
+                                    hyperlinks_url = str(i)
+                    except Exception:
+                        logger.exception("Failed to parse HYPERLINK xml")
+
+        def parse_paragraph(paragraph):
+            paragraph_content = []
+            for run in paragraph.runs:
+                if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"):
+                    # Process drawing type images
+                    drawing_elements = run.element.findall(
+                        ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing"
+                    )
+                    has_drawing = False
+                    for drawing in drawing_elements:
+                        blip_elements = drawing.findall(
+                            ".//{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
+                        )
+                        for blip in blip_elements:
+                            embed_id = blip.get(
+                                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
+                            )
+                            if embed_id:
+                                image_part = doc.part.related_parts.get(embed_id)
+                                if image_part in image_map:
+                                    has_drawing = True
+                                    paragraph_content.append(image_map[image_part])
+                    # Process pict type images
+                    shape_elements = run.element.findall(
+                        ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict"
+                    )
+                    for shape in shape_elements:
+                        # Find image data in VML
+                        shape_image = shape.find(
+                            ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}binData"
+                        )
+                        if shape_image is not None and shape_image.text:
+                            image_id = shape_image.get(
+                                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
+                            )
+                            if image_id and image_id in doc.part.rels:
+                                image_part = doc.part.rels[image_id].target_part
+                                if image_part in image_map and not has_drawing:
+                                    paragraph_content.append(image_map[image_part])
+                        # Find imagedata element in VML
+                        image_data = shape.find(".//{urn:schemas-microsoft-com:vml}imagedata")
+                        if image_data is not None:
+                            image_id = image_data.get("id") or image_data.get(
+                                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
+                            )
+                            if image_id and image_id in doc.part.rels:
+                                image_part = doc.part.rels[image_id].target_part
+                                if image_part in image_map and not has_drawing:
+                                    paragraph_content.append(image_map[image_part])
+                if run.text.strip():
+                    paragraph_content.append(run.text.strip())
+            return "".join(paragraph_content) if paragraph_content else ""
+
+        paragraphs = doc.paragraphs.copy()
+        tables = doc.tables.copy()
+        for element in doc.element.body:
+            if hasattr(element, "tag"):
+                if isinstance(element.tag, str) and element.tag.endswith("p"):  # paragraph
+                    para = paragraphs.pop(0)
+                    parsed_paragraph = parse_paragraph(para)
+                    if parsed_paragraph.strip():
+                        content.append(parsed_paragraph)
+                    else:
+                        content.append("\n")
+                elif isinstance(element.tag, str) and element.tag.endswith("tbl"):  # table
+                    table = tables.pop(0)
+                    content.append(self._table_to_markdown(table, image_map))
+        return "\n".join(content)