This commit is contained in:
2025-12-01 17:21:38 +08:00
parent 32fee2b8ab
commit fab8c13cb3
7511 changed files with 996300 additions and 0 deletions

View File

@@ -0,0 +1,144 @@
"""Schema for Blobs and Blob Loaders.
The goal is to facilitate decoupling of content loading from content parsing code.
In addition, content loading code should provide a lazy loading interface by default.
"""
from __future__ import annotations
import contextlib
import mimetypes
from collections.abc import Generator, Mapping
from io import BufferedReader, BytesIO
from pathlib import Path, PurePath
from typing import Any, Union
from pydantic import BaseModel, ConfigDict, model_validator
PathLike = Union[str, PurePath]
class Blob(BaseModel):
"""A blob is used to represent raw data by either reference or value.
Provides an interface to materialize the blob in different representations, and
help to decouple the development of data loaders from the downstream parsing of
the raw data.
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
"""
data: Union[bytes, str, None] = None # Raw data
mimetype: str | None = None # Not to be confused with a file extension
encoding: str = "utf-8" # Use utf-8 as default encoding, if decoding to string
# Location where the original content was found
# Represent location on the local file system
# Useful for situations where downstream code assumes it must work with file paths
# rather than in-memory content.
path: PathLike | None = None
model_config = ConfigDict(arbitrary_types_allowed=True, frozen=True)
@property
def source(self) -> str | None:
"""The source location of the blob as string if known otherwise none."""
return str(self.path) if self.path else None
@model_validator(mode="before")
@classmethod
def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
"""Verify that either data or path is provided."""
if "data" not in values and "path" not in values:
raise ValueError("Either data or path must be provided")
return values
def as_string(self) -> str:
"""Read data as a string."""
if self.data is None and self.path:
return Path(str(self.path)).read_text(encoding=self.encoding)
elif isinstance(self.data, bytes):
return self.data.decode(self.encoding)
elif isinstance(self.data, str):
return self.data
else:
raise ValueError(f"Unable to get string for blob {self}")
def as_bytes(self) -> bytes:
"""Read data as bytes."""
if isinstance(self.data, bytes):
return self.data
elif isinstance(self.data, str):
return self.data.encode(self.encoding)
elif self.data is None and self.path:
return Path(str(self.path)).read_bytes()
else:
raise ValueError(f"Unable to get bytes for blob {self}")
@contextlib.contextmanager
def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
"""Read data as a byte stream."""
if isinstance(self.data, bytes):
yield BytesIO(self.data)
elif self.data is None and self.path:
with open(str(self.path), "rb") as f:
yield f
else:
raise NotImplementedError(f"Unable to convert blob {self}")
@classmethod
def from_path(
cls,
path: PathLike,
*,
encoding: str = "utf-8",
mime_type: str | None = None,
guess_type: bool = True,
) -> Blob:
"""Load the blob from a path like object.
Args:
path: path like object to file to be read
encoding: Encoding to use if decoding the bytes into a string
mime_type: if provided, will be set as the mime-type of the data
guess_type: If True, the mimetype will be guessed from the file extension,
if a mime-type was not provided
Returns:
Blob instance
"""
if mime_type is None and guess_type:
_mimetype = mimetypes.guess_type(path)[0]
else:
_mimetype = mime_type
# We do not load the data immediately, instead we treat the blob as a
# reference to the underlying data.
return cls(data=None, mimetype=_mimetype, encoding=encoding, path=path)
@classmethod
def from_data(
cls,
data: Union[str, bytes],
*,
encoding: str = "utf-8",
mime_type: str | None = None,
path: str | None = None,
) -> Blob:
"""Initialize the blob from in-memory data.
Args:
data: the in-memory data associated with the blob
encoding: Encoding to use if decoding the bytes into a string
mime_type: if provided, will be set as the mime-type of the data
path: if provided, will be set as the source from which the data came
Returns:
Blob instance
"""
return cls(data=data, mimetype=mime_type, encoding=encoding, path=path)
def __repr__(self) -> str:
"""Define the blob representation."""
str_repr = f"Blob {id(self)}"
if self.source:
str_repr += f" {self.source}"
return str_repr

View File

@@ -0,0 +1,77 @@
"""Abstract interface for document loader implementations."""
import csv
import pandas as pd
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.extractor.helpers import detect_file_encodings
from core.rag.models.document import Document
class CSVExtractor(BaseExtractor):
"""Load CSV files.
Args:
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
encoding: str | None = None,
autodetect_encoding: bool = False,
source_column: str | None = None,
csv_args: dict | None = None,
):
"""Initialize with file path."""
self._file_path = file_path
self._encoding = encoding
self._autodetect_encoding = autodetect_encoding
self.source_column = source_column
self.csv_args = csv_args or {}
def extract(self) -> list[Document]:
"""Load data into document objects."""
docs = []
try:
with open(self._file_path, newline="", encoding=self._encoding) as csvfile:
docs = self._read_from_file(csvfile)
except UnicodeDecodeError as e:
if self._autodetect_encoding:
detected_encodings = detect_file_encodings(self._file_path)
for encoding in detected_encodings:
try:
with open(self._file_path, newline="", encoding=encoding.encoding) as csvfile:
docs = self._read_from_file(csvfile)
break
except UnicodeDecodeError:
continue
else:
raise RuntimeError(f"Error loading {self._file_path}") from e
return docs
def _read_from_file(self, csvfile) -> list[Document]:
docs = []
try:
# load csv file into pandas dataframe
df = pd.read_csv(csvfile, on_bad_lines="skip", **self.csv_args)
# check source column exists
if self.source_column and self.source_column not in df.columns:
raise ValueError(f"Source column '{self.source_column}' not found in CSV file.")
# create document objects
for i, row in df.iterrows():
content = ";".join(f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns)
source = row[self.source_column] if self.source_column else ""
metadata = {"source": source, "row": i}
doc = Document(page_content=content, metadata=metadata)
docs.append(doc)
except csv.Error as e:
raise e
return docs

View File

@@ -0,0 +1,7 @@
from enum import StrEnum
class DatasourceType(StrEnum):
FILE = "upload_file"
NOTION = "notion_import"
WEBSITE = "website_crawl"

View File

@@ -0,0 +1,46 @@
from pydantic import BaseModel, ConfigDict
from models.dataset import Document
from models.model import UploadFile
class NotionInfo(BaseModel):
"""
Notion import info.
"""
credential_id: str | None = None
notion_workspace_id: str
notion_obj_id: str
notion_page_type: str
document: Document | None = None
tenant_id: str
model_config = ConfigDict(arbitrary_types_allowed=True)
class WebsiteInfo(BaseModel):
"""
website import info.
"""
model_config = ConfigDict(arbitrary_types_allowed=True)
provider: str
job_id: str
url: str
mode: str
tenant_id: str
only_main_content: bool = False
class ExtractSetting(BaseModel):
"""
Model class for provider response.
"""
datasource_type: str
upload_file: UploadFile | None = None
notion_info: NotionInfo | None = None
website_info: WebsiteInfo | None = None
document_model: str | None = None
model_config = ConfigDict(arbitrary_types_allowed=True)

View File

@@ -0,0 +1,77 @@
"""Abstract interface for document loader implementations."""
import os
from typing import cast
import pandas as pd
from openpyxl import load_workbook
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
class ExcelExtractor(BaseExtractor):
"""Load Excel files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str, encoding: str | None = None, autodetect_encoding: bool = False):
"""Initialize with file path."""
self._file_path = file_path
self._encoding = encoding
self._autodetect_encoding = autodetect_encoding
def extract(self) -> list[Document]:
"""Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
documents = []
file_extension = os.path.splitext(self._file_path)[-1].lower()
if file_extension == ".xlsx":
wb = load_workbook(self._file_path, data_only=True)
for sheet_name in wb.sheetnames:
sheet = wb[sheet_name]
data = sheet.values
cols = next(data, None)
if cols is None:
continue
df = pd.DataFrame(data, columns=cols)
df.dropna(how="all", inplace=True)
for index, row in df.iterrows():
page_content = []
for col_index, (k, v) in enumerate(row.items()):
if pd.notna(v):
cell = sheet.cell(
row=cast(int, index) + 2, column=col_index + 1
) # +2 to account for header and 1-based index
if cell.hyperlink:
value = f"[{v}]({cell.hyperlink.target})"
page_content.append(f'"{k}":"{value}"')
else:
page_content.append(f'"{k}":"{v}"')
documents.append(
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
)
elif file_extension == ".xls":
excel_file = pd.ExcelFile(self._file_path, engine="xlrd")
for excel_sheet_name in excel_file.sheet_names:
df = excel_file.parse(sheet_name=excel_sheet_name)
df.dropna(how="all", inplace=True)
for _, row in df.iterrows():
page_content = []
for k, v in row.items():
if pd.notna(v):
page_content.append(f'"{k}":"{v}"')
documents.append(
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
)
else:
raise ValueError(f"Unsupported file extension: {file_extension}")
return documents

View File

@@ -0,0 +1,209 @@
import re
import tempfile
from pathlib import Path
from typing import Union
from urllib.parse import unquote
from configs import dify_config
from core.helper import ssrf_proxy
from core.rag.extractor.csv_extractor import CSVExtractor
from core.rag.extractor.entity.datasource_type import DatasourceType
from core.rag.extractor.entity.extract_setting import ExtractSetting
from core.rag.extractor.excel_extractor import ExcelExtractor
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor
from core.rag.extractor.html_extractor import HtmlExtractor
from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor
from core.rag.extractor.markdown_extractor import MarkdownExtractor
from core.rag.extractor.notion_extractor import NotionExtractor
from core.rag.extractor.pdf_extractor import PdfExtractor
from core.rag.extractor.text_extractor import TextExtractor
from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor
from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor
from core.rag.extractor.watercrawl.extractor import WaterCrawlWebExtractor
from core.rag.extractor.word_extractor import WordExtractor
from core.rag.models.document import Document
from extensions.ext_storage import storage
from models.model import UploadFile
SUPPORT_URL_CONTENT_TYPES = ["application/pdf", "text/plain", "application/json"]
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124"
" Safari/537.36"
)
class ExtractProcessor:
@classmethod
def load_from_upload_file(
cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False
) -> Union[list[Document], str]:
extract_setting = ExtractSetting(
datasource_type=DatasourceType.FILE, upload_file=upload_file, document_model="text_model"
)
if return_text:
delimiter = "\n"
return delimiter.join([document.page_content for document in cls.extract(extract_setting, is_automatic)])
else:
return cls.extract(extract_setting, is_automatic)
@classmethod
def load_from_url(cls, url: str, return_text: bool = False) -> Union[list[Document], str]:
response = ssrf_proxy.get(url, headers={"User-Agent": USER_AGENT})
with tempfile.TemporaryDirectory() as temp_dir:
suffix = Path(url).suffix
if not suffix and suffix != ".":
# get content-type
if response.headers.get("Content-Type"):
suffix = "." + response.headers.get("Content-Type").split("/")[-1]
else:
content_disposition = response.headers.get("Content-Disposition")
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
if filename_match:
filename = unquote(filename_match.group(1))
match = re.search(r"\.(\w+)$", filename)
if match:
suffix = "." + match.group(1)
else:
suffix = ""
# https://stackoverflow.com/questions/26541416/generate-temporary-file-names-without-creating-actual-file-in-python#comment90414256_26541521
file_path = f"{temp_dir}/{tempfile.gettempdir()}{suffix}"
Path(file_path).write_bytes(response.content)
extract_setting = ExtractSetting(datasource_type=DatasourceType.FILE, document_model="text_model")
if return_text:
delimiter = "\n"
return delimiter.join(
[
document.page_content
for document in cls.extract(extract_setting=extract_setting, file_path=file_path)
]
)
else:
return cls.extract(extract_setting=extract_setting, file_path=file_path)
@classmethod
def extract(
cls, extract_setting: ExtractSetting, is_automatic: bool = False, file_path: str | None = None
) -> list[Document]:
if extract_setting.datasource_type == DatasourceType.FILE:
with tempfile.TemporaryDirectory() as temp_dir:
if not file_path:
assert extract_setting.upload_file is not None, "upload_file is required"
upload_file: UploadFile = extract_setting.upload_file
suffix = Path(upload_file.key).suffix
# FIXME mypy: Cannot determine type of 'tempfile._get_candidate_names' better not use it here
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore
storage.download(upload_file.key, file_path)
input_file = Path(file_path)
file_extension = input_file.suffix.lower()
etl_type = dify_config.ETL_TYPE
extractor: BaseExtractor | None = None
if etl_type == "Unstructured":
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL or ""
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
if file_extension in {".xlsx", ".xls"}:
extractor = ExcelExtractor(file_path)
elif file_extension == ".pdf":
extractor = PdfExtractor(file_path)
elif file_extension in {".md", ".markdown", ".mdx"}:
extractor = (
UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key)
if is_automatic
else MarkdownExtractor(file_path, autodetect_encoding=True)
)
elif file_extension in {".htm", ".html"}:
extractor = HtmlExtractor(file_path)
elif file_extension == ".docx":
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
elif file_extension == ".doc":
extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".csv":
extractor = CSVExtractor(file_path, autodetect_encoding=True)
elif file_extension == ".msg":
extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".eml":
extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".ppt":
extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
# You must first specify the API key
# because unstructured_api_key is necessary to parse .ppt documents
elif file_extension == ".pptx":
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".xml":
extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".epub":
extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
else:
# txt
extractor = TextExtractor(file_path, autodetect_encoding=True)
else:
if file_extension in {".xlsx", ".xls"}:
extractor = ExcelExtractor(file_path)
elif file_extension == ".pdf":
extractor = PdfExtractor(file_path)
elif file_extension in {".md", ".markdown", ".mdx"}:
extractor = MarkdownExtractor(file_path, autodetect_encoding=True)
elif file_extension in {".htm", ".html"}:
extractor = HtmlExtractor(file_path)
elif file_extension == ".docx":
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
elif file_extension == ".csv":
extractor = CSVExtractor(file_path, autodetect_encoding=True)
elif file_extension == ".epub":
extractor = UnstructuredEpubExtractor(file_path)
else:
# txt
extractor = TextExtractor(file_path, autodetect_encoding=True)
return extractor.extract()
elif extract_setting.datasource_type == DatasourceType.NOTION:
assert extract_setting.notion_info is not None, "notion_info is required"
extractor = NotionExtractor(
notion_workspace_id=extract_setting.notion_info.notion_workspace_id,
notion_obj_id=extract_setting.notion_info.notion_obj_id,
notion_page_type=extract_setting.notion_info.notion_page_type,
document_model=extract_setting.notion_info.document,
tenant_id=extract_setting.notion_info.tenant_id,
credential_id=extract_setting.notion_info.credential_id,
)
return extractor.extract()
elif extract_setting.datasource_type == DatasourceType.WEBSITE:
assert extract_setting.website_info is not None, "website_info is required"
if extract_setting.website_info.provider == "firecrawl":
extractor = FirecrawlWebExtractor(
url=extract_setting.website_info.url,
job_id=extract_setting.website_info.job_id,
tenant_id=extract_setting.website_info.tenant_id,
mode=extract_setting.website_info.mode,
only_main_content=extract_setting.website_info.only_main_content,
)
return extractor.extract()
elif extract_setting.website_info.provider == "watercrawl":
extractor = WaterCrawlWebExtractor(
url=extract_setting.website_info.url,
job_id=extract_setting.website_info.job_id,
tenant_id=extract_setting.website_info.tenant_id,
mode=extract_setting.website_info.mode,
only_main_content=extract_setting.website_info.only_main_content,
)
return extractor.extract()
elif extract_setting.website_info.provider == "jinareader":
extractor = JinaReaderWebExtractor(
url=extract_setting.website_info.url,
job_id=extract_setting.website_info.job_id,
tenant_id=extract_setting.website_info.tenant_id,
mode=extract_setting.website_info.mode,
only_main_content=extract_setting.website_info.only_main_content,
)
return extractor.extract()
else:
raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}")
else:
raise ValueError(f"Unsupported datasource type: {extract_setting.datasource_type}")

View File

@@ -0,0 +1,11 @@
"""Abstract interface for document loader implementations."""
from abc import ABC, abstractmethod
class BaseExtractor(ABC):
"""Interface for extract files."""
@abstractmethod
def extract(self):
raise NotImplementedError

View File

@@ -0,0 +1,173 @@
import json
import time
from typing import Any, cast
import httpx
from extensions.ext_storage import storage
class FirecrawlApp:
def __init__(self, api_key=None, base_url=None):
self.api_key = api_key
self.base_url = base_url or "https://api.firecrawl.dev"
if self.api_key is None and self.base_url == "https://api.firecrawl.dev":
raise ValueError("No API key provided")
def scrape_url(self, url, params=None) -> dict[str, Any]:
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/scrape
headers = self._prepare_headers()
json_data = {
"url": url,
"formats": ["markdown"],
"onlyMainContent": True,
"timeout": 30000,
}
if params:
json_data.update(params)
response = self._post_request(f"{self.base_url}/v2/scrape", json_data, headers)
if response.status_code == 200:
response_data = response.json()
data = response_data["data"]
return self._extract_common_fields(data)
elif response.status_code in {402, 409, 500, 429, 408}:
self._handle_error(response, "scrape URL")
return {} # Avoid additional exception after handling error
else:
raise Exception(f"Failed to scrape URL. Status code: {response.status_code}")
def crawl_url(self, url, params=None) -> str:
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post
headers = self._prepare_headers()
json_data = {"url": url}
if params:
json_data.update(params)
response = self._post_request(f"{self.base_url}/v2/crawl", json_data, headers)
if response.status_code == 200:
# There's also another two fields in the response: "success" (bool) and "url" (str)
job_id = response.json().get("id")
return cast(str, job_id)
else:
self._handle_error(response, "start crawl job")
return "" # unreachable
def map(self, url: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/map
headers = self._prepare_headers()
json_data: dict[str, Any] = {"url": url, "integration": "dify"}
if params:
# Pass through provided params, including optional "sitemap": "only" | "include" | "skip"
json_data.update(params)
response = self._post_request(f"{self.base_url}/v2/map", json_data, headers)
if response.status_code == 200:
return cast(dict[str, Any], response.json())
elif response.status_code in {402, 409, 500, 429, 408}:
self._handle_error(response, "start map job")
return {}
else:
raise Exception(f"Failed to start map job. Status code: {response.status_code}")
def check_crawl_status(self, job_id) -> dict[str, Any]:
headers = self._prepare_headers()
response = self._get_request(f"{self.base_url}/v2/crawl/{job_id}", headers)
if response.status_code == 200:
crawl_status_response = response.json()
if crawl_status_response.get("status") == "completed":
total = crawl_status_response.get("total", 0)
if total == 0:
raise Exception("Failed to check crawl status. Error: No page found")
data = crawl_status_response.get("data", [])
url_data_list = []
for item in data:
if isinstance(item, dict) and "metadata" in item and "markdown" in item:
url_data = self._extract_common_fields(item)
url_data_list.append(url_data)
if url_data_list:
file_key = "website_files/" + job_id + ".txt"
try:
if storage.exists(file_key):
storage.delete(file_key)
storage.save(file_key, json.dumps(url_data_list).encode("utf-8"))
except Exception as e:
raise Exception(f"Error saving crawl data: {e}")
return self._format_crawl_status_response("completed", crawl_status_response, url_data_list)
else:
return self._format_crawl_status_response(
crawl_status_response.get("status"), crawl_status_response, []
)
else:
self._handle_error(response, "check crawl status")
return {} # unreachable
def _format_crawl_status_response(
self, status: str, crawl_status_response: dict[str, Any], url_data_list: list[dict[str, Any]]
) -> dict[str, Any]:
return {
"status": status,
"total": crawl_status_response.get("total"),
"current": crawl_status_response.get("completed"),
"data": url_data_list,
}
def _extract_common_fields(self, item: dict[str, Any]) -> dict[str, Any]:
return {
"title": item.get("metadata", {}).get("title"),
"description": item.get("metadata", {}).get("description"),
"source_url": item.get("metadata", {}).get("sourceURL"),
"markdown": item.get("markdown"),
}
def _prepare_headers(self) -> dict[str, Any]:
return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
for attempt in range(retries):
response = httpx.post(url, headers=headers, json=data)
if response.status_code == 502:
time.sleep(backoff_factor * (2**attempt))
else:
return response
return response
def _get_request(self, url, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
for attempt in range(retries):
response = httpx.get(url, headers=headers)
if response.status_code == 502:
time.sleep(backoff_factor * (2**attempt))
else:
return response
return response
def _handle_error(self, response, action):
error_message = response.json().get("error", "Unknown error occurred")
raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}") # type: ignore[return]
def search(self, query: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/search
headers = self._prepare_headers()
json_data = {
"query": query,
"limit": 5,
"lang": "en",
"country": "us",
"timeout": 60000,
"ignoreInvalidURLs": True,
"scrapeOptions": {},
"sources": [
{"type": "web"},
],
"integration": "dify",
}
if params:
json_data.update(params)
response = self._post_request(f"{self.base_url}/v2/search", json_data, headers)
if response.status_code == 200:
response_data = response.json()
if not response_data.get("success"):
raise Exception(f"Search failed. Error: {response_data.get('warning', 'Unknown error')}")
return cast(dict[str, Any], response_data)
elif response.status_code in {402, 409, 500, 429, 408}:
self._handle_error(response, "perform search")
return {} # Avoid additional exception after handling error
else:
raise Exception(f"Failed to perform search. Status code: {response.status_code}")

View File

@@ -0,0 +1,63 @@
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from services.website_service import WebsiteService
class FirecrawlWebExtractor(BaseExtractor):
"""
Crawl and scrape websites and return content in clean llm-ready markdown.
Args:
url: The URL to scrape.
job_id: The crawl job id.
tenant_id: The tenant id.
mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
"""
def __init__(
self,
url: str,
job_id: str,
tenant_id: str,
mode: str = "crawl",
only_main_content: bool = True,
):
"""Initialize with url, api_key, base_url and mode."""
self._url = url
self.job_id = job_id
self.tenant_id = tenant_id
self.mode = mode
self.only_main_content = only_main_content
def extract(self) -> list[Document]:
"""Extract content from the URL."""
documents = []
if self.mode == "crawl":
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "firecrawl", self._url, self.tenant_id)
if crawl_data is None:
return []
document = Document(
page_content=crawl_data.get("markdown", ""),
metadata={
"source_url": crawl_data.get("source_url"),
"description": crawl_data.get("description"),
"title": crawl_data.get("title"),
},
)
documents.append(document)
elif self.mode == "scrape":
scrape_data = WebsiteService.get_scrape_url_data(
"firecrawl", self._url, self.tenant_id, self.only_main_content
)
document = Document(
page_content=scrape_data.get("markdown", ""),
metadata={
"source_url": scrape_data.get("source_url"),
"description": scrape_data.get("description"),
"title": scrape_data.get("title"),
},
)
documents.append(document)
return documents

View File

@@ -0,0 +1,48 @@
"""Document loader helpers."""
import concurrent.futures
from typing import NamedTuple, cast
class FileEncoding(NamedTuple):
"""A file encoding as the NamedTuple."""
encoding: str | None
"""The encoding of the file."""
confidence: float
"""The confidence of the encoding."""
language: str | None
"""The language of the file."""
def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1024 * 1024) -> list[FileEncoding]:
"""Try to detect the file encoding.
Returns a list of `FileEncoding` tuples with the detected encodings ordered
by confidence.
Args:
file_path: The path to the file to detect the encoding for.
timeout: The timeout in seconds for the encoding detection.
sample_size: The number of bytes to read for encoding detection. Default is 1MB.
For large files, reading only a sample is sufficient and prevents timeout.
"""
import chardet
def read_and_detect(file_path: str):
with open(file_path, "rb") as f:
# Read only a sample of the file for encoding detection
# This prevents timeout on large files while still providing accurate encoding detection
rawdata = f.read(sample_size)
return cast(list[dict], chardet.detect_all(rawdata))
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(read_and_detect, file_path)
try:
encodings = future.result(timeout=timeout)
except concurrent.futures.TimeoutError:
raise TimeoutError(f"Timeout reached while detecting encoding for {file_path}")
if all(encoding["encoding"] is None for encoding in encodings):
raise RuntimeError(f"Could not detect encoding for {file_path}")
return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]

View File

@@ -0,0 +1,32 @@
"""Abstract interface for document loader implementations."""
from bs4 import BeautifulSoup
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
class HtmlExtractor(BaseExtractor):
"""
Load html files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
self._file_path = file_path
def extract(self) -> list[Document]:
return [Document(page_content=self._load_as_text())]
def _load_as_text(self) -> str:
text: str = ""
with open(self._file_path, "rb") as fp:
soup = BeautifulSoup(fp, "html.parser")
text = soup.get_text()
text = text.strip() if text else ""
return text

View File

@@ -0,0 +1,42 @@
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from services.website_service import WebsiteService
class JinaReaderWebExtractor(BaseExtractor):
"""
Crawl and scrape websites and return content in clean llm-ready markdown.
"""
def __init__(
self,
url: str,
job_id: str,
tenant_id: str,
mode: str = "crawl",
only_main_content: bool = False,
):
"""Initialize with url, api_key, base_url and mode."""
self._url = url
self.job_id = job_id
self.tenant_id = tenant_id
self.mode = mode
self.only_main_content = only_main_content
def extract(self) -> list[Document]:
"""Extract content from the URL."""
documents = []
if self.mode == "crawl":
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id)
if crawl_data is None:
return []
document = Document(
page_content=crawl_data.get("content", ""),
metadata={
"source_url": crawl_data.get("url"),
"description": crawl_data.get("description"),
"title": crawl_data.get("title"),
},
)
documents.append(document)
return documents

View File

@@ -0,0 +1,121 @@
"""Abstract interface for document loader implementations."""
import re
from pathlib import Path
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.extractor.helpers import detect_file_encodings
from core.rag.models.document import Document
class MarkdownExtractor(BaseExtractor):
"""Load Markdown files.
Args:
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
remove_hyperlinks: bool = False,
remove_images: bool = False,
encoding: str | None = None,
autodetect_encoding: bool = True,
):
"""Initialize with file path."""
self._file_path = file_path
self._remove_hyperlinks = remove_hyperlinks
self._remove_images = remove_images
self._encoding = encoding
self._autodetect_encoding = autodetect_encoding
def extract(self) -> list[Document]:
"""Load from file path."""
tups = self.parse_tups(self._file_path)
documents = []
for header, value in tups:
value = value.strip()
if header is None:
documents.append(Document(page_content=value))
else:
documents.append(Document(page_content=f"\n\n{header}\n{value}"))
return documents
def markdown_to_tups(self, markdown_text: str) -> list[tuple[str | None, str]]:
"""Convert a markdown file to a dictionary.
The keys are the headers and the values are the text under each header.
"""
markdown_tups: list[tuple[str | None, str]] = []
lines = markdown_text.split("\n")
current_header = None
current_text = ""
code_block_flag = False
for line in lines:
if line.startswith("```"):
code_block_flag = not code_block_flag
current_text += line + "\n"
continue
if code_block_flag:
current_text += line + "\n"
continue
header_match = re.match(r"^#+\s", line)
if header_match:
markdown_tups.append((current_header, current_text))
current_header = line
current_text = ""
else:
current_text += line + "\n"
markdown_tups.append((current_header, current_text))
markdown_tups = [
(re.sub(r"#", "", key).strip() if key else None, re.sub(r"<.*?>", "", value))
for key, value in markdown_tups
]
return markdown_tups
def remove_images(self, content: str) -> str:
"""Get a dictionary of a markdown file from its path."""
pattern = r"!{1}\[\[(.*)\]\]"
content = re.sub(pattern, "", content)
return content
def remove_hyperlinks(self, content: str) -> str:
"""Get a dictionary of a markdown file from its path."""
pattern = r"\[(.*?)\]\((.*?)\)"
content = re.sub(pattern, r"\1", content)
return content
def parse_tups(self, filepath: str) -> list[tuple[str | None, str]]:
"""Parse file into tuples."""
content = ""
try:
content = Path(filepath).read_text(encoding=self._encoding)
except UnicodeDecodeError as e:
if self._autodetect_encoding:
detected_encodings = detect_file_encodings(filepath)
for encoding in detected_encodings:
try:
content = Path(filepath).read_text(encoding=encoding.encoding)
break
except UnicodeDecodeError:
continue
else:
raise RuntimeError(f"Error loading {filepath}") from e
except Exception as e:
raise RuntimeError(f"Error loading {filepath}") from e
if self._remove_hyperlinks:
content = self.remove_hyperlinks(content)
if self._remove_images:
content = self.remove_images(content)
return self.markdown_to_tups(content)

View File

@@ -0,0 +1,386 @@
import json
import logging
import operator
from typing import Any, cast
import httpx
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from extensions.ext_database import db
from models.dataset import Document as DocumentModel
from services.datasource_provider_service import DatasourceProviderService
logger = logging.getLogger(__name__)
BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
SEARCH_URL = "https://api.notion.com/v1/search"
RETRIEVE_PAGE_URL_TMPL = "https://api.notion.com/v1/pages/{page_id}"
RETRIEVE_DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}"
# if user want split by headings, use the corresponding splitter
HEADING_SPLITTER = {
"heading_1": "# ",
"heading_2": "## ",
"heading_3": "### ",
}
class NotionExtractor(BaseExtractor):
def __init__(
self,
notion_workspace_id: str,
notion_obj_id: str,
notion_page_type: str,
tenant_id: str,
document_model: DocumentModel | None = None,
notion_access_token: str | None = None,
credential_id: str | None = None,
):
self._notion_access_token = None
self._document_model = document_model
self._notion_workspace_id = notion_workspace_id
self._notion_obj_id = notion_obj_id
self._notion_page_type = notion_page_type
self._credential_id = credential_id
if notion_access_token:
self._notion_access_token = notion_access_token
else:
self._notion_access_token = self._get_access_token(tenant_id, self._credential_id)
if not self._notion_access_token:
integration_token = dify_config.NOTION_INTEGRATION_TOKEN
if integration_token is None:
raise ValueError(
"Must specify `integration_token` or set environment variable `NOTION_INTEGRATION_TOKEN`."
)
self._notion_access_token = integration_token
def extract(self) -> list[Document]:
self.update_last_edited_time(self._document_model)
text_docs = self._load_data_as_documents(self._notion_obj_id, self._notion_page_type)
return text_docs
def _load_data_as_documents(self, notion_obj_id: str, notion_page_type: str) -> list[Document]:
docs = []
if notion_page_type == "database":
# get all the pages in the database
page_text_documents = self._get_notion_database_data(notion_obj_id)
docs.extend(page_text_documents)
elif notion_page_type == "page":
page_text_list = self._get_notion_block_data(notion_obj_id)
docs.append(Document(page_content="\n".join(page_text_list)))
else:
raise ValueError("notion page type not supported")
return docs
def _get_notion_database_data(self, database_id: str, query_dict: dict[str, Any] = {}) -> list[Document]:
"""Get all the pages from a Notion database."""
assert self._notion_access_token is not None, "Notion access token is required"
database_content = []
next_cursor = None
has_more = True
while has_more:
current_query = query_dict.copy()
if next_cursor:
current_query["start_cursor"] = next_cursor
res = httpx.post(
DATABASE_URL_TMPL.format(database_id=database_id),
headers={
"Authorization": "Bearer " + self._notion_access_token,
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
json=current_query,
)
response_data = res.json()
if "results" not in response_data or response_data["results"] is None:
break
for result in response_data["results"]:
properties = result["properties"]
data = {}
value: Any
for property_name, property_value in properties.items():
type = property_value["type"]
if type == "multi_select":
value = []
multi_select_list = property_value[type]
for multi_select in multi_select_list:
value.append(multi_select["name"])
elif type in {"rich_text", "title"}:
if len(property_value[type]) > 0:
value = property_value[type][0]["plain_text"]
else:
value = ""
elif type in {"select", "status"}:
if property_value[type]:
value = property_value[type]["name"]
else:
value = ""
else:
value = property_value[type]
data[property_name] = value
row_dict = {k: v for k, v in data.items() if v}
row_content = ""
for key, value in sorted(row_dict.items(), key=operator.itemgetter(0)):
if isinstance(value, dict):
value_dict = {k: v for k, v in value.items() if v}
value_content = "".join(f"{k}:{v} " for k, v in value_dict.items())
row_content = row_content + f"{key}:{value_content}\n"
else:
row_content = row_content + f"{key}:{value}\n"
if "url" in result:
row_content = row_content + f"Row Page URL:{result.get('url', '')}\n"
database_content.append(row_content)
has_more = response_data.get("has_more", False)
next_cursor = response_data.get("next_cursor")
if not database_content:
return []
return [Document(page_content="\n".join(database_content))]
def _get_notion_block_data(self, page_id: str) -> list[str]:
assert self._notion_access_token is not None, "Notion access token is required"
result_lines_arr = []
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=page_id)
while True:
query_dict: dict[str, Any] = {} if not start_cursor else {"start_cursor": start_cursor}
try:
res = httpx.request(
"GET",
block_url,
headers={
"Authorization": "Bearer " + self._notion_access_token,
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
params=query_dict,
)
if res.status_code != 200:
raise ValueError(f"Error fetching Notion block data: {res.text}")
data = res.json()
except httpx.HTTPError as e:
raise ValueError("Error fetching Notion block data") from e
if "results" not in data or not isinstance(data["results"], list):
raise ValueError("Error fetching Notion block data")
for result in data["results"]:
result_type = result["type"]
result_obj = result[result_type]
cur_result_text_arr = []
if result_type == "table":
result_block_id = result["id"]
text = self._read_table_rows(result_block_id)
text += "\n\n"
result_lines_arr.append(text)
else:
if "rich_text" in result_obj:
for rich_text in result_obj["rich_text"]:
# skip if doesn't have text object
if "text" in rich_text:
text = rich_text["text"]["content"]
cur_result_text_arr.append(text)
result_block_id = result["id"]
has_children = result["has_children"]
block_type = result["type"]
if has_children and block_type != "child_page":
children_text = self._read_block(result_block_id, num_tabs=1)
cur_result_text_arr.append(children_text)
cur_result_text = "\n".join(cur_result_text_arr)
if result_type in HEADING_SPLITTER:
result_lines_arr.append(f"{HEADING_SPLITTER[result_type]}{cur_result_text}")
else:
result_lines_arr.append(cur_result_text + "\n\n")
if data["next_cursor"] is None:
break
else:
start_cursor = data["next_cursor"]
return result_lines_arr
def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
"""Read a block."""
assert self._notion_access_token is not None, "Notion access token is required"
result_lines_arr = []
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
while True:
query_dict: dict[str, Any] = {} if not start_cursor else {"start_cursor": start_cursor}
res = httpx.request(
"GET",
block_url,
headers={
"Authorization": "Bearer " + self._notion_access_token,
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
params=query_dict,
)
data = res.json()
if "results" not in data or data["results"] is None:
break
for result in data["results"]:
result_type = result["type"]
result_obj = result[result_type]
cur_result_text_arr = []
if result_type == "table":
result_block_id = result["id"]
text = self._read_table_rows(result_block_id)
result_lines_arr.append(text)
else:
if "rich_text" in result_obj:
for rich_text in result_obj["rich_text"]:
# skip if doesn't have text object
if "text" in rich_text:
text = rich_text["text"]["content"]
prefix = "\t" * num_tabs
cur_result_text_arr.append(prefix + text)
result_block_id = result["id"]
has_children = result["has_children"]
block_type = result["type"]
if has_children and block_type != "child_page":
children_text = self._read_block(result_block_id, num_tabs=num_tabs + 1)
cur_result_text_arr.append(children_text)
cur_result_text = "\n".join(cur_result_text_arr)
if result_type in HEADING_SPLITTER:
result_lines_arr.append(f"{HEADING_SPLITTER[result_type]}{cur_result_text}")
else:
result_lines_arr.append(cur_result_text + "\n\n")
if data["next_cursor"] is None:
break
else:
start_cursor = data["next_cursor"]
result_lines = "\n".join(result_lines_arr)
return result_lines
def _read_table_rows(self, block_id: str) -> str:
"""Read table rows."""
assert self._notion_access_token is not None, "Notion access token is required"
done = False
result_lines_arr = []
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
while not done:
query_dict: dict[str, Any] = {} if not start_cursor else {"start_cursor": start_cursor}
res = httpx.request(
"GET",
block_url,
headers={
"Authorization": "Bearer " + self._notion_access_token,
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
params=query_dict,
)
data = res.json()
# get table headers text
table_header_cell_texts = []
table_header_cells = data["results"][0]["table_row"]["cells"]
for table_header_cell in table_header_cells:
if table_header_cell:
for table_header_cell_text in table_header_cell:
text = table_header_cell_text["text"]["content"]
table_header_cell_texts.append(text)
else:
table_header_cell_texts.append("")
# Initialize Markdown table with headers
markdown_table = "| " + " | ".join(table_header_cell_texts) + " |\n"
markdown_table += "| " + " | ".join(["---"] * len(table_header_cell_texts)) + " |\n"
# Process data to format each row in Markdown table format
results = data["results"]
for i in range(len(results) - 1):
column_texts = []
table_column_cells = data["results"][i + 1]["table_row"]["cells"]
for j in range(len(table_column_cells)):
if table_column_cells[j]:
for table_column_cell_text in table_column_cells[j]:
column_text = table_column_cell_text["text"]["content"]
column_texts.append(column_text)
# Add row to Markdown table
markdown_table += "| " + " | ".join(column_texts) + " |\n"
result_lines_arr.append(markdown_table)
if data["next_cursor"] is None:
done = True
break
else:
start_cursor = data["next_cursor"]
result_lines = "\n".join(result_lines_arr)
return result_lines
def update_last_edited_time(self, document_model: DocumentModel | None):
if not document_model:
return
last_edited_time = self.get_notion_last_edited_time()
data_source_info = document_model.data_source_info_dict
if data_source_info:
data_source_info["last_edited_time"] = last_edited_time
db.session.query(DocumentModel).filter_by(id=document_model.id).update(
{DocumentModel.data_source_info: json.dumps(data_source_info)}
) # type: ignore
db.session.commit()
def get_notion_last_edited_time(self) -> str:
assert self._notion_access_token is not None, "Notion access token is required"
obj_id = self._notion_obj_id
page_type = self._notion_page_type
if page_type == "database":
retrieve_page_url = RETRIEVE_DATABASE_URL_TMPL.format(database_id=obj_id)
else:
retrieve_page_url = RETRIEVE_PAGE_URL_TMPL.format(page_id=obj_id)
query_dict: dict[str, Any] = {}
res = httpx.request(
"GET",
retrieve_page_url,
headers={
"Authorization": "Bearer " + self._notion_access_token,
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
json=query_dict,
)
data = res.json()
return cast(str, data["last_edited_time"])
@classmethod
def _get_access_token(cls, tenant_id: str, credential_id: str | None) -> str:
# get credential from tenant_id and credential_id
if not credential_id:
raise Exception(f"No credential id found for tenant {tenant_id}")
datasource_provider_service = DatasourceProviderService()
credential = datasource_provider_service.get_datasource_credentials(
tenant_id=tenant_id,
credential_id=credential_id,
provider="notion_datasource",
plugin_id="langgenius/notion_datasource",
)
if not credential:
raise Exception(f"No notion credential found for tenant {tenant_id} and credential {credential_id}")
return cast(str, credential["integration_secret"])

View File

@@ -0,0 +1,66 @@
"""Abstract interface for document loader implementations."""
import contextlib
from collections.abc import Iterator
from core.rag.extractor.blob.blob import Blob
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from extensions.ext_storage import storage
class PdfExtractor(BaseExtractor):
"""Load pdf files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str, file_cache_key: str | None = None):
"""Initialize with file path."""
self._file_path = file_path
self._file_cache_key = file_cache_key
def extract(self) -> list[Document]:
plaintext_file_exists = False
if self._file_cache_key:
with contextlib.suppress(FileNotFoundError):
text = storage.load(self._file_cache_key).decode("utf-8")
plaintext_file_exists = True
return [Document(page_content=text)]
documents = list(self.load())
text_list = []
for document in documents:
text_list.append(document.page_content)
text = "\n\n".join(text_list)
# save plaintext file for caching
if not plaintext_file_exists and self._file_cache_key:
storage.save(self._file_cache_key, text.encode("utf-8"))
return documents
def load(
self,
) -> Iterator[Document]:
"""Lazy load given path as pages."""
blob = Blob.from_path(self._file_path)
yield from self.parse(blob)
def parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import pypdfium2 # type: ignore
with blob.as_bytes_io() as file_path:
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
try:
for page_number, page in enumerate(pdf_reader):
text_page = page.get_textpage()
content = text_page.get_text_range()
text_page.close()
page.close()
metadata = {"source": blob.source, "page": page_number}
yield Document(page_content=content, metadata=metadata)
finally:
pdf_reader.close()

View File

@@ -0,0 +1,48 @@
"""Abstract interface for document loader implementations."""
from pathlib import Path
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.extractor.helpers import detect_file_encodings
from core.rag.models.document import Document
class TextExtractor(BaseExtractor):
"""Load text files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str, encoding: str | None = None, autodetect_encoding: bool = False):
"""Initialize with file path."""
self._file_path = file_path
self._encoding = encoding
self._autodetect_encoding = autodetect_encoding
def extract(self) -> list[Document]:
"""Load from file path."""
text = ""
try:
text = Path(self._file_path).read_text(encoding=self._encoding)
except UnicodeDecodeError as e:
if self._autodetect_encoding:
detected_encodings = detect_file_encodings(self._file_path)
for encoding in detected_encodings:
try:
text = Path(self._file_path).read_text(encoding=encoding.encoding)
break
except UnicodeDecodeError:
continue
else:
raise RuntimeError(
f"Decode failed: {self._file_path}, all detected encodings failed. Original error: {e}"
)
else:
raise RuntimeError(f"Decode failed: {self._file_path}, specified encoding failed. Original error: {e}")
except Exception as e:
raise RuntimeError(f"Error loading {self._file_path}") from e
metadata = {"source": self._file_path}
return [Document(page_content=text, metadata=metadata)]

View File

@@ -0,0 +1,59 @@
import logging
import os
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredWordExtractor(BaseExtractor):
"""Loader that uses unstructured to load word documents."""
def __init__(self, file_path: str, api_url: str, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.file_utils.filetype import FileType, detect_filetype
unstructured_version = tuple(int(x) for x in __unstructured_version__.split("."))
# check the file extension
try:
import magic # noqa: F401
is_doc = detect_filetype(self._file_path) == FileType.DOC
except ImportError:
_, extension = os.path.splitext(str(self._file_path))
is_doc = extension == ".doc"
if is_doc and unstructured_version < (0, 4, 11):
raise ValueError(
f"You are on unstructured version {__unstructured_version__}. "
"Partitioning .doc files is only supported in unstructured>=0.4.11. "
"Please upgrade the unstructured package and try again."
)
if is_doc:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.docx import partition_docx
elements = partition_docx(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,56 @@
import base64
import contextlib
import logging
from bs4 import BeautifulSoup
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredEmailExtractor(BaseExtractor):
"""Load eml files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.email import partition_email
elements = partition_email(filename=self._file_path)
# noinspection PyBroadException
with contextlib.suppress(Exception):
for element in elements:
element_text = element.text.strip()
padding_needed = 4 - len(element_text) % 4
element_text += "=" * padding_needed
element_decode = base64.b64decode(element_text)
soup = BeautifulSoup(element_decode.decode("utf-8"), "html.parser")
element.text = soup.get_text()
from unstructured.chunking.title import chunk_by_title
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,51 @@
import logging
import pypandoc # type: ignore
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredEpubExtractor(BaseExtractor):
"""Load epub files.
Args:
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str | None = None,
api_key: str = "",
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.epub import partition_epub
pypandoc.download_pandoc()
elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
from unstructured.chunking.title import chunk_by_title
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,43 @@
import logging
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredMarkdownExtractor(BaseExtractor):
"""Load md files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.md import partition_md
elements = partition_md(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,42 @@
import logging
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredMsgExtractor(BaseExtractor):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.msg import partition_msg
elements = partition_msg(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,46 @@
import logging
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredPPTExtractor(BaseExtractor):
"""Load ppt files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
raise NotImplementedError("Unstructured API Url is not configured")
text_by_page: dict[int, str] = {}
for element in elements:
page = element.metadata.page_number
if page is None:
continue
text = element.text
if page in text_by_page:
text_by_page[page] += "\n" + text
else:
text_by_page[page] = text
combined_texts = list(text_by_page.values())
documents = []
for combined_text in combined_texts:
text = combined_text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,48 @@
import logging
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredPPTXExtractor(BaseExtractor):
"""Load pptx files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.pptx import partition_pptx
elements = partition_pptx(filename=self._file_path)
text_by_page: dict[int, str] = {}
for element in elements:
page = element.metadata.page_number
text = element.text
if page is not None:
if page in text_by_page:
text_by_page[page] += "\n" + text
else:
text_by_page[page] = text
combined_texts = list(text_by_page.values())
documents = []
for combined_text in combined_texts:
text = combined_text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,43 @@
import logging
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredXmlExtractor(BaseExtractor):
"""Load xml files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.xml import partition_xml
elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
from unstructured.chunking.title import chunk_by_title
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -0,0 +1,199 @@
import json
from collections.abc import Generator
from typing import Union
from urllib.parse import urljoin
import httpx
from httpx import Response
from core.rag.extractor.watercrawl.exceptions import (
WaterCrawlAuthenticationError,
WaterCrawlBadRequestError,
WaterCrawlPermissionError,
)
class BaseAPIClient:
def __init__(self, api_key, base_url):
self.api_key = api_key
self.base_url = base_url
self.session = self.init_session()
def init_session(self):
headers = {
"X-API-Key": self.api_key,
"Content-Type": "application/json",
"Accept": "application/json",
"User-Agent": "WaterCrawl-Plugin",
"Accept-Language": "en-US",
}
return httpx.Client(headers=headers, timeout=None)
def _request(
self,
method: str,
endpoint: str,
query_params: dict | None = None,
data: dict | None = None,
**kwargs,
) -> Response:
stream = kwargs.pop("stream", False)
url = urljoin(self.base_url, endpoint)
if stream:
request = self.session.build_request(method, url, params=query_params, json=data)
return self.session.send(request, stream=True, **kwargs)
return self.session.request(method, url, params=query_params, json=data, **kwargs)
def _get(self, endpoint: str, query_params: dict | None = None, **kwargs):
return self._request("GET", endpoint, query_params=query_params, **kwargs)
def _post(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
return self._request("POST", endpoint, query_params=query_params, data=data, **kwargs)
def _put(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
return self._request("PUT", endpoint, query_params=query_params, data=data, **kwargs)
def _delete(self, endpoint: str, query_params: dict | None = None, **kwargs):
return self._request("DELETE", endpoint, query_params=query_params, **kwargs)
def _patch(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
return self._request("PATCH", endpoint, query_params=query_params, data=data, **kwargs)
class WaterCrawlAPIClient(BaseAPIClient):
def __init__(self, api_key, base_url: str | None = "https://app.watercrawl.dev/"):
super().__init__(api_key, base_url)
def process_eventstream(self, response: Response, download: bool = False) -> Generator:
try:
for raw_line in response.iter_lines():
line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line
if line.startswith("data:"):
line = line[5:].strip()
data = json.loads(line)
if data["type"] == "result" and download:
data["data"] = self.download_result(data["data"])
yield data
finally:
response.close()
def process_response(self, response: Response) -> dict | bytes | list | None | Generator:
if response.status_code == 401:
raise WaterCrawlAuthenticationError(response)
if response.status_code == 403:
raise WaterCrawlPermissionError(response)
if 400 <= response.status_code < 500:
raise WaterCrawlBadRequestError(response)
response.raise_for_status()
if response.status_code == 204:
return None
if response.headers.get("Content-Type") == "application/json":
return response.json() or {}
if response.headers.get("Content-Type") == "application/octet-stream":
return response.content
if response.headers.get("Content-Type") == "text/event-stream":
return self.process_eventstream(response)
raise Exception(f"Unknown response type: {response.headers.get('Content-Type')}")
def get_crawl_requests_list(self, page: int | None = None, page_size: int | None = None):
query_params = {"page": page or 1, "page_size": page_size or 10}
return self.process_response(
self._get(
"/api/v1/core/crawl-requests/",
query_params=query_params,
)
)
def get_crawl_request(self, item_id: str):
return self.process_response(
self._get(
f"/api/v1/core/crawl-requests/{item_id}/",
)
)
def create_crawl_request(
self,
url: Union[list, str] | None = None,
spider_options: dict | None = None,
page_options: dict | None = None,
plugin_options: dict | None = None,
):
data = {
# 'urls': url if isinstance(url, list) else [url],
"url": url,
"options": {
"spider_options": spider_options or {},
"page_options": page_options or {},
"plugin_options": plugin_options or {},
},
}
return self.process_response(
self._post(
"/api/v1/core/crawl-requests/",
data=data,
)
)
def stop_crawl_request(self, item_id: str):
return self.process_response(
self._delete(
f"/api/v1/core/crawl-requests/{item_id}/",
)
)
def download_crawl_request(self, item_id: str):
return self.process_response(
self._get(
f"/api/v1/core/crawl-requests/{item_id}/download/",
)
)
def monitor_crawl_request(self, item_id: str, prefetched=False) -> Generator:
query_params = {"prefetched": str(prefetched).lower()}
generator = self.process_response(
self._get(f"/api/v1/core/crawl-requests/{item_id}/status/", stream=True, query_params=query_params),
)
if not isinstance(generator, Generator):
raise ValueError("Generator expected")
yield from generator
def get_crawl_request_results(
self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict | None = None
):
query_params = query_params or {}
query_params.update({"page": page or 1, "page_size": page_size or 25})
return self.process_response(
self._get(f"/api/v1/core/crawl-requests/{item_id}/results/", query_params=query_params)
)
def scrape_url(
self,
url: str,
page_options: dict | None = None,
plugin_options: dict | None = None,
sync: bool = True,
prefetched: bool = True,
):
response_result = self.create_crawl_request(url=url, page_options=page_options, plugin_options=plugin_options)
if not sync:
return response_result
for event_data in self.monitor_crawl_request(response_result["uuid"], prefetched):
if event_data["type"] == "result":
return event_data["data"]
def download_result(self, result_object: dict):
response = httpx.get(result_object["result"], timeout=None)
try:
response.raise_for_status()
result_object["result"] = response.json()
finally:
response.close()
return result_object

View File

@@ -0,0 +1,32 @@
import json
class WaterCrawlError(Exception):
pass
class WaterCrawlBadRequestError(WaterCrawlError):
def __init__(self, response):
self.status_code = response.status_code
self.response = response
data = response.json()
self.message = data.get("message", "Unknown error occurred")
self.errors = data.get("errors", {})
super().__init__(self.message)
@property
def flat_errors(self):
return json.dumps(self.errors)
def __str__(self):
return f"WaterCrawlBadRequestError: {self.message} \n {self.flat_errors}"
class WaterCrawlPermissionError(WaterCrawlBadRequestError):
def __str__(self):
return f"You are exceeding your WaterCrawl API limits. {self.message}"
class WaterCrawlAuthenticationError(WaterCrawlBadRequestError):
def __str__(self):
return "WaterCrawl API key is invalid or expired. Please check your API key and try again."

View File

@@ -0,0 +1,64 @@
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from services.website_service import WebsiteService
class WaterCrawlWebExtractor(BaseExtractor):
"""
Crawl and scrape websites and return content in clean llm-ready markdown.
Args:
url: The URL to scrape.
api_key: The API key for WaterCrawl.
base_url: The base URL for the Firecrawl API. Defaults to 'https://app.firecrawl.dev'.
mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
"""
def __init__(
self,
url: str,
job_id: str,
tenant_id: str,
mode: str = "crawl",
only_main_content: bool = True,
):
"""Initialize with url, api_key, base_url and mode."""
self._url = url
self.job_id = job_id
self.tenant_id = tenant_id
self.mode = mode
self.only_main_content = only_main_content
def extract(self) -> list[Document]:
"""Extract content from the URL."""
documents = []
if self.mode == "crawl":
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id)
if crawl_data is None:
return []
document = Document(
page_content=crawl_data.get("markdown", ""),
metadata={
"source_url": crawl_data.get("source_url"),
"description": crawl_data.get("description"),
"title": crawl_data.get("title"),
},
)
documents.append(document)
elif self.mode == "scrape":
scrape_data = WebsiteService.get_scrape_url_data(
"watercrawl", self._url, self.tenant_id, self.only_main_content
)
document = Document(
page_content=scrape_data.get("markdown", ""),
metadata={
"source_url": scrape_data.get("source_url"),
"description": scrape_data.get("description"),
"title": scrape_data.get("title"),
},
)
documents.append(document)
return documents

View File

@@ -0,0 +1,117 @@
from collections.abc import Generator
from datetime import datetime
from typing import Any
from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient
class WaterCrawlProvider:
def __init__(self, api_key, base_url: str | None = None):
self.client = WaterCrawlAPIClient(api_key, base_url)
def crawl_url(self, url, options: dict | Any | None = None):
options = options or {}
spider_options = {
"max_depth": 1,
"page_limit": 1,
"allowed_domains": [],
"exclude_paths": [],
"include_paths": [],
}
if options.get("crawl_sub_pages", True):
spider_options["page_limit"] = options.get("limit", 1)
spider_options["max_depth"] = options.get("max_depth", 1)
spider_options["include_paths"] = options.get("includes", "").split(",") if options.get("includes") else []
spider_options["exclude_paths"] = options.get("excludes", "").split(",") if options.get("excludes") else []
wait_time = options.get("wait_time", 1000)
page_options = {
"exclude_tags": options.get("exclude_tags", "").split(",") if options.get("exclude_tags") else [],
"include_tags": options.get("include_tags", "").split(",") if options.get("include_tags") else [],
"wait_time": max(1000, wait_time), # minimum wait time is 1 second
"include_html": False,
"only_main_content": options.get("only_main_content", True),
"include_links": False,
"timeout": 15000,
"accept_cookies_selector": "#cookies-accept",
"locale": "en-US",
"actions": [],
}
result = self.client.create_crawl_request(url=url, spider_options=spider_options, page_options=page_options)
return {"status": "active", "job_id": result.get("uuid")}
def get_crawl_status(self, crawl_request_id):
response = self.client.get_crawl_request(crawl_request_id)
data = []
if response["status"] in ["new", "running"]:
status = "active"
else:
status = "completed"
data = list(self._get_results(crawl_request_id))
time_str = response.get("duration")
time_consuming: float = 0
if time_str:
time_obj = datetime.strptime(time_str, "%H:%M:%S.%f")
time_consuming = (
time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000
)
return {
"status": status,
"job_id": response.get("uuid"),
"total": response.get("options", {}).get("spider_options", {}).get("page_limit", 1),
"current": response.get("number_of_documents", 0),
"data": data,
"time_consuming": time_consuming,
}
def get_crawl_url_data(self, job_id, url) -> dict | None:
if not job_id:
return self.scrape_url(url)
for result in self._get_results(
job_id,
{
# filter by url
"url": url
},
):
return result
return None
def scrape_url(self, url: str):
response = self.client.scrape_url(url=url, sync=True, prefetched=True)
return self._structure_data(response)
def _structure_data(self, result_object: dict):
if isinstance(result_object.get("result", {}), str):
raise ValueError("Invalid result object. Expected a dictionary.")
metadata = result_object.get("result", {}).get("metadata", {})
return {
"title": metadata.get("og:title") or metadata.get("title"),
"description": metadata.get("description"),
"source_url": result_object.get("url"),
"markdown": result_object.get("result", {}).get("markdown"),
}
def _get_results(self, crawl_request_id: str, query_params: dict | None = None) -> Generator[dict, None, None]:
page = 0
page_size = 100
query_params = query_params or {}
query_params.update({"prefetched": "true"})
while True:
page += 1
response = self.client.get_crawl_request_results(crawl_request_id, page, page_size, query_params)
if not response["results"]:
break
for result in response["results"]:
yield self._structure_data(result)
if response["next"] is None:
break

View File

@@ -0,0 +1,295 @@
"""Abstract interface for document loader implementations."""
import logging
import mimetypes
import os
import re
import tempfile
import uuid
from urllib.parse import urlparse
from xml.etree import ElementTree
import httpx
from docx import Document as DocxDocument
from configs import dify_config
from core.helper import ssrf_proxy
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from extensions.ext_database import db
from extensions.ext_storage import storage
from libs.datetime_utils import naive_utc_now
from models.enums import CreatorUserRole
from models.model import UploadFile
logger = logging.getLogger(__name__)
class WordExtractor(BaseExtractor):
"""Load docx files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str, tenant_id: str, user_id: str):
"""Initialize with file path."""
self.file_path = file_path
self.tenant_id = tenant_id
self.user_id = user_id
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)
# If the file is a web path, download it to a temporary file, and use that
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
response = httpx.get(self.file_path, timeout=None)
if response.status_code != 200:
response.close()
raise ValueError(f"Check the url of your file; returned status code {response.status_code}")
self.web_path = self.file_path
# TODO: use a better way to handle the file
self.temp_file = tempfile.NamedTemporaryFile() # noqa SIM115
try:
self.temp_file.write(response.content)
finally:
response.close()
self.file_path = self.temp_file.name
elif not os.path.isfile(self.file_path):
raise ValueError(f"File path {self.file_path} is not a valid file or url")
def __del__(self):
if hasattr(self, "temp_file"):
self.temp_file.close()
def extract(self) -> list[Document]:
"""Load given path as single page."""
content = self.parse_docx(self.file_path)
return [
Document(
page_content=content,
metadata={"source": self.file_path},
)
]
@staticmethod
def _is_valid_url(url: str) -> bool:
"""Check if the url is valid."""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def _extract_images_from_docx(self, doc):
image_count = 0
image_map = {}
for rel in doc.part.rels.values():
if "image" in rel.target_ref:
image_count += 1
if rel.is_external:
url = rel.target_ref
response = ssrf_proxy.get(url)
if response.status_code == 200:
image_ext = mimetypes.guess_extension(response.headers["Content-Type"])
if image_ext is None:
continue
file_uuid = str(uuid.uuid4())
file_key = "image_files/" + self.tenant_id + "/" + file_uuid + "." + image_ext
mime_type, _ = mimetypes.guess_type(file_key)
storage.save(file_key, response.content)
else:
continue
else:
image_ext = rel.target_ref.split(".")[-1]
if image_ext is None:
continue
# user uuid as file name
file_uuid = str(uuid.uuid4())
file_key = "image_files/" + self.tenant_id + "/" + file_uuid + "." + image_ext
mime_type, _ = mimetypes.guess_type(file_key)
storage.save(file_key, rel.target_part.blob)
# save file to db
upload_file = UploadFile(
tenant_id=self.tenant_id,
storage_type=dify_config.STORAGE_TYPE,
key=file_key,
name=file_key,
size=0,
extension=str(image_ext),
mime_type=mime_type or "",
created_by=self.user_id,
created_by_role=CreatorUserRole.ACCOUNT,
created_at=naive_utc_now(),
used=True,
used_by=self.user_id,
used_at=naive_utc_now(),
)
db.session.add(upload_file)
db.session.commit()
image_map[rel.target_part] = f"![image]({dify_config.FILES_URL}/files/{upload_file.id}/file-preview)"
return image_map
def _table_to_markdown(self, table, image_map):
markdown = []
# calculate the total number of columns
total_cols = max(len(row.cells) for row in table.rows)
header_row = table.rows[0]
headers = self._parse_row(header_row, image_map, total_cols)
markdown.append("| " + " | ".join(headers) + " |")
markdown.append("| " + " | ".join(["---"] * total_cols) + " |")
for row in table.rows[1:]:
row_cells = self._parse_row(row, image_map, total_cols)
markdown.append("| " + " | ".join(row_cells) + " |")
return "\n".join(markdown)
def _parse_row(self, row, image_map, total_cols):
# Initialize a row, all of which are empty by default
row_cells = [""] * total_cols
col_index = 0
while col_index < len(row.cells):
# make sure the col_index is not out of range
while col_index < len(row.cells) and row_cells[col_index] != "":
col_index += 1
# if col_index is out of range the loop is jumped
if col_index >= len(row.cells):
break
# get the correct cell
cell = row.cells[col_index]
cell_content = self._parse_cell(cell, image_map).strip()
cell_colspan = cell.grid_span or 1
for i in range(cell_colspan):
if col_index + i < total_cols:
row_cells[col_index + i] = cell_content if i == 0 else ""
col_index += cell_colspan
return row_cells
def _parse_cell(self, cell, image_map):
cell_content = []
for paragraph in cell.paragraphs:
parsed_paragraph = self._parse_cell_paragraph(paragraph, image_map)
if parsed_paragraph:
cell_content.append(parsed_paragraph)
unique_content = list(dict.fromkeys(cell_content))
return " ".join(unique_content)
def _parse_cell_paragraph(self, paragraph, image_map):
paragraph_content = []
for run in paragraph.runs:
if run.element.xpath(".//a:blip"):
for blip in run.element.xpath(".//a:blip"):
image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
if not image_id:
continue
image_part = paragraph.part.rels[image_id].target_part
if image_part in image_map:
image_link = image_map[image_part]
paragraph_content.append(image_link)
else:
paragraph_content.append(run.text)
return "".join(paragraph_content).strip()
def parse_docx(self, docx_path):
doc = DocxDocument(docx_path)
content = []
image_map = self._extract_images_from_docx(doc)
hyperlinks_url = None
url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+")
for para in doc.paragraphs:
for run in para.runs:
if run.text and hyperlinks_url:
result = f" [{run.text}]({hyperlinks_url}) "
run.text = result
hyperlinks_url = None
if "HYPERLINK" in run.element.xml:
try:
xml = ElementTree.XML(run.element.xml)
x_child = [c for c in xml.iter() if c is not None]
for x in x_child:
if x is None:
continue
if x.tag.endswith("instrText"):
if x.text is None:
continue
for i in url_pattern.findall(x.text):
hyperlinks_url = str(i)
except Exception:
logger.exception("Failed to parse HYPERLINK xml")
def parse_paragraph(paragraph):
paragraph_content = []
for run in paragraph.runs:
if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"):
# Process drawing type images
drawing_elements = run.element.findall(
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing"
)
has_drawing = False
for drawing in drawing_elements:
blip_elements = drawing.findall(
".//{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
)
for blip in blip_elements:
embed_id = blip.get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
)
if embed_id:
image_part = doc.part.related_parts.get(embed_id)
if image_part in image_map:
has_drawing = True
paragraph_content.append(image_map[image_part])
# Process pict type images
shape_elements = run.element.findall(
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict"
)
for shape in shape_elements:
# Find image data in VML
shape_image = shape.find(
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}binData"
)
if shape_image is not None and shape_image.text:
image_id = shape_image.get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
)
if image_id and image_id in doc.part.rels:
image_part = doc.part.rels[image_id].target_part
if image_part in image_map and not has_drawing:
paragraph_content.append(image_map[image_part])
# Find imagedata element in VML
image_data = shape.find(".//{urn:schemas-microsoft-com:vml}imagedata")
if image_data is not None:
image_id = image_data.get("id") or image_data.get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
)
if image_id and image_id in doc.part.rels:
image_part = doc.part.rels[image_id].target_part
if image_part in image_map and not has_drawing:
paragraph_content.append(image_map[image_part])
if run.text.strip():
paragraph_content.append(run.text.strip())
return "".join(paragraph_content) if paragraph_content else ""
paragraphs = doc.paragraphs.copy()
tables = doc.tables.copy()
for element in doc.element.body:
if hasattr(element, "tag"):
if isinstance(element.tag, str) and element.tag.endswith("p"): # paragraph
para = paragraphs.pop(0)
parsed_paragraph = parse_paragraph(para)
if parsed_paragraph.strip():
content.append(parsed_paragraph)
else:
content.append("\n")
elif isinstance(element.tag, str) and element.tag.endswith("tbl"): # table
table = tables.pop(0)
content.append(self._table_to_markdown(table, image_map))
return "\n".join(content)