dify
This commit is contained in:
144
dify/api/core/rag/extractor/blob/blob.py
Normal file
144
dify/api/core/rag/extractor/blob/blob.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Schema for Blobs and Blob Loaders.
|
||||
|
||||
The goal is to facilitate decoupling of content loading from content parsing code.
|
||||
|
||||
In addition, content loading code should provide a lazy loading interface by default.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import mimetypes
|
||||
from collections.abc import Generator, Mapping
|
||||
from io import BufferedReader, BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import Any, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, model_validator
|
||||
|
||||
PathLike = Union[str, PurePath]
|
||||
|
||||
|
||||
class Blob(BaseModel):
|
||||
"""A blob is used to represent raw data by either reference or value.
|
||||
|
||||
Provides an interface to materialize the blob in different representations, and
|
||||
help to decouple the development of data loaders from the downstream parsing of
|
||||
the raw data.
|
||||
|
||||
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
|
||||
"""
|
||||
|
||||
data: Union[bytes, str, None] = None # Raw data
|
||||
mimetype: str | None = None # Not to be confused with a file extension
|
||||
encoding: str = "utf-8" # Use utf-8 as default encoding, if decoding to string
|
||||
# Location where the original content was found
|
||||
# Represent location on the local file system
|
||||
# Useful for situations where downstream code assumes it must work with file paths
|
||||
# rather than in-memory content.
|
||||
path: PathLike | None = None
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True, frozen=True)
|
||||
|
||||
@property
|
||||
def source(self) -> str | None:
|
||||
"""The source location of the blob as string if known otherwise none."""
|
||||
return str(self.path) if self.path else None
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
|
||||
"""Verify that either data or path is provided."""
|
||||
if "data" not in values and "path" not in values:
|
||||
raise ValueError("Either data or path must be provided")
|
||||
return values
|
||||
|
||||
def as_string(self) -> str:
|
||||
"""Read data as a string."""
|
||||
if self.data is None and self.path:
|
||||
return Path(str(self.path)).read_text(encoding=self.encoding)
|
||||
elif isinstance(self.data, bytes):
|
||||
return self.data.decode(self.encoding)
|
||||
elif isinstance(self.data, str):
|
||||
return self.data
|
||||
else:
|
||||
raise ValueError(f"Unable to get string for blob {self}")
|
||||
|
||||
def as_bytes(self) -> bytes:
|
||||
"""Read data as bytes."""
|
||||
if isinstance(self.data, bytes):
|
||||
return self.data
|
||||
elif isinstance(self.data, str):
|
||||
return self.data.encode(self.encoding)
|
||||
elif self.data is None and self.path:
|
||||
return Path(str(self.path)).read_bytes()
|
||||
else:
|
||||
raise ValueError(f"Unable to get bytes for blob {self}")
|
||||
|
||||
@contextlib.contextmanager
|
||||
def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
|
||||
"""Read data as a byte stream."""
|
||||
if isinstance(self.data, bytes):
|
||||
yield BytesIO(self.data)
|
||||
elif self.data is None and self.path:
|
||||
with open(str(self.path), "rb") as f:
|
||||
yield f
|
||||
else:
|
||||
raise NotImplementedError(f"Unable to convert blob {self}")
|
||||
|
||||
@classmethod
|
||||
def from_path(
|
||||
cls,
|
||||
path: PathLike,
|
||||
*,
|
||||
encoding: str = "utf-8",
|
||||
mime_type: str | None = None,
|
||||
guess_type: bool = True,
|
||||
) -> Blob:
|
||||
"""Load the blob from a path like object.
|
||||
|
||||
Args:
|
||||
path: path like object to file to be read
|
||||
encoding: Encoding to use if decoding the bytes into a string
|
||||
mime_type: if provided, will be set as the mime-type of the data
|
||||
guess_type: If True, the mimetype will be guessed from the file extension,
|
||||
if a mime-type was not provided
|
||||
|
||||
Returns:
|
||||
Blob instance
|
||||
"""
|
||||
if mime_type is None and guess_type:
|
||||
_mimetype = mimetypes.guess_type(path)[0]
|
||||
else:
|
||||
_mimetype = mime_type
|
||||
# We do not load the data immediately, instead we treat the blob as a
|
||||
# reference to the underlying data.
|
||||
return cls(data=None, mimetype=_mimetype, encoding=encoding, path=path)
|
||||
|
||||
@classmethod
|
||||
def from_data(
|
||||
cls,
|
||||
data: Union[str, bytes],
|
||||
*,
|
||||
encoding: str = "utf-8",
|
||||
mime_type: str | None = None,
|
||||
path: str | None = None,
|
||||
) -> Blob:
|
||||
"""Initialize the blob from in-memory data.
|
||||
|
||||
Args:
|
||||
data: the in-memory data associated with the blob
|
||||
encoding: Encoding to use if decoding the bytes into a string
|
||||
mime_type: if provided, will be set as the mime-type of the data
|
||||
path: if provided, will be set as the source from which the data came
|
||||
|
||||
Returns:
|
||||
Blob instance
|
||||
"""
|
||||
return cls(data=data, mimetype=mime_type, encoding=encoding, path=path)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""Define the blob representation."""
|
||||
str_repr = f"Blob {id(self)}"
|
||||
if self.source:
|
||||
str_repr += f" {self.source}"
|
||||
return str_repr
|
||||
77
dify/api/core/rag/extractor/csv_extractor.py
Normal file
77
dify/api/core/rag/extractor/csv_extractor.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
import csv
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.extractor.helpers import detect_file_encodings
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
||||
class CSVExtractor(BaseExtractor):
|
||||
"""Load CSV files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
encoding: str | None = None,
|
||||
autodetect_encoding: bool = False,
|
||||
source_column: str | None = None,
|
||||
csv_args: dict | None = None,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._encoding = encoding
|
||||
self._autodetect_encoding = autodetect_encoding
|
||||
self.source_column = source_column
|
||||
self.csv_args = csv_args or {}
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Load data into document objects."""
|
||||
docs = []
|
||||
try:
|
||||
with open(self._file_path, newline="", encoding=self._encoding) as csvfile:
|
||||
docs = self._read_from_file(csvfile)
|
||||
except UnicodeDecodeError as e:
|
||||
if self._autodetect_encoding:
|
||||
detected_encodings = detect_file_encodings(self._file_path)
|
||||
for encoding in detected_encodings:
|
||||
try:
|
||||
with open(self._file_path, newline="", encoding=encoding.encoding) as csvfile:
|
||||
docs = self._read_from_file(csvfile)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
else:
|
||||
raise RuntimeError(f"Error loading {self._file_path}") from e
|
||||
|
||||
return docs
|
||||
|
||||
def _read_from_file(self, csvfile) -> list[Document]:
|
||||
docs = []
|
||||
try:
|
||||
# load csv file into pandas dataframe
|
||||
df = pd.read_csv(csvfile, on_bad_lines="skip", **self.csv_args)
|
||||
|
||||
# check source column exists
|
||||
if self.source_column and self.source_column not in df.columns:
|
||||
raise ValueError(f"Source column '{self.source_column}' not found in CSV file.")
|
||||
|
||||
# create document objects
|
||||
|
||||
for i, row in df.iterrows():
|
||||
content = ";".join(f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns)
|
||||
source = row[self.source_column] if self.source_column else ""
|
||||
metadata = {"source": source, "row": i}
|
||||
doc = Document(page_content=content, metadata=metadata)
|
||||
docs.append(doc)
|
||||
except csv.Error as e:
|
||||
raise e
|
||||
|
||||
return docs
|
||||
7
dify/api/core/rag/extractor/entity/datasource_type.py
Normal file
7
dify/api/core/rag/extractor/entity/datasource_type.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from enum import StrEnum
|
||||
|
||||
|
||||
class DatasourceType(StrEnum):
|
||||
FILE = "upload_file"
|
||||
NOTION = "notion_import"
|
||||
WEBSITE = "website_crawl"
|
||||
46
dify/api/core/rag/extractor/entity/extract_setting.py
Normal file
46
dify/api/core/rag/extractor/entity/extract_setting.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from models.dataset import Document
|
||||
from models.model import UploadFile
|
||||
|
||||
|
||||
class NotionInfo(BaseModel):
|
||||
"""
|
||||
Notion import info.
|
||||
"""
|
||||
|
||||
credential_id: str | None = None
|
||||
notion_workspace_id: str
|
||||
notion_obj_id: str
|
||||
notion_page_type: str
|
||||
document: Document | None = None
|
||||
tenant_id: str
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
|
||||
class WebsiteInfo(BaseModel):
|
||||
"""
|
||||
website import info.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
provider: str
|
||||
job_id: str
|
||||
url: str
|
||||
mode: str
|
||||
tenant_id: str
|
||||
only_main_content: bool = False
|
||||
|
||||
|
||||
class ExtractSetting(BaseModel):
|
||||
"""
|
||||
Model class for provider response.
|
||||
"""
|
||||
|
||||
datasource_type: str
|
||||
upload_file: UploadFile | None = None
|
||||
notion_info: NotionInfo | None = None
|
||||
website_info: WebsiteInfo | None = None
|
||||
document_model: str | None = None
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
77
dify/api/core/rag/extractor/excel_extractor.py
Normal file
77
dify/api/core/rag/extractor/excel_extractor.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
import os
|
||||
from typing import cast
|
||||
|
||||
import pandas as pd
|
||||
from openpyxl import load_workbook
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
||||
class ExcelExtractor(BaseExtractor):
|
||||
"""Load Excel files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, encoding: str | None = None, autodetect_encoding: bool = False):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._encoding = encoding
|
||||
self._autodetect_encoding = autodetect_encoding
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
|
||||
documents = []
|
||||
file_extension = os.path.splitext(self._file_path)[-1].lower()
|
||||
|
||||
if file_extension == ".xlsx":
|
||||
wb = load_workbook(self._file_path, data_only=True)
|
||||
for sheet_name in wb.sheetnames:
|
||||
sheet = wb[sheet_name]
|
||||
data = sheet.values
|
||||
cols = next(data, None)
|
||||
if cols is None:
|
||||
continue
|
||||
df = pd.DataFrame(data, columns=cols)
|
||||
|
||||
df.dropna(how="all", inplace=True)
|
||||
|
||||
for index, row in df.iterrows():
|
||||
page_content = []
|
||||
for col_index, (k, v) in enumerate(row.items()):
|
||||
if pd.notna(v):
|
||||
cell = sheet.cell(
|
||||
row=cast(int, index) + 2, column=col_index + 1
|
||||
) # +2 to account for header and 1-based index
|
||||
if cell.hyperlink:
|
||||
value = f"[{v}]({cell.hyperlink.target})"
|
||||
page_content.append(f'"{k}":"{value}"')
|
||||
else:
|
||||
page_content.append(f'"{k}":"{v}"')
|
||||
documents.append(
|
||||
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
|
||||
)
|
||||
|
||||
elif file_extension == ".xls":
|
||||
excel_file = pd.ExcelFile(self._file_path, engine="xlrd")
|
||||
for excel_sheet_name in excel_file.sheet_names:
|
||||
df = excel_file.parse(sheet_name=excel_sheet_name)
|
||||
df.dropna(how="all", inplace=True)
|
||||
|
||||
for _, row in df.iterrows():
|
||||
page_content = []
|
||||
for k, v in row.items():
|
||||
if pd.notna(v):
|
||||
page_content.append(f'"{k}":"{v}"')
|
||||
documents.append(
|
||||
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file extension: {file_extension}")
|
||||
|
||||
return documents
|
||||
209
dify/api/core/rag/extractor/extract_processor.py
Normal file
209
dify/api/core/rag/extractor/extract_processor.py
Normal file
@@ -0,0 +1,209 @@
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
from urllib.parse import unquote
|
||||
|
||||
from configs import dify_config
|
||||
from core.helper import ssrf_proxy
|
||||
from core.rag.extractor.csv_extractor import CSVExtractor
|
||||
from core.rag.extractor.entity.datasource_type import DatasourceType
|
||||
from core.rag.extractor.entity.extract_setting import ExtractSetting
|
||||
from core.rag.extractor.excel_extractor import ExcelExtractor
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor
|
||||
from core.rag.extractor.html_extractor import HtmlExtractor
|
||||
from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor
|
||||
from core.rag.extractor.markdown_extractor import MarkdownExtractor
|
||||
from core.rag.extractor.notion_extractor import NotionExtractor
|
||||
from core.rag.extractor.pdf_extractor import PdfExtractor
|
||||
from core.rag.extractor.text_extractor import TextExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor
|
||||
from core.rag.extractor.watercrawl.extractor import WaterCrawlWebExtractor
|
||||
from core.rag.extractor.word_extractor import WordExtractor
|
||||
from core.rag.models.document import Document
|
||||
from extensions.ext_storage import storage
|
||||
from models.model import UploadFile
|
||||
|
||||
SUPPORT_URL_CONTENT_TYPES = ["application/pdf", "text/plain", "application/json"]
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124"
|
||||
" Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
class ExtractProcessor:
|
||||
@classmethod
|
||||
def load_from_upload_file(
|
||||
cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False
|
||||
) -> Union[list[Document], str]:
|
||||
extract_setting = ExtractSetting(
|
||||
datasource_type=DatasourceType.FILE, upload_file=upload_file, document_model="text_model"
|
||||
)
|
||||
if return_text:
|
||||
delimiter = "\n"
|
||||
return delimiter.join([document.page_content for document in cls.extract(extract_setting, is_automatic)])
|
||||
else:
|
||||
return cls.extract(extract_setting, is_automatic)
|
||||
|
||||
@classmethod
|
||||
def load_from_url(cls, url: str, return_text: bool = False) -> Union[list[Document], str]:
|
||||
response = ssrf_proxy.get(url, headers={"User-Agent": USER_AGENT})
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
suffix = Path(url).suffix
|
||||
if not suffix and suffix != ".":
|
||||
# get content-type
|
||||
if response.headers.get("Content-Type"):
|
||||
suffix = "." + response.headers.get("Content-Type").split("/")[-1]
|
||||
else:
|
||||
content_disposition = response.headers.get("Content-Disposition")
|
||||
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
|
||||
if filename_match:
|
||||
filename = unquote(filename_match.group(1))
|
||||
match = re.search(r"\.(\w+)$", filename)
|
||||
if match:
|
||||
suffix = "." + match.group(1)
|
||||
else:
|
||||
suffix = ""
|
||||
# https://stackoverflow.com/questions/26541416/generate-temporary-file-names-without-creating-actual-file-in-python#comment90414256_26541521
|
||||
file_path = f"{temp_dir}/{tempfile.gettempdir()}{suffix}"
|
||||
Path(file_path).write_bytes(response.content)
|
||||
extract_setting = ExtractSetting(datasource_type=DatasourceType.FILE, document_model="text_model")
|
||||
if return_text:
|
||||
delimiter = "\n"
|
||||
return delimiter.join(
|
||||
[
|
||||
document.page_content
|
||||
for document in cls.extract(extract_setting=extract_setting, file_path=file_path)
|
||||
]
|
||||
)
|
||||
else:
|
||||
return cls.extract(extract_setting=extract_setting, file_path=file_path)
|
||||
|
||||
@classmethod
|
||||
def extract(
|
||||
cls, extract_setting: ExtractSetting, is_automatic: bool = False, file_path: str | None = None
|
||||
) -> list[Document]:
|
||||
if extract_setting.datasource_type == DatasourceType.FILE:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
if not file_path:
|
||||
assert extract_setting.upload_file is not None, "upload_file is required"
|
||||
upload_file: UploadFile = extract_setting.upload_file
|
||||
suffix = Path(upload_file.key).suffix
|
||||
# FIXME mypy: Cannot determine type of 'tempfile._get_candidate_names' better not use it here
|
||||
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore
|
||||
storage.download(upload_file.key, file_path)
|
||||
input_file = Path(file_path)
|
||||
file_extension = input_file.suffix.lower()
|
||||
etl_type = dify_config.ETL_TYPE
|
||||
extractor: BaseExtractor | None = None
|
||||
if etl_type == "Unstructured":
|
||||
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL or ""
|
||||
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
|
||||
|
||||
if file_extension in {".xlsx", ".xls"}:
|
||||
extractor = ExcelExtractor(file_path)
|
||||
elif file_extension == ".pdf":
|
||||
extractor = PdfExtractor(file_path)
|
||||
elif file_extension in {".md", ".markdown", ".mdx"}:
|
||||
extractor = (
|
||||
UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||
if is_automatic
|
||||
else MarkdownExtractor(file_path, autodetect_encoding=True)
|
||||
)
|
||||
elif file_extension in {".htm", ".html"}:
|
||||
extractor = HtmlExtractor(file_path)
|
||||
elif file_extension == ".docx":
|
||||
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
|
||||
elif file_extension == ".doc":
|
||||
extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||
elif file_extension == ".csv":
|
||||
extractor = CSVExtractor(file_path, autodetect_encoding=True)
|
||||
elif file_extension == ".msg":
|
||||
extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||
elif file_extension == ".eml":
|
||||
extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||
elif file_extension == ".ppt":
|
||||
extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||
# You must first specify the API key
|
||||
# because unstructured_api_key is necessary to parse .ppt documents
|
||||
elif file_extension == ".pptx":
|
||||
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||
elif file_extension == ".xml":
|
||||
extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||
elif file_extension == ".epub":
|
||||
extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||
else:
|
||||
# txt
|
||||
extractor = TextExtractor(file_path, autodetect_encoding=True)
|
||||
else:
|
||||
if file_extension in {".xlsx", ".xls"}:
|
||||
extractor = ExcelExtractor(file_path)
|
||||
elif file_extension == ".pdf":
|
||||
extractor = PdfExtractor(file_path)
|
||||
elif file_extension in {".md", ".markdown", ".mdx"}:
|
||||
extractor = MarkdownExtractor(file_path, autodetect_encoding=True)
|
||||
elif file_extension in {".htm", ".html"}:
|
||||
extractor = HtmlExtractor(file_path)
|
||||
elif file_extension == ".docx":
|
||||
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
|
||||
elif file_extension == ".csv":
|
||||
extractor = CSVExtractor(file_path, autodetect_encoding=True)
|
||||
elif file_extension == ".epub":
|
||||
extractor = UnstructuredEpubExtractor(file_path)
|
||||
else:
|
||||
# txt
|
||||
extractor = TextExtractor(file_path, autodetect_encoding=True)
|
||||
return extractor.extract()
|
||||
elif extract_setting.datasource_type == DatasourceType.NOTION:
|
||||
assert extract_setting.notion_info is not None, "notion_info is required"
|
||||
extractor = NotionExtractor(
|
||||
notion_workspace_id=extract_setting.notion_info.notion_workspace_id,
|
||||
notion_obj_id=extract_setting.notion_info.notion_obj_id,
|
||||
notion_page_type=extract_setting.notion_info.notion_page_type,
|
||||
document_model=extract_setting.notion_info.document,
|
||||
tenant_id=extract_setting.notion_info.tenant_id,
|
||||
credential_id=extract_setting.notion_info.credential_id,
|
||||
)
|
||||
return extractor.extract()
|
||||
elif extract_setting.datasource_type == DatasourceType.WEBSITE:
|
||||
assert extract_setting.website_info is not None, "website_info is required"
|
||||
if extract_setting.website_info.provider == "firecrawl":
|
||||
extractor = FirecrawlWebExtractor(
|
||||
url=extract_setting.website_info.url,
|
||||
job_id=extract_setting.website_info.job_id,
|
||||
tenant_id=extract_setting.website_info.tenant_id,
|
||||
mode=extract_setting.website_info.mode,
|
||||
only_main_content=extract_setting.website_info.only_main_content,
|
||||
)
|
||||
return extractor.extract()
|
||||
elif extract_setting.website_info.provider == "watercrawl":
|
||||
extractor = WaterCrawlWebExtractor(
|
||||
url=extract_setting.website_info.url,
|
||||
job_id=extract_setting.website_info.job_id,
|
||||
tenant_id=extract_setting.website_info.tenant_id,
|
||||
mode=extract_setting.website_info.mode,
|
||||
only_main_content=extract_setting.website_info.only_main_content,
|
||||
)
|
||||
return extractor.extract()
|
||||
elif extract_setting.website_info.provider == "jinareader":
|
||||
extractor = JinaReaderWebExtractor(
|
||||
url=extract_setting.website_info.url,
|
||||
job_id=extract_setting.website_info.job_id,
|
||||
tenant_id=extract_setting.website_info.tenant_id,
|
||||
mode=extract_setting.website_info.mode,
|
||||
only_main_content=extract_setting.website_info.only_main_content,
|
||||
)
|
||||
return extractor.extract()
|
||||
else:
|
||||
raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}")
|
||||
else:
|
||||
raise ValueError(f"Unsupported datasource type: {extract_setting.datasource_type}")
|
||||
11
dify/api/core/rag/extractor/extractor_base.py
Normal file
11
dify/api/core/rag/extractor/extractor_base.py
Normal file
@@ -0,0 +1,11 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class BaseExtractor(ABC):
|
||||
"""Interface for extract files."""
|
||||
|
||||
@abstractmethod
|
||||
def extract(self):
|
||||
raise NotImplementedError
|
||||
173
dify/api/core/rag/extractor/firecrawl/firecrawl_app.py
Normal file
173
dify/api/core/rag/extractor/firecrawl/firecrawl_app.py
Normal file
@@ -0,0 +1,173 @@
|
||||
import json
|
||||
import time
|
||||
from typing import Any, cast
|
||||
|
||||
import httpx
|
||||
|
||||
from extensions.ext_storage import storage
|
||||
|
||||
|
||||
class FirecrawlApp:
|
||||
def __init__(self, api_key=None, base_url=None):
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url or "https://api.firecrawl.dev"
|
||||
if self.api_key is None and self.base_url == "https://api.firecrawl.dev":
|
||||
raise ValueError("No API key provided")
|
||||
|
||||
def scrape_url(self, url, params=None) -> dict[str, Any]:
|
||||
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/scrape
|
||||
headers = self._prepare_headers()
|
||||
json_data = {
|
||||
"url": url,
|
||||
"formats": ["markdown"],
|
||||
"onlyMainContent": True,
|
||||
"timeout": 30000,
|
||||
}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f"{self.base_url}/v2/scrape", json_data, headers)
|
||||
if response.status_code == 200:
|
||||
response_data = response.json()
|
||||
data = response_data["data"]
|
||||
return self._extract_common_fields(data)
|
||||
elif response.status_code in {402, 409, 500, 429, 408}:
|
||||
self._handle_error(response, "scrape URL")
|
||||
return {} # Avoid additional exception after handling error
|
||||
else:
|
||||
raise Exception(f"Failed to scrape URL. Status code: {response.status_code}")
|
||||
|
||||
def crawl_url(self, url, params=None) -> str:
|
||||
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post
|
||||
headers = self._prepare_headers()
|
||||
json_data = {"url": url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f"{self.base_url}/v2/crawl", json_data, headers)
|
||||
if response.status_code == 200:
|
||||
# There's also another two fields in the response: "success" (bool) and "url" (str)
|
||||
job_id = response.json().get("id")
|
||||
return cast(str, job_id)
|
||||
else:
|
||||
self._handle_error(response, "start crawl job")
|
||||
return "" # unreachable
|
||||
|
||||
def map(self, url: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
|
||||
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/map
|
||||
headers = self._prepare_headers()
|
||||
json_data: dict[str, Any] = {"url": url, "integration": "dify"}
|
||||
if params:
|
||||
# Pass through provided params, including optional "sitemap": "only" | "include" | "skip"
|
||||
json_data.update(params)
|
||||
response = self._post_request(f"{self.base_url}/v2/map", json_data, headers)
|
||||
if response.status_code == 200:
|
||||
return cast(dict[str, Any], response.json())
|
||||
elif response.status_code in {402, 409, 500, 429, 408}:
|
||||
self._handle_error(response, "start map job")
|
||||
return {}
|
||||
else:
|
||||
raise Exception(f"Failed to start map job. Status code: {response.status_code}")
|
||||
|
||||
def check_crawl_status(self, job_id) -> dict[str, Any]:
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f"{self.base_url}/v2/crawl/{job_id}", headers)
|
||||
if response.status_code == 200:
|
||||
crawl_status_response = response.json()
|
||||
if crawl_status_response.get("status") == "completed":
|
||||
total = crawl_status_response.get("total", 0)
|
||||
if total == 0:
|
||||
raise Exception("Failed to check crawl status. Error: No page found")
|
||||
data = crawl_status_response.get("data", [])
|
||||
url_data_list = []
|
||||
for item in data:
|
||||
if isinstance(item, dict) and "metadata" in item and "markdown" in item:
|
||||
url_data = self._extract_common_fields(item)
|
||||
url_data_list.append(url_data)
|
||||
if url_data_list:
|
||||
file_key = "website_files/" + job_id + ".txt"
|
||||
try:
|
||||
if storage.exists(file_key):
|
||||
storage.delete(file_key)
|
||||
storage.save(file_key, json.dumps(url_data_list).encode("utf-8"))
|
||||
except Exception as e:
|
||||
raise Exception(f"Error saving crawl data: {e}")
|
||||
return self._format_crawl_status_response("completed", crawl_status_response, url_data_list)
|
||||
else:
|
||||
return self._format_crawl_status_response(
|
||||
crawl_status_response.get("status"), crawl_status_response, []
|
||||
)
|
||||
else:
|
||||
self._handle_error(response, "check crawl status")
|
||||
return {} # unreachable
|
||||
|
||||
def _format_crawl_status_response(
|
||||
self, status: str, crawl_status_response: dict[str, Any], url_data_list: list[dict[str, Any]]
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"status": status,
|
||||
"total": crawl_status_response.get("total"),
|
||||
"current": crawl_status_response.get("completed"),
|
||||
"data": url_data_list,
|
||||
}
|
||||
|
||||
def _extract_common_fields(self, item: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
"title": item.get("metadata", {}).get("title"),
|
||||
"description": item.get("metadata", {}).get("description"),
|
||||
"source_url": item.get("metadata", {}).get("sourceURL"),
|
||||
"markdown": item.get("markdown"),
|
||||
}
|
||||
|
||||
def _prepare_headers(self) -> dict[str, Any]:
|
||||
return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
||||
|
||||
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
|
||||
for attempt in range(retries):
|
||||
response = httpx.post(url, headers=headers, json=data)
|
||||
if response.status_code == 502:
|
||||
time.sleep(backoff_factor * (2**attempt))
|
||||
else:
|
||||
return response
|
||||
return response
|
||||
|
||||
def _get_request(self, url, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
|
||||
for attempt in range(retries):
|
||||
response = httpx.get(url, headers=headers)
|
||||
if response.status_code == 502:
|
||||
time.sleep(backoff_factor * (2**attempt))
|
||||
else:
|
||||
return response
|
||||
return response
|
||||
|
||||
def _handle_error(self, response, action):
|
||||
error_message = response.json().get("error", "Unknown error occurred")
|
||||
raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}") # type: ignore[return]
|
||||
|
||||
def search(self, query: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
|
||||
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/search
|
||||
headers = self._prepare_headers()
|
||||
json_data = {
|
||||
"query": query,
|
||||
"limit": 5,
|
||||
"lang": "en",
|
||||
"country": "us",
|
||||
"timeout": 60000,
|
||||
"ignoreInvalidURLs": True,
|
||||
"scrapeOptions": {},
|
||||
"sources": [
|
||||
{"type": "web"},
|
||||
],
|
||||
"integration": "dify",
|
||||
}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f"{self.base_url}/v2/search", json_data, headers)
|
||||
if response.status_code == 200:
|
||||
response_data = response.json()
|
||||
if not response_data.get("success"):
|
||||
raise Exception(f"Search failed. Error: {response_data.get('warning', 'Unknown error')}")
|
||||
return cast(dict[str, Any], response_data)
|
||||
elif response.status_code in {402, 409, 500, 429, 408}:
|
||||
self._handle_error(response, "perform search")
|
||||
return {} # Avoid additional exception after handling error
|
||||
else:
|
||||
raise Exception(f"Failed to perform search. Status code: {response.status_code}")
|
||||
@@ -0,0 +1,63 @@
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from services.website_service import WebsiteService
|
||||
|
||||
|
||||
class FirecrawlWebExtractor(BaseExtractor):
|
||||
"""
|
||||
Crawl and scrape websites and return content in clean llm-ready markdown.
|
||||
|
||||
Args:
|
||||
url: The URL to scrape.
|
||||
job_id: The crawl job id.
|
||||
tenant_id: The tenant id.
|
||||
mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
|
||||
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
job_id: str,
|
||||
tenant_id: str,
|
||||
mode: str = "crawl",
|
||||
only_main_content: bool = True,
|
||||
):
|
||||
"""Initialize with url, api_key, base_url and mode."""
|
||||
self._url = url
|
||||
self.job_id = job_id
|
||||
self.tenant_id = tenant_id
|
||||
self.mode = mode
|
||||
self.only_main_content = only_main_content
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Extract content from the URL."""
|
||||
documents = []
|
||||
if self.mode == "crawl":
|
||||
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "firecrawl", self._url, self.tenant_id)
|
||||
if crawl_data is None:
|
||||
return []
|
||||
document = Document(
|
||||
page_content=crawl_data.get("markdown", ""),
|
||||
metadata={
|
||||
"source_url": crawl_data.get("source_url"),
|
||||
"description": crawl_data.get("description"),
|
||||
"title": crawl_data.get("title"),
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
elif self.mode == "scrape":
|
||||
scrape_data = WebsiteService.get_scrape_url_data(
|
||||
"firecrawl", self._url, self.tenant_id, self.only_main_content
|
||||
)
|
||||
|
||||
document = Document(
|
||||
page_content=scrape_data.get("markdown", ""),
|
||||
metadata={
|
||||
"source_url": scrape_data.get("source_url"),
|
||||
"description": scrape_data.get("description"),
|
||||
"title": scrape_data.get("title"),
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
return documents
|
||||
48
dify/api/core/rag/extractor/helpers.py
Normal file
48
dify/api/core/rag/extractor/helpers.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""Document loader helpers."""
|
||||
|
||||
import concurrent.futures
|
||||
from typing import NamedTuple, cast
|
||||
|
||||
|
||||
class FileEncoding(NamedTuple):
|
||||
"""A file encoding as the NamedTuple."""
|
||||
|
||||
encoding: str | None
|
||||
"""The encoding of the file."""
|
||||
confidence: float
|
||||
"""The confidence of the encoding."""
|
||||
language: str | None
|
||||
"""The language of the file."""
|
||||
|
||||
|
||||
def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1024 * 1024) -> list[FileEncoding]:
|
||||
"""Try to detect the file encoding.
|
||||
|
||||
Returns a list of `FileEncoding` tuples with the detected encodings ordered
|
||||
by confidence.
|
||||
|
||||
Args:
|
||||
file_path: The path to the file to detect the encoding for.
|
||||
timeout: The timeout in seconds for the encoding detection.
|
||||
sample_size: The number of bytes to read for encoding detection. Default is 1MB.
|
||||
For large files, reading only a sample is sufficient and prevents timeout.
|
||||
"""
|
||||
import chardet
|
||||
|
||||
def read_and_detect(file_path: str):
|
||||
with open(file_path, "rb") as f:
|
||||
# Read only a sample of the file for encoding detection
|
||||
# This prevents timeout on large files while still providing accurate encoding detection
|
||||
rawdata = f.read(sample_size)
|
||||
return cast(list[dict], chardet.detect_all(rawdata))
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(read_and_detect, file_path)
|
||||
try:
|
||||
encodings = future.result(timeout=timeout)
|
||||
except concurrent.futures.TimeoutError:
|
||||
raise TimeoutError(f"Timeout reached while detecting encoding for {file_path}")
|
||||
|
||||
if all(encoding["encoding"] is None for encoding in encodings):
|
||||
raise RuntimeError(f"Could not detect encoding for {file_path}")
|
||||
return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]
|
||||
32
dify/api/core/rag/extractor/html_extractor.py
Normal file
32
dify/api/core/rag/extractor/html_extractor.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
||||
class HtmlExtractor(BaseExtractor):
|
||||
"""
|
||||
Load html files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
return [Document(page_content=self._load_as_text())]
|
||||
|
||||
def _load_as_text(self) -> str:
|
||||
text: str = ""
|
||||
with open(self._file_path, "rb") as fp:
|
||||
soup = BeautifulSoup(fp, "html.parser")
|
||||
text = soup.get_text()
|
||||
text = text.strip() if text else ""
|
||||
|
||||
return text
|
||||
42
dify/api/core/rag/extractor/jina_reader_extractor.py
Normal file
42
dify/api/core/rag/extractor/jina_reader_extractor.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from services.website_service import WebsiteService
|
||||
|
||||
|
||||
class JinaReaderWebExtractor(BaseExtractor):
|
||||
"""
|
||||
Crawl and scrape websites and return content in clean llm-ready markdown.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
job_id: str,
|
||||
tenant_id: str,
|
||||
mode: str = "crawl",
|
||||
only_main_content: bool = False,
|
||||
):
|
||||
"""Initialize with url, api_key, base_url and mode."""
|
||||
self._url = url
|
||||
self.job_id = job_id
|
||||
self.tenant_id = tenant_id
|
||||
self.mode = mode
|
||||
self.only_main_content = only_main_content
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Extract content from the URL."""
|
||||
documents = []
|
||||
if self.mode == "crawl":
|
||||
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id)
|
||||
if crawl_data is None:
|
||||
return []
|
||||
document = Document(
|
||||
page_content=crawl_data.get("content", ""),
|
||||
metadata={
|
||||
"source_url": crawl_data.get("url"),
|
||||
"description": crawl_data.get("description"),
|
||||
"title": crawl_data.get("title"),
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
return documents
|
||||
121
dify/api/core/rag/extractor/markdown_extractor.py
Normal file
121
dify/api/core/rag/extractor/markdown_extractor.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.extractor.helpers import detect_file_encodings
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
||||
class MarkdownExtractor(BaseExtractor):
|
||||
"""Load Markdown files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
remove_hyperlinks: bool = False,
|
||||
remove_images: bool = False,
|
||||
encoding: str | None = None,
|
||||
autodetect_encoding: bool = True,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._remove_hyperlinks = remove_hyperlinks
|
||||
self._remove_images = remove_images
|
||||
self._encoding = encoding
|
||||
self._autodetect_encoding = autodetect_encoding
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Load from file path."""
|
||||
tups = self.parse_tups(self._file_path)
|
||||
documents = []
|
||||
for header, value in tups:
|
||||
value = value.strip()
|
||||
if header is None:
|
||||
documents.append(Document(page_content=value))
|
||||
else:
|
||||
documents.append(Document(page_content=f"\n\n{header}\n{value}"))
|
||||
|
||||
return documents
|
||||
|
||||
def markdown_to_tups(self, markdown_text: str) -> list[tuple[str | None, str]]:
|
||||
"""Convert a markdown file to a dictionary.
|
||||
|
||||
The keys are the headers and the values are the text under each header.
|
||||
|
||||
"""
|
||||
markdown_tups: list[tuple[str | None, str]] = []
|
||||
lines = markdown_text.split("\n")
|
||||
|
||||
current_header = None
|
||||
current_text = ""
|
||||
code_block_flag = False
|
||||
|
||||
for line in lines:
|
||||
if line.startswith("```"):
|
||||
code_block_flag = not code_block_flag
|
||||
current_text += line + "\n"
|
||||
continue
|
||||
if code_block_flag:
|
||||
current_text += line + "\n"
|
||||
continue
|
||||
header_match = re.match(r"^#+\s", line)
|
||||
if header_match:
|
||||
markdown_tups.append((current_header, current_text))
|
||||
current_header = line
|
||||
current_text = ""
|
||||
else:
|
||||
current_text += line + "\n"
|
||||
markdown_tups.append((current_header, current_text))
|
||||
|
||||
markdown_tups = [
|
||||
(re.sub(r"#", "", key).strip() if key else None, re.sub(r"<.*?>", "", value))
|
||||
for key, value in markdown_tups
|
||||
]
|
||||
|
||||
return markdown_tups
|
||||
|
||||
def remove_images(self, content: str) -> str:
|
||||
"""Get a dictionary of a markdown file from its path."""
|
||||
pattern = r"!{1}\[\[(.*)\]\]"
|
||||
content = re.sub(pattern, "", content)
|
||||
return content
|
||||
|
||||
def remove_hyperlinks(self, content: str) -> str:
|
||||
"""Get a dictionary of a markdown file from its path."""
|
||||
pattern = r"\[(.*?)\]\((.*?)\)"
|
||||
content = re.sub(pattern, r"\1", content)
|
||||
return content
|
||||
|
||||
def parse_tups(self, filepath: str) -> list[tuple[str | None, str]]:
|
||||
"""Parse file into tuples."""
|
||||
content = ""
|
||||
try:
|
||||
content = Path(filepath).read_text(encoding=self._encoding)
|
||||
except UnicodeDecodeError as e:
|
||||
if self._autodetect_encoding:
|
||||
detected_encodings = detect_file_encodings(filepath)
|
||||
for encoding in detected_encodings:
|
||||
try:
|
||||
content = Path(filepath).read_text(encoding=encoding.encoding)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
else:
|
||||
raise RuntimeError(f"Error loading {filepath}") from e
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error loading {filepath}") from e
|
||||
|
||||
if self._remove_hyperlinks:
|
||||
content = self.remove_hyperlinks(content)
|
||||
|
||||
if self._remove_images:
|
||||
content = self.remove_images(content)
|
||||
|
||||
return self.markdown_to_tups(content)
|
||||
386
dify/api/core/rag/extractor/notion_extractor.py
Normal file
386
dify/api/core/rag/extractor/notion_extractor.py
Normal file
@@ -0,0 +1,386 @@
|
||||
import json
|
||||
import logging
|
||||
import operator
|
||||
from typing import Any, cast
|
||||
|
||||
import httpx
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Document as DocumentModel
|
||||
from services.datasource_provider_service import DatasourceProviderService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
|
||||
DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
|
||||
SEARCH_URL = "https://api.notion.com/v1/search"
|
||||
|
||||
RETRIEVE_PAGE_URL_TMPL = "https://api.notion.com/v1/pages/{page_id}"
|
||||
RETRIEVE_DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}"
|
||||
# if user want split by headings, use the corresponding splitter
|
||||
HEADING_SPLITTER = {
|
||||
"heading_1": "# ",
|
||||
"heading_2": "## ",
|
||||
"heading_3": "### ",
|
||||
}
|
||||
|
||||
|
||||
class NotionExtractor(BaseExtractor):
|
||||
def __init__(
|
||||
self,
|
||||
notion_workspace_id: str,
|
||||
notion_obj_id: str,
|
||||
notion_page_type: str,
|
||||
tenant_id: str,
|
||||
document_model: DocumentModel | None = None,
|
||||
notion_access_token: str | None = None,
|
||||
credential_id: str | None = None,
|
||||
):
|
||||
self._notion_access_token = None
|
||||
self._document_model = document_model
|
||||
self._notion_workspace_id = notion_workspace_id
|
||||
self._notion_obj_id = notion_obj_id
|
||||
self._notion_page_type = notion_page_type
|
||||
self._credential_id = credential_id
|
||||
if notion_access_token:
|
||||
self._notion_access_token = notion_access_token
|
||||
else:
|
||||
self._notion_access_token = self._get_access_token(tenant_id, self._credential_id)
|
||||
if not self._notion_access_token:
|
||||
integration_token = dify_config.NOTION_INTEGRATION_TOKEN
|
||||
if integration_token is None:
|
||||
raise ValueError(
|
||||
"Must specify `integration_token` or set environment variable `NOTION_INTEGRATION_TOKEN`."
|
||||
)
|
||||
|
||||
self._notion_access_token = integration_token
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
self.update_last_edited_time(self._document_model)
|
||||
|
||||
text_docs = self._load_data_as_documents(self._notion_obj_id, self._notion_page_type)
|
||||
|
||||
return text_docs
|
||||
|
||||
def _load_data_as_documents(self, notion_obj_id: str, notion_page_type: str) -> list[Document]:
|
||||
docs = []
|
||||
if notion_page_type == "database":
|
||||
# get all the pages in the database
|
||||
page_text_documents = self._get_notion_database_data(notion_obj_id)
|
||||
docs.extend(page_text_documents)
|
||||
elif notion_page_type == "page":
|
||||
page_text_list = self._get_notion_block_data(notion_obj_id)
|
||||
docs.append(Document(page_content="\n".join(page_text_list)))
|
||||
else:
|
||||
raise ValueError("notion page type not supported")
|
||||
|
||||
return docs
|
||||
|
||||
def _get_notion_database_data(self, database_id: str, query_dict: dict[str, Any] = {}) -> list[Document]:
|
||||
"""Get all the pages from a Notion database."""
|
||||
assert self._notion_access_token is not None, "Notion access token is required"
|
||||
|
||||
database_content = []
|
||||
next_cursor = None
|
||||
has_more = True
|
||||
|
||||
while has_more:
|
||||
current_query = query_dict.copy()
|
||||
if next_cursor:
|
||||
current_query["start_cursor"] = next_cursor
|
||||
|
||||
res = httpx.post(
|
||||
DATABASE_URL_TMPL.format(database_id=database_id),
|
||||
headers={
|
||||
"Authorization": "Bearer " + self._notion_access_token,
|
||||
"Content-Type": "application/json",
|
||||
"Notion-Version": "2022-06-28",
|
||||
},
|
||||
json=current_query,
|
||||
)
|
||||
|
||||
response_data = res.json()
|
||||
|
||||
if "results" not in response_data or response_data["results"] is None:
|
||||
break
|
||||
|
||||
for result in response_data["results"]:
|
||||
properties = result["properties"]
|
||||
data = {}
|
||||
value: Any
|
||||
for property_name, property_value in properties.items():
|
||||
type = property_value["type"]
|
||||
if type == "multi_select":
|
||||
value = []
|
||||
multi_select_list = property_value[type]
|
||||
for multi_select in multi_select_list:
|
||||
value.append(multi_select["name"])
|
||||
elif type in {"rich_text", "title"}:
|
||||
if len(property_value[type]) > 0:
|
||||
value = property_value[type][0]["plain_text"]
|
||||
else:
|
||||
value = ""
|
||||
elif type in {"select", "status"}:
|
||||
if property_value[type]:
|
||||
value = property_value[type]["name"]
|
||||
else:
|
||||
value = ""
|
||||
else:
|
||||
value = property_value[type]
|
||||
data[property_name] = value
|
||||
row_dict = {k: v for k, v in data.items() if v}
|
||||
row_content = ""
|
||||
for key, value in sorted(row_dict.items(), key=operator.itemgetter(0)):
|
||||
if isinstance(value, dict):
|
||||
value_dict = {k: v for k, v in value.items() if v}
|
||||
value_content = "".join(f"{k}:{v} " for k, v in value_dict.items())
|
||||
row_content = row_content + f"{key}:{value_content}\n"
|
||||
else:
|
||||
row_content = row_content + f"{key}:{value}\n"
|
||||
if "url" in result:
|
||||
row_content = row_content + f"Row Page URL:{result.get('url', '')}\n"
|
||||
database_content.append(row_content)
|
||||
|
||||
has_more = response_data.get("has_more", False)
|
||||
next_cursor = response_data.get("next_cursor")
|
||||
|
||||
if not database_content:
|
||||
return []
|
||||
|
||||
return [Document(page_content="\n".join(database_content))]
|
||||
|
||||
def _get_notion_block_data(self, page_id: str) -> list[str]:
|
||||
assert self._notion_access_token is not None, "Notion access token is required"
|
||||
result_lines_arr = []
|
||||
start_cursor = None
|
||||
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=page_id)
|
||||
while True:
|
||||
query_dict: dict[str, Any] = {} if not start_cursor else {"start_cursor": start_cursor}
|
||||
try:
|
||||
res = httpx.request(
|
||||
"GET",
|
||||
block_url,
|
||||
headers={
|
||||
"Authorization": "Bearer " + self._notion_access_token,
|
||||
"Content-Type": "application/json",
|
||||
"Notion-Version": "2022-06-28",
|
||||
},
|
||||
params=query_dict,
|
||||
)
|
||||
if res.status_code != 200:
|
||||
raise ValueError(f"Error fetching Notion block data: {res.text}")
|
||||
data = res.json()
|
||||
except httpx.HTTPError as e:
|
||||
raise ValueError("Error fetching Notion block data") from e
|
||||
if "results" not in data or not isinstance(data["results"], list):
|
||||
raise ValueError("Error fetching Notion block data")
|
||||
for result in data["results"]:
|
||||
result_type = result["type"]
|
||||
result_obj = result[result_type]
|
||||
cur_result_text_arr = []
|
||||
if result_type == "table":
|
||||
result_block_id = result["id"]
|
||||
text = self._read_table_rows(result_block_id)
|
||||
text += "\n\n"
|
||||
result_lines_arr.append(text)
|
||||
else:
|
||||
if "rich_text" in result_obj:
|
||||
for rich_text in result_obj["rich_text"]:
|
||||
# skip if doesn't have text object
|
||||
if "text" in rich_text:
|
||||
text = rich_text["text"]["content"]
|
||||
cur_result_text_arr.append(text)
|
||||
|
||||
result_block_id = result["id"]
|
||||
has_children = result["has_children"]
|
||||
block_type = result["type"]
|
||||
if has_children and block_type != "child_page":
|
||||
children_text = self._read_block(result_block_id, num_tabs=1)
|
||||
cur_result_text_arr.append(children_text)
|
||||
|
||||
cur_result_text = "\n".join(cur_result_text_arr)
|
||||
if result_type in HEADING_SPLITTER:
|
||||
result_lines_arr.append(f"{HEADING_SPLITTER[result_type]}{cur_result_text}")
|
||||
else:
|
||||
result_lines_arr.append(cur_result_text + "\n\n")
|
||||
|
||||
if data["next_cursor"] is None:
|
||||
break
|
||||
else:
|
||||
start_cursor = data["next_cursor"]
|
||||
return result_lines_arr
|
||||
|
||||
def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
|
||||
"""Read a block."""
|
||||
assert self._notion_access_token is not None, "Notion access token is required"
|
||||
result_lines_arr = []
|
||||
start_cursor = None
|
||||
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
|
||||
while True:
|
||||
query_dict: dict[str, Any] = {} if not start_cursor else {"start_cursor": start_cursor}
|
||||
|
||||
res = httpx.request(
|
||||
"GET",
|
||||
block_url,
|
||||
headers={
|
||||
"Authorization": "Bearer " + self._notion_access_token,
|
||||
"Content-Type": "application/json",
|
||||
"Notion-Version": "2022-06-28",
|
||||
},
|
||||
params=query_dict,
|
||||
)
|
||||
data = res.json()
|
||||
if "results" not in data or data["results"] is None:
|
||||
break
|
||||
for result in data["results"]:
|
||||
result_type = result["type"]
|
||||
result_obj = result[result_type]
|
||||
cur_result_text_arr = []
|
||||
if result_type == "table":
|
||||
result_block_id = result["id"]
|
||||
text = self._read_table_rows(result_block_id)
|
||||
result_lines_arr.append(text)
|
||||
else:
|
||||
if "rich_text" in result_obj:
|
||||
for rich_text in result_obj["rich_text"]:
|
||||
# skip if doesn't have text object
|
||||
if "text" in rich_text:
|
||||
text = rich_text["text"]["content"]
|
||||
prefix = "\t" * num_tabs
|
||||
cur_result_text_arr.append(prefix + text)
|
||||
result_block_id = result["id"]
|
||||
has_children = result["has_children"]
|
||||
block_type = result["type"]
|
||||
if has_children and block_type != "child_page":
|
||||
children_text = self._read_block(result_block_id, num_tabs=num_tabs + 1)
|
||||
cur_result_text_arr.append(children_text)
|
||||
|
||||
cur_result_text = "\n".join(cur_result_text_arr)
|
||||
if result_type in HEADING_SPLITTER:
|
||||
result_lines_arr.append(f"{HEADING_SPLITTER[result_type]}{cur_result_text}")
|
||||
else:
|
||||
result_lines_arr.append(cur_result_text + "\n\n")
|
||||
|
||||
if data["next_cursor"] is None:
|
||||
break
|
||||
else:
|
||||
start_cursor = data["next_cursor"]
|
||||
|
||||
result_lines = "\n".join(result_lines_arr)
|
||||
return result_lines
|
||||
|
||||
def _read_table_rows(self, block_id: str) -> str:
|
||||
"""Read table rows."""
|
||||
assert self._notion_access_token is not None, "Notion access token is required"
|
||||
done = False
|
||||
result_lines_arr = []
|
||||
start_cursor = None
|
||||
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
|
||||
while not done:
|
||||
query_dict: dict[str, Any] = {} if not start_cursor else {"start_cursor": start_cursor}
|
||||
|
||||
res = httpx.request(
|
||||
"GET",
|
||||
block_url,
|
||||
headers={
|
||||
"Authorization": "Bearer " + self._notion_access_token,
|
||||
"Content-Type": "application/json",
|
||||
"Notion-Version": "2022-06-28",
|
||||
},
|
||||
params=query_dict,
|
||||
)
|
||||
data = res.json()
|
||||
# get table headers text
|
||||
table_header_cell_texts = []
|
||||
table_header_cells = data["results"][0]["table_row"]["cells"]
|
||||
for table_header_cell in table_header_cells:
|
||||
if table_header_cell:
|
||||
for table_header_cell_text in table_header_cell:
|
||||
text = table_header_cell_text["text"]["content"]
|
||||
table_header_cell_texts.append(text)
|
||||
else:
|
||||
table_header_cell_texts.append("")
|
||||
# Initialize Markdown table with headers
|
||||
markdown_table = "| " + " | ".join(table_header_cell_texts) + " |\n"
|
||||
markdown_table += "| " + " | ".join(["---"] * len(table_header_cell_texts)) + " |\n"
|
||||
|
||||
# Process data to format each row in Markdown table format
|
||||
results = data["results"]
|
||||
for i in range(len(results) - 1):
|
||||
column_texts = []
|
||||
table_column_cells = data["results"][i + 1]["table_row"]["cells"]
|
||||
for j in range(len(table_column_cells)):
|
||||
if table_column_cells[j]:
|
||||
for table_column_cell_text in table_column_cells[j]:
|
||||
column_text = table_column_cell_text["text"]["content"]
|
||||
column_texts.append(column_text)
|
||||
# Add row to Markdown table
|
||||
markdown_table += "| " + " | ".join(column_texts) + " |\n"
|
||||
result_lines_arr.append(markdown_table)
|
||||
if data["next_cursor"] is None:
|
||||
done = True
|
||||
break
|
||||
else:
|
||||
start_cursor = data["next_cursor"]
|
||||
|
||||
result_lines = "\n".join(result_lines_arr)
|
||||
return result_lines
|
||||
|
||||
def update_last_edited_time(self, document_model: DocumentModel | None):
|
||||
if not document_model:
|
||||
return
|
||||
|
||||
last_edited_time = self.get_notion_last_edited_time()
|
||||
data_source_info = document_model.data_source_info_dict
|
||||
if data_source_info:
|
||||
data_source_info["last_edited_time"] = last_edited_time
|
||||
|
||||
db.session.query(DocumentModel).filter_by(id=document_model.id).update(
|
||||
{DocumentModel.data_source_info: json.dumps(data_source_info)}
|
||||
) # type: ignore
|
||||
db.session.commit()
|
||||
|
||||
def get_notion_last_edited_time(self) -> str:
|
||||
assert self._notion_access_token is not None, "Notion access token is required"
|
||||
obj_id = self._notion_obj_id
|
||||
page_type = self._notion_page_type
|
||||
if page_type == "database":
|
||||
retrieve_page_url = RETRIEVE_DATABASE_URL_TMPL.format(database_id=obj_id)
|
||||
else:
|
||||
retrieve_page_url = RETRIEVE_PAGE_URL_TMPL.format(page_id=obj_id)
|
||||
|
||||
query_dict: dict[str, Any] = {}
|
||||
|
||||
res = httpx.request(
|
||||
"GET",
|
||||
retrieve_page_url,
|
||||
headers={
|
||||
"Authorization": "Bearer " + self._notion_access_token,
|
||||
"Content-Type": "application/json",
|
||||
"Notion-Version": "2022-06-28",
|
||||
},
|
||||
json=query_dict,
|
||||
)
|
||||
|
||||
data = res.json()
|
||||
return cast(str, data["last_edited_time"])
|
||||
|
||||
@classmethod
|
||||
def _get_access_token(cls, tenant_id: str, credential_id: str | None) -> str:
|
||||
# get credential from tenant_id and credential_id
|
||||
if not credential_id:
|
||||
raise Exception(f"No credential id found for tenant {tenant_id}")
|
||||
datasource_provider_service = DatasourceProviderService()
|
||||
credential = datasource_provider_service.get_datasource_credentials(
|
||||
tenant_id=tenant_id,
|
||||
credential_id=credential_id,
|
||||
provider="notion_datasource",
|
||||
plugin_id="langgenius/notion_datasource",
|
||||
)
|
||||
if not credential:
|
||||
raise Exception(f"No notion credential found for tenant {tenant_id} and credential {credential_id}")
|
||||
|
||||
return cast(str, credential["integration_secret"])
|
||||
66
dify/api/core/rag/extractor/pdf_extractor.py
Normal file
66
dify/api/core/rag/extractor/pdf_extractor.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
import contextlib
|
||||
from collections.abc import Iterator
|
||||
|
||||
from core.rag.extractor.blob.blob import Blob
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from extensions.ext_storage import storage
|
||||
|
||||
|
||||
class PdfExtractor(BaseExtractor):
|
||||
"""Load pdf files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, file_cache_key: str | None = None):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._file_cache_key = file_cache_key
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
plaintext_file_exists = False
|
||||
if self._file_cache_key:
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
text = storage.load(self._file_cache_key).decode("utf-8")
|
||||
plaintext_file_exists = True
|
||||
return [Document(page_content=text)]
|
||||
documents = list(self.load())
|
||||
text_list = []
|
||||
for document in documents:
|
||||
text_list.append(document.page_content)
|
||||
text = "\n\n".join(text_list)
|
||||
|
||||
# save plaintext file for caching
|
||||
if not plaintext_file_exists and self._file_cache_key:
|
||||
storage.save(self._file_cache_key, text.encode("utf-8"))
|
||||
|
||||
return documents
|
||||
|
||||
def load(
|
||||
self,
|
||||
) -> Iterator[Document]:
|
||||
"""Lazy load given path as pages."""
|
||||
blob = Blob.from_path(self._file_path)
|
||||
yield from self.parse(blob)
|
||||
|
||||
def parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
import pypdfium2 # type: ignore
|
||||
|
||||
with blob.as_bytes_io() as file_path:
|
||||
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
||||
try:
|
||||
for page_number, page in enumerate(pdf_reader):
|
||||
text_page = page.get_textpage()
|
||||
content = text_page.get_text_range()
|
||||
text_page.close()
|
||||
page.close()
|
||||
metadata = {"source": blob.source, "page": page_number}
|
||||
yield Document(page_content=content, metadata=metadata)
|
||||
finally:
|
||||
pdf_reader.close()
|
||||
48
dify/api/core/rag/extractor/text_extractor.py
Normal file
48
dify/api/core/rag/extractor/text_extractor.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.extractor.helpers import detect_file_encodings
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
||||
class TextExtractor(BaseExtractor):
|
||||
"""Load text files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, encoding: str | None = None, autodetect_encoding: bool = False):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._encoding = encoding
|
||||
self._autodetect_encoding = autodetect_encoding
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Load from file path."""
|
||||
text = ""
|
||||
try:
|
||||
text = Path(self._file_path).read_text(encoding=self._encoding)
|
||||
except UnicodeDecodeError as e:
|
||||
if self._autodetect_encoding:
|
||||
detected_encodings = detect_file_encodings(self._file_path)
|
||||
for encoding in detected_encodings:
|
||||
try:
|
||||
text = Path(self._file_path).read_text(encoding=encoding.encoding)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Decode failed: {self._file_path}, all detected encodings failed. Original error: {e}"
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(f"Decode failed: {self._file_path}, specified encoding failed. Original error: {e}")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error loading {self._file_path}") from e
|
||||
|
||||
metadata = {"source": self._file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
@@ -0,0 +1,59 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnstructuredWordExtractor(BaseExtractor):
|
||||
"""Loader that uses unstructured to load word documents."""
|
||||
|
||||
def __init__(self, file_path: str, api_url: str, api_key: str = ""):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||
from unstructured.file_utils.filetype import FileType, detect_filetype
|
||||
|
||||
unstructured_version = tuple(int(x) for x in __unstructured_version__.split("."))
|
||||
# check the file extension
|
||||
try:
|
||||
import magic # noqa: F401
|
||||
|
||||
is_doc = detect_filetype(self._file_path) == FileType.DOC
|
||||
except ImportError:
|
||||
_, extension = os.path.splitext(str(self._file_path))
|
||||
is_doc = extension == ".doc"
|
||||
|
||||
if is_doc and unstructured_version < (0, 4, 11):
|
||||
raise ValueError(
|
||||
f"You are on unstructured version {__unstructured_version__}. "
|
||||
"Partitioning .doc files is only supported in unstructured>=0.4.11. "
|
||||
"Please upgrade the unstructured package and try again."
|
||||
)
|
||||
|
||||
if is_doc:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||
|
||||
else:
|
||||
from unstructured.partition.docx import partition_docx
|
||||
|
||||
elements = partition_docx(filename=self._file_path)
|
||||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
documents.append(Document(page_content=text))
|
||||
return documents
|
||||
@@ -0,0 +1,56 @@
|
||||
import base64
|
||||
import contextlib
|
||||
import logging
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnstructuredEmailExtractor(BaseExtractor):
|
||||
"""Load eml files.
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||
else:
|
||||
from unstructured.partition.email import partition_email
|
||||
|
||||
elements = partition_email(filename=self._file_path)
|
||||
|
||||
# noinspection PyBroadException
|
||||
with contextlib.suppress(Exception):
|
||||
for element in elements:
|
||||
element_text = element.text.strip()
|
||||
|
||||
padding_needed = 4 - len(element_text) % 4
|
||||
element_text += "=" * padding_needed
|
||||
|
||||
element_decode = base64.b64decode(element_text)
|
||||
soup = BeautifulSoup(element_decode.decode("utf-8"), "html.parser")
|
||||
element.text = soup.get_text()
|
||||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
documents.append(Document(page_content=text))
|
||||
return documents
|
||||
@@ -0,0 +1,51 @@
|
||||
import logging
|
||||
|
||||
import pypandoc # type: ignore
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnstructuredEpubExtractor(BaseExtractor):
|
||||
"""Load epub files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
api_url: str | None = None,
|
||||
api_key: str = "",
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||
else:
|
||||
from unstructured.partition.epub import partition_epub
|
||||
|
||||
pypandoc.download_pandoc()
|
||||
elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
|
||||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
documents.append(Document(page_content=text))
|
||||
|
||||
return documents
|
||||
@@ -0,0 +1,43 @@
|
||||
import logging
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnstructuredMarkdownExtractor(BaseExtractor):
|
||||
"""Load md files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||
else:
|
||||
from unstructured.partition.md import partition_md
|
||||
|
||||
elements = partition_md(filename=self._file_path)
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
documents.append(Document(page_content=text))
|
||||
|
||||
return documents
|
||||
@@ -0,0 +1,42 @@
|
||||
import logging
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnstructuredMsgExtractor(BaseExtractor):
|
||||
"""Load msg files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||
else:
|
||||
from unstructured.partition.msg import partition_msg
|
||||
|
||||
elements = partition_msg(filename=self._file_path)
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
documents.append(Document(page_content=text))
|
||||
|
||||
return documents
|
||||
@@ -0,0 +1,46 @@
|
||||
import logging
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnstructuredPPTExtractor(BaseExtractor):
|
||||
"""Load ppt files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||
else:
|
||||
raise NotImplementedError("Unstructured API Url is not configured")
|
||||
text_by_page: dict[int, str] = {}
|
||||
for element in elements:
|
||||
page = element.metadata.page_number
|
||||
if page is None:
|
||||
continue
|
||||
text = element.text
|
||||
if page in text_by_page:
|
||||
text_by_page[page] += "\n" + text
|
||||
else:
|
||||
text_by_page[page] = text
|
||||
|
||||
combined_texts = list(text_by_page.values())
|
||||
documents = []
|
||||
for combined_text in combined_texts:
|
||||
text = combined_text.strip()
|
||||
documents.append(Document(page_content=text))
|
||||
return documents
|
||||
@@ -0,0 +1,48 @@
|
||||
import logging
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnstructuredPPTXExtractor(BaseExtractor):
|
||||
"""Load pptx files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||
else:
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
elements = partition_pptx(filename=self._file_path)
|
||||
text_by_page: dict[int, str] = {}
|
||||
for element in elements:
|
||||
page = element.metadata.page_number
|
||||
text = element.text
|
||||
if page is not None:
|
||||
if page in text_by_page:
|
||||
text_by_page[page] += "\n" + text
|
||||
else:
|
||||
text_by_page[page] = text
|
||||
|
||||
combined_texts = list(text_by_page.values())
|
||||
documents = []
|
||||
for combined_text in combined_texts:
|
||||
text = combined_text.strip()
|
||||
documents.append(Document(page_content=text))
|
||||
|
||||
return documents
|
||||
@@ -0,0 +1,43 @@
|
||||
import logging
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnstructuredXmlExtractor(BaseExtractor):
|
||||
"""Load xml files.
|
||||
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, api_url: str | None = None, api_key: str = ""):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||
else:
|
||||
from unstructured.partition.xml import partition_xml
|
||||
|
||||
elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
|
||||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
documents.append(Document(page_content=text))
|
||||
|
||||
return documents
|
||||
199
dify/api/core/rag/extractor/watercrawl/client.py
Normal file
199
dify/api/core/rag/extractor/watercrawl/client.py
Normal file
@@ -0,0 +1,199 @@
|
||||
import json
|
||||
from collections.abc import Generator
|
||||
from typing import Union
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
from httpx import Response
|
||||
|
||||
from core.rag.extractor.watercrawl.exceptions import (
|
||||
WaterCrawlAuthenticationError,
|
||||
WaterCrawlBadRequestError,
|
||||
WaterCrawlPermissionError,
|
||||
)
|
||||
|
||||
|
||||
class BaseAPIClient:
|
||||
def __init__(self, api_key, base_url):
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url
|
||||
self.session = self.init_session()
|
||||
|
||||
def init_session(self):
|
||||
headers = {
|
||||
"X-API-Key": self.api_key,
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
"User-Agent": "WaterCrawl-Plugin",
|
||||
"Accept-Language": "en-US",
|
||||
}
|
||||
return httpx.Client(headers=headers, timeout=None)
|
||||
|
||||
def _request(
|
||||
self,
|
||||
method: str,
|
||||
endpoint: str,
|
||||
query_params: dict | None = None,
|
||||
data: dict | None = None,
|
||||
**kwargs,
|
||||
) -> Response:
|
||||
stream = kwargs.pop("stream", False)
|
||||
url = urljoin(self.base_url, endpoint)
|
||||
if stream:
|
||||
request = self.session.build_request(method, url, params=query_params, json=data)
|
||||
return self.session.send(request, stream=True, **kwargs)
|
||||
|
||||
return self.session.request(method, url, params=query_params, json=data, **kwargs)
|
||||
|
||||
def _get(self, endpoint: str, query_params: dict | None = None, **kwargs):
|
||||
return self._request("GET", endpoint, query_params=query_params, **kwargs)
|
||||
|
||||
def _post(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
|
||||
return self._request("POST", endpoint, query_params=query_params, data=data, **kwargs)
|
||||
|
||||
def _put(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
|
||||
return self._request("PUT", endpoint, query_params=query_params, data=data, **kwargs)
|
||||
|
||||
def _delete(self, endpoint: str, query_params: dict | None = None, **kwargs):
|
||||
return self._request("DELETE", endpoint, query_params=query_params, **kwargs)
|
||||
|
||||
def _patch(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
|
||||
return self._request("PATCH", endpoint, query_params=query_params, data=data, **kwargs)
|
||||
|
||||
|
||||
class WaterCrawlAPIClient(BaseAPIClient):
|
||||
def __init__(self, api_key, base_url: str | None = "https://app.watercrawl.dev/"):
|
||||
super().__init__(api_key, base_url)
|
||||
|
||||
def process_eventstream(self, response: Response, download: bool = False) -> Generator:
|
||||
try:
|
||||
for raw_line in response.iter_lines():
|
||||
line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line
|
||||
if line.startswith("data:"):
|
||||
line = line[5:].strip()
|
||||
data = json.loads(line)
|
||||
if data["type"] == "result" and download:
|
||||
data["data"] = self.download_result(data["data"])
|
||||
yield data
|
||||
finally:
|
||||
response.close()
|
||||
|
||||
def process_response(self, response: Response) -> dict | bytes | list | None | Generator:
|
||||
if response.status_code == 401:
|
||||
raise WaterCrawlAuthenticationError(response)
|
||||
|
||||
if response.status_code == 403:
|
||||
raise WaterCrawlPermissionError(response)
|
||||
|
||||
if 400 <= response.status_code < 500:
|
||||
raise WaterCrawlBadRequestError(response)
|
||||
|
||||
response.raise_for_status()
|
||||
if response.status_code == 204:
|
||||
return None
|
||||
if response.headers.get("Content-Type") == "application/json":
|
||||
return response.json() or {}
|
||||
|
||||
if response.headers.get("Content-Type") == "application/octet-stream":
|
||||
return response.content
|
||||
|
||||
if response.headers.get("Content-Type") == "text/event-stream":
|
||||
return self.process_eventstream(response)
|
||||
|
||||
raise Exception(f"Unknown response type: {response.headers.get('Content-Type')}")
|
||||
|
||||
def get_crawl_requests_list(self, page: int | None = None, page_size: int | None = None):
|
||||
query_params = {"page": page or 1, "page_size": page_size or 10}
|
||||
return self.process_response(
|
||||
self._get(
|
||||
"/api/v1/core/crawl-requests/",
|
||||
query_params=query_params,
|
||||
)
|
||||
)
|
||||
|
||||
def get_crawl_request(self, item_id: str):
|
||||
return self.process_response(
|
||||
self._get(
|
||||
f"/api/v1/core/crawl-requests/{item_id}/",
|
||||
)
|
||||
)
|
||||
|
||||
def create_crawl_request(
|
||||
self,
|
||||
url: Union[list, str] | None = None,
|
||||
spider_options: dict | None = None,
|
||||
page_options: dict | None = None,
|
||||
plugin_options: dict | None = None,
|
||||
):
|
||||
data = {
|
||||
# 'urls': url if isinstance(url, list) else [url],
|
||||
"url": url,
|
||||
"options": {
|
||||
"spider_options": spider_options or {},
|
||||
"page_options": page_options or {},
|
||||
"plugin_options": plugin_options or {},
|
||||
},
|
||||
}
|
||||
return self.process_response(
|
||||
self._post(
|
||||
"/api/v1/core/crawl-requests/",
|
||||
data=data,
|
||||
)
|
||||
)
|
||||
|
||||
def stop_crawl_request(self, item_id: str):
|
||||
return self.process_response(
|
||||
self._delete(
|
||||
f"/api/v1/core/crawl-requests/{item_id}/",
|
||||
)
|
||||
)
|
||||
|
||||
def download_crawl_request(self, item_id: str):
|
||||
return self.process_response(
|
||||
self._get(
|
||||
f"/api/v1/core/crawl-requests/{item_id}/download/",
|
||||
)
|
||||
)
|
||||
|
||||
def monitor_crawl_request(self, item_id: str, prefetched=False) -> Generator:
|
||||
query_params = {"prefetched": str(prefetched).lower()}
|
||||
generator = self.process_response(
|
||||
self._get(f"/api/v1/core/crawl-requests/{item_id}/status/", stream=True, query_params=query_params),
|
||||
)
|
||||
if not isinstance(generator, Generator):
|
||||
raise ValueError("Generator expected")
|
||||
yield from generator
|
||||
|
||||
def get_crawl_request_results(
|
||||
self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict | None = None
|
||||
):
|
||||
query_params = query_params or {}
|
||||
query_params.update({"page": page or 1, "page_size": page_size or 25})
|
||||
return self.process_response(
|
||||
self._get(f"/api/v1/core/crawl-requests/{item_id}/results/", query_params=query_params)
|
||||
)
|
||||
|
||||
def scrape_url(
|
||||
self,
|
||||
url: str,
|
||||
page_options: dict | None = None,
|
||||
plugin_options: dict | None = None,
|
||||
sync: bool = True,
|
||||
prefetched: bool = True,
|
||||
):
|
||||
response_result = self.create_crawl_request(url=url, page_options=page_options, plugin_options=plugin_options)
|
||||
if not sync:
|
||||
return response_result
|
||||
|
||||
for event_data in self.monitor_crawl_request(response_result["uuid"], prefetched):
|
||||
if event_data["type"] == "result":
|
||||
return event_data["data"]
|
||||
|
||||
def download_result(self, result_object: dict):
|
||||
response = httpx.get(result_object["result"], timeout=None)
|
||||
try:
|
||||
response.raise_for_status()
|
||||
result_object["result"] = response.json()
|
||||
finally:
|
||||
response.close()
|
||||
return result_object
|
||||
32
dify/api/core/rag/extractor/watercrawl/exceptions.py
Normal file
32
dify/api/core/rag/extractor/watercrawl/exceptions.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import json
|
||||
|
||||
|
||||
class WaterCrawlError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class WaterCrawlBadRequestError(WaterCrawlError):
|
||||
def __init__(self, response):
|
||||
self.status_code = response.status_code
|
||||
self.response = response
|
||||
data = response.json()
|
||||
self.message = data.get("message", "Unknown error occurred")
|
||||
self.errors = data.get("errors", {})
|
||||
super().__init__(self.message)
|
||||
|
||||
@property
|
||||
def flat_errors(self):
|
||||
return json.dumps(self.errors)
|
||||
|
||||
def __str__(self):
|
||||
return f"WaterCrawlBadRequestError: {self.message} \n {self.flat_errors}"
|
||||
|
||||
|
||||
class WaterCrawlPermissionError(WaterCrawlBadRequestError):
|
||||
def __str__(self):
|
||||
return f"You are exceeding your WaterCrawl API limits. {self.message}"
|
||||
|
||||
|
||||
class WaterCrawlAuthenticationError(WaterCrawlBadRequestError):
|
||||
def __str__(self):
|
||||
return "WaterCrawl API key is invalid or expired. Please check your API key and try again."
|
||||
64
dify/api/core/rag/extractor/watercrawl/extractor.py
Normal file
64
dify/api/core/rag/extractor/watercrawl/extractor.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from services.website_service import WebsiteService
|
||||
|
||||
|
||||
class WaterCrawlWebExtractor(BaseExtractor):
|
||||
"""
|
||||
Crawl and scrape websites and return content in clean llm-ready markdown.
|
||||
|
||||
|
||||
Args:
|
||||
url: The URL to scrape.
|
||||
api_key: The API key for WaterCrawl.
|
||||
base_url: The base URL for the Firecrawl API. Defaults to 'https://app.firecrawl.dev'.
|
||||
mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
|
||||
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
job_id: str,
|
||||
tenant_id: str,
|
||||
mode: str = "crawl",
|
||||
only_main_content: bool = True,
|
||||
):
|
||||
"""Initialize with url, api_key, base_url and mode."""
|
||||
self._url = url
|
||||
self.job_id = job_id
|
||||
self.tenant_id = tenant_id
|
||||
self.mode = mode
|
||||
self.only_main_content = only_main_content
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Extract content from the URL."""
|
||||
documents = []
|
||||
if self.mode == "crawl":
|
||||
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id)
|
||||
if crawl_data is None:
|
||||
return []
|
||||
document = Document(
|
||||
page_content=crawl_data.get("markdown", ""),
|
||||
metadata={
|
||||
"source_url": crawl_data.get("source_url"),
|
||||
"description": crawl_data.get("description"),
|
||||
"title": crawl_data.get("title"),
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
elif self.mode == "scrape":
|
||||
scrape_data = WebsiteService.get_scrape_url_data(
|
||||
"watercrawl", self._url, self.tenant_id, self.only_main_content
|
||||
)
|
||||
|
||||
document = Document(
|
||||
page_content=scrape_data.get("markdown", ""),
|
||||
metadata={
|
||||
"source_url": scrape_data.get("source_url"),
|
||||
"description": scrape_data.get("description"),
|
||||
"title": scrape_data.get("title"),
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
return documents
|
||||
117
dify/api/core/rag/extractor/watercrawl/provider.py
Normal file
117
dify/api/core/rag/extractor/watercrawl/provider.py
Normal file
@@ -0,0 +1,117 @@
|
||||
from collections.abc import Generator
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient
|
||||
|
||||
|
||||
class WaterCrawlProvider:
|
||||
def __init__(self, api_key, base_url: str | None = None):
|
||||
self.client = WaterCrawlAPIClient(api_key, base_url)
|
||||
|
||||
def crawl_url(self, url, options: dict | Any | None = None):
|
||||
options = options or {}
|
||||
spider_options = {
|
||||
"max_depth": 1,
|
||||
"page_limit": 1,
|
||||
"allowed_domains": [],
|
||||
"exclude_paths": [],
|
||||
"include_paths": [],
|
||||
}
|
||||
if options.get("crawl_sub_pages", True):
|
||||
spider_options["page_limit"] = options.get("limit", 1)
|
||||
spider_options["max_depth"] = options.get("max_depth", 1)
|
||||
spider_options["include_paths"] = options.get("includes", "").split(",") if options.get("includes") else []
|
||||
spider_options["exclude_paths"] = options.get("excludes", "").split(",") if options.get("excludes") else []
|
||||
|
||||
wait_time = options.get("wait_time", 1000)
|
||||
page_options = {
|
||||
"exclude_tags": options.get("exclude_tags", "").split(",") if options.get("exclude_tags") else [],
|
||||
"include_tags": options.get("include_tags", "").split(",") if options.get("include_tags") else [],
|
||||
"wait_time": max(1000, wait_time), # minimum wait time is 1 second
|
||||
"include_html": False,
|
||||
"only_main_content": options.get("only_main_content", True),
|
||||
"include_links": False,
|
||||
"timeout": 15000,
|
||||
"accept_cookies_selector": "#cookies-accept",
|
||||
"locale": "en-US",
|
||||
"actions": [],
|
||||
}
|
||||
result = self.client.create_crawl_request(url=url, spider_options=spider_options, page_options=page_options)
|
||||
|
||||
return {"status": "active", "job_id": result.get("uuid")}
|
||||
|
||||
def get_crawl_status(self, crawl_request_id):
|
||||
response = self.client.get_crawl_request(crawl_request_id)
|
||||
data = []
|
||||
if response["status"] in ["new", "running"]:
|
||||
status = "active"
|
||||
else:
|
||||
status = "completed"
|
||||
data = list(self._get_results(crawl_request_id))
|
||||
|
||||
time_str = response.get("duration")
|
||||
time_consuming: float = 0
|
||||
if time_str:
|
||||
time_obj = datetime.strptime(time_str, "%H:%M:%S.%f")
|
||||
time_consuming = (
|
||||
time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000
|
||||
)
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"job_id": response.get("uuid"),
|
||||
"total": response.get("options", {}).get("spider_options", {}).get("page_limit", 1),
|
||||
"current": response.get("number_of_documents", 0),
|
||||
"data": data,
|
||||
"time_consuming": time_consuming,
|
||||
}
|
||||
|
||||
def get_crawl_url_data(self, job_id, url) -> dict | None:
|
||||
if not job_id:
|
||||
return self.scrape_url(url)
|
||||
|
||||
for result in self._get_results(
|
||||
job_id,
|
||||
{
|
||||
# filter by url
|
||||
"url": url
|
||||
},
|
||||
):
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
def scrape_url(self, url: str):
|
||||
response = self.client.scrape_url(url=url, sync=True, prefetched=True)
|
||||
return self._structure_data(response)
|
||||
|
||||
def _structure_data(self, result_object: dict):
|
||||
if isinstance(result_object.get("result", {}), str):
|
||||
raise ValueError("Invalid result object. Expected a dictionary.")
|
||||
|
||||
metadata = result_object.get("result", {}).get("metadata", {})
|
||||
return {
|
||||
"title": metadata.get("og:title") or metadata.get("title"),
|
||||
"description": metadata.get("description"),
|
||||
"source_url": result_object.get("url"),
|
||||
"markdown": result_object.get("result", {}).get("markdown"),
|
||||
}
|
||||
|
||||
def _get_results(self, crawl_request_id: str, query_params: dict | None = None) -> Generator[dict, None, None]:
|
||||
page = 0
|
||||
page_size = 100
|
||||
|
||||
query_params = query_params or {}
|
||||
query_params.update({"prefetched": "true"})
|
||||
while True:
|
||||
page += 1
|
||||
response = self.client.get_crawl_request_results(crawl_request_id, page, page_size, query_params)
|
||||
if not response["results"]:
|
||||
break
|
||||
|
||||
for result in response["results"]:
|
||||
yield self._structure_data(result)
|
||||
|
||||
if response["next"] is None:
|
||||
break
|
||||
295
dify/api/core/rag/extractor/word_extractor.py
Normal file
295
dify/api/core/rag/extractor/word_extractor.py
Normal file
@@ -0,0 +1,295 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
import uuid
|
||||
from urllib.parse import urlparse
|
||||
from xml.etree import ElementTree
|
||||
|
||||
import httpx
|
||||
from docx import Document as DocxDocument
|
||||
|
||||
from configs import dify_config
|
||||
from core.helper import ssrf_proxy
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_storage import storage
|
||||
from libs.datetime_utils import naive_utc_now
|
||||
from models.enums import CreatorUserRole
|
||||
from models.model import UploadFile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WordExtractor(BaseExtractor):
|
||||
"""Load docx files.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, tenant_id: str, user_id: str):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
self.tenant_id = tenant_id
|
||||
self.user_id = user_id
|
||||
|
||||
if "~" in self.file_path:
|
||||
self.file_path = os.path.expanduser(self.file_path)
|
||||
|
||||
# If the file is a web path, download it to a temporary file, and use that
|
||||
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
|
||||
response = httpx.get(self.file_path, timeout=None)
|
||||
|
||||
if response.status_code != 200:
|
||||
response.close()
|
||||
raise ValueError(f"Check the url of your file; returned status code {response.status_code}")
|
||||
|
||||
self.web_path = self.file_path
|
||||
# TODO: use a better way to handle the file
|
||||
self.temp_file = tempfile.NamedTemporaryFile() # noqa SIM115
|
||||
try:
|
||||
self.temp_file.write(response.content)
|
||||
finally:
|
||||
response.close()
|
||||
self.file_path = self.temp_file.name
|
||||
elif not os.path.isfile(self.file_path):
|
||||
raise ValueError(f"File path {self.file_path} is not a valid file or url")
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, "temp_file"):
|
||||
self.temp_file.close()
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Load given path as single page."""
|
||||
content = self.parse_docx(self.file_path)
|
||||
return [
|
||||
Document(
|
||||
page_content=content,
|
||||
metadata={"source": self.file_path},
|
||||
)
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _is_valid_url(url: str) -> bool:
|
||||
"""Check if the url is valid."""
|
||||
parsed = urlparse(url)
|
||||
return bool(parsed.netloc) and bool(parsed.scheme)
|
||||
|
||||
def _extract_images_from_docx(self, doc):
|
||||
image_count = 0
|
||||
image_map = {}
|
||||
|
||||
for rel in doc.part.rels.values():
|
||||
if "image" in rel.target_ref:
|
||||
image_count += 1
|
||||
if rel.is_external:
|
||||
url = rel.target_ref
|
||||
response = ssrf_proxy.get(url)
|
||||
if response.status_code == 200:
|
||||
image_ext = mimetypes.guess_extension(response.headers["Content-Type"])
|
||||
if image_ext is None:
|
||||
continue
|
||||
file_uuid = str(uuid.uuid4())
|
||||
file_key = "image_files/" + self.tenant_id + "/" + file_uuid + "." + image_ext
|
||||
mime_type, _ = mimetypes.guess_type(file_key)
|
||||
storage.save(file_key, response.content)
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
image_ext = rel.target_ref.split(".")[-1]
|
||||
if image_ext is None:
|
||||
continue
|
||||
# user uuid as file name
|
||||
file_uuid = str(uuid.uuid4())
|
||||
file_key = "image_files/" + self.tenant_id + "/" + file_uuid + "." + image_ext
|
||||
mime_type, _ = mimetypes.guess_type(file_key)
|
||||
|
||||
storage.save(file_key, rel.target_part.blob)
|
||||
# save file to db
|
||||
upload_file = UploadFile(
|
||||
tenant_id=self.tenant_id,
|
||||
storage_type=dify_config.STORAGE_TYPE,
|
||||
key=file_key,
|
||||
name=file_key,
|
||||
size=0,
|
||||
extension=str(image_ext),
|
||||
mime_type=mime_type or "",
|
||||
created_by=self.user_id,
|
||||
created_by_role=CreatorUserRole.ACCOUNT,
|
||||
created_at=naive_utc_now(),
|
||||
used=True,
|
||||
used_by=self.user_id,
|
||||
used_at=naive_utc_now(),
|
||||
)
|
||||
|
||||
db.session.add(upload_file)
|
||||
db.session.commit()
|
||||
image_map[rel.target_part] = f""
|
||||
|
||||
return image_map
|
||||
|
||||
def _table_to_markdown(self, table, image_map):
|
||||
markdown = []
|
||||
# calculate the total number of columns
|
||||
total_cols = max(len(row.cells) for row in table.rows)
|
||||
|
||||
header_row = table.rows[0]
|
||||
headers = self._parse_row(header_row, image_map, total_cols)
|
||||
markdown.append("| " + " | ".join(headers) + " |")
|
||||
markdown.append("| " + " | ".join(["---"] * total_cols) + " |")
|
||||
|
||||
for row in table.rows[1:]:
|
||||
row_cells = self._parse_row(row, image_map, total_cols)
|
||||
markdown.append("| " + " | ".join(row_cells) + " |")
|
||||
return "\n".join(markdown)
|
||||
|
||||
def _parse_row(self, row, image_map, total_cols):
|
||||
# Initialize a row, all of which are empty by default
|
||||
row_cells = [""] * total_cols
|
||||
col_index = 0
|
||||
while col_index < len(row.cells):
|
||||
# make sure the col_index is not out of range
|
||||
while col_index < len(row.cells) and row_cells[col_index] != "":
|
||||
col_index += 1
|
||||
# if col_index is out of range the loop is jumped
|
||||
if col_index >= len(row.cells):
|
||||
break
|
||||
# get the correct cell
|
||||
cell = row.cells[col_index]
|
||||
cell_content = self._parse_cell(cell, image_map).strip()
|
||||
cell_colspan = cell.grid_span or 1
|
||||
for i in range(cell_colspan):
|
||||
if col_index + i < total_cols:
|
||||
row_cells[col_index + i] = cell_content if i == 0 else ""
|
||||
col_index += cell_colspan
|
||||
return row_cells
|
||||
|
||||
def _parse_cell(self, cell, image_map):
|
||||
cell_content = []
|
||||
for paragraph in cell.paragraphs:
|
||||
parsed_paragraph = self._parse_cell_paragraph(paragraph, image_map)
|
||||
if parsed_paragraph:
|
||||
cell_content.append(parsed_paragraph)
|
||||
unique_content = list(dict.fromkeys(cell_content))
|
||||
return " ".join(unique_content)
|
||||
|
||||
def _parse_cell_paragraph(self, paragraph, image_map):
|
||||
paragraph_content = []
|
||||
for run in paragraph.runs:
|
||||
if run.element.xpath(".//a:blip"):
|
||||
for blip in run.element.xpath(".//a:blip"):
|
||||
image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
|
||||
if not image_id:
|
||||
continue
|
||||
image_part = paragraph.part.rels[image_id].target_part
|
||||
|
||||
if image_part in image_map:
|
||||
image_link = image_map[image_part]
|
||||
paragraph_content.append(image_link)
|
||||
else:
|
||||
paragraph_content.append(run.text)
|
||||
return "".join(paragraph_content).strip()
|
||||
|
||||
def parse_docx(self, docx_path):
|
||||
doc = DocxDocument(docx_path)
|
||||
|
||||
content = []
|
||||
|
||||
image_map = self._extract_images_from_docx(doc)
|
||||
|
||||
hyperlinks_url = None
|
||||
url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+")
|
||||
for para in doc.paragraphs:
|
||||
for run in para.runs:
|
||||
if run.text and hyperlinks_url:
|
||||
result = f" [{run.text}]({hyperlinks_url}) "
|
||||
run.text = result
|
||||
hyperlinks_url = None
|
||||
if "HYPERLINK" in run.element.xml:
|
||||
try:
|
||||
xml = ElementTree.XML(run.element.xml)
|
||||
x_child = [c for c in xml.iter() if c is not None]
|
||||
for x in x_child:
|
||||
if x is None:
|
||||
continue
|
||||
if x.tag.endswith("instrText"):
|
||||
if x.text is None:
|
||||
continue
|
||||
for i in url_pattern.findall(x.text):
|
||||
hyperlinks_url = str(i)
|
||||
except Exception:
|
||||
logger.exception("Failed to parse HYPERLINK xml")
|
||||
|
||||
def parse_paragraph(paragraph):
|
||||
paragraph_content = []
|
||||
for run in paragraph.runs:
|
||||
if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"):
|
||||
# Process drawing type images
|
||||
drawing_elements = run.element.findall(
|
||||
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing"
|
||||
)
|
||||
has_drawing = False
|
||||
for drawing in drawing_elements:
|
||||
blip_elements = drawing.findall(
|
||||
".//{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
|
||||
)
|
||||
for blip in blip_elements:
|
||||
embed_id = blip.get(
|
||||
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
||||
)
|
||||
if embed_id:
|
||||
image_part = doc.part.related_parts.get(embed_id)
|
||||
if image_part in image_map:
|
||||
has_drawing = True
|
||||
paragraph_content.append(image_map[image_part])
|
||||
# Process pict type images
|
||||
shape_elements = run.element.findall(
|
||||
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict"
|
||||
)
|
||||
for shape in shape_elements:
|
||||
# Find image data in VML
|
||||
shape_image = shape.find(
|
||||
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}binData"
|
||||
)
|
||||
if shape_image is not None and shape_image.text:
|
||||
image_id = shape_image.get(
|
||||
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
|
||||
)
|
||||
if image_id and image_id in doc.part.rels:
|
||||
image_part = doc.part.rels[image_id].target_part
|
||||
if image_part in image_map and not has_drawing:
|
||||
paragraph_content.append(image_map[image_part])
|
||||
# Find imagedata element in VML
|
||||
image_data = shape.find(".//{urn:schemas-microsoft-com:vml}imagedata")
|
||||
if image_data is not None:
|
||||
image_id = image_data.get("id") or image_data.get(
|
||||
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
|
||||
)
|
||||
if image_id and image_id in doc.part.rels:
|
||||
image_part = doc.part.rels[image_id].target_part
|
||||
if image_part in image_map and not has_drawing:
|
||||
paragraph_content.append(image_map[image_part])
|
||||
if run.text.strip():
|
||||
paragraph_content.append(run.text.strip())
|
||||
return "".join(paragraph_content) if paragraph_content else ""
|
||||
|
||||
paragraphs = doc.paragraphs.copy()
|
||||
tables = doc.tables.copy()
|
||||
for element in doc.element.body:
|
||||
if hasattr(element, "tag"):
|
||||
if isinstance(element.tag, str) and element.tag.endswith("p"): # paragraph
|
||||
para = paragraphs.pop(0)
|
||||
parsed_paragraph = parse_paragraph(para)
|
||||
if parsed_paragraph.strip():
|
||||
content.append(parsed_paragraph)
|
||||
else:
|
||||
content.append("\n")
|
||||
elif isinstance(element.tag, str) and element.tag.endswith("tbl"): # table
|
||||
table = tables.pop(0)
|
||||
content.append(self._table_to_markdown(table, image_map))
|
||||
return "\n".join(content)
|
||||
Reference in New Issue
Block a user