dify
This commit is contained in:
0
dify/api/core/rag/rerank/__init__.py
Normal file
0
dify/api/core/rag/rerank/__init__.py
Normal file
21
dify/api/core/rag/rerank/entity/weight.py
Normal file
21
dify/api/core/rag/rerank/entity/weight.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class VectorSetting(BaseModel):
|
||||
vector_weight: float
|
||||
|
||||
embedding_provider_name: str
|
||||
|
||||
embedding_model_name: str
|
||||
|
||||
|
||||
class KeywordSetting(BaseModel):
|
||||
keyword_weight: float
|
||||
|
||||
|
||||
class Weights(BaseModel):
|
||||
"""Model for weighted rerank."""
|
||||
|
||||
vector_setting: VectorSetting
|
||||
|
||||
keyword_setting: KeywordSetting
|
||||
25
dify/api/core/rag/rerank/rerank_base.py
Normal file
25
dify/api/core/rag/rerank/rerank_base.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
||||
class BaseRerankRunner(ABC):
|
||||
@abstractmethod
|
||||
def run(
|
||||
self,
|
||||
query: str,
|
||||
documents: list[Document],
|
||||
score_threshold: float | None = None,
|
||||
top_n: int | None = None,
|
||||
user: str | None = None,
|
||||
) -> list[Document]:
|
||||
"""
|
||||
Run rerank model
|
||||
:param query: search query
|
||||
:param documents: documents for reranking
|
||||
:param score_threshold: score threshold
|
||||
:param top_n: top n
|
||||
:param user: unique user id if needed
|
||||
:return:
|
||||
"""
|
||||
raise NotImplementedError
|
||||
16
dify/api/core/rag/rerank/rerank_factory.py
Normal file
16
dify/api/core/rag/rerank/rerank_factory.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from core.rag.rerank.rerank_base import BaseRerankRunner
|
||||
from core.rag.rerank.rerank_model import RerankModelRunner
|
||||
from core.rag.rerank.rerank_type import RerankMode
|
||||
from core.rag.rerank.weight_rerank import WeightRerankRunner
|
||||
|
||||
|
||||
class RerankRunnerFactory:
|
||||
@staticmethod
|
||||
def create_rerank_runner(runner_type: str, *args, **kwargs) -> BaseRerankRunner:
|
||||
match runner_type:
|
||||
case RerankMode.RERANKING_MODEL:
|
||||
return RerankModelRunner(*args, **kwargs)
|
||||
case RerankMode.WEIGHTED_SCORE:
|
||||
return WeightRerankRunner(*args, **kwargs)
|
||||
case _:
|
||||
raise ValueError(f"Unknown runner type: {runner_type}")
|
||||
65
dify/api/core/rag/rerank/rerank_model.py
Normal file
65
dify/api/core/rag/rerank/rerank_model.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from core.model_manager import ModelInstance
|
||||
from core.rag.models.document import Document
|
||||
from core.rag.rerank.rerank_base import BaseRerankRunner
|
||||
|
||||
|
||||
class RerankModelRunner(BaseRerankRunner):
|
||||
def __init__(self, rerank_model_instance: ModelInstance):
|
||||
self.rerank_model_instance = rerank_model_instance
|
||||
|
||||
def run(
|
||||
self,
|
||||
query: str,
|
||||
documents: list[Document],
|
||||
score_threshold: float | None = None,
|
||||
top_n: int | None = None,
|
||||
user: str | None = None,
|
||||
) -> list[Document]:
|
||||
"""
|
||||
Run rerank model
|
||||
:param query: search query
|
||||
:param documents: documents for reranking
|
||||
:param score_threshold: score threshold
|
||||
:param top_n: top n
|
||||
:param user: unique user id if needed
|
||||
:return:
|
||||
"""
|
||||
docs = []
|
||||
doc_ids = set()
|
||||
unique_documents = []
|
||||
for document in documents:
|
||||
if (
|
||||
document.provider == "dify"
|
||||
and document.metadata is not None
|
||||
and document.metadata["doc_id"] not in doc_ids
|
||||
):
|
||||
doc_ids.add(document.metadata["doc_id"])
|
||||
docs.append(document.page_content)
|
||||
unique_documents.append(document)
|
||||
elif document.provider == "external":
|
||||
if document not in unique_documents:
|
||||
docs.append(document.page_content)
|
||||
unique_documents.append(document)
|
||||
|
||||
documents = unique_documents
|
||||
|
||||
rerank_result = self.rerank_model_instance.invoke_rerank(
|
||||
query=query, docs=docs, score_threshold=score_threshold, top_n=top_n, user=user
|
||||
)
|
||||
|
||||
rerank_documents = []
|
||||
|
||||
for result in rerank_result.docs:
|
||||
if score_threshold is None or result.score >= score_threshold:
|
||||
# format document
|
||||
rerank_document = Document(
|
||||
page_content=result.text,
|
||||
metadata=documents[result.index].metadata,
|
||||
provider=documents[result.index].provider,
|
||||
)
|
||||
if rerank_document.metadata is not None:
|
||||
rerank_document.metadata["score"] = result.score
|
||||
rerank_documents.append(rerank_document)
|
||||
|
||||
rerank_documents.sort(key=lambda x: x.metadata.get("score", 0.0), reverse=True)
|
||||
return rerank_documents[:top_n] if top_n else rerank_documents
|
||||
6
dify/api/core/rag/rerank/rerank_type.py
Normal file
6
dify/api/core/rag/rerank/rerank_type.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from enum import StrEnum
|
||||
|
||||
|
||||
class RerankMode(StrEnum):
|
||||
RERANKING_MODEL = "reranking_model"
|
||||
WEIGHTED_SCORE = "weighted_score"
|
||||
191
dify/api/core/rag/rerank/weight_rerank.py
Normal file
191
dify/api/core/rag/rerank/weight_rerank.py
Normal file
@@ -0,0 +1,191 @@
|
||||
import math
|
||||
from collections import Counter
|
||||
|
||||
import numpy as np
|
||||
|
||||
from core.model_manager import ModelManager
|
||||
from core.model_runtime.entities.model_entities import ModelType
|
||||
from core.rag.datasource.keyword.jieba.jieba_keyword_table_handler import JiebaKeywordTableHandler
|
||||
from core.rag.embedding.cached_embedding import CacheEmbedding
|
||||
from core.rag.models.document import Document
|
||||
from core.rag.rerank.entity.weight import VectorSetting, Weights
|
||||
from core.rag.rerank.rerank_base import BaseRerankRunner
|
||||
|
||||
|
||||
class WeightRerankRunner(BaseRerankRunner):
|
||||
def __init__(self, tenant_id: str, weights: Weights):
|
||||
self.tenant_id = tenant_id
|
||||
self.weights = weights
|
||||
|
||||
def run(
|
||||
self,
|
||||
query: str,
|
||||
documents: list[Document],
|
||||
score_threshold: float | None = None,
|
||||
top_n: int | None = None,
|
||||
user: str | None = None,
|
||||
) -> list[Document]:
|
||||
"""
|
||||
Run rerank model
|
||||
:param query: search query
|
||||
:param documents: documents for reranking
|
||||
:param score_threshold: score threshold
|
||||
:param top_n: top n
|
||||
:param user: unique user id if needed
|
||||
|
||||
:return:
|
||||
"""
|
||||
unique_documents = []
|
||||
doc_ids = set()
|
||||
for document in documents:
|
||||
if (
|
||||
document.provider == "dify"
|
||||
and document.metadata is not None
|
||||
and document.metadata["doc_id"] not in doc_ids
|
||||
):
|
||||
doc_ids.add(document.metadata["doc_id"])
|
||||
unique_documents.append(document)
|
||||
else:
|
||||
if document not in unique_documents:
|
||||
unique_documents.append(document)
|
||||
|
||||
documents = unique_documents
|
||||
|
||||
query_scores = self._calculate_keyword_score(query, documents)
|
||||
query_vector_scores = self._calculate_cosine(self.tenant_id, query, documents, self.weights.vector_setting)
|
||||
|
||||
rerank_documents = []
|
||||
for document, query_score, query_vector_score in zip(documents, query_scores, query_vector_scores):
|
||||
score = (
|
||||
self.weights.vector_setting.vector_weight * query_vector_score
|
||||
+ self.weights.keyword_setting.keyword_weight * query_score
|
||||
)
|
||||
if score_threshold and score < score_threshold:
|
||||
continue
|
||||
if document.metadata is not None:
|
||||
document.metadata["score"] = score
|
||||
rerank_documents.append(document)
|
||||
|
||||
rerank_documents.sort(key=lambda x: x.metadata["score"] if x.metadata else 0, reverse=True)
|
||||
return rerank_documents[:top_n] if top_n else rerank_documents
|
||||
|
||||
def _calculate_keyword_score(self, query: str, documents: list[Document]) -> list[float]:
|
||||
"""
|
||||
Calculate BM25 scores
|
||||
:param query: search query
|
||||
:param documents: documents for reranking
|
||||
|
||||
:return:
|
||||
"""
|
||||
keyword_table_handler = JiebaKeywordTableHandler()
|
||||
query_keywords = keyword_table_handler.extract_keywords(query, None)
|
||||
documents_keywords = []
|
||||
for document in documents:
|
||||
# get the document keywords
|
||||
document_keywords = keyword_table_handler.extract_keywords(document.page_content, None)
|
||||
if document.metadata is not None:
|
||||
document.metadata["keywords"] = document_keywords
|
||||
documents_keywords.append(document_keywords)
|
||||
|
||||
# Counter query keywords(TF)
|
||||
query_keyword_counts = Counter(query_keywords)
|
||||
|
||||
# total documents
|
||||
total_documents = len(documents)
|
||||
|
||||
# calculate all documents' keywords IDF
|
||||
all_keywords = set()
|
||||
for document_keywords in documents_keywords:
|
||||
all_keywords.update(document_keywords)
|
||||
|
||||
keyword_idf = {}
|
||||
for keyword in all_keywords:
|
||||
# calculate include query keywords' documents
|
||||
doc_count_containing_keyword = sum(1 for doc_keywords in documents_keywords if keyword in doc_keywords)
|
||||
# IDF
|
||||
keyword_idf[keyword] = math.log((1 + total_documents) / (1 + doc_count_containing_keyword)) + 1
|
||||
|
||||
query_tfidf = {}
|
||||
|
||||
for keyword, count in query_keyword_counts.items():
|
||||
tf = count
|
||||
idf = keyword_idf.get(keyword, 0)
|
||||
query_tfidf[keyword] = tf * idf
|
||||
|
||||
# calculate all documents' TF-IDF
|
||||
documents_tfidf = []
|
||||
for document_keywords in documents_keywords:
|
||||
document_keyword_counts = Counter(document_keywords)
|
||||
document_tfidf = {}
|
||||
for keyword, count in document_keyword_counts.items():
|
||||
tf = count
|
||||
idf = keyword_idf.get(keyword, 0)
|
||||
document_tfidf[keyword] = tf * idf
|
||||
documents_tfidf.append(document_tfidf)
|
||||
|
||||
def cosine_similarity(vec1, vec2):
|
||||
intersection = set(vec1.keys()) & set(vec2.keys())
|
||||
numerator = sum(vec1[x] * vec2[x] for x in intersection)
|
||||
|
||||
sum1 = sum(vec1[x] ** 2 for x in vec1)
|
||||
sum2 = sum(vec2[x] ** 2 for x in vec2)
|
||||
denominator = math.sqrt(sum1) * math.sqrt(sum2)
|
||||
|
||||
if not denominator:
|
||||
return 0.0
|
||||
else:
|
||||
return float(numerator) / denominator
|
||||
|
||||
similarities = []
|
||||
for document_tfidf in documents_tfidf:
|
||||
similarity = cosine_similarity(query_tfidf, document_tfidf)
|
||||
similarities.append(similarity)
|
||||
|
||||
# for idx, similarity in enumerate(similarities):
|
||||
# print(f"Document {idx + 1} similarity: {similarity}")
|
||||
|
||||
return similarities
|
||||
|
||||
def _calculate_cosine(
|
||||
self, tenant_id: str, query: str, documents: list[Document], vector_setting: VectorSetting
|
||||
) -> list[float]:
|
||||
"""
|
||||
Calculate Cosine scores
|
||||
:param query: search query
|
||||
:param documents: documents for reranking
|
||||
|
||||
:return:
|
||||
"""
|
||||
query_vector_scores = []
|
||||
|
||||
model_manager = ModelManager()
|
||||
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=tenant_id,
|
||||
provider=vector_setting.embedding_provider_name,
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=vector_setting.embedding_model_name,
|
||||
)
|
||||
cache_embedding = CacheEmbedding(embedding_model)
|
||||
query_vector = cache_embedding.embed_query(query)
|
||||
for document in documents:
|
||||
# calculate cosine similarity
|
||||
if document.metadata and "score" in document.metadata:
|
||||
query_vector_scores.append(document.metadata["score"])
|
||||
else:
|
||||
# transform to NumPy
|
||||
vec1 = np.array(query_vector)
|
||||
vec2 = np.array(document.vector)
|
||||
|
||||
# calculate dot product
|
||||
dot_product = np.dot(vec1, vec2)
|
||||
|
||||
# calculate norm
|
||||
norm_vec1 = np.linalg.norm(vec1)
|
||||
norm_vec2 = np.linalg.norm(vec2)
|
||||
|
||||
# calculate cosine similarity
|
||||
cosine_sim = dot_product / (norm_vec1 * norm_vec2)
|
||||
query_vector_scores.append(cosine_sim)
|
||||
|
||||
return query_vector_scores
|
||||
Reference in New Issue
Block a user