Initial commit: AI 知识库文档智能分块工具
This commit is contained in:
62
parsers/base.py
Normal file
62
parsers/base.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""BaseParser 抽象基类和 ParserRegistry 解析器注册表"""
|
||||
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
|
||||
from exceptions import ParseError, UnsupportedFormatError
|
||||
|
||||
|
||||
class BaseParser(ABC):
|
||||
"""文件解析器抽象基类"""
|
||||
|
||||
@abstractmethod
|
||||
def supported_extensions(self) -> List[str]:
|
||||
"""返回支持的文件扩展名列表,如 ['.pdf']"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, file_path: str) -> str:
|
||||
"""
|
||||
解析文件并返回纯文本/Markdown 内容。
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
提取的文本内容(Markdown 格式优先)
|
||||
|
||||
Raises:
|
||||
ParseError: 文件损坏、格式不支持或编码无法识别时抛出
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ParserRegistry:
|
||||
"""根据文件扩展名自动选择合适的解析器"""
|
||||
|
||||
def __init__(self):
|
||||
self._parsers: List[BaseParser] = []
|
||||
|
||||
def register(self, parser: BaseParser) -> None:
|
||||
"""注册一个解析器"""
|
||||
self._parsers.append(parser)
|
||||
|
||||
def get_parser(self, file_path: str) -> BaseParser:
|
||||
"""
|
||||
根据文件扩展名返回对应的解析器。
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
匹配的解析器实例
|
||||
|
||||
Raises:
|
||||
UnsupportedFormatError: 未找到匹配的解析器时抛出
|
||||
"""
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
for parser in self._parsers:
|
||||
if ext in parser.supported_extensions():
|
||||
return parser
|
||||
raise UnsupportedFormatError(os.path.basename(file_path), ext)
|
||||
Reference in New Issue
Block a user