63 lines
1.6 KiB
Python
63 lines
1.6 KiB
Python
|
|
"""BaseParser 抽象基类和 ParserRegistry 解析器注册表"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
from abc import ABC, abstractmethod
|
|||
|
|
from typing import List
|
|||
|
|
|
|||
|
|
from exceptions import ParseError, UnsupportedFormatError
|
|||
|
|
|
|||
|
|
|
|||
|
|
class BaseParser(ABC):
|
|||
|
|
"""文件解析器抽象基类"""
|
|||
|
|
|
|||
|
|
@abstractmethod
|
|||
|
|
def supported_extensions(self) -> List[str]:
|
|||
|
|
"""返回支持的文件扩展名列表,如 ['.pdf']"""
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
@abstractmethod
|
|||
|
|
def parse(self, file_path: str) -> str:
|
|||
|
|
"""
|
|||
|
|
解析文件并返回纯文本/Markdown 内容。
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: 文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
提取的文本内容(Markdown 格式优先)
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
ParseError: 文件损坏、格式不支持或编码无法识别时抛出
|
|||
|
|
"""
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
|
|||
|
|
class ParserRegistry:
|
|||
|
|
"""根据文件扩展名自动选择合适的解析器"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
self._parsers: List[BaseParser] = []
|
|||
|
|
|
|||
|
|
def register(self, parser: BaseParser) -> None:
|
|||
|
|
"""注册一个解析器"""
|
|||
|
|
self._parsers.append(parser)
|
|||
|
|
|
|||
|
|
def get_parser(self, file_path: str) -> BaseParser:
|
|||
|
|
"""
|
|||
|
|
根据文件扩展名返回对应的解析器。
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: 文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
匹配的解析器实例
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
UnsupportedFormatError: 未找到匹配的解析器时抛出
|
|||
|
|
"""
|
|||
|
|
ext = os.path.splitext(file_path)[1].lower()
|
|||
|
|
for parser in self._parsers:
|
|||
|
|
if ext in parser.supported_extensions():
|
|||
|
|
return parser
|
|||
|
|
raise UnsupportedFormatError(os.path.basename(file_path), ext)
|