Files
bigwo/parsers/base.py
2026-03-02 17:38:28 +08:00

63 lines
1.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""BaseParser 抽象基类和 ParserRegistry 解析器注册表"""
import os
from abc import ABC, abstractmethod
from typing import List
from exceptions import ParseError, UnsupportedFormatError
class BaseParser(ABC):
"""文件解析器抽象基类"""
@abstractmethod
def supported_extensions(self) -> List[str]:
"""返回支持的文件扩展名列表,如 ['.pdf']"""
pass
@abstractmethod
def parse(self, file_path: str) -> str:
"""
解析文件并返回纯文本/Markdown 内容。
Args:
file_path: 文件路径
Returns:
提取的文本内容Markdown 格式优先)
Raises:
ParseError: 文件损坏、格式不支持或编码无法识别时抛出
"""
pass
class ParserRegistry:
"""根据文件扩展名自动选择合适的解析器"""
def __init__(self):
self._parsers: List[BaseParser] = []
def register(self, parser: BaseParser) -> None:
"""注册一个解析器"""
self._parsers.append(parser)
def get_parser(self, file_path: str) -> BaseParser:
"""
根据文件扩展名返回对应的解析器。
Args:
file_path: 文件路径
Returns:
匹配的解析器实例
Raises:
UnsupportedFormatError: 未找到匹配的解析器时抛出
"""
ext = os.path.splitext(file_path)[1].lower()
for parser in self._parsers:
if ext in parser.supported_extensions():
return parser
raise UnsupportedFormatError(os.path.basename(file_path), ext)