Files
yiliao/backend/services/ocr_service.py

223 lines
8.8 KiB
Python
Raw Permalink Normal View History

import os
import sys
from pathlib import Path
from typing import Union
import tempfile
import shutil
class OCRService:
"""OCR识别服务 - 支持 MinerU、百度云OCR API、PaddleOCR生产模式不支持演示"""
def __init__(self):
self.ocr_type = self._detect_ocr_type()
self._initialize_ocr()
def _detect_ocr_type(self) -> str:
"""检测可用的OCR类型"""
# 最优先使用百度云OCR API速度快、精度高、免费额度足够日常使用
if os.getenv("BAIDU_OCR_APP_ID") and os.getenv("BAIDU_OCR_API_KEY") and os.getenv("BAIDU_OCR_SECRET_KEY"):
return "baidu_cloud"
# 其次使用 MinerU最强大的文档解析工具但速度慢
elif self._check_mineru():
return "mineru"
# 再次使用PaddleOCR
elif self._check_paddleocr():
return "paddleocr"
# 没有可用的OCR
else:
raise RuntimeError(
"❌ 没有可用的OCR引擎请至少配置以下一种\n"
"1. MinerU - 将 MinerU-master 文件夹放在桌面\n"
"2. 百度OCR - 配置环境变量 BAIDU_OCR_*\n"
"3. PaddleOCR - 运行 pip install paddleocr paddlepaddle"
)
def _initialize_ocr(self):
"""初始化OCR引擎"""
if self.ocr_type == "mineru":
try:
# 添加 MinerU 路径
mineru_path = Path(r"c:\Users\UI\Desktop\MinerU-master")
if mineru_path.exists() and str(mineru_path) not in sys.path:
sys.path.insert(0, str(mineru_path))
from demo.demo import parse_doc
self.mineru_parse = parse_doc
try:
from torch.serialization import add_safe_globals
from doclayout_yolo.nn.tasks import YOLOv10DetectionModel
add_safe_globals([YOLOv10DetectionModel])
except Exception:
pass
print("✓ 使用 MinerU 引擎(高精度文档解析)")
except Exception as e:
raise RuntimeError(f"❌ MinerU 初始化失败: {e}\n请安装完整依赖或使用其他OCR引擎")
elif self.ocr_type == "baidu_cloud":
try:
from aip import AipOcr
app_id = os.getenv("BAIDU_OCR_APP_ID")
api_key = os.getenv("BAIDU_OCR_API_KEY")
secret_key = os.getenv("BAIDU_OCR_SECRET_KEY")
self.baidu_client = AipOcr(app_id, api_key, secret_key)
print("✓ 使用百度云OCR API高精度")
except Exception as e:
raise RuntimeError(f"❌ 百度云OCR初始化失败: {e}\n请检查环境变量配置")
elif self.ocr_type == "paddleocr":
try:
from paddleocr import PaddleOCR
self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False)
print("✓ 使用 PaddleOCR 引擎(本地离线)")
except Exception as e:
raise RuntimeError(f"❌ PaddleOCR 初始化失败: {e}\n请运行: pip install paddleocr paddlepaddle")
def _check_mineru(self) -> bool:
"""检查MinerU是否可用"""
try:
mineru_path = Path(r"c:\Users\UI\Desktop\MinerU-master")
return mineru_path.exists() and (mineru_path / "demo" / "demo.py").exists()
except:
return False
def _check_paddleocr(self) -> bool:
"""检查PaddleOCR是否可用"""
try:
import paddleocr
return True
except ImportError:
return False
def extract_text(self, file_path: Union[str, Path]) -> str:
"""从图片或PDF中提取文本"""
file_path = str(file_path)
file_ext = Path(file_path).suffix.lower()
if file_ext == '.pdf':
return self._extract_from_pdf(file_path)
else:
return self._extract_from_image(file_path)
def _extract_from_image(self, image_path: str) -> str:
"""从图片中提取文本"""
if self.ocr_type == "mineru":
return self._extract_with_mineru(image_path)
elif self.ocr_type == "baidu_cloud":
return self._extract_with_baidu_cloud(image_path)
elif self.ocr_type == "paddleocr":
return self._extract_with_paddleocr(image_path)
else:
raise RuntimeError("OCR引擎未正确初始化")
def _extract_with_mineru(self, file_path: str) -> str:
"""使用 MinerU 提取文本支持PDF和图片"""
try:
# 创建临时输出目录
temp_dir = tempfile.mkdtemp(prefix="mineru_")
try:
# 调用 MinerU 解析
file_path_obj = Path(file_path)
self.mineru_parse(
path_list=[file_path_obj],
output_dir=temp_dir,
lang="ch", # 中文
backend="pipeline", # 使用 pipeline 模式
method="auto" # 自动检测
)
# 读取生成的 markdown 文件
md_files = list(Path(temp_dir).rglob("*.md"))
if md_files:
# 优先排除 layout / span / origin 等辅助文件
content_files = [
f for f in md_files
if not any(x in f.stem for x in ['layout', 'span', 'origin'])
]
target_files = content_files or md_files
with open(target_files[0], 'r', encoding='utf-8') as f:
content = f.read()
return content if content.strip() else "未识别到文本内容"
return "未识别到文本内容"
finally:
# 清理临时目录
try:
shutil.rmtree(temp_dir)
except:
pass
except Exception as e:
return f"MinerU识别出错: {str(e)}"
def _extract_with_baidu_cloud(self, image_path: str) -> str:
"""使用百度云OCR API提取文本"""
try:
# 读取图片
with open(image_path, 'rb') as f:
image_data = f.read()
# 调用通用文字识别(高精度版)
result = self.baidu_client.accurateBasic(image_data)
if 'error_code' in result:
return f"百度OCR错误 ({result['error_code']}): {result.get('error_msg', '未知错误')}"
# 提取文本
if 'words_result' in result:
text_lines = [item['words'] for item in result['words_result']]
return "\n".join(text_lines) if text_lines else "未识别到文本内容"
return "未识别到文本内容"
except Exception as e:
return f"百度云OCR识别出错: {str(e)}"
def _extract_with_paddleocr(self, image_path: str) -> str:
"""使用PaddleOCR提取文本"""
try:
result = self.paddle_ocr.ocr(image_path, cls=True)
if not result or not result[0]:
return "未识别到文本内容"
# 提取所有文本行
text_lines = []
for line in result[0]:
if line and len(line) >= 2:
text_lines.append(line[1][0])
return "\n".join(text_lines) if text_lines else "未识别到文本内容"
except Exception as e:
return f"PaddleOCR识别出错: {str(e)}"
def _extract_from_pdf(self, pdf_path: str) -> str:
"""从PDF中提取文本"""
# 优先使用 MinerU 处理 PDF效果最好
if self.ocr_type == "mineru":
return self._extract_with_mineru(pdf_path)
# 备选方案:使用 pdfplumber
try:
import pdfplumber
text_content = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
text_content.append(text)
return "\n\n".join(text_content) if text_content else "未提取到文本内容"
except ImportError:
# PDF库不可用尝试使用OCR处理PDF的图像
return "PDF处理需要安装 pdfplumber 库\n可以运行: pip install pdfplumber"
except Exception as e:
return f"PDF处理出错: {str(e)}"