Files
yiliao/backend/services/ocr_service.py

223 lines
8.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import sys
from pathlib import Path
from typing import Union
import tempfile
import shutil
class OCRService:
"""OCR识别服务 - 支持 MinerU、百度云OCR API、PaddleOCR生产模式不支持演示"""
def __init__(self):
self.ocr_type = self._detect_ocr_type()
self._initialize_ocr()
def _detect_ocr_type(self) -> str:
"""检测可用的OCR类型"""
# 最优先使用百度云OCR API速度快、精度高、免费额度足够日常使用
if os.getenv("BAIDU_OCR_APP_ID") and os.getenv("BAIDU_OCR_API_KEY") and os.getenv("BAIDU_OCR_SECRET_KEY"):
return "baidu_cloud"
# 其次使用 MinerU最强大的文档解析工具但速度慢
elif self._check_mineru():
return "mineru"
# 再次使用PaddleOCR
elif self._check_paddleocr():
return "paddleocr"
# 没有可用的OCR
else:
raise RuntimeError(
"❌ 没有可用的OCR引擎请至少配置以下一种\n"
"1. MinerU - 将 MinerU-master 文件夹放在桌面\n"
"2. 百度OCR - 配置环境变量 BAIDU_OCR_*\n"
"3. PaddleOCR - 运行 pip install paddleocr paddlepaddle"
)
def _initialize_ocr(self):
"""初始化OCR引擎"""
if self.ocr_type == "mineru":
try:
# 添加 MinerU 路径
mineru_path = Path(r"c:\Users\UI\Desktop\MinerU-master")
if mineru_path.exists() and str(mineru_path) not in sys.path:
sys.path.insert(0, str(mineru_path))
from demo.demo import parse_doc
self.mineru_parse = parse_doc
try:
from torch.serialization import add_safe_globals
from doclayout_yolo.nn.tasks import YOLOv10DetectionModel
add_safe_globals([YOLOv10DetectionModel])
except Exception:
pass
print("✓ 使用 MinerU 引擎(高精度文档解析)")
except Exception as e:
raise RuntimeError(f"❌ MinerU 初始化失败: {e}\n请安装完整依赖或使用其他OCR引擎")
elif self.ocr_type == "baidu_cloud":
try:
from aip import AipOcr
app_id = os.getenv("BAIDU_OCR_APP_ID")
api_key = os.getenv("BAIDU_OCR_API_KEY")
secret_key = os.getenv("BAIDU_OCR_SECRET_KEY")
self.baidu_client = AipOcr(app_id, api_key, secret_key)
print("✓ 使用百度云OCR API高精度")
except Exception as e:
raise RuntimeError(f"❌ 百度云OCR初始化失败: {e}\n请检查环境变量配置")
elif self.ocr_type == "paddleocr":
try:
from paddleocr import PaddleOCR
self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False)
print("✓ 使用 PaddleOCR 引擎(本地离线)")
except Exception as e:
raise RuntimeError(f"❌ PaddleOCR 初始化失败: {e}\n请运行: pip install paddleocr paddlepaddle")
def _check_mineru(self) -> bool:
"""检查MinerU是否可用"""
try:
mineru_path = Path(r"c:\Users\UI\Desktop\MinerU-master")
return mineru_path.exists() and (mineru_path / "demo" / "demo.py").exists()
except:
return False
def _check_paddleocr(self) -> bool:
"""检查PaddleOCR是否可用"""
try:
import paddleocr
return True
except ImportError:
return False
def extract_text(self, file_path: Union[str, Path]) -> str:
"""从图片或PDF中提取文本"""
file_path = str(file_path)
file_ext = Path(file_path).suffix.lower()
if file_ext == '.pdf':
return self._extract_from_pdf(file_path)
else:
return self._extract_from_image(file_path)
def _extract_from_image(self, image_path: str) -> str:
"""从图片中提取文本"""
if self.ocr_type == "mineru":
return self._extract_with_mineru(image_path)
elif self.ocr_type == "baidu_cloud":
return self._extract_with_baidu_cloud(image_path)
elif self.ocr_type == "paddleocr":
return self._extract_with_paddleocr(image_path)
else:
raise RuntimeError("OCR引擎未正确初始化")
def _extract_with_mineru(self, file_path: str) -> str:
"""使用 MinerU 提取文本支持PDF和图片"""
try:
# 创建临时输出目录
temp_dir = tempfile.mkdtemp(prefix="mineru_")
try:
# 调用 MinerU 解析
file_path_obj = Path(file_path)
self.mineru_parse(
path_list=[file_path_obj],
output_dir=temp_dir,
lang="ch", # 中文
backend="pipeline", # 使用 pipeline 模式
method="auto" # 自动检测
)
# 读取生成的 markdown 文件
md_files = list(Path(temp_dir).rglob("*.md"))
if md_files:
# 优先排除 layout / span / origin 等辅助文件
content_files = [
f for f in md_files
if not any(x in f.stem for x in ['layout', 'span', 'origin'])
]
target_files = content_files or md_files
with open(target_files[0], 'r', encoding='utf-8') as f:
content = f.read()
return content if content.strip() else "未识别到文本内容"
return "未识别到文本内容"
finally:
# 清理临时目录
try:
shutil.rmtree(temp_dir)
except:
pass
except Exception as e:
return f"MinerU识别出错: {str(e)}"
def _extract_with_baidu_cloud(self, image_path: str) -> str:
"""使用百度云OCR API提取文本"""
try:
# 读取图片
with open(image_path, 'rb') as f:
image_data = f.read()
# 调用通用文字识别(高精度版)
result = self.baidu_client.accurateBasic(image_data)
if 'error_code' in result:
return f"百度OCR错误 ({result['error_code']}): {result.get('error_msg', '未知错误')}"
# 提取文本
if 'words_result' in result:
text_lines = [item['words'] for item in result['words_result']]
return "\n".join(text_lines) if text_lines else "未识别到文本内容"
return "未识别到文本内容"
except Exception as e:
return f"百度云OCR识别出错: {str(e)}"
def _extract_with_paddleocr(self, image_path: str) -> str:
"""使用PaddleOCR提取文本"""
try:
result = self.paddle_ocr.ocr(image_path, cls=True)
if not result or not result[0]:
return "未识别到文本内容"
# 提取所有文本行
text_lines = []
for line in result[0]:
if line and len(line) >= 2:
text_lines.append(line[1][0])
return "\n".join(text_lines) if text_lines else "未识别到文本内容"
except Exception as e:
return f"PaddleOCR识别出错: {str(e)}"
def _extract_from_pdf(self, pdf_path: str) -> str:
"""从PDF中提取文本"""
# 优先使用 MinerU 处理 PDF效果最好
if self.ocr_type == "mineru":
return self._extract_with_mineru(pdf_path)
# 备选方案:使用 pdfplumber
try:
import pdfplumber
text_content = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
text_content.append(text)
return "\n\n".join(text_content) if text_content else "未提取到文本内容"
except ImportError:
# PDF库不可用尝试使用OCR处理PDF的图像
return "PDF处理需要安装 pdfplumber 库\n可以运行: pip install pdfplumber"
except Exception as e:
return f"PDF处理出错: {str(e)}"