初始化医疗报告生成项目,添加核心代码文件
This commit is contained in:
222
backend/services/ocr_service.py
Normal file
222
backend/services/ocr_service.py
Normal file
@@ -0,0 +1,222 @@
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
class OCRService:
|
||||
"""OCR识别服务 - 支持 MinerU、百度云OCR API、PaddleOCR(生产模式,不支持演示)"""
|
||||
|
||||
def __init__(self):
|
||||
self.ocr_type = self._detect_ocr_type()
|
||||
self._initialize_ocr()
|
||||
|
||||
def _detect_ocr_type(self) -> str:
|
||||
"""检测可用的OCR类型"""
|
||||
# 最优先使用百度云OCR API(速度快、精度高、免费额度足够日常使用)
|
||||
if os.getenv("BAIDU_OCR_APP_ID") and os.getenv("BAIDU_OCR_API_KEY") and os.getenv("BAIDU_OCR_SECRET_KEY"):
|
||||
return "baidu_cloud"
|
||||
# 其次使用 MinerU(最强大的文档解析工具,但速度慢)
|
||||
elif self._check_mineru():
|
||||
return "mineru"
|
||||
# 再次使用PaddleOCR
|
||||
elif self._check_paddleocr():
|
||||
return "paddleocr"
|
||||
# 没有可用的OCR
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"❌ 没有可用的OCR引擎!请至少配置以下一种:\n"
|
||||
"1. MinerU - 将 MinerU-master 文件夹放在桌面\n"
|
||||
"2. 百度OCR - 配置环境变量 BAIDU_OCR_*\n"
|
||||
"3. PaddleOCR - 运行 pip install paddleocr paddlepaddle"
|
||||
)
|
||||
|
||||
def _initialize_ocr(self):
|
||||
"""初始化OCR引擎"""
|
||||
if self.ocr_type == "mineru":
|
||||
try:
|
||||
# 添加 MinerU 路径
|
||||
mineru_path = Path(r"c:\Users\UI\Desktop\MinerU-master")
|
||||
if mineru_path.exists() and str(mineru_path) not in sys.path:
|
||||
sys.path.insert(0, str(mineru_path))
|
||||
|
||||
from demo.demo import parse_doc
|
||||
self.mineru_parse = parse_doc
|
||||
|
||||
try:
|
||||
from torch.serialization import add_safe_globals
|
||||
from doclayout_yolo.nn.tasks import YOLOv10DetectionModel
|
||||
add_safe_globals([YOLOv10DetectionModel])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print("✓ 使用 MinerU 引擎(高精度文档解析)")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"❌ MinerU 初始化失败: {e}\n请安装完整依赖或使用其他OCR引擎")
|
||||
|
||||
elif self.ocr_type == "baidu_cloud":
|
||||
try:
|
||||
from aip import AipOcr
|
||||
app_id = os.getenv("BAIDU_OCR_APP_ID")
|
||||
api_key = os.getenv("BAIDU_OCR_API_KEY")
|
||||
secret_key = os.getenv("BAIDU_OCR_SECRET_KEY")
|
||||
self.baidu_client = AipOcr(app_id, api_key, secret_key)
|
||||
print("✓ 使用百度云OCR API(高精度)")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"❌ 百度云OCR初始化失败: {e}\n请检查环境变量配置")
|
||||
|
||||
elif self.ocr_type == "paddleocr":
|
||||
try:
|
||||
from paddleocr import PaddleOCR
|
||||
self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False)
|
||||
print("✓ 使用 PaddleOCR 引擎(本地离线)")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"❌ PaddleOCR 初始化失败: {e}\n请运行: pip install paddleocr paddlepaddle")
|
||||
|
||||
def _check_mineru(self) -> bool:
|
||||
"""检查MinerU是否可用"""
|
||||
try:
|
||||
mineru_path = Path(r"c:\Users\UI\Desktop\MinerU-master")
|
||||
return mineru_path.exists() and (mineru_path / "demo" / "demo.py").exists()
|
||||
except:
|
||||
return False
|
||||
|
||||
def _check_paddleocr(self) -> bool:
|
||||
"""检查PaddleOCR是否可用"""
|
||||
try:
|
||||
import paddleocr
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
def extract_text(self, file_path: Union[str, Path]) -> str:
|
||||
"""从图片或PDF中提取文本"""
|
||||
file_path = str(file_path)
|
||||
file_ext = Path(file_path).suffix.lower()
|
||||
|
||||
if file_ext == '.pdf':
|
||||
return self._extract_from_pdf(file_path)
|
||||
else:
|
||||
return self._extract_from_image(file_path)
|
||||
|
||||
def _extract_from_image(self, image_path: str) -> str:
|
||||
"""从图片中提取文本"""
|
||||
if self.ocr_type == "mineru":
|
||||
return self._extract_with_mineru(image_path)
|
||||
elif self.ocr_type == "baidu_cloud":
|
||||
return self._extract_with_baidu_cloud(image_path)
|
||||
elif self.ocr_type == "paddleocr":
|
||||
return self._extract_with_paddleocr(image_path)
|
||||
else:
|
||||
raise RuntimeError("OCR引擎未正确初始化")
|
||||
|
||||
def _extract_with_mineru(self, file_path: str) -> str:
|
||||
"""使用 MinerU 提取文本(支持PDF和图片)"""
|
||||
try:
|
||||
# 创建临时输出目录
|
||||
temp_dir = tempfile.mkdtemp(prefix="mineru_")
|
||||
|
||||
try:
|
||||
# 调用 MinerU 解析
|
||||
file_path_obj = Path(file_path)
|
||||
self.mineru_parse(
|
||||
path_list=[file_path_obj],
|
||||
output_dir=temp_dir,
|
||||
lang="ch", # 中文
|
||||
backend="pipeline", # 使用 pipeline 模式
|
||||
method="auto" # 自动检测
|
||||
)
|
||||
|
||||
# 读取生成的 markdown 文件
|
||||
md_files = list(Path(temp_dir).rglob("*.md"))
|
||||
if md_files:
|
||||
# 优先排除 layout / span / origin 等辅助文件
|
||||
content_files = [
|
||||
f for f in md_files
|
||||
if not any(x in f.stem for x in ['layout', 'span', 'origin'])
|
||||
]
|
||||
target_files = content_files or md_files
|
||||
with open(target_files[0], 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
return content if content.strip() else "未识别到文本内容"
|
||||
|
||||
return "未识别到文本内容"
|
||||
|
||||
finally:
|
||||
# 清理临时目录
|
||||
try:
|
||||
shutil.rmtree(temp_dir)
|
||||
except:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
return f"MinerU识别出错: {str(e)}"
|
||||
|
||||
def _extract_with_baidu_cloud(self, image_path: str) -> str:
|
||||
"""使用百度云OCR API提取文本"""
|
||||
try:
|
||||
# 读取图片
|
||||
with open(image_path, 'rb') as f:
|
||||
image_data = f.read()
|
||||
|
||||
# 调用通用文字识别(高精度版)
|
||||
result = self.baidu_client.accurateBasic(image_data)
|
||||
|
||||
if 'error_code' in result:
|
||||
return f"百度OCR错误 ({result['error_code']}): {result.get('error_msg', '未知错误')}"
|
||||
|
||||
# 提取文本
|
||||
if 'words_result' in result:
|
||||
text_lines = [item['words'] for item in result['words_result']]
|
||||
return "\n".join(text_lines) if text_lines else "未识别到文本内容"
|
||||
|
||||
return "未识别到文本内容"
|
||||
|
||||
except Exception as e:
|
||||
return f"百度云OCR识别出错: {str(e)}"
|
||||
|
||||
def _extract_with_paddleocr(self, image_path: str) -> str:
|
||||
"""使用PaddleOCR提取文本"""
|
||||
try:
|
||||
result = self.paddle_ocr.ocr(image_path, cls=True)
|
||||
|
||||
if not result or not result[0]:
|
||||
return "未识别到文本内容"
|
||||
|
||||
# 提取所有文本行
|
||||
text_lines = []
|
||||
for line in result[0]:
|
||||
if line and len(line) >= 2:
|
||||
text_lines.append(line[1][0])
|
||||
|
||||
return "\n".join(text_lines) if text_lines else "未识别到文本内容"
|
||||
|
||||
except Exception as e:
|
||||
return f"PaddleOCR识别出错: {str(e)}"
|
||||
|
||||
def _extract_from_pdf(self, pdf_path: str) -> str:
|
||||
"""从PDF中提取文本"""
|
||||
# 优先使用 MinerU 处理 PDF(效果最好)
|
||||
if self.ocr_type == "mineru":
|
||||
return self._extract_with_mineru(pdf_path)
|
||||
|
||||
# 备选方案:使用 pdfplumber
|
||||
try:
|
||||
import pdfplumber
|
||||
|
||||
text_content = []
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_content.append(text)
|
||||
|
||||
return "\n\n".join(text_content) if text_content else "未提取到文本内容"
|
||||
|
||||
except ImportError:
|
||||
# PDF库不可用,尝试使用OCR处理PDF的图像
|
||||
return "PDF处理需要安装 pdfplumber 库\n可以运行: pip install pdfplumber"
|
||||
except Exception as e:
|
||||
return f"PDF处理出错: {str(e)}"
|
||||
|
||||
Reference in New Issue
Block a user