import os import sys from pathlib import Path from typing import Union import tempfile import shutil class OCRService: """OCR识别服务 - 支持 MinerU、百度云OCR API、PaddleOCR(生产模式,不支持演示)""" def __init__(self): self.ocr_type = self._detect_ocr_type() self._initialize_ocr() def _detect_ocr_type(self) -> str: """检测可用的OCR类型""" # 最优先使用百度云OCR API(速度快、精度高、免费额度足够日常使用) if os.getenv("BAIDU_OCR_APP_ID") and os.getenv("BAIDU_OCR_API_KEY") and os.getenv("BAIDU_OCR_SECRET_KEY"): return "baidu_cloud" # 其次使用 MinerU(最强大的文档解析工具,但速度慢) elif self._check_mineru(): return "mineru" # 再次使用PaddleOCR elif self._check_paddleocr(): return "paddleocr" # 没有可用的OCR else: raise RuntimeError( "❌ 没有可用的OCR引擎!请至少配置以下一种:\n" "1. MinerU - 将 MinerU-master 文件夹放在桌面\n" "2. 百度OCR - 配置环境变量 BAIDU_OCR_*\n" "3. PaddleOCR - 运行 pip install paddleocr paddlepaddle" ) def _initialize_ocr(self): """初始化OCR引擎""" if self.ocr_type == "mineru": try: # 添加 MinerU 路径 mineru_path = Path(r"c:\Users\UI\Desktop\MinerU-master") if mineru_path.exists() and str(mineru_path) not in sys.path: sys.path.insert(0, str(mineru_path)) from demo.demo import parse_doc self.mineru_parse = parse_doc try: from torch.serialization import add_safe_globals from doclayout_yolo.nn.tasks import YOLOv10DetectionModel add_safe_globals([YOLOv10DetectionModel]) except Exception: pass print("✓ 使用 MinerU 引擎(高精度文档解析)") except Exception as e: raise RuntimeError(f"❌ MinerU 初始化失败: {e}\n请安装完整依赖或使用其他OCR引擎") elif self.ocr_type == "baidu_cloud": try: from aip import AipOcr app_id = os.getenv("BAIDU_OCR_APP_ID") api_key = os.getenv("BAIDU_OCR_API_KEY") secret_key = os.getenv("BAIDU_OCR_SECRET_KEY") self.baidu_client = AipOcr(app_id, api_key, secret_key) print("✓ 使用百度云OCR API(高精度)") except Exception as e: raise RuntimeError(f"❌ 百度云OCR初始化失败: {e}\n请检查环境变量配置") elif self.ocr_type == "paddleocr": try: from paddleocr import PaddleOCR self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False) print("✓ 使用 PaddleOCR 引擎(本地离线)") except Exception as e: raise RuntimeError(f"❌ PaddleOCR 初始化失败: {e}\n请运行: pip install paddleocr paddlepaddle") def _check_mineru(self) -> bool: """检查MinerU是否可用""" try: mineru_path = Path(r"c:\Users\UI\Desktop\MinerU-master") return mineru_path.exists() and (mineru_path / "demo" / "demo.py").exists() except: return False def _check_paddleocr(self) -> bool: """检查PaddleOCR是否可用""" try: import paddleocr return True except ImportError: return False def extract_text(self, file_path: Union[str, Path]) -> str: """从图片或PDF中提取文本""" file_path = str(file_path) file_ext = Path(file_path).suffix.lower() if file_ext == '.pdf': return self._extract_from_pdf(file_path) else: return self._extract_from_image(file_path) def _extract_from_image(self, image_path: str) -> str: """从图片中提取文本""" if self.ocr_type == "mineru": return self._extract_with_mineru(image_path) elif self.ocr_type == "baidu_cloud": return self._extract_with_baidu_cloud(image_path) elif self.ocr_type == "paddleocr": return self._extract_with_paddleocr(image_path) else: raise RuntimeError("OCR引擎未正确初始化") def _extract_with_mineru(self, file_path: str) -> str: """使用 MinerU 提取文本(支持PDF和图片)""" try: # 创建临时输出目录 temp_dir = tempfile.mkdtemp(prefix="mineru_") try: # 调用 MinerU 解析 file_path_obj = Path(file_path) self.mineru_parse( path_list=[file_path_obj], output_dir=temp_dir, lang="ch", # 中文 backend="pipeline", # 使用 pipeline 模式 method="auto" # 自动检测 ) # 读取生成的 markdown 文件 md_files = list(Path(temp_dir).rglob("*.md")) if md_files: # 优先排除 layout / span / origin 等辅助文件 content_files = [ f for f in md_files if not any(x in f.stem for x in ['layout', 'span', 'origin']) ] target_files = content_files or md_files with open(target_files[0], 'r', encoding='utf-8') as f: content = f.read() return content if content.strip() else "未识别到文本内容" return "未识别到文本内容" finally: # 清理临时目录 try: shutil.rmtree(temp_dir) except: pass except Exception as e: return f"MinerU识别出错: {str(e)}" def _extract_with_baidu_cloud(self, image_path: str) -> str: """使用百度云OCR API提取文本""" try: # 读取图片 with open(image_path, 'rb') as f: image_data = f.read() # 调用通用文字识别(高精度版) result = self.baidu_client.accurateBasic(image_data) if 'error_code' in result: return f"百度OCR错误 ({result['error_code']}): {result.get('error_msg', '未知错误')}" # 提取文本 if 'words_result' in result: text_lines = [item['words'] for item in result['words_result']] return "\n".join(text_lines) if text_lines else "未识别到文本内容" return "未识别到文本内容" except Exception as e: return f"百度云OCR识别出错: {str(e)}" def _extract_with_paddleocr(self, image_path: str) -> str: """使用PaddleOCR提取文本""" try: result = self.paddle_ocr.ocr(image_path, cls=True) if not result or not result[0]: return "未识别到文本内容" # 提取所有文本行 text_lines = [] for line in result[0]: if line and len(line) >= 2: text_lines.append(line[1][0]) return "\n".join(text_lines) if text_lines else "未识别到文本内容" except Exception as e: return f"PaddleOCR识别出错: {str(e)}" def _extract_from_pdf(self, pdf_path: str) -> str: """从PDF中提取文本""" # 优先使用 MinerU 处理 PDF(效果最好) if self.ocr_type == "mineru": return self._extract_with_mineru(pdf_path) # 备选方案:使用 pdfplumber try: import pdfplumber text_content = [] with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text = page.extract_text() if text: text_content.append(text) return "\n\n".join(text_content) if text_content else "未提取到文本内容" except ImportError: # PDF库不可用,尝试使用OCR处理PDF的图像 return "PDF处理需要安装 pdfplumber 库\n可以运行: pip install pdfplumber" except Exception as e: return f"PDF处理出错: {str(e)}"