人民日报增加域

2025-11-19 16:41:41 +08:00
parent 0e436e31f3
commit 1ad118b0d3
7 changed files with 56 additions and 28 deletions
--- a/schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py
+++ b/schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py
@@ -6,7 +6,7 @@ from loguru import logger
 import re
 import chardet
 from datetime import datetime, timedelta
-from bs4.element import NavigableString
+from bs4.element import NavigableString, Tag
 from urllib.parse import urlparse
 import json

@@ -70,6 +70,7 @@ class RmrbCrawler(BaseCrawler):
            "politics": self.parse_base_news_detail,
            "finance": self.parse_base_news_detail,
            "cpc": self.parse_cpc_news_detail,
+            "theory": self.parse_cpc_news_detail,
        }
    
    def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
@@ -518,13 +519,11 @@ class RmrbCrawler(BaseCrawler):

            # 遍历 show_text 下的所有直接子节点（保持顺序）
            for child in content_div.children:
-                # 跳过纯文本节点（如换行、空格）
-                if isinstance(child, NavigableString):
+                # 只处理 Tag 类型的节点，跳过文本节点、注释等
+                if not isinstance(child, Tag):
                    continue
-
+                
                tag_name = child.name
-                if tag_name is None:
-                    continue

                # 情况1：检测是否是视频容器（根据 id 特征或内部结构）
                video_tag = child.find('video') if tag_name != 'video' else child
@@ -578,7 +577,7 @@ class RmrbCrawler(BaseCrawler):
                        continue

                    # 情况2：检查是否包含人民网的 showPlayer 脚本（动态视频）
-                    script_tags = child.find_all('script', string=True)
+                    script_tags = child.find_all('script')
                    video_src = None
                    poster_url = None

--- a/schoolNewsCrawler/crawler/rmrb/RmrbHotPoint.py
+++ b/schoolNewsCrawler/crawler/rmrb/RmrbHotPoint.py
@@ -11,7 +11,7 @@ import sys
 from pathlib import Path

 # Add parent directory to path to import crawler
-sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))

 from crawler.rmrb.RmrbCrawler import RmrbCrawler
 from loguru import logger
--- a/schoolNewsCrawler/crawler/rmrb/RmrbSearch.py
+++ b/schoolNewsCrawler/crawler/rmrb/RmrbSearch.py
@@ -10,8 +10,8 @@ import json
 import sys
 from pathlib import Path

-# Add parent directory to path to import crawler
-sys.path.insert(0, str(Path(__file__).parent.parent))
+# Add project root directory to path to import crawler
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))

 from crawler.rmrb.RmrbCrawler import RmrbCrawler
 from loguru import logger
@@ -89,11 +89,13 @@ def main():
            "data": None,
            "dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
        }
-        result = None
-        with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
-            result = json.load(f)
-        print(result)
-        output = result
+        # result = None
+        # with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
+        #     result = json.load(f)
+        # print(result)
+        # output = result
+
+        
        if output_file:
            output_path = Path(output_file)
            output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -101,13 +103,10 @@ def main():
                json.dump(output, f, ensure_ascii=False, indent=2)
            logger.info(f"结果已保存到: {output_file}")

-    
        crawler.close()
-        # sys.exit(0 if result.success else 1)
+        sys.exit(0 if result.success else 1)
        # print(json.dumps(output, ensure_ascii=False, indent=2))
-
-        sys.exit(0 if result["success"] else 1)
-
+        # sys.exit(0 if result["success"] else 1)
    except Exception as e:
        logger.error(f"执行失败: {str(e)}")
        error_output = {
--- a/schoolNewsCrawler/crawler/rmrb/RmrbTrending.py
+++ b/schoolNewsCrawler/crawler/rmrb/RmrbTrending.py
@@ -14,7 +14,7 @@ from datetime import datetime, timedelta
 from pathlib import Path

 # Add parent directory to path to import crawler
-sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))

 from crawler.rmrb.RmrbCrawler import RmrbCrawler
 from loguru import logger
--- a/schoolNewsCrawler/crawler/rmrb/init.py
+++ b/schoolNewsCrawler/crawler/rmrb/init.py