diff --git a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py index 629fcf3..d9a8b0a 100644 --- a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py +++ b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py @@ -56,7 +56,27 @@ class XhwCrawler(BaseCrawler): 'sec-ch-ua-platform': '"Windows"' } ), - + "hot_point": UrlConfig( + url="https://xhsz.news.cn/focus_news", + method="GET", + params={}, + headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'max-age=0', + 'Referer': 'https://xhsz.news.cn/', + 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"' + } + ) }, ) super().__init__(config) @@ -318,37 +338,47 @@ class XhwCrawler(BaseCrawler): for child in children: try: tag_name = child.tag_name.lower() - if tag_name == "p": + if tag_name == "p" or tag_name == "div": text = child.text.strip().replace("\xa0", "") if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0: continue - # 图片 - try: - img = child.find_element(By.TAG_NAME, "img") - src = img.get_attribute("src") - if src and not src.startswith("http"): - src = self.config.base_url + src - news_item.contentRows.append({"tag": "img", "content": f""}) - continue - except: - pass # 视频 try: video = child.find_element(By.TAG_NAME, "video") src = video.get_attribute("src") if src and not src.startswith("http"): - src = self.config.base_url + src + src = self._normalize_url(src) news_item.contentRows.append({"tag": "video", "content": f"