修复浙江爬虫日期过滤:客户端精确过滤+智能翻页逻辑;补充python-dotenv依赖
This commit is contained in:
@@ -3,3 +3,4 @@ beautifulsoup4>=4.12.0
|
|||||||
lxml>=4.9.0
|
lxml>=4.9.0
|
||||||
pdfplumber>=0.10.0
|
pdfplumber>=0.10.0
|
||||||
python-docx>=1.0.0
|
python-docx>=1.0.0
|
||||||
|
python-dotenv>=1.0.0
|
||||||
|
|||||||
@@ -251,13 +251,16 @@ class ZhejiangSpider(BaseSpider):
|
|||||||
page_size = self.spider_config.get("page_size", 20)
|
page_size = self.spider_config.get("page_size", 20)
|
||||||
|
|
||||||
# 日期范围
|
# 日期范围
|
||||||
|
target_date = None # 客户端精确过滤用
|
||||||
if date_filter == "yesterday":
|
if date_filter == "yesterday":
|
||||||
d = datetime.now() - timedelta(days=1)
|
d = datetime.now() - timedelta(days=1)
|
||||||
start_date = end_date = d.strftime("%Y-%m-%d")
|
target_date = d.strftime("%Y-%m-%d")
|
||||||
logger.info(f"过滤日期: {start_date}(昨天)")
|
start_date = end_date = target_date
|
||||||
|
logger.info(f"过滤日期: {target_date}(昨天)")
|
||||||
elif date_filter:
|
elif date_filter:
|
||||||
|
target_date = date_filter
|
||||||
start_date = end_date = date_filter
|
start_date = end_date = date_filter
|
||||||
logger.info(f"过滤日期: {start_date}")
|
logger.info(f"过滤日期: {target_date}")
|
||||||
else:
|
else:
|
||||||
# 默认近一个月
|
# 默认近一个月
|
||||||
end_date = datetime.now().strftime("%Y-%m-%d")
|
end_date = datetime.now().strftime("%Y-%m-%d")
|
||||||
@@ -294,11 +297,15 @@ class ZhejiangSpider(BaseSpider):
|
|||||||
break
|
break
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
|
has_older = False
|
||||||
for rec in records:
|
for rec in records:
|
||||||
# 客户端日期二次过滤:跳过不在目标日期范围内的记录
|
# 客户端日期精确过滤
|
||||||
rec_date = rec.get("webdate", "").split(" ")[0]
|
rec_date = rec.get("webdate", "").split(" ")[0]
|
||||||
if date_filter and rec_date and rec_date != start_date:
|
if target_date and rec_date:
|
||||||
continue
|
if rec_date != target_date:
|
||||||
|
if rec_date < target_date:
|
||||||
|
has_older = True
|
||||||
|
continue
|
||||||
|
|
||||||
link = rec.get("linkurl", "")
|
link = rec.get("linkurl", "")
|
||||||
if link and not link.startswith("http"):
|
if link and not link.startswith("http"):
|
||||||
@@ -338,8 +345,14 @@ class ZhejiangSpider(BaseSpider):
|
|||||||
logger.info(f" 获取 {count} 条数据")
|
logger.info(f" 获取 {count} 条数据")
|
||||||
|
|
||||||
if count == 0:
|
if count == 0:
|
||||||
logger.info("当前页无新数据,停止翻页")
|
if not target_date or has_older:
|
||||||
break
|
logger.info("当前页无新数据,停止翻页")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# 当前页全是比目标日期更新的数据,继续翻页
|
||||||
|
logger.info(" 当前页均为更新日期的数据,继续翻页")
|
||||||
|
self.delay()
|
||||||
|
continue
|
||||||
|
|
||||||
self.delay()
|
self.delay()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user