From ca70f94db0692180c015bb6fd87c606cc675970c Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Mon, 12 Aug 2019 18:02:15 +0200 Subject: Manage too young for calendars --- commcrawler/scrapy.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index ef406e9..b729a1c 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -33,6 +33,7 @@ CALENDAR_KEYS = ["agenda", "calendar"] DATE_REG = re.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}') DATE_FORMATS = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y'] DATE_MONTH_DELAY = 1 +DATE_MONTH_FUTUR_DELAY = 2 MAX_LINKS = None # if None no max TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) @@ -160,7 +161,7 @@ class DefaultSpider: if url is None or url in self.links_reached: continue match = DATE_REG.match(full_url) - calendar_view, too_old = False, False + calendar_view, too_old, too_young = False, False, False if match: for calendar_key in CALENDAR_KEYS: if calendar_key in url: @@ -174,9 +175,13 @@ class DefaultSpider: datetime.timedelta( 31 * DATE_MONTH_DELAY): too_old = True + if d - datetime.date.today() > \ + datetime.timedelta( + 31 * DATE_MONTH_FUTUR_DELAY): + too_young = True except ValueError: pass - if too_old: + if too_old or too_young: continue is_internal = False for domain in self.allowed_domains: -- cgit v1.2.3