diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-12 18:02:15 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-12 18:02:15 +0200 |
commit | ca70f94db0692180c015bb6fd87c606cc675970c (patch) | |
tree | 1049f13cc9a2e3aa0a4822a2df9bce1d9c425bb1 | |
parent | a688056311899039fdb92f08e8e505ab9eaccd82 (diff) | |
download | Comm-on-net-ca70f94db0692180c015bb6fd87c606cc675970c.tar.bz2 Comm-on-net-ca70f94db0692180c015bb6fd87c606cc675970c.zip |
Manage too young for calendars
-rw-r--r-- | commcrawler/scrapy.py | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index ef406e9..b729a1c 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -33,6 +33,7 @@ CALENDAR_KEYS = ["agenda", "calendar"] DATE_REG = re.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}') DATE_FORMATS = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y'] DATE_MONTH_DELAY = 1 +DATE_MONTH_FUTUR_DELAY = 2 MAX_LINKS = None # if None no max TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) @@ -160,7 +161,7 @@ class DefaultSpider: if url is None or url in self.links_reached: continue match = DATE_REG.match(full_url) - calendar_view, too_old = False, False + calendar_view, too_old, too_young = False, False, False if match: for calendar_key in CALENDAR_KEYS: if calendar_key in url: @@ -174,9 +175,13 @@ class DefaultSpider: datetime.timedelta( 31 * DATE_MONTH_DELAY): too_old = True + if d - datetime.date.today() > \ + datetime.timedelta( + 31 * DATE_MONTH_FUTUR_DELAY): + too_young = True except ValueError: pass - if too_old: + if too_old or too_young: continue is_internal = False for domain in self.allowed_domains: |