summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-12 18:02:15 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-12 18:02:15 +0200
commitca70f94db0692180c015bb6fd87c606cc675970c (patch)
tree1049f13cc9a2e3aa0a4822a2df9bce1d9c425bb1
parenta688056311899039fdb92f08e8e505ab9eaccd82 (diff)
downloadComm-on-net-ca70f94db0692180c015bb6fd87c606cc675970c.tar.bz2
Comm-on-net-ca70f94db0692180c015bb6fd87c606cc675970c.zip
Manage too young for calendars
-rw-r--r--commcrawler/scrapy.py9
1 files changed, 7 insertions, 2 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index ef406e9..b729a1c 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -33,6 +33,7 @@ CALENDAR_KEYS = ["agenda", "calendar"]
DATE_REG = re.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}')
DATE_FORMATS = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y']
DATE_MONTH_DELAY = 1
+DATE_MONTH_FUTUR_DELAY = 2
MAX_LINKS = None # if None no max
TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
@@ -160,7 +161,7 @@ class DefaultSpider:
if url is None or url in self.links_reached:
continue
match = DATE_REG.match(full_url)
- calendar_view, too_old = False, False
+ calendar_view, too_old, too_young = False, False, False
if match:
for calendar_key in CALENDAR_KEYS:
if calendar_key in url:
@@ -174,9 +175,13 @@ class DefaultSpider:
datetime.timedelta(
31 * DATE_MONTH_DELAY):
too_old = True
+ if d - datetime.date.today() > \
+ datetime.timedelta(
+ 31 * DATE_MONTH_FUTUR_DELAY):
+ too_young = True
except ValueError:
pass
- if too_old:
+ if too_old or too_young:
continue
is_internal = False
for domain in self.allowed_domains: