summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--commcrawler/scrapy.py9
1 files changed, 7 insertions, 2 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index ef406e9..b729a1c 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -33,6 +33,7 @@ CALENDAR_KEYS = ["agenda", "calendar"]
DATE_REG = re.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}')
DATE_FORMATS = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y']
DATE_MONTH_DELAY = 1
+DATE_MONTH_FUTUR_DELAY = 2
MAX_LINKS = None # if None no max
TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
@@ -160,7 +161,7 @@ class DefaultSpider:
if url is None or url in self.links_reached:
continue
match = DATE_REG.match(full_url)
- calendar_view, too_old = False, False
+ calendar_view, too_old, too_young = False, False, False
if match:
for calendar_key in CALENDAR_KEYS:
if calendar_key in url:
@@ -174,9 +175,13 @@ class DefaultSpider:
datetime.timedelta(
31 * DATE_MONTH_DELAY):
too_old = True
+ if d - datetime.date.today() > \
+ datetime.timedelta(
+ 31 * DATE_MONTH_FUTUR_DELAY):
+ too_young = True
except ValueError:
pass
- if too_old:
+ if too_old or too_young:
continue
is_internal = False
for domain in self.allowed_domains: