1 files changed, 24 insertions, 0 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 8c78e61..ef406e9 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,6 +1,7 @@
 import datetime
 import time
 from random import randint
+import re
 import requests
 
 import scrapy
@@ -29,6 +30,9 @@ AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus",
 OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx")
 
 CALENDAR_KEYS = ["agenda", "calendar"]
+DATE_REG = re.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}')
+DATE_FORMATS = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y']
+DATE_MONTH_DELAY = 1
 
 MAX_LINKS = None  # if None no max
 TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
@@ -151,9 +155,29 @@ class DefaultSpider:
                 self._parse_image(response, result)
                 self._parse_iframe(response, result)
                 for link in LinkExtractor().extract_links(response):
+                    full_url = link.url
                     url = clean_url(link.url)
                     if url is None or url in self.links_reached:
                         continue
+                    match = DATE_REG.match(full_url)
+                    calendar_view, too_old = False, False
+                    if match:
+                        for calendar_key in CALENDAR_KEYS:
+                            if calendar_key in url:
+                                calendar_view = True
+                    if calendar_view:
+                        for date_format in DATE_FORMATS:
+                            try:
+                                d = datetime.datetime.strptime(match.group(),
+                                                               date_format)
+                                if datetime.date.today() - d > \
+                                        datetime.timedelta(
+                                            31 * DATE_MONTH_DELAY):
+                                    too_old = True
+                            except ValueError:
+                                pass
+                    if too_old:
+                        continue
                     is_internal = False
                     for domain in self.allowed_domains:
                         if domain in url: