summaryrefslogtreecommitdiff
path: root/commcrawler
diff options
context:
space:
mode:
Diffstat (limited to 'commcrawler')
-rw-r--r--commcrawler/scrapy.py24
1 files changed, 24 insertions, 0 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 8c78e61..ef406e9 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,6 +1,7 @@
import datetime
import time
from random import randint
+import re
import requests
import scrapy
@@ -29,6 +30,9 @@ AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus",
OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx")
CALENDAR_KEYS = ["agenda", "calendar"]
+DATE_REG = re.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}')
+DATE_FORMATS = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y']
+DATE_MONTH_DELAY = 1
MAX_LINKS = None # if None no max
TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
@@ -151,9 +155,29 @@ class DefaultSpider:
self._parse_image(response, result)
self._parse_iframe(response, result)
for link in LinkExtractor().extract_links(response):
+ full_url = link.url
url = clean_url(link.url)
if url is None or url in self.links_reached:
continue
+ match = DATE_REG.match(full_url)
+ calendar_view, too_old = False, False
+ if match:
+ for calendar_key in CALENDAR_KEYS:
+ if calendar_key in url:
+ calendar_view = True
+ if calendar_view:
+ for date_format in DATE_FORMATS:
+ try:
+ d = datetime.datetime.strptime(match.group(),
+ date_format)
+ if datetime.date.today() - d > \
+ datetime.timedelta(
+ 31 * DATE_MONTH_DELAY):
+ too_old = True
+ except ValueError:
+ pass
+ if too_old:
+ continue
is_internal = False
for domain in self.allowed_domains:
if domain in url: