summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-12 17:58:47 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-12 17:58:47 +0200
commita688056311899039fdb92f08e8e505ab9eaccd82 (patch)
treea6a27585b433a9044010eb2ad3bc27ede9199422
parent391cb54e98dc2c1661a4e0ae13739ed710297d02 (diff)
downloadComm-on-net-a688056311899039fdb92f08e8e505ab9eaccd82.tar.bz2
Comm-on-net-a688056311899039fdb92f08e8e505ab9eaccd82.zip
Manage calendars
-rw-r--r--commcrawler/scrapy.py24
1 files changed, 24 insertions, 0 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 8c78e61..ef406e9 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,6 +1,7 @@
import datetime
import time
from random import randint
+import re
import requests
import scrapy
@@ -29,6 +30,9 @@ AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus",
OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx")
CALENDAR_KEYS = ["agenda", "calendar"]
+DATE_REG = re.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}')
+DATE_FORMATS = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y']
+DATE_MONTH_DELAY = 1
MAX_LINKS = None # if None no max
TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
@@ -151,9 +155,29 @@ class DefaultSpider:
self._parse_image(response, result)
self._parse_iframe(response, result)
for link in LinkExtractor().extract_links(response):
+ full_url = link.url
url = clean_url(link.url)
if url is None or url in self.links_reached:
continue
+ match = DATE_REG.match(full_url)
+ calendar_view, too_old = False, False
+ if match:
+ for calendar_key in CALENDAR_KEYS:
+ if calendar_key in url:
+ calendar_view = True
+ if calendar_view:
+ for date_format in DATE_FORMATS:
+ try:
+ d = datetime.datetime.strptime(match.group(),
+ date_format)
+ if datetime.date.today() - d > \
+ datetime.timedelta(
+ 31 * DATE_MONTH_DELAY):
+ too_old = True
+ except ValueError:
+ pass
+ if too_old:
+ continue
is_internal = False
for domain in self.allowed_domains:
if domain in url: