Manage calendars

author: Étienne Loks <etienne.loks@iggdrasil.net> 2019-08-12 17:58:47 +0200
committer: Étienne Loks <etienne.loks@iggdrasil.net> 2019-08-12 17:58:47 +0200
commit: a688056311899039fdb92f08e8e505ab9eaccd82 (patch)
tree: a6a27585b433a9044010eb2ad3bc27ede9199422
parent: 391cb54e98dc2c1661a4e0ae13739ed710297d02 (diff)
download: Comm-on-net-a688056311899039fdb92f08e8e505ab9eaccd82.tar.bz2
Comm-on-net-a688056311899039fdb92f08e8e505ab9eaccd82.zip
1 files changed, 24 insertions, 0 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 8c78e61..ef406e9 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,6 +1,7 @@
 import datetime
 import time
 from random import randint
+import re
 import requests
 
 import scrapy
@@ -29,6 +30,9 @@ AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus",
 OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx")
 
 CALENDAR_KEYS = ["agenda", "calendar"]
+DATE_REG = re.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}')
+DATE_FORMATS = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y']
+DATE_MONTH_DELAY = 1
 
 MAX_LINKS = None  # if None no max
 TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
@@ -151,9 +155,29 @@ class DefaultSpider:
                 self._parse_image(response, result)
                 self._parse_iframe(response, result)
                 for link in LinkExtractor().extract_links(response):
+                    full_url = link.url
                     url = clean_url(link.url)
                     if url is None or url in self.links_reached:
                         continue
+                    match = DATE_REG.match(full_url)
+                    calendar_view, too_old = False, False
+                    if match:
+                        for calendar_key in CALENDAR_KEYS:
+                            if calendar_key in url:
+                                calendar_view = True
+                    if calendar_view:
+                        for date_format in DATE_FORMATS:
+                            try:
+                                d = datetime.datetime.strptime(match.group(),
+                                                               date_format)
+                                if datetime.date.today() - d > \
+                                        datetime.timedelta(
+                                            31 * DATE_MONTH_DELAY):
+                                    too_old = True
+                            except ValueError:
+                                pass
+                    if too_old:
+                        continue
                     is_internal = False
                     for domain in self.allowed_domains:
                         if domain in url:
author	Étienne Loks <etienne.loks@iggdrasil.net>	2019-08-12 17:58:47 +0200
committer	Étienne Loks <etienne.loks@iggdrasil.net>	2019-08-12 17:58:47 +0200
commit	a688056311899039fdb92f08e8e505ab9eaccd82 (patch)
tree	a6a27585b433a9044010eb2ad3bc27ede9199422
parent	391cb54e98dc2c1661a4e0ae13739ed710297d02 (diff)
download	Comm-on-net-a688056311899039fdb92f08e8e505ab9eaccd82.tar.bz2 Comm-on-net-a688056311899039fdb92f08e8e505ab9eaccd82.zip