diff options
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 8c78e61..ef406e9 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,6 +1,7 @@ import datetime import time from random import randint +import re import requests import scrapy @@ -29,6 +30,9 @@ AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus", OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx") CALENDAR_KEYS = ["agenda", "calendar"] +DATE_REG = re.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}') +DATE_FORMATS = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y'] +DATE_MONTH_DELAY = 1 MAX_LINKS = None # if None no max TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) @@ -151,9 +155,29 @@ class DefaultSpider: self._parse_image(response, result) self._parse_iframe(response, result) for link in LinkExtractor().extract_links(response): + full_url = link.url url = clean_url(link.url) if url is None or url in self.links_reached: continue + match = DATE_REG.match(full_url) + calendar_view, too_old = False, False + if match: + for calendar_key in CALENDAR_KEYS: + if calendar_key in url: + calendar_view = True + if calendar_view: + for date_format in DATE_FORMATS: + try: + d = datetime.datetime.strptime(match.group(), + date_format) + if datetime.date.today() - d > \ + datetime.timedelta( + 31 * DATE_MONTH_DELAY): + too_old = True + except ValueError: + pass + if too_old: + continue is_internal = False for domain in self.allowed_domains: if domain in url: |