diff options
Diffstat (limited to 'commcrawler')
-rw-r--r-- | commcrawler/scrapy.py | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index ef406e9..b729a1c 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -33,6 +33,7 @@ CALENDAR_KEYS = ["agenda", "calendar"] DATE_REG = re.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}') DATE_FORMATS = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y'] DATE_MONTH_DELAY = 1 +DATE_MONTH_FUTUR_DELAY = 2 MAX_LINKS = None # if None no max TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) @@ -160,7 +161,7 @@ class DefaultSpider: if url is None or url in self.links_reached: continue match = DATE_REG.match(full_url) - calendar_view, too_old = False, False + calendar_view, too_old, too_young = False, False, False if match: for calendar_key in CALENDAR_KEYS: if calendar_key in url: @@ -174,9 +175,13 @@ class DefaultSpider: datetime.timedelta( 31 * DATE_MONTH_DELAY): too_old = True + if d - datetime.date.today() > \ + datetime.timedelta( + 31 * DATE_MONTH_FUTUR_DELAY): + too_young = True except ValueError: pass - if too_old: + if too_old or too_young: continue is_internal = False for domain in self.allowed_domains: |