diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-12 15:49:58 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-12 15:49:58 +0200 |
commit | b2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f (patch) | |
tree | 36293ae352aea89f79eb08c4666e9fbf334be1e7 /commcrawler | |
parent | c37796e9fa54ccc4f9a41635644c724854ede06d (diff) | |
download | Comm-on-net-b2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f.tar.bz2 Comm-on-net-b2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f.zip |
Do not process if no URL is reached
Diffstat (limited to 'commcrawler')
-rw-r--r-- | commcrawler/scrapy.py | 7 |
1 files changed, 5 insertions, 2 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 213f28d..40fc3b7 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -335,6 +335,7 @@ def launch_crawl(crawl_item, excluded_domains=None): process = CrawlerProcess(settings=scrap_settings) idx = 0 current_idx = page * 50 + has_url_to_process = False while current_idx < total and idx < 50: target = models.Target.objects.filter(pk=targets[current_idx]['id']) idx += 1 @@ -373,8 +374,9 @@ def launch_crawl(crawl_item, excluded_domains=None): else: url = target.url + has_url_to_process = True redirect = None - if getattr(response, 'history', None): + if response and getattr(response, 'history', None): url = response.url redirect = url domain = get_domain(url) @@ -392,7 +394,8 @@ def launch_crawl(crawl_item, excluded_domains=None): redirect ) ) - process.start() + if has_url_to_process: + process.start() crawl_item.crawl_ended = timezone.now() crawl_item.status = "M" crawl_item.save() |