Do not process if no URL is reached

author: Étienne Loks <etienne.loks@iggdrasil.net> 2019-08-12 15:49:58 +0200
committer: Étienne Loks <etienne.loks@iggdrasil.net> 2019-08-12 15:49:58 +0200
commit: b2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f (patch)
tree: 36293ae352aea89f79eb08c4666e9fbf334be1e7 /commcrawler
parent: c37796e9fa54ccc4f9a41635644c724854ede06d (diff)
download: Comm-on-net-b2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f.tar.bz2
Comm-on-net-b2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f.zip
1 files changed, 5 insertions, 2 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 213f28d..40fc3b7 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -335,6 +335,7 @@ def launch_crawl(crawl_item, excluded_domains=None):
         process = CrawlerProcess(settings=scrap_settings)
         idx = 0
         current_idx = page * 50
+        has_url_to_process = False
         while current_idx < total and idx < 50:
             target = models.Target.objects.filter(pk=targets[current_idx]['id'])
             idx += 1
@@ -373,8 +374,9 @@ def launch_crawl(crawl_item, excluded_domains=None):
             else:
                 url = target.url
 
+            has_url_to_process = True
             redirect = None
-            if getattr(response, 'history', None):
+            if response and getattr(response, 'history', None):
                 url = response.url
                 redirect = url
                 domain = get_domain(url)
@@ -392,7 +394,8 @@ def launch_crawl(crawl_item, excluded_domains=None):
                     redirect
                 )
             )
-        process.start()
+        if has_url_to_process:
+            process.start()
     crawl_item.crawl_ended = timezone.now()
     crawl_item.status = "M"
     crawl_item.save()
author	Étienne Loks <etienne.loks@iggdrasil.net>	2019-08-12 15:49:58 +0200
committer	Étienne Loks <etienne.loks@iggdrasil.net>	2019-08-12 15:49:58 +0200
commit	b2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f (patch)
tree	36293ae352aea89f79eb08c4666e9fbf334be1e7 /commcrawler
parent	c37796e9fa54ccc4f9a41635644c724854ede06d (diff)
download	Comm-on-net-b2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f.tar.bz2 Comm-on-net-b2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f.zip