summaryrefslogtreecommitdiff
path: root/commcrawler
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-12 15:49:58 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-12 15:49:58 +0200
commitb2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f (patch)
tree36293ae352aea89f79eb08c4666e9fbf334be1e7 /commcrawler
parentc37796e9fa54ccc4f9a41635644c724854ede06d (diff)
downloadComm-on-net-b2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f.tar.bz2
Comm-on-net-b2b06f6ca9e1128fb355848cd5dd7db0e7ba9b0f.zip
Do not process if no URL is reached
Diffstat (limited to 'commcrawler')
-rw-r--r--commcrawler/scrapy.py7
1 files changed, 5 insertions, 2 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 213f28d..40fc3b7 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -335,6 +335,7 @@ def launch_crawl(crawl_item, excluded_domains=None):
process = CrawlerProcess(settings=scrap_settings)
idx = 0
current_idx = page * 50
+ has_url_to_process = False
while current_idx < total and idx < 50:
target = models.Target.objects.filter(pk=targets[current_idx]['id'])
idx += 1
@@ -373,8 +374,9 @@ def launch_crawl(crawl_item, excluded_domains=None):
else:
url = target.url
+ has_url_to_process = True
redirect = None
- if getattr(response, 'history', None):
+ if response and getattr(response, 'history', None):
url = response.url
redirect = url
domain = get_domain(url)
@@ -392,7 +394,8 @@ def launch_crawl(crawl_item, excluded_domains=None):
redirect
)
)
- process.start()
+ if has_url_to_process:
+ process.start()
crawl_item.crawl_ended = timezone.now()
crawl_item.status = "M"
crawl_item.save()