summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-20 12:48:21 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-20 12:48:21 +0200
commiteecc7eac9112389c0b76e71cc831b21e889f1ab6 (patch)
tree71d3f438c5ab16c991070f277411c9ee1acd5883
parentd9c27b5bacc00fd5f0c6012c47688d5ff1722a29 (diff)
downloadComm-on-net-eecc7eac9112389c0b76e71cc831b21e889f1ab6.tar.bz2
Comm-on-net-eecc7eac9112389c0b76e71cc831b21e889f1ab6.zip
Crawl: pass already done
-rw-r--r--commcrawler/scrapy.py12
1 files changed, 10 insertions, 2 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index e69268b..ff8e83a 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -369,20 +369,28 @@ def launch_crawl(crawl_item, excluded_domains=None):
page_number = total // NUMBER_PER_PAGE
while page <= page_number and not (ONLY_FIRST_PAGE and page):
process = CrawlerProcess(settings=scrap_settings)
- idx = 0
+ idx, delta = 0, 0
current_idx = page * NUMBER_PER_PAGE
has_url_to_process = False
while current_idx < total and idx < NUMBER_PER_PAGE:
target = models.Target.objects.filter(pk=targets[current_idx]['id'])
idx += 1
- current_idx = idx + page * NUMBER_PER_PAGE
+ current_idx = idx + delta + page * NUMBER_PER_PAGE
if not target.count(): # target has disappear
+ idx -= 1
+ delta += 1
continue
target = target.all()[0]
result_dct = {
"crawl_id": crawl_item.pk,
"target_id": target.pk,
}
+ q = models.CrawlResult.objects.filter(**result_dct)
+ if q.count(): # already managed
+ idx -= 1
+ delta += 1
+ continue
+
response, verify_ssl = None, True
url = target.url
while response is None: