From eecc7eac9112389c0b76e71cc831b21e889f1ab6 Mon Sep 17 00:00:00 2001
From: Étienne Loks <etienne.loks@iggdrasil.net>
Date: Tue, 20 Aug 2019 12:48:21 +0200
Subject: Crawl: pass already done

---
 commcrawler/scrapy.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'commcrawler/scrapy.py')

diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index e69268b..ff8e83a 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -369,20 +369,28 @@ def launch_crawl(crawl_item, excluded_domains=None):
     page_number = total // NUMBER_PER_PAGE
     while page <= page_number and not (ONLY_FIRST_PAGE and page):
         process = CrawlerProcess(settings=scrap_settings)
-        idx = 0
+        idx, delta = 0, 0
         current_idx = page * NUMBER_PER_PAGE
         has_url_to_process = False
         while current_idx < total and idx < NUMBER_PER_PAGE:
             target = models.Target.objects.filter(pk=targets[current_idx]['id'])
             idx += 1
-            current_idx = idx + page * NUMBER_PER_PAGE
+            current_idx = idx + delta + page * NUMBER_PER_PAGE
             if not target.count():  # target has disappear
+                idx -= 1
+                delta += 1
                 continue
             target = target.all()[0]
             result_dct = {
                 "crawl_id": crawl_item.pk,
                 "target_id": target.pk,
             }
+            q = models.CrawlResult.objects.filter(**result_dct)
+            if q.count():  # already managed
+                idx -= 1
+                delta += 1
+                continue
+
             response, verify_ssl = None, True
             url = target.url
             while response is None:
-- 
cgit v1.2.3