diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-20 12:48:21 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-20 12:48:21 +0200 |
commit | eecc7eac9112389c0b76e71cc831b21e889f1ab6 (patch) | |
tree | 71d3f438c5ab16c991070f277411c9ee1acd5883 | |
parent | d9c27b5bacc00fd5f0c6012c47688d5ff1722a29 (diff) | |
download | Comm-on-net-eecc7eac9112389c0b76e71cc831b21e889f1ab6.tar.bz2 Comm-on-net-eecc7eac9112389c0b76e71cc831b21e889f1ab6.zip |
Crawl: pass already done
-rw-r--r-- | commcrawler/scrapy.py | 12 |
1 files changed, 10 insertions, 2 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index e69268b..ff8e83a 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -369,20 +369,28 @@ def launch_crawl(crawl_item, excluded_domains=None): page_number = total // NUMBER_PER_PAGE while page <= page_number and not (ONLY_FIRST_PAGE and page): process = CrawlerProcess(settings=scrap_settings) - idx = 0 + idx, delta = 0, 0 current_idx = page * NUMBER_PER_PAGE has_url_to_process = False while current_idx < total and idx < NUMBER_PER_PAGE: target = models.Target.objects.filter(pk=targets[current_idx]['id']) idx += 1 - current_idx = idx + page * NUMBER_PER_PAGE + current_idx = idx + delta + page * NUMBER_PER_PAGE if not target.count(): # target has disappear + idx -= 1 + delta += 1 continue target = target.all()[0] result_dct = { "crawl_id": crawl_item.pk, "target_id": target.pk, } + q = models.CrawlResult.objects.filter(**result_dct) + if q.count(): # already managed + idx -= 1 + delta += 1 + continue + response, verify_ssl = None, True url = target.url while response is None: |