diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-12 15:34:59 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-12 15:34:59 +0200 |
commit | 5835ead75454e387cd7eaacac2ce7658719848dc (patch) | |
tree | d84590c3d85d8493cbc2d729789731f7e0ceeeeb /commcrawler/scrapy.py | |
parent | 8377087d7572729f4e227d40c0a3f32fb5b2720a (diff) | |
download | Comm-on-net-5835ead75454e387cd7eaacac2ce7658719848dc.tar.bz2 Comm-on-net-5835ead75454e387cd7eaacac2ce7658719848dc.zip |
Slicing the process
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 125 |
1 files changed, 65 insertions, 60 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 7702ea6..d01c5c6 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -43,7 +43,6 @@ class DefaultSpider: crawl_result = None links_reached = set() redirect = None - is_timeout = False def start_requests(self): q = { @@ -129,15 +128,13 @@ class DefaultSpider: pk=self.crawl_result.pk) result.status = "T" result.save() - self.is_timeout = True raise CloseSpider('timeout') def parse(self, response): result = { "url": response.url, } - if self.is_timeout or self.timeout(): - raise CloseSpider('timeout') + self.timeout() for domain in self.excluded_domains: if domain in response.url: result["is_online"] = False @@ -327,67 +324,75 @@ def launch_crawl(crawl_item, excluded_domains=None): crawl_item.crawl_ended = None crawl_item.ended = None crawl_item.progression = 0 - crawl_item.status = "W" + crawl_item.status = "P" crawl_item.save() - for target in crawl_item.targets.all(): - crawl_item.progression += 1 - crawl_item.save() - result_dct = { - "crawl_id": crawl_item.pk, - "target_id": target.pk, - } - response, verify_ssl = None, True - while response is None: - try: - response = requests.get(target.url, verify=verify_ssl, - timeout=45) - except requests.exceptions.SSLError: - if not verify_ssl: # new error on SSL - response = "Try..." # scrapy is more permissive - else: - update_db_result(result_dct, {"bad_ssl": True}) - verify_ssl = False - time.sleep( - settings.SCRAPPY_SETTINGS["DOWNLOAD_DELAY"] - 1 - + randint(0, 20)/10) - except requests.exceptions.RequestException: + q = crawl_item.targets + # slice + total = q.count() + targets = q.values("id") + page = 0 + page_number = total // 50 + while page >= page_number: + idx = 0 + current_idx = page * 50 + while current_idx < total and idx < 50: + idx += 1 + current_idx = idx + page * 50 + target = models.Target.objects.filter(pk=targets['id']) + if not target.count(): # target has disappear + continue + target = target.all()[0] + result_dct = { + "crawl_id": crawl_item.pk, + "target_id": target.pk, + } + response, verify_ssl = None, True + while response is None: + try: + response = requests.get(target.url, verify=verify_ssl, + timeout=45) + except requests.exceptions.SSLError: + if not verify_ssl: # new error on SSL + response = False + else: + update_db_result(result_dct, {"bad_ssl": True}) + verify_ssl = False + time.sleep( + settings.SCRAPPY_SETTINGS["DOWNLOAD_DELAY"] - 1 + + randint(0, 20) / 10) + except requests.exceptions.RequestException: + update_db_result(result_dct, {"is_online": False, + "status": "F"}) + response = False + if response is False: # scrapy is more permissive - try it + pass + elif response.status_code == 404: update_db_result(result_dct, {"is_online": False, "status": "F"}) - response = False - if response is False: - continue - if response == "Try...": - pass - elif response.status_code == 404: - update_db_result(result_dct, {"is_online": False, - "status": "F"}) - continue - else: - url = target.url - - redirect = None - if getattr(response, 'history', None): - url = response.url - redirect = url - domain = get_domain(url) - if domain in excluded_domains: - update_db_result( - result_dct, {"is_online": False, "status": "F", - "redirection": redirect}) continue - process.crawl( - create_spider( - "Crawl{}Target{}".format(crawl_item.pk, target.pk), - [url], - crawl_item, target, - excluded_domains, - redirect + else: + url = target.url + + redirect = None + if getattr(response, 'history', None): + url = response.url + redirect = url + domain = get_domain(url) + if domain in excluded_domains: + update_db_result( + result_dct, {"is_online": False, "status": "F", + "redirection": redirect}) + continue + process.crawl( + create_spider( + "Crawl{}Target{}".format(crawl_item.pk, target.pk), + [url], + crawl_item, target, + excluded_domains, + redirect + ) ) - ) - crawl_item.pre_crawl_ended = timezone.now() - crawl_item.status = "P" - crawl_item.save() - process.start() + process.start() crawl_item.crawl_ended = timezone.now() crawl_item.status = "M" crawl_item.save() |