From 507e0ce240f739a1d4580405ce2d189390c9f68b Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Fri, 9 Aug 2019 19:39:46 +0200 Subject: Scrap: add a condition on timeout --- commcrawler/admin.py | 10 ++++++++-- commcrawler/models.py | 6 ++++++ commcrawler/scrapy.py | 18 +++++++++--------- fixtures/commcrawler.json | 6 ++++++ 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/commcrawler/admin.py b/commcrawler/admin.py index 6365e65..e475325 100644 --- a/commcrawler/admin.py +++ b/commcrawler/admin.py @@ -30,8 +30,8 @@ admin_site.register(models.Crawl, CrawlAdmin) class CrawlResultAdmin(admin.ModelAdmin): model = models.CrawlResult list_display = ( - "url", "crawl", "started", "duration", "status", "is_online", - "bad_ssl", "nb_external_link", "nb_internal_link", + "short_name", "open_link", "crawl", "started", "duration", "status", + "is_online", "bad_ssl", "nb_external_link", "nb_internal_link", "nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube", "nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf", "nb_external_pdf", "nb_internal_office", "nb_external_office" @@ -49,6 +49,12 @@ class CrawlResultAdmin(admin.ModelAdmin): exclude = ("crawl_result",) form = make_ajax_form(model, {'target': 'target'}) + def open_link(self, obj): + url = obj.url() + if not url: + return "-" + return mark_safe("{}".format(url, url)) + def crawl_result_prettified(self, instance): response = json.dumps(instance.crawl_result, sort_keys=True, indent=2) formatter = HtmlFormatter(style='colorful') diff --git a/commcrawler/models.py b/commcrawler/models.py index 62d12eb..7a80b3b 100644 --- a/commcrawler/models.py +++ b/commcrawler/models.py @@ -141,6 +141,12 @@ class CrawlResult(models.Model): def __str__(self): return "{} - {}".format(self.crawl, self.target) + def short_name(self): + LEN = 50 + if len(self.target.name) < LEN: + return self.target.name + return self.target.name[:LEN] + "..." + def url(self): return self.target.url diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 7147949..bdd28c3 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -43,6 +43,7 @@ class DefaultSpider: crawl_result = None links_reached = set() redirect = None + is_timeout = False def start_requests(self): q = { @@ -128,13 +129,14 @@ class DefaultSpider: pk=self.crawl_result.pk) result.status = "T" result.save() + self.is_timeout = True return True def parse(self, response): result = { "url": response.url, } - if self.timeout(): + if self.is_timeout or self.timeout(): return [] for domain in self.excluded_domains: if domain in response.url: @@ -156,14 +158,12 @@ class DefaultSpider: is_internal = True self.links_reached.add(url) is_file = self._parse_internal_files(url, result) - if is_file: - pass - elif not MAX_LINKS or \ - len(self.links_reached) < MAX_LINKS: + if not is_file and \ + not self.is_timeout and \ + not self.timeout() and ( + not MAX_LINKS or + len(self.links_reached) < MAX_LINKS): yield response.follow(link.url, self.parse) - else: - print("MAX", self.allowed_domains, - self.links_reached) if not is_internal: current_domain = get_domain(url) if current_domain in FACEBOOK_DOMAINS: @@ -339,7 +339,7 @@ def launch_crawl(crawl_item, excluded_domains=None): while response is None: try: response = requests.get(target.url, verify=verify_ssl, - timeout=20) + timeout=45) except requests.exceptions.SSLError: if not verify_ssl: # new error on SSL response = "Try..." # scrapy is more permissive diff --git a/fixtures/commcrawler.json b/fixtures/commcrawler.json index 8ec419b..143e8b5 100644 --- a/fixtures/commcrawler.json +++ b/fixtures/commcrawler.json @@ -4,5 +4,11 @@ "fields": { "domain": "hugedomains.com" } +}, +{ + "model": "commcrawler.exludeddomains", + "fields": { + "domain": "pp.auto.ke.orange.fr" + } } ] -- cgit v1.2.3