diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 19:39:46 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 19:39:46 +0200 |
commit | 507e0ce240f739a1d4580405ce2d189390c9f68b (patch) | |
tree | a5a10e8876395dd545433787fc2b036560207c3e /commcrawler | |
parent | f7fe0a457eb92611731ba93959f3fca0ceb16528 (diff) | |
download | Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.tar.bz2 Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.zip |
Scrap: add a condition on timeout
Diffstat (limited to 'commcrawler')
-rw-r--r-- | commcrawler/admin.py | 10 | ||||
-rw-r--r-- | commcrawler/models.py | 6 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 18 |
3 files changed, 23 insertions, 11 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py index 6365e65..e475325 100644 --- a/commcrawler/admin.py +++ b/commcrawler/admin.py @@ -30,8 +30,8 @@ admin_site.register(models.Crawl, CrawlAdmin) class CrawlResultAdmin(admin.ModelAdmin): model = models.CrawlResult list_display = ( - "url", "crawl", "started", "duration", "status", "is_online", - "bad_ssl", "nb_external_link", "nb_internal_link", + "short_name", "open_link", "crawl", "started", "duration", "status", + "is_online", "bad_ssl", "nb_external_link", "nb_internal_link", "nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube", "nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf", "nb_external_pdf", "nb_internal_office", "nb_external_office" @@ -49,6 +49,12 @@ class CrawlResultAdmin(admin.ModelAdmin): exclude = ("crawl_result",) form = make_ajax_form(model, {'target': 'target'}) + def open_link(self, obj): + url = obj.url() + if not url: + return "-" + return mark_safe("<a href='{}' target='blank_'>{}</a>".format(url, url)) + def crawl_result_prettified(self, instance): response = json.dumps(instance.crawl_result, sort_keys=True, indent=2) formatter = HtmlFormatter(style='colorful') diff --git a/commcrawler/models.py b/commcrawler/models.py index 62d12eb..7a80b3b 100644 --- a/commcrawler/models.py +++ b/commcrawler/models.py @@ -141,6 +141,12 @@ class CrawlResult(models.Model): def __str__(self): return "{} - {}".format(self.crawl, self.target) + def short_name(self): + LEN = 50 + if len(self.target.name) < LEN: + return self.target.name + return self.target.name[:LEN] + "..." + def url(self): return self.target.url diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 7147949..bdd28c3 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -43,6 +43,7 @@ class DefaultSpider: crawl_result = None links_reached = set() redirect = None + is_timeout = False def start_requests(self): q = { @@ -128,13 +129,14 @@ class DefaultSpider: pk=self.crawl_result.pk) result.status = "T" result.save() + self.is_timeout = True return True def parse(self, response): result = { "url": response.url, } - if self.timeout(): + if self.is_timeout or self.timeout(): return [] for domain in self.excluded_domains: if domain in response.url: @@ -156,14 +158,12 @@ class DefaultSpider: is_internal = True self.links_reached.add(url) is_file = self._parse_internal_files(url, result) - if is_file: - pass - elif not MAX_LINKS or \ - len(self.links_reached) < MAX_LINKS: + if not is_file and \ + not self.is_timeout and \ + not self.timeout() and ( + not MAX_LINKS or + len(self.links_reached) < MAX_LINKS): yield response.follow(link.url, self.parse) - else: - print("MAX", self.allowed_domains, - self.links_reached) if not is_internal: current_domain = get_domain(url) if current_domain in FACEBOOK_DOMAINS: @@ -339,7 +339,7 @@ def launch_crawl(crawl_item, excluded_domains=None): while response is None: try: response = requests.get(target.url, verify=verify_ssl, - timeout=20) + timeout=45) except requests.exceptions.SSLError: if not verify_ssl: # new error on SSL response = "Try..." # scrapy is more permissive |