Scrap: add a condition on timeout

author: Étienne Loks <etienne.loks@iggdrasil.net> 2019-08-09 19:39:46 +0200
committer: Étienne Loks <etienne.loks@iggdrasil.net> 2019-08-09 19:39:46 +0200
commit: 507e0ce240f739a1d4580405ce2d189390c9f68b (patch)
tree: a5a10e8876395dd545433787fc2b036560207c3e /commcrawler
parent: f7fe0a457eb92611731ba93959f3fca0ceb16528 (diff)
download: Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.tar.bz2
Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.zip
3 files changed, 23 insertions, 11 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py
index 6365e65..e475325 100644
--- a/commcrawler/admin.py
+++ b/commcrawler/admin.py
@@ -30,8 +30,8 @@ admin_site.register(models.Crawl, CrawlAdmin)
 class CrawlResultAdmin(admin.ModelAdmin):
     model = models.CrawlResult
     list_display = (
-        "url", "crawl", "started", "duration", "status", "is_online",
-        "bad_ssl", "nb_external_link", "nb_internal_link",
+        "short_name", "open_link", "crawl", "started", "duration", "status",
+        "is_online", "bad_ssl", "nb_external_link", "nb_internal_link",
         "nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube",
         "nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf",
         "nb_external_pdf", "nb_internal_office", "nb_external_office"
@@ -49,6 +49,12 @@ class CrawlResultAdmin(admin.ModelAdmin):
     exclude = ("crawl_result",)
     form = make_ajax_form(model, {'target': 'target'})
 
+    def open_link(self, obj):
+        url = obj.url()
+        if not url:
+            return "-"
+        return mark_safe("<a href='{}' target='blank_'>{}</a>".format(url, url))
+
     def crawl_result_prettified(self, instance):
         response = json.dumps(instance.crawl_result, sort_keys=True, indent=2)
         formatter = HtmlFormatter(style='colorful')
diff --git a/commcrawler/models.py b/commcrawler/models.py
index 62d12eb..7a80b3b 100644
--- a/commcrawler/models.py
+++ b/commcrawler/models.py
@@ -141,6 +141,12 @@ class CrawlResult(models.Model):
     def __str__(self):
         return "{} - {}".format(self.crawl, self.target)
 
+    def short_name(self):
+        LEN = 50
+        if len(self.target.name) < LEN:
+            return self.target.name
+        return self.target.name[:LEN] + "..."
+
     def url(self):
         return self.target.url
 
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 7147949..bdd28c3 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -43,6 +43,7 @@ class DefaultSpider:
     crawl_result = None
     links_reached = set()
     redirect = None
+    is_timeout = False
 
     def start_requests(self):
         q = {
@@ -128,13 +129,14 @@ class DefaultSpider:
                 pk=self.crawl_result.pk)
             result.status = "T"
             result.save()
+            self.is_timeout = True
         return True
 
     def parse(self, response):
         result = {
             "url": response.url,
         }
-        if self.timeout():
+        if self.is_timeout or self.timeout():
             return []
         for domain in self.excluded_domains:
             if domain in response.url:
@@ -156,14 +158,12 @@ class DefaultSpider:
                             is_internal = True
                             self.links_reached.add(url)
                             is_file = self._parse_internal_files(url, result)
-                            if is_file:
-                                pass
-                            elif not MAX_LINKS or \
-                                    len(self.links_reached) < MAX_LINKS:
+                            if not is_file and \
+                                    not self.is_timeout and \
+                                    not self.timeout() and (
+                                        not MAX_LINKS or
+                                        len(self.links_reached) < MAX_LINKS):
                                 yield response.follow(link.url, self.parse)
-                            else:
-                                print("MAX", self.allowed_domains,
-                                      self.links_reached)
                     if not is_internal:
                         current_domain = get_domain(url)
                         if current_domain in FACEBOOK_DOMAINS:
@@ -339,7 +339,7 @@ def launch_crawl(crawl_item, excluded_domains=None):
         while response is None:
             try:
                 response = requests.get(target.url, verify=verify_ssl,
-                                        timeout=20)
+                                        timeout=45)
             except requests.exceptions.SSLError:
                 if not verify_ssl:  # new error on SSL
                     response = "Try..."  # scrapy is more permissive
author	Étienne Loks <etienne.loks@iggdrasil.net>	2019-08-09 19:39:46 +0200
committer	Étienne Loks <etienne.loks@iggdrasil.net>	2019-08-09 19:39:46 +0200
commit	507e0ce240f739a1d4580405ce2d189390c9f68b (patch)
tree	a5a10e8876395dd545433787fc2b036560207c3e /commcrawler
parent	f7fe0a457eb92611731ba93959f3fca0ceb16528 (diff)
download	Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.tar.bz2 Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.zip