summaryrefslogtreecommitdiff
path: root/commcrawler
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 19:39:46 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 19:39:46 +0200
commit507e0ce240f739a1d4580405ce2d189390c9f68b (patch)
treea5a10e8876395dd545433787fc2b036560207c3e /commcrawler
parentf7fe0a457eb92611731ba93959f3fca0ceb16528 (diff)
downloadComm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.tar.bz2
Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.zip
Scrap: add a condition on timeout
Diffstat (limited to 'commcrawler')
-rw-r--r--commcrawler/admin.py10
-rw-r--r--commcrawler/models.py6
-rw-r--r--commcrawler/scrapy.py18
3 files changed, 23 insertions, 11 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py
index 6365e65..e475325 100644
--- a/commcrawler/admin.py
+++ b/commcrawler/admin.py
@@ -30,8 +30,8 @@ admin_site.register(models.Crawl, CrawlAdmin)
class CrawlResultAdmin(admin.ModelAdmin):
model = models.CrawlResult
list_display = (
- "url", "crawl", "started", "duration", "status", "is_online",
- "bad_ssl", "nb_external_link", "nb_internal_link",
+ "short_name", "open_link", "crawl", "started", "duration", "status",
+ "is_online", "bad_ssl", "nb_external_link", "nb_internal_link",
"nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube",
"nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf",
"nb_external_pdf", "nb_internal_office", "nb_external_office"
@@ -49,6 +49,12 @@ class CrawlResultAdmin(admin.ModelAdmin):
exclude = ("crawl_result",)
form = make_ajax_form(model, {'target': 'target'})
+ def open_link(self, obj):
+ url = obj.url()
+ if not url:
+ return "-"
+ return mark_safe("<a href='{}' target='blank_'>{}</a>".format(url, url))
+
def crawl_result_prettified(self, instance):
response = json.dumps(instance.crawl_result, sort_keys=True, indent=2)
formatter = HtmlFormatter(style='colorful')
diff --git a/commcrawler/models.py b/commcrawler/models.py
index 62d12eb..7a80b3b 100644
--- a/commcrawler/models.py
+++ b/commcrawler/models.py
@@ -141,6 +141,12 @@ class CrawlResult(models.Model):
def __str__(self):
return "{} - {}".format(self.crawl, self.target)
+ def short_name(self):
+ LEN = 50
+ if len(self.target.name) < LEN:
+ return self.target.name
+ return self.target.name[:LEN] + "..."
+
def url(self):
return self.target.url
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 7147949..bdd28c3 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -43,6 +43,7 @@ class DefaultSpider:
crawl_result = None
links_reached = set()
redirect = None
+ is_timeout = False
def start_requests(self):
q = {
@@ -128,13 +129,14 @@ class DefaultSpider:
pk=self.crawl_result.pk)
result.status = "T"
result.save()
+ self.is_timeout = True
return True
def parse(self, response):
result = {
"url": response.url,
}
- if self.timeout():
+ if self.is_timeout or self.timeout():
return []
for domain in self.excluded_domains:
if domain in response.url:
@@ -156,14 +158,12 @@ class DefaultSpider:
is_internal = True
self.links_reached.add(url)
is_file = self._parse_internal_files(url, result)
- if is_file:
- pass
- elif not MAX_LINKS or \
- len(self.links_reached) < MAX_LINKS:
+ if not is_file and \
+ not self.is_timeout and \
+ not self.timeout() and (
+ not MAX_LINKS or
+ len(self.links_reached) < MAX_LINKS):
yield response.follow(link.url, self.parse)
- else:
- print("MAX", self.allowed_domains,
- self.links_reached)
if not is_internal:
current_domain = get_domain(url)
if current_domain in FACEBOOK_DOMAINS:
@@ -339,7 +339,7 @@ def launch_crawl(crawl_item, excluded_domains=None):
while response is None:
try:
response = requests.get(target.url, verify=verify_ssl,
- timeout=20)
+ timeout=45)
except requests.exceptions.SSLError:
if not verify_ssl: # new error on SSL
response = "Try..." # scrapy is more permissive