diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-07 22:30:21 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-07 22:30:21 +0200 |
commit | 37036fde028b0c1c5f9db06b71bbd4bc9a287e51 (patch) | |
tree | 986ec9a9d8da3571f6f5685ef53d6510d4964a91 | |
parent | d737e04553f464966f54739ba37f9f06dab44586 (diff) | |
download | Comm-on-net-37036fde028b0c1c5f9db06b71bbd4bc9a287e51.tar.bz2 Comm-on-net-37036fde028b0c1c5f9db06b71bbd4bc9a287e51.zip |
Manage timeout
-rw-r--r-- | commcrawler/admin.py | 10 | ||||
-rw-r--r-- | commcrawler/models.py | 1 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 39 | ||||
-rw-r--r-- | commonnet/settings.py | 2 |
4 files changed, 41 insertions, 11 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py index 857962c..a89c871 100644 --- a/commcrawler/admin.py +++ b/commcrawler/admin.py @@ -30,8 +30,14 @@ class CrawlResultAdmin(admin.ModelAdmin): "is_online") list_filter = ("status", "crawl") search_fields = ("target__name",) - readonly_fields = ("started", "duration", "status", - "crawl_result_prettified") + + readonly_fields = ( + "started", "duration", "status", "nb_external_link", "nb_internal_link", + "nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube", + "nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf", + "nb_external_pdf", "nb_internal_office", "nb_external_office", + "is_online", "redirection", + ) exclude = ("crawl_result",) form = make_ajax_form(model, {'target': 'target'}) diff --git a/commcrawler/models.py b/commcrawler/models.py index 9a98b89..be371ca 100644 --- a/commcrawler/models.py +++ b/commcrawler/models.py @@ -54,6 +54,7 @@ class Crawl(models.Model): class CrawlResult(models.Model): STATUS = ( ('P', _("In progress")), + ('T', _("Time out")), ('F', _("Finished")) ) crawl = models.ForeignKey(Crawl, verbose_name=_("Crawl")) diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index b0c4fe4..47a0521 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,3 +1,4 @@ +import datetime import tldextract import scrapy @@ -33,6 +34,7 @@ CrawlLink """ MAX_LINKS = 500 +TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) class DefaultSpider: @@ -42,17 +44,18 @@ class DefaultSpider: excluded_domains = [] crawl_id = None target_id = None + crawl_result = None links_reached = set() def start_requests(self): q = { "crawl_id": self.crawl_id, "target_id": self.target_id, - "status": "F" + "status__in": ["F", "T"], } if models.CrawlResult.objects.filter(**q).count(): return [] - q.pop("status") + q.pop("status__in") if models.CrawlResult.objects.filter(**q).count(): # delete a previous interrupted attempt res = models.CrawlResult.objects.get(**q) @@ -74,10 +77,31 @@ class DefaultSpider: continue result["images"].append(src) + def timeout(self): + if not self.crawl_result: + q = { + "crawl_id": self.crawl_id, + "target_id": self.target_id, + } + if not models.CrawlResult.objects.filter(**q).count(): + return + self.crawl_result = models.CrawlResult.objects.get(**q) + duration = timezone.now() - self.crawl_result.started + if duration < TIMEOUT: + return + with transaction.atomic(): + result = models.CrawlResult.objects.select_for_update().get( + pk=self.crawl_result.pk) + result.status = "T" + result.save() + return True + def parse(self, response): result = { "url": response.url, } + if self.timeout(): + return [] for domain in self.excluded_domains: if domain in response.url: result["is_online"] = False @@ -104,10 +128,6 @@ class DefaultSpider: yield result def closed(self, reason): - result = { - "crawl_id": self.crawl_id, - "target_id": self.target_id, - } DbPipeline().close(self) @@ -185,9 +205,10 @@ class DbPipeline: with transaction.atomic(): result = models.CrawlResult.objects.select_for_update().get( pk=result_pk) - result.status = "F" - result.duration = timezone.now() - result.started - result.save() + if result.status == "P": + result.status = "F" + result.duration = (timezone.now() - result.started) + result.save() def get_domain(url): diff --git a/commonnet/settings.py b/commonnet/settings.py index 40e3335..d2048b2 100644 --- a/commonnet/settings.py +++ b/commonnet/settings.py @@ -121,6 +121,8 @@ DATA_UPLOAD_MAX_NUMBER_FIELDS = 5000 STATIC_URL = '/static/' +CRAWL_TIMEOUT = 30 # timeout for each website crawl in minutes + try: from .local_settings import * except ImportError: |