diff options
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 39 |
1 files changed, 30 insertions, 9 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index b0c4fe4..47a0521 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,3 +1,4 @@ +import datetime import tldextract import scrapy @@ -33,6 +34,7 @@ CrawlLink """ MAX_LINKS = 500 +TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) class DefaultSpider: @@ -42,17 +44,18 @@ class DefaultSpider: excluded_domains = [] crawl_id = None target_id = None + crawl_result = None links_reached = set() def start_requests(self): q = { "crawl_id": self.crawl_id, "target_id": self.target_id, - "status": "F" + "status__in": ["F", "T"], } if models.CrawlResult.objects.filter(**q).count(): return [] - q.pop("status") + q.pop("status__in") if models.CrawlResult.objects.filter(**q).count(): # delete a previous interrupted attempt res = models.CrawlResult.objects.get(**q) @@ -74,10 +77,31 @@ class DefaultSpider: continue result["images"].append(src) + def timeout(self): + if not self.crawl_result: + q = { + "crawl_id": self.crawl_id, + "target_id": self.target_id, + } + if not models.CrawlResult.objects.filter(**q).count(): + return + self.crawl_result = models.CrawlResult.objects.get(**q) + duration = timezone.now() - self.crawl_result.started + if duration < TIMEOUT: + return + with transaction.atomic(): + result = models.CrawlResult.objects.select_for_update().get( + pk=self.crawl_result.pk) + result.status = "T" + result.save() + return True + def parse(self, response): result = { "url": response.url, } + if self.timeout(): + return [] for domain in self.excluded_domains: if domain in response.url: result["is_online"] = False @@ -104,10 +128,6 @@ class DefaultSpider: yield result def closed(self, reason): - result = { - "crawl_id": self.crawl_id, - "target_id": self.target_id, - } DbPipeline().close(self) @@ -185,9 +205,10 @@ class DbPipeline: with transaction.atomic(): result = models.CrawlResult.objects.select_for_update().get( pk=result_pk) - result.status = "F" - result.duration = timezone.now() - result.started - result.save() + if result.status == "P": + result.status = "F" + result.duration = (timezone.now() - result.started) + result.save() def get_domain(url): |