summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-07 22:30:21 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-07 22:30:21 +0200
commit37036fde028b0c1c5f9db06b71bbd4bc9a287e51 (patch)
tree986ec9a9d8da3571f6f5685ef53d6510d4964a91 /commcrawler/scrapy.py
parentd737e04553f464966f54739ba37f9f06dab44586 (diff)
downloadComm-on-net-37036fde028b0c1c5f9db06b71bbd4bc9a287e51.tar.bz2
Comm-on-net-37036fde028b0c1c5f9db06b71bbd4bc9a287e51.zip
Manage timeout
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py39
1 files changed, 30 insertions, 9 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index b0c4fe4..47a0521 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,3 +1,4 @@
+import datetime
import tldextract
import scrapy
@@ -33,6 +34,7 @@ CrawlLink
"""
MAX_LINKS = 500
+TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
class DefaultSpider:
@@ -42,17 +44,18 @@ class DefaultSpider:
excluded_domains = []
crawl_id = None
target_id = None
+ crawl_result = None
links_reached = set()
def start_requests(self):
q = {
"crawl_id": self.crawl_id,
"target_id": self.target_id,
- "status": "F"
+ "status__in": ["F", "T"],
}
if models.CrawlResult.objects.filter(**q).count():
return []
- q.pop("status")
+ q.pop("status__in")
if models.CrawlResult.objects.filter(**q).count():
# delete a previous interrupted attempt
res = models.CrawlResult.objects.get(**q)
@@ -74,10 +77,31 @@ class DefaultSpider:
continue
result["images"].append(src)
+ def timeout(self):
+ if not self.crawl_result:
+ q = {
+ "crawl_id": self.crawl_id,
+ "target_id": self.target_id,
+ }
+ if not models.CrawlResult.objects.filter(**q).count():
+ return
+ self.crawl_result = models.CrawlResult.objects.get(**q)
+ duration = timezone.now() - self.crawl_result.started
+ if duration < TIMEOUT:
+ return
+ with transaction.atomic():
+ result = models.CrawlResult.objects.select_for_update().get(
+ pk=self.crawl_result.pk)
+ result.status = "T"
+ result.save()
+ return True
+
def parse(self, response):
result = {
"url": response.url,
}
+ if self.timeout():
+ return []
for domain in self.excluded_domains:
if domain in response.url:
result["is_online"] = False
@@ -104,10 +128,6 @@ class DefaultSpider:
yield result
def closed(self, reason):
- result = {
- "crawl_id": self.crawl_id,
- "target_id": self.target_id,
- }
DbPipeline().close(self)
@@ -185,9 +205,10 @@ class DbPipeline:
with transaction.atomic():
result = models.CrawlResult.objects.select_for_update().get(
pk=result_pk)
- result.status = "F"
- result.duration = timezone.now() - result.started
- result.save()
+ if result.status == "P":
+ result.status = "F"
+ result.duration = (timezone.now() - result.started)
+ result.save()
def get_domain(url):