summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--commcrawler/admin.py10
-rw-r--r--commcrawler/models.py1
-rw-r--r--commcrawler/scrapy.py39
-rw-r--r--commonnet/settings.py2
4 files changed, 41 insertions, 11 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py
index 857962c..a89c871 100644
--- a/commcrawler/admin.py
+++ b/commcrawler/admin.py
@@ -30,8 +30,14 @@ class CrawlResultAdmin(admin.ModelAdmin):
"is_online")
list_filter = ("status", "crawl")
search_fields = ("target__name",)
- readonly_fields = ("started", "duration", "status",
- "crawl_result_prettified")
+
+ readonly_fields = (
+ "started", "duration", "status", "nb_external_link", "nb_internal_link",
+ "nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube",
+ "nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf",
+ "nb_external_pdf", "nb_internal_office", "nb_external_office",
+ "is_online", "redirection",
+ )
exclude = ("crawl_result",)
form = make_ajax_form(model, {'target': 'target'})
diff --git a/commcrawler/models.py b/commcrawler/models.py
index 9a98b89..be371ca 100644
--- a/commcrawler/models.py
+++ b/commcrawler/models.py
@@ -54,6 +54,7 @@ class Crawl(models.Model):
class CrawlResult(models.Model):
STATUS = (
('P', _("In progress")),
+ ('T', _("Time out")),
('F', _("Finished"))
)
crawl = models.ForeignKey(Crawl, verbose_name=_("Crawl"))
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index b0c4fe4..47a0521 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,3 +1,4 @@
+import datetime
import tldextract
import scrapy
@@ -33,6 +34,7 @@ CrawlLink
"""
MAX_LINKS = 500
+TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
class DefaultSpider:
@@ -42,17 +44,18 @@ class DefaultSpider:
excluded_domains = []
crawl_id = None
target_id = None
+ crawl_result = None
links_reached = set()
def start_requests(self):
q = {
"crawl_id": self.crawl_id,
"target_id": self.target_id,
- "status": "F"
+ "status__in": ["F", "T"],
}
if models.CrawlResult.objects.filter(**q).count():
return []
- q.pop("status")
+ q.pop("status__in")
if models.CrawlResult.objects.filter(**q).count():
# delete a previous interrupted attempt
res = models.CrawlResult.objects.get(**q)
@@ -74,10 +77,31 @@ class DefaultSpider:
continue
result["images"].append(src)
+ def timeout(self):
+ if not self.crawl_result:
+ q = {
+ "crawl_id": self.crawl_id,
+ "target_id": self.target_id,
+ }
+ if not models.CrawlResult.objects.filter(**q).count():
+ return
+ self.crawl_result = models.CrawlResult.objects.get(**q)
+ duration = timezone.now() - self.crawl_result.started
+ if duration < TIMEOUT:
+ return
+ with transaction.atomic():
+ result = models.CrawlResult.objects.select_for_update().get(
+ pk=self.crawl_result.pk)
+ result.status = "T"
+ result.save()
+ return True
+
def parse(self, response):
result = {
"url": response.url,
}
+ if self.timeout():
+ return []
for domain in self.excluded_domains:
if domain in response.url:
result["is_online"] = False
@@ -104,10 +128,6 @@ class DefaultSpider:
yield result
def closed(self, reason):
- result = {
- "crawl_id": self.crawl_id,
- "target_id": self.target_id,
- }
DbPipeline().close(self)
@@ -185,9 +205,10 @@ class DbPipeline:
with transaction.atomic():
result = models.CrawlResult.objects.select_for_update().get(
pk=result_pk)
- result.status = "F"
- result.duration = timezone.now() - result.started
- result.save()
+ if result.status == "P":
+ result.status = "F"
+ result.duration = (timezone.now() - result.started)
+ result.save()
def get_domain(url):
diff --git a/commonnet/settings.py b/commonnet/settings.py
index 40e3335..d2048b2 100644
--- a/commonnet/settings.py
+++ b/commonnet/settings.py
@@ -121,6 +121,8 @@ DATA_UPLOAD_MAX_NUMBER_FIELDS = 5000
STATIC_URL = '/static/'
+CRAWL_TIMEOUT = 30 # timeout for each website crawl in minutes
+
try:
from .local_settings import *
except ImportError: