summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-12 15:34:59 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-12 15:34:59 +0200
commit5835ead75454e387cd7eaacac2ce7658719848dc (patch)
treed84590c3d85d8493cbc2d729789731f7e0ceeeeb
parent8377087d7572729f4e227d40c0a3f32fb5b2720a (diff)
downloadComm-on-net-5835ead75454e387cd7eaacac2ce7658719848dc.tar.bz2
Comm-on-net-5835ead75454e387cd7eaacac2ce7658719848dc.zip
Slicing the process
-rw-r--r--commcrawler/scrapy.py125
1 files changed, 65 insertions, 60 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 7702ea6..d01c5c6 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -43,7 +43,6 @@ class DefaultSpider:
crawl_result = None
links_reached = set()
redirect = None
- is_timeout = False
def start_requests(self):
q = {
@@ -129,15 +128,13 @@ class DefaultSpider:
pk=self.crawl_result.pk)
result.status = "T"
result.save()
- self.is_timeout = True
raise CloseSpider('timeout')
def parse(self, response):
result = {
"url": response.url,
}
- if self.is_timeout or self.timeout():
- raise CloseSpider('timeout')
+ self.timeout()
for domain in self.excluded_domains:
if domain in response.url:
result["is_online"] = False
@@ -327,67 +324,75 @@ def launch_crawl(crawl_item, excluded_domains=None):
crawl_item.crawl_ended = None
crawl_item.ended = None
crawl_item.progression = 0
- crawl_item.status = "W"
+ crawl_item.status = "P"
crawl_item.save()
- for target in crawl_item.targets.all():
- crawl_item.progression += 1
- crawl_item.save()
- result_dct = {
- "crawl_id": crawl_item.pk,
- "target_id": target.pk,
- }
- response, verify_ssl = None, True
- while response is None:
- try:
- response = requests.get(target.url, verify=verify_ssl,
- timeout=45)
- except requests.exceptions.SSLError:
- if not verify_ssl: # new error on SSL
- response = "Try..." # scrapy is more permissive
- else:
- update_db_result(result_dct, {"bad_ssl": True})
- verify_ssl = False
- time.sleep(
- settings.SCRAPPY_SETTINGS["DOWNLOAD_DELAY"] - 1
- + randint(0, 20)/10)
- except requests.exceptions.RequestException:
+ q = crawl_item.targets
+ # slice
+ total = q.count()
+ targets = q.values("id")
+ page = 0
+ page_number = total // 50
+ while page >= page_number:
+ idx = 0
+ current_idx = page * 50
+ while current_idx < total and idx < 50:
+ idx += 1
+ current_idx = idx + page * 50
+ target = models.Target.objects.filter(pk=targets['id'])
+ if not target.count(): # target has disappear
+ continue
+ target = target.all()[0]
+ result_dct = {
+ "crawl_id": crawl_item.pk,
+ "target_id": target.pk,
+ }
+ response, verify_ssl = None, True
+ while response is None:
+ try:
+ response = requests.get(target.url, verify=verify_ssl,
+ timeout=45)
+ except requests.exceptions.SSLError:
+ if not verify_ssl: # new error on SSL
+ response = False
+ else:
+ update_db_result(result_dct, {"bad_ssl": True})
+ verify_ssl = False
+ time.sleep(
+ settings.SCRAPPY_SETTINGS["DOWNLOAD_DELAY"] - 1
+ + randint(0, 20) / 10)
+ except requests.exceptions.RequestException:
+ update_db_result(result_dct, {"is_online": False,
+ "status": "F"})
+ response = False
+ if response is False: # scrapy is more permissive - try it
+ pass
+ elif response.status_code == 404:
update_db_result(result_dct, {"is_online": False,
"status": "F"})
- response = False
- if response is False:
- continue
- if response == "Try...":
- pass
- elif response.status_code == 404:
- update_db_result(result_dct, {"is_online": False,
- "status": "F"})
- continue
- else:
- url = target.url
-
- redirect = None
- if getattr(response, 'history', None):
- url = response.url
- redirect = url
- domain = get_domain(url)
- if domain in excluded_domains:
- update_db_result(
- result_dct, {"is_online": False, "status": "F",
- "redirection": redirect})
continue
- process.crawl(
- create_spider(
- "Crawl{}Target{}".format(crawl_item.pk, target.pk),
- [url],
- crawl_item, target,
- excluded_domains,
- redirect
+ else:
+ url = target.url
+
+ redirect = None
+ if getattr(response, 'history', None):
+ url = response.url
+ redirect = url
+ domain = get_domain(url)
+ if domain in excluded_domains:
+ update_db_result(
+ result_dct, {"is_online": False, "status": "F",
+ "redirection": redirect})
+ continue
+ process.crawl(
+ create_spider(
+ "Crawl{}Target{}".format(crawl_item.pk, target.pk),
+ [url],
+ crawl_item, target,
+ excluded_domains,
+ redirect
+ )
)
- )
- crawl_item.pre_crawl_ended = timezone.now()
- crawl_item.status = "P"
- crawl_item.save()
- process.start()
+ process.start()
crawl_item.crawl_ended = timezone.now()
crawl_item.status = "M"
crawl_item.save()