summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py41
1 files changed, 22 insertions, 19 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 5fc9fce..a430f0e 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -312,6 +312,14 @@ def launch_match(crawl_item):
crawl_item.save()
+def update_db_result(result_dct, values):
+ result, __ = models.CrawlResult.objects.get_or_create(
+ **result_dct)
+ for k in values.keys():
+ setattr(result, k, values[k])
+ result.save()
+
+
def launch_crawl(crawl_item, excluded_domains=None):
scrap_settings = settings.SCRAPPY_SETTINGS.copy()
process = CrawlerProcess(settings=scrap_settings)
@@ -324,24 +332,22 @@ def launch_crawl(crawl_item, excluded_domains=None):
"crawl_id": crawl_item.pk,
"target_id": target.pk,
}
- response, verify_ssl = None, True
- while not response:
+ response, verify_ssl, retry = None, True, 5
+ while response is None:
try:
response = requests.get(target.url, verify=verify_ssl)
except requests.exceptions.SSLError:
+ update_db_result(result_dct, {"bad_ssl": True})
verify_ssl = False
- result, __ = models.CrawlResult.objects.get_or_create(
- **result_dct)
- result.bad_ssl = True
- result.save()
except requests.exceptions.RequestException:
- result, __ = models.CrawlResult.objects.get_or_create(
- **result_dct)
- result.is_online = False
- result.status = "F"
- result.save()
- break
- if not response:
+ update_db_result(result_dct, {"is_online": False,
+ "status": "F"})
+ response = False
+ if response is False:
+ continue
+ if response.status_code == 404:
+ update_db_result(result_dct, {"is_online": False,
+ "status": "F"})
continue
redirect = None
@@ -351,12 +357,9 @@ def launch_crawl(crawl_item, excluded_domains=None):
redirect = url
domain = get_domain(url)
if domain in excluded_domains:
- result, __ = models.CrawlResult.objects.get_or_create(
- **result_dct)
- result.redirection = redirect
- result.is_online = False
- result.status = "F"
- result.save()
+ update_db_result(
+ result_dct, {"is_online": False, "status": "F",
+ "redirection": redirect})
continue
process.crawl(
create_spider(