summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--commcrawler/lookups.py2
-rw-r--r--commcrawler/scrapy.py18
-rwxr-xr-xinstall.sh4
3 files changed, 15 insertions, 9 deletions
diff --git a/commcrawler/lookups.py b/commcrawler/lookups.py
index 106a69a..396490b 100644
--- a/commcrawler/lookups.py
+++ b/commcrawler/lookups.py
@@ -14,7 +14,7 @@ class TargetLookup(LookupChannel):
def get_query(self, q, request):
query = Q()
for term in q.strip().split(' '):
- subquery = Q(name__icontains=term)
+ subquery = Q(name__icontains=term) | Q(url__icontains=term)
query &= subquery
return self.model.objects.filter(query).order_by('name')[:20]
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 39e3a3e..e3782d6 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -337,27 +337,33 @@ def launch_crawl(crawl_item, excluded_domains=None):
"crawl_id": crawl_item.pk,
"target_id": target.pk,
}
- response, verify_ssl, retry = None, True, 5
+ response, verify_ssl = None, True
while response is None:
try:
response = requests.get(target.url, verify=verify_ssl)
except requests.exceptions.SSLError:
- update_db_result(result_dct, {"bad_ssl": True})
- verify_ssl = False
+ if not verify_ssl: # new error on SSL
+ response = "Try..." # scrapy is more permissive
+ else:
+ update_db_result(result_dct, {"bad_ssl": True})
+ verify_ssl = False
except requests.exceptions.RequestException:
update_db_result(result_dct, {"is_online": False,
"status": "F"})
response = False
if response is False:
continue
- if response.status_code == 404:
+ if response == "Try...":
+ pass
+ elif response.status_code == 404:
update_db_result(result_dct, {"is_online": False,
"status": "F"})
continue
+ else:
+ url = target.url
redirect = None
- url = target.url
- if response.history:
+ if getattr(response, 'history', None):
url = response.url
redirect = url
domain = get_domain(url)
diff --git a/install.sh b/install.sh
index 414bc1f..0005cd2 100755
--- a/install.sh
+++ b/install.sh
@@ -17,10 +17,10 @@ APT_OPTIONS=" -y -q "
apt-get install $APT_OPTIONS git nginx uwsgi uwsgi-plugin-python3 postgresql apg sed gettext > /dev/null
apt-get install $APT_OPTIONS python3 python3-pip python3-psycopg2 python3-pygments python3-service-identity > /dev/null
-apt-get install $APT_OPTIONS -t stretch-backports python3-django python3-requests > /dev/null
+apt-get install $APT_OPTIONS -t stretch-backports python3-django > /dev/null
# buster/bulleyes: apt install python3-django python3-requests
-pip3 install scrapy==1.7 tldextract==2.2 django-ajax-selects==1.6.0 service_identity==18.1 > /dev/null
+pip3 install scrapy==1.7 tldextract==2.2 django-ajax-selects==1.6.0 service_identity==18.1 requests[security] > /dev/null
# buster: apt install python3-tldextract django-ajax-selects
# buster: apt install -t buster-backports python3-service-identity
# bullseye: apt install python3-scrapy python3-tldextract django-ajax-selects python3-service-identity