summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--commcrawler/management/__init__.py0
-rw-r--r--commcrawler/management/commands/__init__.py0
-rw-r--r--commcrawler/management/commands/launch_crawl.py39
-rw-r--r--commcrawler/scrapy.py41
-rw-r--r--commonnet/scrapy_setting.py1
-rw-r--r--commorganization/admin.py5
-rw-r--r--requirements.txt3
8 files changed, 83 insertions, 8 deletions
diff --git a/.gitignore b/.gitignore
index 8ff3fee..e78f46c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,5 +4,7 @@
*.mo
*~
.idea
+.scrapy
*.sqlite3
local_settings.py
+result.json
diff --git a/commcrawler/management/__init__.py b/commcrawler/management/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/commcrawler/management/__init__.py
diff --git a/commcrawler/management/commands/__init__.py b/commcrawler/management/commands/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/commcrawler/management/commands/__init__.py
diff --git a/commcrawler/management/commands/launch_crawl.py b/commcrawler/management/commands/launch_crawl.py
new file mode 100644
index 0000000..050a54d
--- /dev/null
+++ b/commcrawler/management/commands/launch_crawl.py
@@ -0,0 +1,39 @@
+import csv
+import sys
+
+from django.core.management.base import BaseCommand
+
+from commcrawler.models import Crawl
+from commcrawler.scrapy import launch_crawl
+
+
+class Command(BaseCommand):
+ help = 'Launch a crawl'
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ '--quiet', dest='quiet', action='store_true',
+ help='Quiet output')
+
+ def handle(self, *args, **options):
+ quiet = options['quiet']
+ q = Crawl.objects.filter(status="C")
+ if not q.count():
+ sys.stdout.write('No crawl waiting. Exit.\n')
+ return
+
+ crawls = dict([(c.pk, c) for c in q.all()])
+ available_ids = crawls.keys()
+ c_id = None
+ while c_id not in available_ids:
+ sys.stdout.write('Which crawl to launch (type the number):\n')
+ for crawl_id, crawl in crawls.items():
+ sys.stdout.write('* {} - {}\n'.format(crawl_id, crawl))
+ sys.stdout.flush()
+ try:
+ c_id = int(input("# "))
+ except ValueError:
+ c_id = None
+ current_crawl = crawls[c_id]
+ launch_crawl(current_crawl)
+
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 77dafe9..ea58164 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,22 +1,53 @@
import scrapy
from scrapy.crawler import CrawlerProcess
+from scrapy.linkextractors import LinkExtractor
+import tldextract
from django.conf import settings
class DefaultSpider:
- pass
+ def _parse_image(self, response, result):
+ for __ in response.css('img'):
+ if 'nb_images' not in result:
+ result["nb_images"] = 0
+ result["nb_images"] += 1
+
+ def parse(self, response):
+ result = {
+ "url": response.url,
+ "target_id": self.target_id
+ }
+ self._parse_image(response, result)
+
+ yield result
+
+ for link in LinkExtractor().extract_links(response):
+ if link.url is not None:
+ yield response.follow(link.url, self.parse)
+
+
+def get_domain(url):
+ ext = tldextract.extract(url)
+ return '{}.{}'.format(ext.domain, ext.suffix)
def create_spider(name, urls, target=None):
return type(
- name, (scrapy.Spider, DefaultSpider),
- {"name": name, "start_urls": urls, "target": target}
+ name, (DefaultSpider, scrapy.Spider),
+ {"name": name, "start_urls": urls,
+ "allowed_domains": [get_domain(url) for url in urls],
+ "target_id": target.pk}
)
-def crawl(crawl_item):
- process = CrawlerProcess(settings=settings.SCRAPPY_SETTINGS)
+def launch_crawl(crawl_item):
+ scrap_settings = settings.SCRAPPY_SETTINGS.copy()
+ scrap_settings.update({
+ 'FEED_FORMAT': 'json',
+ 'FEED_URI': 'result.json'
+ })
+ process = CrawlerProcess(settings=scrap_settings)
for target in crawl_item.targets.all():
process.crawl(
create_spider("Target{}".format(target.pk),
diff --git a/commonnet/scrapy_setting.py b/commonnet/scrapy_setting.py
index 6330705..2d2b7b7 100644
--- a/commonnet/scrapy_setting.py
+++ b/commonnet/scrapy_setting.py
@@ -10,6 +10,7 @@ SCRAPPY_SETTINGS = {
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
"BOT_NAME": 'commonnet-scraper',
+ "DEPTH_LIMIT": 5,
# Obey robots.txt rules
"ROBOTSTXT_OBEY": True,
diff --git a/commorganization/admin.py b/commorganization/admin.py
index 2b3c042..b26ad7e 100644
--- a/commorganization/admin.py
+++ b/commorganization/admin.py
@@ -9,6 +9,7 @@ from django.utils.translation import ugettext_lazy as _
from commonnet.admin_site import admin_site
from commorganization import models
+from commcrawler.models import Crawl
admin_site.register(User, UserAdmin)
@@ -60,7 +61,7 @@ class AddToCrawlForm(forms.Form):
super(AddToCrawlForm, self).__init__(*args, **kwargs)
self.fields["crawl"].choices = [(None, "--")] + [
(c.pk, str(c))
- for c in models.Crawl.objects.filter(status="C").all()
+ for c in Crawl.objects.filter(status="C").all()
]
@@ -77,7 +78,7 @@ class TargetAdmin(admin.ModelAdmin):
if form.is_valid():
crawl = None
try:
- crawl = models.Crawl.objects.get(
+ crawl = Crawl.objects.get(
pk=form.cleaned_data["crawl"],
status="C"
)
diff --git a/requirements.txt b/requirements.txt
index 43068f9..bf39a7d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
+scrapy==1.7
+tldextract
django==1.11
-scrapy==1.5
django-ajax-selects==1.6.0
# https://splash.readthedocs.io/ \ No newline at end of file