diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-07-31 17:56:53 +0200 |
---|---|---|
committer | Étienne Loks <etienne@peacefrogs.net> | 2019-07-31 17:56:53 +0200 |
commit | 108b5514fe795e3bbf4c76245047f5ea054c3d20 (patch) | |
tree | 8bb5ded34e2205583b8cb12101bc3f945252ea1d | |
parent | dd2dd640aa649c715a843fa431621fd955ca6767 (diff) | |
download | Comm-on-net-108b5514fe795e3bbf4c76245047f5ea054c3d20.tar.bz2 Comm-on-net-108b5514fe795e3bbf4c76245047f5ea054c3d20.zip |
Basic crawling
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | commcrawler/management/__init__.py | 0 | ||||
-rw-r--r-- | commcrawler/management/commands/__init__.py | 0 | ||||
-rw-r--r-- | commcrawler/management/commands/launch_crawl.py | 39 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 41 | ||||
-rw-r--r-- | commonnet/scrapy_setting.py | 1 | ||||
-rw-r--r-- | commorganization/admin.py | 5 | ||||
-rw-r--r-- | requirements.txt | 3 |
8 files changed, 83 insertions, 8 deletions
@@ -4,5 +4,7 @@ *.mo *~ .idea +.scrapy *.sqlite3 local_settings.py +result.json diff --git a/commcrawler/management/__init__.py b/commcrawler/management/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/commcrawler/management/__init__.py diff --git a/commcrawler/management/commands/__init__.py b/commcrawler/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/commcrawler/management/commands/__init__.py diff --git a/commcrawler/management/commands/launch_crawl.py b/commcrawler/management/commands/launch_crawl.py new file mode 100644 index 0000000..050a54d --- /dev/null +++ b/commcrawler/management/commands/launch_crawl.py @@ -0,0 +1,39 @@ +import csv +import sys + +from django.core.management.base import BaseCommand + +from commcrawler.models import Crawl +from commcrawler.scrapy import launch_crawl + + +class Command(BaseCommand): + help = 'Launch a crawl' + + def add_arguments(self, parser): + parser.add_argument( + '--quiet', dest='quiet', action='store_true', + help='Quiet output') + + def handle(self, *args, **options): + quiet = options['quiet'] + q = Crawl.objects.filter(status="C") + if not q.count(): + sys.stdout.write('No crawl waiting. Exit.\n') + return + + crawls = dict([(c.pk, c) for c in q.all()]) + available_ids = crawls.keys() + c_id = None + while c_id not in available_ids: + sys.stdout.write('Which crawl to launch (type the number):\n') + for crawl_id, crawl in crawls.items(): + sys.stdout.write('* {} - {}\n'.format(crawl_id, crawl)) + sys.stdout.flush() + try: + c_id = int(input("# ")) + except ValueError: + c_id = None + current_crawl = crawls[c_id] + launch_crawl(current_crawl) + diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 77dafe9..ea58164 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,22 +1,53 @@ import scrapy from scrapy.crawler import CrawlerProcess +from scrapy.linkextractors import LinkExtractor +import tldextract from django.conf import settings class DefaultSpider: - pass + def _parse_image(self, response, result): + for __ in response.css('img'): + if 'nb_images' not in result: + result["nb_images"] = 0 + result["nb_images"] += 1 + + def parse(self, response): + result = { + "url": response.url, + "target_id": self.target_id + } + self._parse_image(response, result) + + yield result + + for link in LinkExtractor().extract_links(response): + if link.url is not None: + yield response.follow(link.url, self.parse) + + +def get_domain(url): + ext = tldextract.extract(url) + return '{}.{}'.format(ext.domain, ext.suffix) def create_spider(name, urls, target=None): return type( - name, (scrapy.Spider, DefaultSpider), - {"name": name, "start_urls": urls, "target": target} + name, (DefaultSpider, scrapy.Spider), + {"name": name, "start_urls": urls, + "allowed_domains": [get_domain(url) for url in urls], + "target_id": target.pk} ) -def crawl(crawl_item): - process = CrawlerProcess(settings=settings.SCRAPPY_SETTINGS) +def launch_crawl(crawl_item): + scrap_settings = settings.SCRAPPY_SETTINGS.copy() + scrap_settings.update({ + 'FEED_FORMAT': 'json', + 'FEED_URI': 'result.json' + }) + process = CrawlerProcess(settings=scrap_settings) for target in crawl_item.targets.all(): process.crawl( create_spider("Target{}".format(target.pk), diff --git a/commonnet/scrapy_setting.py b/commonnet/scrapy_setting.py index 6330705..2d2b7b7 100644 --- a/commonnet/scrapy_setting.py +++ b/commonnet/scrapy_setting.py @@ -10,6 +10,7 @@ SCRAPPY_SETTINGS = { # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html "BOT_NAME": 'commonnet-scraper', + "DEPTH_LIMIT": 5, # Obey robots.txt rules "ROBOTSTXT_OBEY": True, diff --git a/commorganization/admin.py b/commorganization/admin.py index 2b3c042..b26ad7e 100644 --- a/commorganization/admin.py +++ b/commorganization/admin.py @@ -9,6 +9,7 @@ from django.utils.translation import ugettext_lazy as _ from commonnet.admin_site import admin_site from commorganization import models +from commcrawler.models import Crawl admin_site.register(User, UserAdmin) @@ -60,7 +61,7 @@ class AddToCrawlForm(forms.Form): super(AddToCrawlForm, self).__init__(*args, **kwargs) self.fields["crawl"].choices = [(None, "--")] + [ (c.pk, str(c)) - for c in models.Crawl.objects.filter(status="C").all() + for c in Crawl.objects.filter(status="C").all() ] @@ -77,7 +78,7 @@ class TargetAdmin(admin.ModelAdmin): if form.is_valid(): crawl = None try: - crawl = models.Crawl.objects.get( + crawl = Crawl.objects.get( pk=form.cleaned_data["crawl"], status="C" ) diff --git a/requirements.txt b/requirements.txt index 43068f9..bf39a7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ +scrapy==1.7 +tldextract django==1.11 -scrapy==1.5 django-ajax-selects==1.6.0 # https://splash.readthedocs.io/
\ No newline at end of file |