Basic crawling

author: Étienne Loks <etienne.loks@iggdrasil.net> 2019-07-31 17:56:53 +0200
committer: Étienne Loks <etienne@peacefrogs.net> 2019-07-31 17:56:53 +0200
commit: 108b5514fe795e3bbf4c76245047f5ea054c3d20 (patch)
tree: 8bb5ded34e2205583b8cb12101bc3f945252ea1d
parent: dd2dd640aa649c715a843fa431621fd955ca6767 (diff)
download: Comm-on-net-108b5514fe795e3bbf4c76245047f5ea054c3d20.tar.bz2
Comm-on-net-108b5514fe795e3bbf4c76245047f5ea054c3d20.zip
8 files changed, 83 insertions, 8 deletions
diff --git a/.gitignore b/.gitignore
index 8ff3fee..e78f46c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,5 +4,7 @@
 *.mo
 *~
 .idea
+.scrapy
 *.sqlite3
 local_settings.py
+result.json
diff --git a/commcrawler/management/__init__.py b/commcrawler/management/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/commcrawler/management/__init__.py
diff --git a/commcrawler/management/commands/__init__.py b/commcrawler/management/commands/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/commcrawler/management/commands/__init__.py
diff --git a/commcrawler/management/commands/launch_crawl.py b/commcrawler/management/commands/launch_crawl.py
new file mode 100644
index 0000000..050a54d
--- /dev/null
+++ b/commcrawler/management/commands/launch_crawl.py
@@ -0,0 +1,39 @@
+import csv
+import sys
+
+from django.core.management.base import BaseCommand
+
+from commcrawler.models import Crawl
+from commcrawler.scrapy import launch_crawl
+
+
+class Command(BaseCommand):
+    help = 'Launch a crawl'
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--quiet', dest='quiet', action='store_true',
+            help='Quiet output')
+
+    def handle(self, *args, **options):
+        quiet = options['quiet']
+        q = Crawl.objects.filter(status="C")
+        if not q.count():
+            sys.stdout.write('No crawl waiting. Exit.\n')
+            return
+
+        crawls = dict([(c.pk, c) for c in q.all()])
+        available_ids = crawls.keys()
+        c_id = None
+        while c_id not in available_ids:
+            sys.stdout.write('Which crawl to launch (type the number):\n')
+            for crawl_id, crawl in crawls.items():
+                sys.stdout.write('* {} - {}\n'.format(crawl_id, crawl))
+                sys.stdout.flush()
+            try:
+                c_id = int(input("# "))
+            except ValueError:
+                c_id = None
+        current_crawl = crawls[c_id]
+        launch_crawl(current_crawl)
+
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 77dafe9..ea58164 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,22 +1,53 @@
 import scrapy
 from scrapy.crawler import CrawlerProcess
+from scrapy.linkextractors import LinkExtractor
+import tldextract
 
 from django.conf import settings
 
 
 class DefaultSpider:
-    pass
+    def _parse_image(self, response, result):
+        for __ in response.css('img'):
+            if 'nb_images' not in result:
+                result["nb_images"] = 0
+            result["nb_images"] += 1
+
+    def parse(self, response):
+        result = {
+            "url": response.url,
+            "target_id": self.target_id
+        }
+        self._parse_image(response, result)
+
+        yield result
+
+        for link in LinkExtractor().extract_links(response):
+            if link.url is not None:
+                yield response.follow(link.url, self.parse)
+
+
+def get_domain(url):
+    ext = tldextract.extract(url)
+    return '{}.{}'.format(ext.domain, ext.suffix)
 
 
 def create_spider(name, urls, target=None):
     return type(
-        name, (scrapy.Spider, DefaultSpider),
-        {"name": name, "start_urls": urls, "target": target}
+        name, (DefaultSpider, scrapy.Spider),
+        {"name": name, "start_urls": urls,
+         "allowed_domains": [get_domain(url) for url in urls],
+         "target_id": target.pk}
     )
 
 
-def crawl(crawl_item):
-    process = CrawlerProcess(settings=settings.SCRAPPY_SETTINGS)
+def launch_crawl(crawl_item):
+    scrap_settings = settings.SCRAPPY_SETTINGS.copy()
+    scrap_settings.update({
+        'FEED_FORMAT': 'json',
+        'FEED_URI': 'result.json'
+    })
+    process = CrawlerProcess(settings=scrap_settings)
     for target in crawl_item.targets.all():
         process.crawl(
             create_spider("Target{}".format(target.pk),
diff --git a/commonnet/scrapy_setting.py b/commonnet/scrapy_setting.py
index 6330705..2d2b7b7 100644
--- a/commonnet/scrapy_setting.py
+++ b/commonnet/scrapy_setting.py
@@ -10,6 +10,7 @@ SCRAPPY_SETTINGS = {
     #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
     #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
     "BOT_NAME": 'commonnet-scraper',
+    "DEPTH_LIMIT": 5,
 
     # Obey robots.txt rules
     "ROBOTSTXT_OBEY": True,
diff --git a/commorganization/admin.py b/commorganization/admin.py
index 2b3c042..b26ad7e 100644
--- a/commorganization/admin.py
+++ b/commorganization/admin.py
@@ -9,6 +9,7 @@ from django.utils.translation import ugettext_lazy as _
 
 from commonnet.admin_site import admin_site
 from commorganization import models
+from commcrawler.models import Crawl
 
 
 admin_site.register(User, UserAdmin)
@@ -60,7 +61,7 @@ class AddToCrawlForm(forms.Form):
         super(AddToCrawlForm, self).__init__(*args, **kwargs)
         self.fields["crawl"].choices = [(None, "--")] + [
             (c.pk, str(c))
-            for c in models.Crawl.objects.filter(status="C").all()
+            for c in Crawl.objects.filter(status="C").all()
         ]
 
 
@@ -77,7 +78,7 @@ class TargetAdmin(admin.ModelAdmin):
             if form.is_valid():
                 crawl = None
                 try:
-                    crawl = models.Crawl.objects.get(
+                    crawl = Crawl.objects.get(
                         pk=form.cleaned_data["crawl"],
                         status="C"
                     )
diff --git a/requirements.txt b/requirements.txt
index 43068f9..bf39a7d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
+scrapy==1.7
+tldextract
 django==1.11
-scrapy==1.5
 django-ajax-selects==1.6.0
 # https://splash.readthedocs.io/
 \ No newline at end of file
author	Étienne Loks <etienne.loks@iggdrasil.net>	2019-07-31 17:56:53 +0200
committer	Étienne Loks <etienne@peacefrogs.net>	2019-07-31 17:56:53 +0200
commit	108b5514fe795e3bbf4c76245047f5ea054c3d20 (patch)
tree	8bb5ded34e2205583b8cb12101bc3f945252ea1d
parent	dd2dd640aa649c715a843fa431621fd955ca6767 (diff)
download	Comm-on-net-108b5514fe795e3bbf4c76245047f5ea054c3d20.tar.bz2 Comm-on-net-108b5514fe795e3bbf4c76245047f5ea054c3d20.zip