diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-07-31 17:56:53 +0200 |
---|---|---|
committer | Étienne Loks <etienne@peacefrogs.net> | 2019-07-31 17:56:53 +0200 |
commit | 108b5514fe795e3bbf4c76245047f5ea054c3d20 (patch) | |
tree | 8bb5ded34e2205583b8cb12101bc3f945252ea1d /commcrawler/scrapy.py | |
parent | dd2dd640aa649c715a843fa431621fd955ca6767 (diff) | |
download | Comm-on-net-108b5514fe795e3bbf4c76245047f5ea054c3d20.tar.bz2 Comm-on-net-108b5514fe795e3bbf4c76245047f5ea054c3d20.zip |
Basic crawling
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 41 |
1 files changed, 36 insertions, 5 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 77dafe9..ea58164 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,22 +1,53 @@ import scrapy from scrapy.crawler import CrawlerProcess +from scrapy.linkextractors import LinkExtractor +import tldextract from django.conf import settings class DefaultSpider: - pass + def _parse_image(self, response, result): + for __ in response.css('img'): + if 'nb_images' not in result: + result["nb_images"] = 0 + result["nb_images"] += 1 + + def parse(self, response): + result = { + "url": response.url, + "target_id": self.target_id + } + self._parse_image(response, result) + + yield result + + for link in LinkExtractor().extract_links(response): + if link.url is not None: + yield response.follow(link.url, self.parse) + + +def get_domain(url): + ext = tldextract.extract(url) + return '{}.{}'.format(ext.domain, ext.suffix) def create_spider(name, urls, target=None): return type( - name, (scrapy.Spider, DefaultSpider), - {"name": name, "start_urls": urls, "target": target} + name, (DefaultSpider, scrapy.Spider), + {"name": name, "start_urls": urls, + "allowed_domains": [get_domain(url) for url in urls], + "target_id": target.pk} ) -def crawl(crawl_item): - process = CrawlerProcess(settings=settings.SCRAPPY_SETTINGS) +def launch_crawl(crawl_item): + scrap_settings = settings.SCRAPPY_SETTINGS.copy() + scrap_settings.update({ + 'FEED_FORMAT': 'json', + 'FEED_URI': 'result.json' + }) + process = CrawlerProcess(settings=scrap_settings) for target in crawl_item.targets.all(): process.crawl( create_spider("Target{}".format(target.pk), |