summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py41
1 files changed, 36 insertions, 5 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 77dafe9..ea58164 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,22 +1,53 @@
import scrapy
from scrapy.crawler import CrawlerProcess
+from scrapy.linkextractors import LinkExtractor
+import tldextract
from django.conf import settings
class DefaultSpider:
- pass
+ def _parse_image(self, response, result):
+ for __ in response.css('img'):
+ if 'nb_images' not in result:
+ result["nb_images"] = 0
+ result["nb_images"] += 1
+
+ def parse(self, response):
+ result = {
+ "url": response.url,
+ "target_id": self.target_id
+ }
+ self._parse_image(response, result)
+
+ yield result
+
+ for link in LinkExtractor().extract_links(response):
+ if link.url is not None:
+ yield response.follow(link.url, self.parse)
+
+
+def get_domain(url):
+ ext = tldextract.extract(url)
+ return '{}.{}'.format(ext.domain, ext.suffix)
def create_spider(name, urls, target=None):
return type(
- name, (scrapy.Spider, DefaultSpider),
- {"name": name, "start_urls": urls, "target": target}
+ name, (DefaultSpider, scrapy.Spider),
+ {"name": name, "start_urls": urls,
+ "allowed_domains": [get_domain(url) for url in urls],
+ "target_id": target.pk}
)
-def crawl(crawl_item):
- process = CrawlerProcess(settings=settings.SCRAPPY_SETTINGS)
+def launch_crawl(crawl_item):
+ scrap_settings = settings.SCRAPPY_SETTINGS.copy()
+ scrap_settings.update({
+ 'FEED_FORMAT': 'json',
+ 'FEED_URI': 'result.json'
+ })
+ process = CrawlerProcess(settings=scrap_settings)
for target in crawl_item.targets.all():
process.crawl(
create_spider("Target{}".format(target.pk),