1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
import tldextract
from django.conf import settings
class DefaultSpider:
target_id = None
def _parse_image(self, response, result):
for __ in response.css('img'):
if 'nb_images' not in result:
result["nb_images"] = 0
result["nb_images"] += 1
def parse(self, response):
result = {
"url": response.url,
"target_id": self.target_id
}
self._parse_image(response, result)
for link in LinkExtractor().extract_links(response):
if link.url is not None:
yield response.follow(link.url, self.parse)
def get_domain(url):
ext = tldextract.extract(url)
return '{}.{}'.format(ext.domain, ext.suffix)
def create_spider(name, urls, target=None):
return type(
name, (DefaultSpider, scrapy.Spider),
{"name": name, "start_urls": urls,
"allowed_domains": [get_domain(url) for url in urls],
"target_id": target.pk}
)
def launch_crawl(crawl_item):
scrap_settings = settings.SCRAPPY_SETTINGS.copy()
scrap_settings.update({
'FEED_FORMAT': 'json',
'FEED_URI': 'result.json'
})
process = CrawlerProcess(settings=scrap_settings)
for target in crawl_item.targets.all():
process.crawl(
create_spider("Target{}".format(target.pk),
[target.url],
target)
)
process.start()
|