summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py50
1 files changed, 40 insertions, 10 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index e821a31..d218648 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,13 +1,21 @@
import scrapy
from scrapy.crawler import CrawlerProcess
+from scrapy.exceptions import NotSupported
from scrapy.linkextractors import LinkExtractor
import tldextract
from django.conf import settings
+MAX_LINKS = 500
+
class DefaultSpider:
+ name = None
+ start_urls = None
+ allowed_domains = []
+ excluded_domains = []
target_id = None
+ links_reached = set()
def _parse_image(self, response, result):
for __ in response.css('img'):
@@ -20,10 +28,27 @@ class DefaultSpider:
"url": response.url,
"target_id": self.target_id
}
- self._parse_image(response, result)
- for link in LinkExtractor().extract_links(response):
- if link.url is not None:
- yield response.follow(link.url, self.parse)
+ for domain in self.excluded_domains:
+ if domain in response.url:
+ result["offline"] = True
+ yield result
+ try:
+ self._parse_image(response, result)
+ for link in LinkExtractor().extract_links(response):
+ url = link.url
+ if url is None or url in self.links_reached:
+ continue
+ for domain in self.allowed_domains:
+ if domain in url:
+ self.links_reached.add(link.url)
+ if len(self.links_reached) < MAX_LINKS:
+ yield response.follow(link.url, self.parse)
+ else:
+ print("MAX", self.allowed_domains,
+ self.links_reached)
+ except NotSupported:
+ print("No response", response.url)
+ yield result
def get_domain(url):
@@ -31,16 +56,19 @@ def get_domain(url):
return '{}.{}'.format(ext.domain, ext.suffix)
-def create_spider(name, urls, target=None):
+def create_spider(name, urls, target, excluded_domains=None):
+ if not excluded_domains:
+ excluded_domains = []
return type(
name, (DefaultSpider, scrapy.Spider),
{"name": name, "start_urls": urls,
"allowed_domains": [get_domain(url) for url in urls],
- "target_id": target.pk}
+ "target_id": target.pk, "links_reached": set(),
+ "excluded_domains": excluded_domains}
)
-def launch_crawl(crawl_item):
+def launch_crawl(crawl_item, excluded_domains=None):
scrap_settings = settings.SCRAPPY_SETTINGS.copy()
scrap_settings.update({
'FEED_FORMAT': 'json',
@@ -49,8 +77,10 @@ def launch_crawl(crawl_item):
process = CrawlerProcess(settings=scrap_settings)
for target in crawl_item.targets.all():
process.crawl(
- create_spider("Target{}".format(target.pk),
- [target.url],
- target)
+ create_spider(
+ "Target{}".format(target.pk),
+ [target.url], target,
+ excluded_domains
+ )
)
process.start()