diff options
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 26 |
1 files changed, 19 insertions, 7 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 47a0521..7f7bea5 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,5 +1,6 @@ import datetime import tldextract +from urllib.parse import urldefrag import scrapy from scrapy.crawler import CrawlerProcess @@ -13,9 +14,6 @@ from django.utils import timezone from . import models """ -nb_external_link -nb_internal_link -nb_images nb_facebook nb_twitter nb_instagram @@ -33,7 +31,13 @@ redirection CrawlLink """ -MAX_LINKS = 500 + +def clean_url(url): + url, __ = urldefrag(url) # remove anchors + return url + + +MAX_LINKS = None # if None no max TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) @@ -112,17 +116,24 @@ class DefaultSpider: try: self._parse_image(response, result) for link in LinkExtractor().extract_links(response): - url = link.url + url = clean_url(link.url) if url is None or url in self.links_reached: continue + is_internal = False for domain in self.allowed_domains: if domain in url: + is_internal = True self.links_reached.add(link.url) - if len(self.links_reached) < MAX_LINKS: + if not MAX_LINKS or \ + len(self.links_reached) < MAX_LINKS: yield response.follow(link.url, self.parse) else: print("MAX", self.allowed_domains, self.links_reached) + if not is_internal: + if "external_link" not in result: + result["external_link"] = [] + result["external_link"].append(url) except NotSupported: print("No response", response.url) yield result @@ -137,7 +148,7 @@ class DbPipeline: "facebook", "twitter", "instagram", "youtube", "dailymotion", "vimeo", "video", "audio", "internal_pdf", "external_pdf", "internal_office", - "external_office", ] + "external_office"] def _get_result_pk(self, spider): """ @@ -179,6 +190,7 @@ class DbPipeline: if url in crawl_result["urls"]: return crawl_result["urls"].append(url) + result.nb_internal_links = len(crawl_result["urls"]) - 1 for k, value in item.items(): if k == "is_online": if result_created: # only update on the first link |