From 9fb538feb0989df7bcd3538ae178165cc10cc184 Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Sat, 10 Aug 2019 12:28:09 +0200 Subject: Better management of timeout in crawl... --- commcrawler/locale/fr/LC_MESSAGES/django.po | 92 +++++++++++++----------- commcrawler/scrapy.py | 8 +-- commorganization/locale/fr/LC_MESSAGES/django.po | 2 +- 3 files changed, 55 insertions(+), 47 deletions(-) diff --git a/commcrawler/locale/fr/LC_MESSAGES/django.po b/commcrawler/locale/fr/LC_MESSAGES/django.po index 081304b..2145e0d 100644 --- a/commcrawler/locale/fr/LC_MESSAGES/django.po +++ b/commcrawler/locale/fr/LC_MESSAGES/django.po @@ -2,7 +2,7 @@ msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2019-08-09 15:40+0200\n" +"POT-Creation-Date: 2019-08-09 20:06+0200\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -12,7 +12,7 @@ msgstr "" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=2; plural=(n > 1);\n" -#: admin.py:58 models.py:95 models.py:134 +#: admin.py:65 models.py:98 models.py:137 msgid "Crawl result" msgstr "Résultat d'indexation" @@ -41,166 +41,174 @@ msgid "Planned" msgstr "Planifié" #: models.py:31 +msgid "Pre-crawl in progress" +msgstr "Pré-indexation en cours" + +#: models.py:32 msgid "Crawl in progress" msgstr "Indexation en cours" -#: models.py:32 +#: models.py:33 msgid "Match link in progress" msgstr "Mise en correspondance des liens en cours" -#: models.py:33 models.py:83 +#: models.py:34 models.py:86 msgid "Finished" msgstr "Fini" -#: models.py:35 +#: models.py:36 msgid "Name" msgstr "Nom" -#: models.py:37 +#: models.py:38 msgid "Creation" msgstr "Création" -#: models.py:39 +#: models.py:40 msgid "Started" msgstr "Début" -#: models.py:41 +#: models.py:42 +msgid "Pre-crawl end" +msgstr "Fin de la pré-indexation" + +#: models.py:44 msgid "Crawl end" msgstr "Fin d'indexation" -#: models.py:43 +#: models.py:46 msgid "Ended" msgstr "Fin" -#: models.py:45 models.py:93 +#: models.py:48 models.py:96 msgid "Status" msgstr "État" -#: models.py:49 +#: models.py:52 msgid "Progression" msgstr "Progression" -#: models.py:52 models.py:85 models.py:143 +#: models.py:55 models.py:88 models.py:155 msgid "Crawl" msgstr "Session d'indexation" -#: models.py:53 +#: models.py:56 msgid "Crawls" msgstr "Session d'indexation" -#: models.py:81 +#: models.py:84 msgid "In progress" msgstr "En cours" -#: models.py:82 +#: models.py:85 msgid "Time out" msgstr "Délai expiré" -#: models.py:87 +#: models.py:90 msgid "Target" msgstr "Cible" -#: models.py:89 +#: models.py:92 msgid "Start date" msgstr "Date de début" -#: models.py:91 +#: models.py:94 msgid "Duration" msgstr "Durée" -#: models.py:97 +#: models.py:100 msgid "External links" msgstr "Liens externes" -#: models.py:99 +#: models.py:102 msgid "Internal links" msgstr "Liens internes" -#: models.py:101 +#: models.py:104 msgid "Images" msgstr "Images" -#: models.py:103 +#: models.py:106 msgid "Facebook links" msgstr "Liens Facebook" -#: models.py:105 +#: models.py:108 msgid "Twitter links" msgstr "Liens Twitter" -#: models.py:107 +#: models.py:110 msgid "Instagram links" msgstr "Liens Instagram" -#: models.py:109 +#: models.py:112 msgid "Youtube links" msgstr "Liens Youtube" -#: models.py:111 +#: models.py:114 msgid "Dailymotion links" msgstr "Liens Dailymotion" -#: models.py:113 +#: models.py:116 msgid "Vimeo links" msgstr "Liens Vimeo" -#: models.py:115 +#: models.py:118 msgid "Internal videos" msgstr "Vidéos internes" -#: models.py:117 +#: models.py:120 msgid "Internal audios" msgstr "Audios internes" -#: models.py:119 +#: models.py:122 msgid "Internal PDF" msgstr "PDF internes" -#: models.py:121 +#: models.py:124 msgid "External PDF" msgstr "PDF externes" -#: models.py:123 +#: models.py:126 msgid "Internal office documents" msgstr "Document office internes" -#: models.py:125 +#: models.py:128 msgid "External office documents" msgstr "Document office externes" -#: models.py:127 +#: models.py:130 msgid "Website is online" msgstr "Site en ligne" -#: models.py:129 +#: models.py:132 msgid "Bad SSL certificate" msgstr "Mauvais certificat SSL" -#: models.py:131 +#: models.py:134 msgid "Redirection" msgstr "Redirection" -#: models.py:135 +#: models.py:138 msgid "Crawl results" msgstr "Résultats d'indexation" -#: models.py:144 +#: models.py:156 msgid "Source" msgstr "Source" -#: models.py:146 +#: models.py:158 msgid "Destination" msgstr "Destination" -#: models.py:148 +#: models.py:160 msgid "Number" msgstr "Nombre" -#: models.py:151 +#: models.py:163 msgid "Crawl relation" msgstr "Indexation - Relation" -#: models.py:152 +#: models.py:164 msgid "Crawl relations" msgstr "Indexations - Relations" diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index bdd28c3..767827a 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -5,7 +5,7 @@ import requests import scrapy from scrapy.crawler import CrawlerProcess -from scrapy.exceptions import NotSupported +from scrapy.exceptions import NotSupported, CloseSpider from scrapy.linkextractors import LinkExtractor from django.conf import settings @@ -129,15 +129,15 @@ class DefaultSpider: pk=self.crawl_result.pk) result.status = "T" result.save() - self.is_timeout = True - return True + self.is_timeout = True + raise CloseSpider('timeout') def parse(self, response): result = { "url": response.url, } if self.is_timeout or self.timeout(): - return [] + raise CloseSpider('timeout') for domain in self.excluded_domains: if domain in response.url: result["is_online"] = False diff --git a/commorganization/locale/fr/LC_MESSAGES/django.po b/commorganization/locale/fr/LC_MESSAGES/django.po index e527790..c12a5b0 100644 --- a/commorganization/locale/fr/LC_MESSAGES/django.po +++ b/commorganization/locale/fr/LC_MESSAGES/django.po @@ -2,7 +2,7 @@ msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2019-08-09 15:40+0200\n" +"POT-Creation-Date: 2019-08-09 20:06+0200\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" -- cgit v1.2.3