import csv import sys from django.core.management.base import BaseCommand from commcrawler.models import Crawl, ExludedDomains from commcrawler.scrapy import launch_crawl class Command(BaseCommand): help = 'Launch a crawl' def add_arguments(self, parser): parser.add_argument('--crawl-id', default=None, type=int, dest="crawl_id") parser.add_argument( '--first-available', dest="first_available", action='store_true', help="Crawl the first available crawler" ) parser.add_argument( '--quiet', dest='quiet', action='store_true', help='Quiet output') def handle(self, *args, **options): quiet = options['quiet'] crawl_id = options['crawl_id'] first_available = options['first_available'] if crawl_id and first_available: sys.stdout.write('--first-available and --crawl-id are ' 'incompatible. Exit.\n') return q = Crawl.objects.filter(status="C") if not q.count(): sys.stdout.write('No crawl waiting. Exit.\n') return if first_available: q = q.order_by("-pk") current_crawl = q.all()[0] elif crawl_id: q = q.filter(pk=crawl_id) if not q.count(): sys.stdout.write('Crawl with this ID do not exist. Exit.\n') return current_crawl = q.all()[0] else: crawls = dict([(c.pk, c) for c in q.all()]) available_ids = crawls.keys() c_id = None while c_id not in available_ids: sys.stdout.write('Which crawl to launch (type the number):\n') for crawl_id, crawl in crawls.items(): sys.stdout.write('* {} - {}\n'.format(crawl_id, crawl)) sys.stdout.flush() try: c_id = int(input("# ")) except ValueError: c_id = None current_crawl = crawls[c_id] excluded = [domain.split("://")[1] for domain in ExludedDomains.objects.all()] launch_crawl(current_crawl, excluded_domains=excluded)