summaryrefslogtreecommitdiff
path: root/commcrawler/management/commands/launch_crawl.py
blob: 883c035f75ab6e53e00084f9998dbca9bf9e478c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import csv
import sys

from django.core.management.base import BaseCommand

from commcrawler.models import Crawl, ExludedDomains
from commcrawler.scrapy import launch_crawl


class Command(BaseCommand):
    help = 'Launch a crawl'

    def add_arguments(self, parser):
        parser.add_argument('--crawl-id', default=None, type=int,
                            dest="crawl_id")
        parser.add_argument(
            '--first-available', dest="first_available", action='store_true',
            help="Crawl the first available crawler"
        )
        parser.add_argument(
            '--quiet', dest='quiet', action='store_true',
            help='Quiet output')

    def handle(self, *args, **options):
        quiet = options['quiet']
        crawl_id = options['crawl_id']
        first_available = options['first_available']
        if crawl_id and first_available:
            sys.stdout.write('--first-available and --crawl-id are '
                             'incompatible. Exit.\n')
            return

        q = Crawl.objects.filter(status="A")
        if not q.count():
            sys.stdout.write('No crawl waiting. Exit.\n')
            return
        if first_available:
            q = q.order_by("-pk")
            current_crawl = q.all()[0]
        elif crawl_id:
            q = q.filter(pk=crawl_id)
            if not q.count():
                sys.stdout.write('Crawl with this ID do not exist. Exit.\n')
                return
            current_crawl = q.all()[0]
        else:
            crawls = dict([(c.pk, c) for c in q.all()])
            available_ids = crawls.keys()
            c_id = None
            while c_id not in available_ids:
                sys.stdout.write('Which crawl to launch (type the number):\n')
                for crawl_id, crawl in crawls.items():
                    sys.stdout.write('* {} - {}\n'.format(crawl_id, crawl))
                    sys.stdout.flush()
                try:
                    c_id = int(input("# "))
                except ValueError:
                    c_id = None
            current_crawl = crawls[c_id]
        excluded = [domain.domain for domain in ExludedDomains.objects.all()]
        launch_crawl(current_crawl, excluded_domains=excluded)