summaryrefslogtreecommitdiff
path: root/commcrawler/models.py
blob: e71540866ef4bff822cbb27bd4d80d66273710ed (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import datetime

from django.contrib.postgres.fields import JSONField
from django.contrib.sites.models import _simple_domain_name_validator
from django.db import models
from django.utils.translation import ugettext_lazy as _

from commorganization.models import Target


class ExludedDomains(models.Model):
    domain = models.CharField(
        _("Domain name"), max_length=100,
        validators=[_simple_domain_name_validator], unique=True)

    class Meta:
        verbose_name = _("Excluded domain")
        verbose_name_plural = _("Excluded domains")

    def __str__(self):
        return self.url


class Crawl(models.Model):
    STATUS = (
        ('C', _("Created")), ('P', _("In progress")),
        ('F', _("Finished"))
    )
    name = models.CharField(verbose_name=_("Name"), max_length=200, unique=True)
    created = models.DateTimeField(
        verbose_name=_("Creation date"), default=datetime.datetime.now)
    started = models.DateTimeField(
        verbose_name=_("Start date"), blank=True, null=True)
    ended = models.DateTimeField(
        verbose_name=_("End date"), blank=True, null=True)
    status = models.CharField(
        verbose_name=_("Status"),
        max_length=1, choices=STATUS, default='C')
    targets = models.ManyToManyField(Target, blank=True)

    class Meta:
        verbose_name = _("Crawl")
        verbose_name_plural = _("Crawls")
        ordering = ("created", "name")

    def __str__(self):
        return self.name

    @property
    def target_nb(self):
        return self.targets.count()


class CrawlResult(models.Model):
    STATUS = (
        ('P', _("In progress")),
        ('T', _("Time out")),
        ('F', _("Finished"))
    )
    crawl = models.ForeignKey(Crawl, verbose_name=_("Crawl"))
    target = models.ForeignKey(Target, verbose_name=_("Target"))
    started = models.DateTimeField(
        verbose_name=_("Start date"), default=datetime.datetime.now)
    duration = models.DurationField(
        verbose_name=_("Duration"), blank=True, null=True)
    status = models.CharField(
        verbose_name=_("Status"),
        max_length=1, choices=STATUS, default='P')
    crawl_result = JSONField(verbose_name=_("Crawl result"), default=list)
    nb_external_link = models.IntegerField(
        verbose_name=_("External links"), default=0)
    nb_internal_link = models.IntegerField(
        verbose_name=_("Internal links"), default=0)
    nb_images = models.IntegerField(
        verbose_name=_("Images"), default=0)
    nb_facebook = models.IntegerField(
        verbose_name=_("Facebook links"), default=0)
    nb_twitter = models.IntegerField(
        verbose_name=_("Twitter links"), default=0)
    nb_instagram = models.IntegerField(
        verbose_name=_("Instagram links"), default=0)
    nb_youtube = models.IntegerField(
        verbose_name=_("Youtube links"), default=0)
    nb_dailymotion = models.IntegerField(
        verbose_name=_("Dailymotion links"), default=0)
    nb_vimeo = models.IntegerField(
        verbose_name=_("Vimeo links"), default=0)
    nb_video = models.IntegerField(
        verbose_name=_("Internal videos"), default=0)
    nb_audio = models.IntegerField(
        verbose_name=_("Internal audios"), default=0)
    nb_internal_pdf = models.IntegerField(
        verbose_name=_("Internal PDF"), default=0)
    nb_external_pdf = models.IntegerField(
        verbose_name=_("External PDF"), default=0)
    nb_internal_office = models.IntegerField(
        verbose_name=_("Internal office documents"), default=0)
    nb_external_office = models.IntegerField(
        verbose_name=_("External office documents"), default=0)
    is_online = models.BooleanField(
        verbose_name=_("Website is online"), default=False)
    redirection = models.URLField(
        verbose_name=_("Redirection"), blank=True, null=True)

    class Meta:
        verbose_name = _("Crawl result")
        verbose_name_plural = _("Crawl results")
        unique_together = ("crawl", "target")

    def __str__(self):
        return "{} - {}".format(self.crawl, self.target)


class CrawlLink(models.Model):
    result = models.ForeignKey(CrawlResult, verbose_name=_("Result"))
    link = models.URLField(verbose_name=_("Link"))

    class Meta:
        verbose_name = _("Crawl link")
        verbose_name_plural = _("Crawl links")

    def __str__(self):
        return "{} - {}".format(self.result, self.link)


class CrawlRelation(models.Model):
    crawl = models.ForeignKey(Crawl, verbose_name=_("Crawl"))
    source = models.ForeignKey(Target, verbose_name=_("Source"),
                               related_name="relation_source")
    destination = models.ForeignKey(Target, verbose_name=_("Destination"),
                                    related_name="relation_destination")
    number = models.IntegerField(verbose_name=_("Number"), default=1)

    class Meta:
        verbose_name = _("Crawl relation")
        verbose_name_plural = _("Crawl relations")
        unique_together = ("crawl", "source", "destination")

    def __str__(self):
        return "{} - {}".format(self.crawl, self.source, self.destination)