1 files changed, 268 insertions, 7 deletions
diff --git a/chimere/utils.py b/chimere/utils.py
index 974f8f3..790fd56 100644
--- a/chimere/utils.py
+++ b/chimere/utils.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-# Copyright (C) 2012-2013  Étienne Loks  <etienne.loks_AT_peacefrogsDOTnet>
+# Copyright (C) 2012-2015  Étienne Loks  <etienne.loks_AT_peacefrogsDOTnet>
 
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as
@@ -69,13 +69,13 @@ class ImportManager(object):
                                                                'name').all()])
 
     def get(self):
-        pass
+        raise NotImplementedError
 
     def put(self, extra_args={}):
-        pass
+        raise NotImplementedError
 
     def create_or_update_item(self, cls, values, import_key, version=None,
-                              key='', pk=None):
+                              key='', pk=None, category=None):
         updated, created, item = False, False, None
         import_key = unicode(import_key).replace(':', '^')
         if not values.get('name'):
@@ -87,6 +87,7 @@ class ImportManager(object):
             dct_import = {
                 'import_key__icontains':'%s:%s;' % (key, import_key),
                 'import_source':self.importer_instance.source}
+            ref_item = cls.objects.filter(**dct_import)
             try:
                 item = None
                 if pk:
@@ -125,7 +126,8 @@ class ImportManager(object):
                                      self.importer_instance.default_description
             values.update({
                 'import_source':self.importer_instance.source})
-            values['status'] = 'I'
+            values['status'] = 'I' \
+                        if not self.importer_instance.automatic_update else 'A'
             if not self.importer_instance.associate_marker_to_way\
               and cls.__name__ == 'Route':
                 values['has_associated_marker'] = False
@@ -138,8 +140,11 @@ class ImportManager(object):
         if import_key:
             item.set_key(key, import_key)
         item.categories.clear()
-        for cat in self.importer_instance.categories.all():
-            item.categories.add(cat)
+        if category:
+            item.categories.add(category)
+        else:
+            for cat in self.importer_instance.categories.all():
+                item.categories.add(cat)
         return item, updated, created
 
     @classmethod
@@ -866,3 +871,259 @@ class OSMManager(ImportManager):
             item.save()
         api.ChangesetClose()
         return idx+1, None
+
+import urllib2, chardet, HTMLParser
+from BeautifulSoup import BeautifulSoup
+from lxml import etree
+
+RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''),
+             (re.compile(' ( )*'), ' '),
+             (re.compile(r"""<a href=["'](?!https?)(.*)["']"""),
+                        '<a href="%(base_url)s\\1"'),
+             )
+
+from calendar import TimeEncoding, month_name
+
+def get_month_name(month_no, locale):
+    with TimeEncoding(locale) as encoding:
+        s = month_name[month_no]
+        if encoding is not None:
+            s = s.decode(encoding)
+        return s
+
+MONTH_NAMES = {locale:[get_month_name(no_month, locale+'.UTF-8')
+                        for no_month in xrange(1, 13)] for locale in ['fr_FR']}
+
+try:
+    UNI_MONTH_NAMES = {locale:[m.decode('utf-8') for m in MONTH_NAMES[locale]]
+                                                 for locale in MONTH_NAMES}
+except UnicodeEncodeError:
+    UNI_MONTH_NAMES = {locale:[m for m in MONTH_NAMES[locale]]
+                                         for locale in MONTH_NAMES}
+
+DATE_PARSINGS = {'fr_FR':[
+           re.compile(r'(?P<day1>\d{1,2}) '\
+                      r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') '\
+                      r'(?P<year1>\d{4})?[^\d]*'\
+                      r'(?P<day2>\d{1,2}) '\
+                      r'(?P<month2>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\
+                      r'(?P<year2>\d{4})?.*'),
+           re.compile(r'(?P<day1>\d{1,2}) '\
+                      r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\
+                      r'(?P<year1>\d{4})?')
+                     ],
+                 'en':[
+           re.compile(r'(?P<year1>\d{4})-'\
+                      r'(?P<month1>\d{2})-'\
+                      r'(?P<day1>\d{2})'\
+                      r'(?:T'\
+                          r'(?P<hour1>\d{2})?:'\
+                          r'(?P<minut1>\d{2})?:'\
+                          r'(?P<second1>\d{2})'\
+                      r')?.*'\
+                      r'(?P<year2>\d{4})-'\
+                      r'(?P<month2>\d{2})-'\
+                      r'(?P<day2>\d{2})'\
+                      r'(?:T'\
+                          r'(?P<hour2>\d{2})?:'\
+                          r'(?P<minut2>\d{2})?:'\
+                          r'(?P<second2>\d{2})'\
+                      r')?.*'
+                      ),
+           re.compile(r'(?P<year1>\d{4})-'\
+                      r'(?P<month1>\d{2})-'\
+                      r'(?P<day1>\d{2})'\
+                      r'(?:T'\
+                          r'(?P<hour1>\d{2})?:'\
+                          r'(?P<minut1>\d{2})?:'\
+                          r'(?P<second1>\d{2})'\
+                      r')?'
+                      )
+                     ],
+            }
+
+def clean_field(value):
+    return value.strip()
+
+class HtmlXsltManager(ImportManager):
+    PARSER = 'HTMLParser'
+    def get(self):
+        u"""
+        Get data from the source
+
+        Return a tuple with:
+        - new items;
+        - updated items;
+        - error detail on error.
+        """
+        from models import Marker
+        self.marker_cls = Marker
+        try:
+            main_page = urllib2.urlopen(self.importer_instance.source)
+            assert main_page.getcode() == 200
+        except (urllib2.URLError, AssertionError):
+            return (0, 0, _(u"Source page is unreachable."))
+        data = main_page.read()
+        encoding = chardet.detect(data)
+        data = data.decode(encoding['encoding'])
+
+        soup = BeautifulSoup(data)
+        main_page = soup.prettify()
+        # convert it to valid XHTML
+        #doc, errors = tidy_document(main_page)
+        doc = main_page
+        dom = etree.HTML(doc, getattr(etree, self.PARSER)())
+        try:
+            xslt = etree.parse(self.importer_instance.source_file)
+            self.importer_instance.source_file.seek(0)
+            transform = etree.XSLT(xslt)
+        except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError):
+            return (0, 0, _(u"The source file is not a valid XSLT file."))
+        newdom = transform(dom)
+        items = []
+        # load an alternate xslt file to apply to linked page
+        transform_child = None
+        if self.importer_instance.source_file_alt:
+            try:
+                alt_xslt = etree.parse(self.importer_instance.source_file_alt)
+                self.importer_instance.source_file_alt.seek(0)
+                transform_child = etree.XSLT(alt_xslt)
+            except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError):
+                return (0, 0,
+                        _(u"The alt source file is not a valid XSLT file."))
+        base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1])
+        base_url += u"/"
+        for item in newdom.getroot():
+            c_item = {child.tag:clean_field(child.text)
+                        for child in item.getchildren() if child.text}
+            # try to have more information on the linked page
+            if transform_child and 'link' in c_item:
+                # not an absolute address
+                if not c_item['link'].startswith('http://') and \
+                   not c_item['link'].startswith('https://'):
+                    c_item['link'] = base_url + c_item['link']
+                try:
+                    child_page = urllib2.urlopen(c_item['link'])
+                    assert child_page.getcode() == 200
+                except (urllib2.URLError, AssertionError):
+                    # don't stop the export for a bad link
+                    items.append(c_item)
+                    continue
+                data = child_page.read()
+                encoding = chardet.detect(data)
+                data = data.decode(encoding['encoding'])
+                child_page = BeautifulSoup(data).prettify()
+                child_dom = etree.HTML(child_page, etree.HTMLParser())
+                extra_keys = transform_child(child_dom).getroot()
+                if len(extra_keys):
+                    c_item.update({extra.tag:etree.tostring(extra)
+                            for extra in extra_keys[0].getchildren()})
+            items.append(c_item)
+        # change relative link to full link, simplify, unescape HTML entities
+        html_unescape = HTMLParser.HTMLParser().unescape
+        for item in items:
+            for k in item:
+                val = item[k]
+                for r, replaced in RE_CLEANS:
+                    val = re.sub(r, replaced % {'base_url':base_url}, val)
+                item[k] = html_unescape(val)
+        self.key_categories = self.importer_instance.get_key_category_dict()
+        self.missing_cats = set()
+        self.updated_item, self.new_item = 0, 0
+        for item in items:
+            self.add_dct_item(item)
+        msg = ''
+        if self.missing_cats:
+            msg = _(u"Names \"%s\" doesn't match existing categories. "
+                u"Modify the import to match theses names with categories.") % (
+                    u'", "'.join(self.missing_cats))
+        return (self.new_item, self.updated_item, msg)
+
+    @classmethod
+    def _internal_parse_date(cls, locale, year, month, day):
+        try:
+            year = datetime.date.today().year if not year else int(year)
+        except ValueError:
+            return
+        month = month.encode('utf-8')
+        if locale in MONTH_NAMES and month in MONTH_NAMES[locale]:
+            month = MONTH_NAMES[locale].index(month) + 1
+        else:
+            try:
+                month = int(month)
+            except ValueError:
+                return
+        try:
+            day = int(day)
+        except ValueError:
+            return
+        try:
+            return datetime.date(year, month, day)
+        except ValueError:
+            return
+
+    def parse_date(self, date):
+        dct = {}
+        has_dates = False
+        for locale in DATE_PARSINGS:
+            if has_dates:
+                break
+            for r in DATE_PARSINGS[locale]:
+                m = r.search(date)
+                if not m:
+                    continue
+                values = m.groupdict()
+                date = self._internal_parse_date(locale,
+                              'year1' in values and values['year1'],
+                              values['month1'], values['day1'])
+                if not date:
+                    continue
+                dct['start_date'] = date
+                has_dates = True
+                if 'day2' not in values:
+                    break
+                date = self._internal_parse_date(locale,
+                              'year2' in values and values['year2'],
+                              values['month2'], values['day2'])
+                if date:
+                    dct['end_date'] = date
+                break
+        return dct
+
+    def add_dct_item(self, item):
+        if not self.importer_instance.default_localisation and \
+           not "point" in item and not ("lat" in item and item['lat']):
+            return
+        cls = None
+        dct = {'origin':"<a href='%s'>%s</a>" % (item['link'],
+                                    self.importer_instance.origin),
+               'license':self.importer_instance.license,
+               'name':item['name']}
+        category = None
+        if 'category' in item and item['category']:
+            if item['category'] in self.key_categories:
+                category = self.key_categories[item['category']]
+            else:
+                self.missing_cats.add(item['category'])
+        cls = self.marker_cls
+        if 'point' in item:
+            x, y = item['point'].split(",")
+            dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y)
+        elif 'lat' in item and item['lat']:
+            dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'],
+                                                       item['lat'])
+        else:
+            dct['point'] = self.importer_instance.default_localisation
+        dct['description'] = item['description']
+        if 'date' in item:
+            dct.update(self.parse_date(item['date']))
+        key = item['key']
+        it, updated, created = self.create_or_update_item(cls, dct, key,
+                                                          category=category)
+        if updated:
+            self.updated_item += 1
+        if created:
+            self.new_item += 1
+
+class XMLXsltManager(HtmlXsltManager):
+    PARSER = 'XMLParser'