summaryrefslogtreecommitdiff
path: root/chimere/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'chimere/utils.py')
-rw-r--r--chimere/utils.py275
1 files changed, 268 insertions, 7 deletions
diff --git a/chimere/utils.py b/chimere/utils.py
index 974f8f3..790fd56 100644
--- a/chimere/utils.py
+++ b/chimere/utils.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-# Copyright (C) 2012-2013 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet>
+# Copyright (C) 2012-2015 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as
@@ -69,13 +69,13 @@ class ImportManager(object):
'name').all()])
def get(self):
- pass
+ raise NotImplementedError
def put(self, extra_args={}):
- pass
+ raise NotImplementedError
def create_or_update_item(self, cls, values, import_key, version=None,
- key='', pk=None):
+ key='', pk=None, category=None):
updated, created, item = False, False, None
import_key = unicode(import_key).replace(':', '^')
if not values.get('name'):
@@ -87,6 +87,7 @@ class ImportManager(object):
dct_import = {
'import_key__icontains':'%s:%s;' % (key, import_key),
'import_source':self.importer_instance.source}
+ ref_item = cls.objects.filter(**dct_import)
try:
item = None
if pk:
@@ -125,7 +126,8 @@ class ImportManager(object):
self.importer_instance.default_description
values.update({
'import_source':self.importer_instance.source})
- values['status'] = 'I'
+ values['status'] = 'I' \
+ if not self.importer_instance.automatic_update else 'A'
if not self.importer_instance.associate_marker_to_way\
and cls.__name__ == 'Route':
values['has_associated_marker'] = False
@@ -138,8 +140,11 @@ class ImportManager(object):
if import_key:
item.set_key(key, import_key)
item.categories.clear()
- for cat in self.importer_instance.categories.all():
- item.categories.add(cat)
+ if category:
+ item.categories.add(category)
+ else:
+ for cat in self.importer_instance.categories.all():
+ item.categories.add(cat)
return item, updated, created
@classmethod
@@ -866,3 +871,259 @@ class OSMManager(ImportManager):
item.save()
api.ChangesetClose()
return idx+1, None
+
+import urllib2, chardet, HTMLParser
+from BeautifulSoup import BeautifulSoup
+from lxml import etree
+
+RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''),
+ (re.compile(' ( )*'), ' '),
+ (re.compile(r"""<a href=["'](?!https?)(.*)["']"""),
+ '<a href="%(base_url)s\\1"'),
+ )
+
+from calendar import TimeEncoding, month_name
+
+def get_month_name(month_no, locale):
+ with TimeEncoding(locale) as encoding:
+ s = month_name[month_no]
+ if encoding is not None:
+ s = s.decode(encoding)
+ return s
+
+MONTH_NAMES = {locale:[get_month_name(no_month, locale+'.UTF-8')
+ for no_month in xrange(1, 13)] for locale in ['fr_FR']}
+
+try:
+ UNI_MONTH_NAMES = {locale:[m.decode('utf-8') for m in MONTH_NAMES[locale]]
+ for locale in MONTH_NAMES}
+except UnicodeEncodeError:
+ UNI_MONTH_NAMES = {locale:[m for m in MONTH_NAMES[locale]]
+ for locale in MONTH_NAMES}
+
+DATE_PARSINGS = {'fr_FR':[
+ re.compile(r'(?P<day1>\d{1,2}) '\
+ r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') '\
+ r'(?P<year1>\d{4})?[^\d]*'\
+ r'(?P<day2>\d{1,2}) '\
+ r'(?P<month2>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\
+ r'(?P<year2>\d{4})?.*'),
+ re.compile(r'(?P<day1>\d{1,2}) '\
+ r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\
+ r'(?P<year1>\d{4})?')
+ ],
+ 'en':[
+ re.compile(r'(?P<year1>\d{4})-'\
+ r'(?P<month1>\d{2})-'\
+ r'(?P<day1>\d{2})'\
+ r'(?:T'\
+ r'(?P<hour1>\d{2})?:'\
+ r'(?P<minut1>\d{2})?:'\
+ r'(?P<second1>\d{2})'\
+ r')?.*'\
+ r'(?P<year2>\d{4})-'\
+ r'(?P<month2>\d{2})-'\
+ r'(?P<day2>\d{2})'\
+ r'(?:T'\
+ r'(?P<hour2>\d{2})?:'\
+ r'(?P<minut2>\d{2})?:'\
+ r'(?P<second2>\d{2})'\
+ r')?.*'
+ ),
+ re.compile(r'(?P<year1>\d{4})-'\
+ r'(?P<month1>\d{2})-'\
+ r'(?P<day1>\d{2})'\
+ r'(?:T'\
+ r'(?P<hour1>\d{2})?:'\
+ r'(?P<minut1>\d{2})?:'\
+ r'(?P<second1>\d{2})'\
+ r')?'
+ )
+ ],
+ }
+
+def clean_field(value):
+ return value.strip()
+
+class HtmlXsltManager(ImportManager):
+ PARSER = 'HTMLParser'
+ def get(self):
+ u"""
+ Get data from the source
+
+ Return a tuple with:
+ - new items;
+ - updated items;
+ - error detail on error.
+ """
+ from models import Marker
+ self.marker_cls = Marker
+ try:
+ main_page = urllib2.urlopen(self.importer_instance.source)
+ assert main_page.getcode() == 200
+ except (urllib2.URLError, AssertionError):
+ return (0, 0, _(u"Source page is unreachable."))
+ data = main_page.read()
+ encoding = chardet.detect(data)
+ data = data.decode(encoding['encoding'])
+
+ soup = BeautifulSoup(data)
+ main_page = soup.prettify()
+ # convert it to valid XHTML
+ #doc, errors = tidy_document(main_page)
+ doc = main_page
+ dom = etree.HTML(doc, getattr(etree, self.PARSER)())
+ try:
+ xslt = etree.parse(self.importer_instance.source_file)
+ self.importer_instance.source_file.seek(0)
+ transform = etree.XSLT(xslt)
+ except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError):
+ return (0, 0, _(u"The source file is not a valid XSLT file."))
+ newdom = transform(dom)
+ items = []
+ # load an alternate xslt file to apply to linked page
+ transform_child = None
+ if self.importer_instance.source_file_alt:
+ try:
+ alt_xslt = etree.parse(self.importer_instance.source_file_alt)
+ self.importer_instance.source_file_alt.seek(0)
+ transform_child = etree.XSLT(alt_xslt)
+ except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError):
+ return (0, 0,
+ _(u"The alt source file is not a valid XSLT file."))
+ base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1])
+ base_url += u"/"
+ for item in newdom.getroot():
+ c_item = {child.tag:clean_field(child.text)
+ for child in item.getchildren() if child.text}
+ # try to have more information on the linked page
+ if transform_child and 'link' in c_item:
+ # not an absolute address
+ if not c_item['link'].startswith('http://') and \
+ not c_item['link'].startswith('https://'):
+ c_item['link'] = base_url + c_item['link']
+ try:
+ child_page = urllib2.urlopen(c_item['link'])
+ assert child_page.getcode() == 200
+ except (urllib2.URLError, AssertionError):
+ # don't stop the export for a bad link
+ items.append(c_item)
+ continue
+ data = child_page.read()
+ encoding = chardet.detect(data)
+ data = data.decode(encoding['encoding'])
+ child_page = BeautifulSoup(data).prettify()
+ child_dom = etree.HTML(child_page, etree.HTMLParser())
+ extra_keys = transform_child(child_dom).getroot()
+ if len(extra_keys):
+ c_item.update({extra.tag:etree.tostring(extra)
+ for extra in extra_keys[0].getchildren()})
+ items.append(c_item)
+ # change relative link to full link, simplify, unescape HTML entities
+ html_unescape = HTMLParser.HTMLParser().unescape
+ for item in items:
+ for k in item:
+ val = item[k]
+ for r, replaced in RE_CLEANS:
+ val = re.sub(r, replaced % {'base_url':base_url}, val)
+ item[k] = html_unescape(val)
+ self.key_categories = self.importer_instance.get_key_category_dict()
+ self.missing_cats = set()
+ self.updated_item, self.new_item = 0, 0
+ for item in items:
+ self.add_dct_item(item)
+ msg = ''
+ if self.missing_cats:
+ msg = _(u"Names \"%s\" doesn't match existing categories. "
+ u"Modify the import to match theses names with categories.") % (
+ u'", "'.join(self.missing_cats))
+ return (self.new_item, self.updated_item, msg)
+
+ @classmethod
+ def _internal_parse_date(cls, locale, year, month, day):
+ try:
+ year = datetime.date.today().year if not year else int(year)
+ except ValueError:
+ return
+ month = month.encode('utf-8')
+ if locale in MONTH_NAMES and month in MONTH_NAMES[locale]:
+ month = MONTH_NAMES[locale].index(month) + 1
+ else:
+ try:
+ month = int(month)
+ except ValueError:
+ return
+ try:
+ day = int(day)
+ except ValueError:
+ return
+ try:
+ return datetime.date(year, month, day)
+ except ValueError:
+ return
+
+ def parse_date(self, date):
+ dct = {}
+ has_dates = False
+ for locale in DATE_PARSINGS:
+ if has_dates:
+ break
+ for r in DATE_PARSINGS[locale]:
+ m = r.search(date)
+ if not m:
+ continue
+ values = m.groupdict()
+ date = self._internal_parse_date(locale,
+ 'year1' in values and values['year1'],
+ values['month1'], values['day1'])
+ if not date:
+ continue
+ dct['start_date'] = date
+ has_dates = True
+ if 'day2' not in values:
+ break
+ date = self._internal_parse_date(locale,
+ 'year2' in values and values['year2'],
+ values['month2'], values['day2'])
+ if date:
+ dct['end_date'] = date
+ break
+ return dct
+
+ def add_dct_item(self, item):
+ if not self.importer_instance.default_localisation and \
+ not "point" in item and not ("lat" in item and item['lat']):
+ return
+ cls = None
+ dct = {'origin':"<a href='%s'>%s</a>" % (item['link'],
+ self.importer_instance.origin),
+ 'license':self.importer_instance.license,
+ 'name':item['name']}
+ category = None
+ if 'category' in item and item['category']:
+ if item['category'] in self.key_categories:
+ category = self.key_categories[item['category']]
+ else:
+ self.missing_cats.add(item['category'])
+ cls = self.marker_cls
+ if 'point' in item:
+ x, y = item['point'].split(",")
+ dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y)
+ elif 'lat' in item and item['lat']:
+ dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'],
+ item['lat'])
+ else:
+ dct['point'] = self.importer_instance.default_localisation
+ dct['description'] = item['description']
+ if 'date' in item:
+ dct.update(self.parse_date(item['date']))
+ key = item['key']
+ it, updated, created = self.create_or_update_item(cls, dct, key,
+ category=category)
+ if updated:
+ self.updated_item += 1
+ if created:
+ self.new_item += 1
+
+class XMLXsltManager(HtmlXsltManager):
+ PARSER = 'XMLParser'