diff options
author | Étienne Loks <etienne.loks@proxience.com> | 2015-02-14 21:42:27 +0100 |
---|---|---|
committer | Étienne Loks <etienne.loks@proxience.com> | 2015-02-14 21:42:27 +0100 |
commit | 660ce29ac2cf781c8e9607a837b9832e1692e156 (patch) | |
tree | dfc74f2056cfc9a1ce12eb17adf8bc3771510495 /chimere/utils.py | |
parent | 3daa945c334f719e7edb086021bfcc93880eb7f7 (diff) | |
parent | 57bbba43a75a72eeacd44f1ce5fcd6f203dc321c (diff) | |
download | Chimère-660ce29ac2cf781c8e9607a837b9832e1692e156.tar.bz2 Chimère-660ce29ac2cf781c8e9607a837b9832e1692e156.zip |
Merge branch 'master' into nef
Conflicts:
chimere/admin.py
chimere/forms.py
chimere/models.py
chimere/settings.sample.py
chimere/static/chimere/js/jquery.chimere-ol.js
chimere/templates/chimere/base.html
chimere/templates/chimere/blocks/head_chimere.html
chimere/templates/chimere/blocks/map.html
chimere/templates/chimere/main_map.html
chimere/templatetags/chimere_tags.py
chimere/tests.py
chimere/urls.py
chimere/views.py
chimere/widgets.py
Diffstat (limited to 'chimere/utils.py')
-rw-r--r-- | chimere/utils.py | 275 |
1 files changed, 268 insertions, 7 deletions
diff --git a/chimere/utils.py b/chimere/utils.py index 974f8f3..790fd56 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright (C) 2012-2013 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet> +# Copyright (C) 2012-2015 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet> # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as @@ -69,13 +69,13 @@ class ImportManager(object): 'name').all()]) def get(self): - pass + raise NotImplementedError def put(self, extra_args={}): - pass + raise NotImplementedError def create_or_update_item(self, cls, values, import_key, version=None, - key='', pk=None): + key='', pk=None, category=None): updated, created, item = False, False, None import_key = unicode(import_key).replace(':', '^') if not values.get('name'): @@ -87,6 +87,7 @@ class ImportManager(object): dct_import = { 'import_key__icontains':'%s:%s;' % (key, import_key), 'import_source':self.importer_instance.source} + ref_item = cls.objects.filter(**dct_import) try: item = None if pk: @@ -125,7 +126,8 @@ class ImportManager(object): self.importer_instance.default_description values.update({ 'import_source':self.importer_instance.source}) - values['status'] = 'I' + values['status'] = 'I' \ + if not self.importer_instance.automatic_update else 'A' if not self.importer_instance.associate_marker_to_way\ and cls.__name__ == 'Route': values['has_associated_marker'] = False @@ -138,8 +140,11 @@ class ImportManager(object): if import_key: item.set_key(key, import_key) item.categories.clear() - for cat in self.importer_instance.categories.all(): - item.categories.add(cat) + if category: + item.categories.add(category) + else: + for cat in self.importer_instance.categories.all(): + item.categories.add(cat) return item, updated, created @classmethod @@ -866,3 +871,259 @@ class OSMManager(ImportManager): item.save() api.ChangesetClose() return idx+1, None + +import urllib2, chardet, HTMLParser +from BeautifulSoup import BeautifulSoup +from lxml import etree + +RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''), + (re.compile(' ( )*'), ' '), + (re.compile(r"""<a href=["'](?!https?)(.*)["']"""), + '<a href="%(base_url)s\\1"'), + ) + +from calendar import TimeEncoding, month_name + +def get_month_name(month_no, locale): + with TimeEncoding(locale) as encoding: + s = month_name[month_no] + if encoding is not None: + s = s.decode(encoding) + return s + +MONTH_NAMES = {locale:[get_month_name(no_month, locale+'.UTF-8') + for no_month in xrange(1, 13)] for locale in ['fr_FR']} + +try: + UNI_MONTH_NAMES = {locale:[m.decode('utf-8') for m in MONTH_NAMES[locale]] + for locale in MONTH_NAMES} +except UnicodeEncodeError: + UNI_MONTH_NAMES = {locale:[m for m in MONTH_NAMES[locale]] + for locale in MONTH_NAMES} + +DATE_PARSINGS = {'fr_FR':[ + re.compile(r'(?P<day1>\d{1,2}) '\ + r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') '\ + r'(?P<year1>\d{4})?[^\d]*'\ + r'(?P<day2>\d{1,2}) '\ + r'(?P<month2>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\ + r'(?P<year2>\d{4})?.*'), + re.compile(r'(?P<day1>\d{1,2}) '\ + r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\ + r'(?P<year1>\d{4})?') + ], + 'en':[ + re.compile(r'(?P<year1>\d{4})-'\ + r'(?P<month1>\d{2})-'\ + r'(?P<day1>\d{2})'\ + r'(?:T'\ + r'(?P<hour1>\d{2})?:'\ + r'(?P<minut1>\d{2})?:'\ + r'(?P<second1>\d{2})'\ + r')?.*'\ + r'(?P<year2>\d{4})-'\ + r'(?P<month2>\d{2})-'\ + r'(?P<day2>\d{2})'\ + r'(?:T'\ + r'(?P<hour2>\d{2})?:'\ + r'(?P<minut2>\d{2})?:'\ + r'(?P<second2>\d{2})'\ + r')?.*' + ), + re.compile(r'(?P<year1>\d{4})-'\ + r'(?P<month1>\d{2})-'\ + r'(?P<day1>\d{2})'\ + r'(?:T'\ + r'(?P<hour1>\d{2})?:'\ + r'(?P<minut1>\d{2})?:'\ + r'(?P<second1>\d{2})'\ + r')?' + ) + ], + } + +def clean_field(value): + return value.strip() + +class HtmlXsltManager(ImportManager): + PARSER = 'HTMLParser' + def get(self): + u""" + Get data from the source + + Return a tuple with: + - new items; + - updated items; + - error detail on error. + """ + from models import Marker + self.marker_cls = Marker + try: + main_page = urllib2.urlopen(self.importer_instance.source) + assert main_page.getcode() == 200 + except (urllib2.URLError, AssertionError): + return (0, 0, _(u"Source page is unreachable.")) + data = main_page.read() + encoding = chardet.detect(data) + data = data.decode(encoding['encoding']) + + soup = BeautifulSoup(data) + main_page = soup.prettify() + # convert it to valid XHTML + #doc, errors = tidy_document(main_page) + doc = main_page + dom = etree.HTML(doc, getattr(etree, self.PARSER)()) + try: + xslt = etree.parse(self.importer_instance.source_file) + self.importer_instance.source_file.seek(0) + transform = etree.XSLT(xslt) + except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError): + return (0, 0, _(u"The source file is not a valid XSLT file.")) + newdom = transform(dom) + items = [] + # load an alternate xslt file to apply to linked page + transform_child = None + if self.importer_instance.source_file_alt: + try: + alt_xslt = etree.parse(self.importer_instance.source_file_alt) + self.importer_instance.source_file_alt.seek(0) + transform_child = etree.XSLT(alt_xslt) + except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError): + return (0, 0, + _(u"The alt source file is not a valid XSLT file.")) + base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1]) + base_url += u"/" + for item in newdom.getroot(): + c_item = {child.tag:clean_field(child.text) + for child in item.getchildren() if child.text} + # try to have more information on the linked page + if transform_child and 'link' in c_item: + # not an absolute address + if not c_item['link'].startswith('http://') and \ + not c_item['link'].startswith('https://'): + c_item['link'] = base_url + c_item['link'] + try: + child_page = urllib2.urlopen(c_item['link']) + assert child_page.getcode() == 200 + except (urllib2.URLError, AssertionError): + # don't stop the export for a bad link + items.append(c_item) + continue + data = child_page.read() + encoding = chardet.detect(data) + data = data.decode(encoding['encoding']) + child_page = BeautifulSoup(data).prettify() + child_dom = etree.HTML(child_page, etree.HTMLParser()) + extra_keys = transform_child(child_dom).getroot() + if len(extra_keys): + c_item.update({extra.tag:etree.tostring(extra) + for extra in extra_keys[0].getchildren()}) + items.append(c_item) + # change relative link to full link, simplify, unescape HTML entities + html_unescape = HTMLParser.HTMLParser().unescape + for item in items: + for k in item: + val = item[k] + for r, replaced in RE_CLEANS: + val = re.sub(r, replaced % {'base_url':base_url}, val) + item[k] = html_unescape(val) + self.key_categories = self.importer_instance.get_key_category_dict() + self.missing_cats = set() + self.updated_item, self.new_item = 0, 0 + for item in items: + self.add_dct_item(item) + msg = '' + if self.missing_cats: + msg = _(u"Names \"%s\" doesn't match existing categories. " + u"Modify the import to match theses names with categories.") % ( + u'", "'.join(self.missing_cats)) + return (self.new_item, self.updated_item, msg) + + @classmethod + def _internal_parse_date(cls, locale, year, month, day): + try: + year = datetime.date.today().year if not year else int(year) + except ValueError: + return + month = month.encode('utf-8') + if locale in MONTH_NAMES and month in MONTH_NAMES[locale]: + month = MONTH_NAMES[locale].index(month) + 1 + else: + try: + month = int(month) + except ValueError: + return + try: + day = int(day) + except ValueError: + return + try: + return datetime.date(year, month, day) + except ValueError: + return + + def parse_date(self, date): + dct = {} + has_dates = False + for locale in DATE_PARSINGS: + if has_dates: + break + for r in DATE_PARSINGS[locale]: + m = r.search(date) + if not m: + continue + values = m.groupdict() + date = self._internal_parse_date(locale, + 'year1' in values and values['year1'], + values['month1'], values['day1']) + if not date: + continue + dct['start_date'] = date + has_dates = True + if 'day2' not in values: + break + date = self._internal_parse_date(locale, + 'year2' in values and values['year2'], + values['month2'], values['day2']) + if date: + dct['end_date'] = date + break + return dct + + def add_dct_item(self, item): + if not self.importer_instance.default_localisation and \ + not "point" in item and not ("lat" in item and item['lat']): + return + cls = None + dct = {'origin':"<a href='%s'>%s</a>" % (item['link'], + self.importer_instance.origin), + 'license':self.importer_instance.license, + 'name':item['name']} + category = None + if 'category' in item and item['category']: + if item['category'] in self.key_categories: + category = self.key_categories[item['category']] + else: + self.missing_cats.add(item['category']) + cls = self.marker_cls + if 'point' in item: + x, y = item['point'].split(",") + dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y) + elif 'lat' in item and item['lat']: + dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'], + item['lat']) + else: + dct['point'] = self.importer_instance.default_localisation + dct['description'] = item['description'] + if 'date' in item: + dct.update(self.parse_date(item['date'])) + key = item['key'] + it, updated, created = self.create_or_update_item(cls, dct, key, + category=category) + if updated: + self.updated_item += 1 + if created: + self.new_item += 1 + +class XMLXsltManager(HtmlXsltManager): + PARSER = 'XMLParser' |