diff options
author | Étienne Loks <etienne.loks@proxience.com> | 2015-02-14 14:33:53 +0100 |
---|---|---|
committer | Étienne Loks <etienne.loks@proxience.com> | 2015-02-14 14:33:53 +0100 |
commit | 06008ace42c6f972b96532e76ef69cf35b0b9eea (patch) | |
tree | d7860b8022fe36d85ab1fe2fcf8b69cc1d1a074c /chimere/utils.py | |
parent | 72f4ae70dee56b5e532a579aeae7f5cc22f49813 (diff) | |
parent | afbcd9cf0578f70ac25afac0199446a43d317b52 (diff) | |
download | Chimère-06008ace42c6f972b96532e76ef69cf35b0b9eea.tar.bz2 Chimère-06008ace42c6f972b96532e76ef69cf35b0b9eea.zip |
Merge branch 'v2.1'
Diffstat (limited to 'chimere/utils.py')
-rw-r--r-- | chimere/utils.py | 207 |
1 files changed, 134 insertions, 73 deletions
diff --git a/chimere/utils.py b/chimere/utils.py index 73e38ba..55fc45c 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright (C) 2012-2013 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet> +# Copyright (C) 2012-2015 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet> # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as @@ -870,17 +870,45 @@ except UnicodeEncodeError: for locale in MONTH_NAMES} DATE_PARSINGS = {'fr_FR':[ - re.compile(r'(?P<day1>\d{1,2}) '\ - r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') '\ - r'(?P<year1>\d{4})?[^\d]*'\ - r'(?P<day2>\d{1,2}) '\ - r'(?P<month2>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\ - r'(?P<year2>\d{4})?.*'), - re.compile(r'(?P<day1>\d{1,2}) '\ - r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\ - r'(?P<year1>\d{4})?') - ] - } + re.compile(r'(?P<day1>\d{1,2}) '\ + r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') '\ + r'(?P<year1>\d{4})?[^\d]*'\ + r'(?P<day2>\d{1,2}) '\ + r'(?P<month2>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\ + r'(?P<year2>\d{4})?.*'), + re.compile(r'(?P<day1>\d{1,2}) '\ + r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\ + r'(?P<year1>\d{4})?') + ], + 'en':[ + re.compile(r'(?P<year1>\d{4})-'\ + r'(?P<month1>\d{2})-'\ + r'(?P<day1>\d{2})'\ + r'(?:T'\ + r'(?P<hour1>\d{2})?:'\ + r'(?P<minut1>\d{2})?:'\ + r'(?P<second1>\d{2})'\ + r')?.*'\ + r'(?P<year2>\d{4})-'\ + r'(?P<month2>\d{2})-'\ + r'(?P<day2>\d{2})'\ + r'(?:T'\ + r'(?P<hour2>\d{2})?:'\ + r'(?P<minut2>\d{2})?:'\ + r'(?P<second2>\d{2})'\ + r')?.*' + ), + re.compile(r'(?P<year1>\d{4})-'\ + r'(?P<month1>\d{2})-'\ + r'(?P<day1>\d{2})'\ + r'(?:T'\ + r'(?P<hour1>\d{2})?:'\ + r'(?P<minut1>\d{2})?:'\ + r'(?P<second1>\d{2})'\ + r')?' + ) + ], + } def clean_field(value): return value.strip() @@ -897,6 +925,7 @@ class HtmlXsltManager(ImportManager): - error detail on error. """ from models import Marker + self.marker_cls = Marker try: main_page = urllib2.urlopen(self.importer_instance.source) assert main_page.getcode() == 200 @@ -966,71 +995,103 @@ class HtmlXsltManager(ImportManager): for r, replaced in RE_CLEANS: val = re.sub(r, replaced % {'base_url':base_url}, val) item[k] = html_unescape(val) - updated_item, new_item = 0, 0 - key_categories = self.importer_instance.get_key_category_dict() - missing_cats = set() + self.key_categories = self.importer_instance.get_key_category_dict() + self.missing_cats = set() + self.updated_item, self.new_item = 0, 0 for item in items: - if not self.importer_instance.default_localisation and \ - not "point" in item and not ("lat" in item and item['lat']): - continue - cls = None - dct = {'origin':"<a href='%s'>%s</a>" % (item['link'], - self.importer_instance.origin), - 'license':self.importer_instance.license, - 'name':item['name']} - category = None - if 'category' in item and item['category']: - if item['category'] in key_categories: - category = key_categories[item['category']] - else: - missing_cats.add(item['category']) - cls = Marker - if 'point' in item: - x, y = item['point'].split(",") - dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y) - elif 'lat' in item and item['lat']: - dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'], - item['lat']) - else: - dct['point'] = self.importer_instance.default_localisation - dct['description'] = item['description'] - if 'date' in item: - has_dates = False - for locale in DATE_PARSINGS: - if has_dates: - break - for r in DATE_PARSINGS[locale]: - m = r.search(item['date']) - if not m: - continue - has_dates = True - values = m.groupdict() - year1 = datetime.date.today().year if 'year1' not in values \ - else int(values['year1']) - dct['start_date'] = datetime.date(year1, - MONTH_NAMES[locale].index(values['month1'].encode('utf-8')) + 1, - int(values['day1'])) - if 'day2' not in values: - break - year2 = datetime.date.today().year if 'year2' not in values \ - else int(values['year2']) - dct['end_date'] = datetime.date(year2, - MONTH_NAMES[locale].index(values['month2'].encode('utf-8')) + 1, - int(values['day2'])) - break - key = item['key'] - it, updated, created = self.create_or_update_item(cls, dct, key, - category=category) - if updated: - updated_item += 1 - if created: - new_item += 1 + self.add_dct_item(item) msg = '' - if missing_cats: + if self.missing_cats: msg = _(u"Names \"%s\" doesn't match existing categories. " u"Modify the import to match theses names with categories.") % ( - u'", "'.join(missing_cats)) - return (new_item, updated_item, msg) + u'", "'.join(self.missing_cats)) + return (self.new_item, self.updated_item, msg) + + @classmethod + def _internal_parse_date(cls, locale, year, month, day): + try: + year = datetime.date.today().year if not year else int(year) + except ValueError: + return + month = month.encode('utf-8') + if locale in MONTH_NAMES and month in MONTH_NAMES[locale]: + month = MONTH_NAMES[locale].index(month) + 1 + else: + try: + month = int(month) + except ValueError: + return + try: + day = int(day) + except ValueError: + return + try: + return datetime.date(year, month, day) + except ValueError: + return + + def parse_date(self, date): + dct = {} + has_dates = False + for locale in DATE_PARSINGS: + if has_dates: + break + for r in DATE_PARSINGS[locale]: + m = r.search(date) + if not m: + continue + values = m.groupdict() + date = self._internal_parse_date(locale, + 'year1' in values and values['year1'], + values['month1'], values['day1']) + if not date: + continue + dct['start_date'] = date + has_dates = True + if 'day2' not in values: + break + date = self._internal_parse_date(locale, + 'year2' in values and values['year2'], + values['month2'], values['day2']) + if date: + dct['end_date'] = date + break + return dct + + def add_dct_item(self, item): + if not self.importer_instance.default_localisation and \ + not "point" in item and not ("lat" in item and item['lat']): + return + cls = None + dct = {'origin':"<a href='%s'>%s</a>" % (item['link'], + self.importer_instance.origin), + 'license':self.importer_instance.license, + 'name':item['name']} + category = None + if 'category' in item and item['category']: + if item['category'] in self.key_categories: + category = self.key_categories[item['category']] + else: + self.missing_cats.add(item['category']) + cls = self.marker_cls + if 'point' in item: + x, y = item['point'].split(",") + dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y) + elif 'lat' in item and item['lat']: + dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'], + item['lat']) + else: + dct['point'] = self.importer_instance.default_localisation + dct['description'] = item['description'] + if 'date' in item: + dct.update(self.parse_date(item['date'])) + key = item['key'] + it, updated, created = self.create_or_update_item(cls, dct, key, + category=category) + if updated: + self.updated_item += 1 + if created: + self.new_item += 1 class XMLXsltManager(HtmlXsltManager): PARSER = 'XMLParser' |