summaryrefslogtreecommitdiff
path: root/chimere/utils.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@proxience.com>2015-02-14 14:33:53 +0100
committerÉtienne Loks <etienne.loks@proxience.com>2015-02-14 14:33:53 +0100
commit06008ace42c6f972b96532e76ef69cf35b0b9eea (patch)
treed7860b8022fe36d85ab1fe2fcf8b69cc1d1a074c /chimere/utils.py
parent72f4ae70dee56b5e532a579aeae7f5cc22f49813 (diff)
parentafbcd9cf0578f70ac25afac0199446a43d317b52 (diff)
downloadChimère-06008ace42c6f972b96532e76ef69cf35b0b9eea.tar.bz2
Chimère-06008ace42c6f972b96532e76ef69cf35b0b9eea.zip
Merge branch 'v2.1'
Diffstat (limited to 'chimere/utils.py')
-rw-r--r--chimere/utils.py207
1 files changed, 134 insertions, 73 deletions
diff --git a/chimere/utils.py b/chimere/utils.py
index 73e38ba..55fc45c 100644
--- a/chimere/utils.py
+++ b/chimere/utils.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-# Copyright (C) 2012-2013 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet>
+# Copyright (C) 2012-2015 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as
@@ -870,17 +870,45 @@ except UnicodeEncodeError:
for locale in MONTH_NAMES}
DATE_PARSINGS = {'fr_FR':[
- re.compile(r'(?P<day1>\d{1,2}) '\
- r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') '\
- r'(?P<year1>\d{4})?[^\d]*'\
- r'(?P<day2>\d{1,2}) '\
- r'(?P<month2>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\
- r'(?P<year2>\d{4})?.*'),
- re.compile(r'(?P<day1>\d{1,2}) '\
- r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\
- r'(?P<year1>\d{4})?')
- ]
- }
+ re.compile(r'(?P<day1>\d{1,2}) '\
+ r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') '\
+ r'(?P<year1>\d{4})?[^\d]*'\
+ r'(?P<day2>\d{1,2}) '\
+ r'(?P<month2>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\
+ r'(?P<year2>\d{4})?.*'),
+ re.compile(r'(?P<day1>\d{1,2}) '\
+ r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\
+ r'(?P<year1>\d{4})?')
+ ],
+ 'en':[
+ re.compile(r'(?P<year1>\d{4})-'\
+ r'(?P<month1>\d{2})-'\
+ r'(?P<day1>\d{2})'\
+ r'(?:T'\
+ r'(?P<hour1>\d{2})?:'\
+ r'(?P<minut1>\d{2})?:'\
+ r'(?P<second1>\d{2})'\
+ r')?.*'\
+ r'(?P<year2>\d{4})-'\
+ r'(?P<month2>\d{2})-'\
+ r'(?P<day2>\d{2})'\
+ r'(?:T'\
+ r'(?P<hour2>\d{2})?:'\
+ r'(?P<minut2>\d{2})?:'\
+ r'(?P<second2>\d{2})'\
+ r')?.*'
+ ),
+ re.compile(r'(?P<year1>\d{4})-'\
+ r'(?P<month1>\d{2})-'\
+ r'(?P<day1>\d{2})'\
+ r'(?:T'\
+ r'(?P<hour1>\d{2})?:'\
+ r'(?P<minut1>\d{2})?:'\
+ r'(?P<second1>\d{2})'\
+ r')?'
+ )
+ ],
+ }
def clean_field(value):
return value.strip()
@@ -897,6 +925,7 @@ class HtmlXsltManager(ImportManager):
- error detail on error.
"""
from models import Marker
+ self.marker_cls = Marker
try:
main_page = urllib2.urlopen(self.importer_instance.source)
assert main_page.getcode() == 200
@@ -966,71 +995,103 @@ class HtmlXsltManager(ImportManager):
for r, replaced in RE_CLEANS:
val = re.sub(r, replaced % {'base_url':base_url}, val)
item[k] = html_unescape(val)
- updated_item, new_item = 0, 0
- key_categories = self.importer_instance.get_key_category_dict()
- missing_cats = set()
+ self.key_categories = self.importer_instance.get_key_category_dict()
+ self.missing_cats = set()
+ self.updated_item, self.new_item = 0, 0
for item in items:
- if not self.importer_instance.default_localisation and \
- not "point" in item and not ("lat" in item and item['lat']):
- continue
- cls = None
- dct = {'origin':"<a href='%s'>%s</a>" % (item['link'],
- self.importer_instance.origin),
- 'license':self.importer_instance.license,
- 'name':item['name']}
- category = None
- if 'category' in item and item['category']:
- if item['category'] in key_categories:
- category = key_categories[item['category']]
- else:
- missing_cats.add(item['category'])
- cls = Marker
- if 'point' in item:
- x, y = item['point'].split(",")
- dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y)
- elif 'lat' in item and item['lat']:
- dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'],
- item['lat'])
- else:
- dct['point'] = self.importer_instance.default_localisation
- dct['description'] = item['description']
- if 'date' in item:
- has_dates = False
- for locale in DATE_PARSINGS:
- if has_dates:
- break
- for r in DATE_PARSINGS[locale]:
- m = r.search(item['date'])
- if not m:
- continue
- has_dates = True
- values = m.groupdict()
- year1 = datetime.date.today().year if 'year1' not in values \
- else int(values['year1'])
- dct['start_date'] = datetime.date(year1,
- MONTH_NAMES[locale].index(values['month1'].encode('utf-8')) + 1,
- int(values['day1']))
- if 'day2' not in values:
- break
- year2 = datetime.date.today().year if 'year2' not in values \
- else int(values['year2'])
- dct['end_date'] = datetime.date(year2,
- MONTH_NAMES[locale].index(values['month2'].encode('utf-8')) + 1,
- int(values['day2']))
- break
- key = item['key']
- it, updated, created = self.create_or_update_item(cls, dct, key,
- category=category)
- if updated:
- updated_item += 1
- if created:
- new_item += 1
+ self.add_dct_item(item)
msg = ''
- if missing_cats:
+ if self.missing_cats:
msg = _(u"Names \"%s\" doesn't match existing categories. "
u"Modify the import to match theses names with categories.") % (
- u'", "'.join(missing_cats))
- return (new_item, updated_item, msg)
+ u'", "'.join(self.missing_cats))
+ return (self.new_item, self.updated_item, msg)
+
+ @classmethod
+ def _internal_parse_date(cls, locale, year, month, day):
+ try:
+ year = datetime.date.today().year if not year else int(year)
+ except ValueError:
+ return
+ month = month.encode('utf-8')
+ if locale in MONTH_NAMES and month in MONTH_NAMES[locale]:
+ month = MONTH_NAMES[locale].index(month) + 1
+ else:
+ try:
+ month = int(month)
+ except ValueError:
+ return
+ try:
+ day = int(day)
+ except ValueError:
+ return
+ try:
+ return datetime.date(year, month, day)
+ except ValueError:
+ return
+
+ def parse_date(self, date):
+ dct = {}
+ has_dates = False
+ for locale in DATE_PARSINGS:
+ if has_dates:
+ break
+ for r in DATE_PARSINGS[locale]:
+ m = r.search(date)
+ if not m:
+ continue
+ values = m.groupdict()
+ date = self._internal_parse_date(locale,
+ 'year1' in values and values['year1'],
+ values['month1'], values['day1'])
+ if not date:
+ continue
+ dct['start_date'] = date
+ has_dates = True
+ if 'day2' not in values:
+ break
+ date = self._internal_parse_date(locale,
+ 'year2' in values and values['year2'],
+ values['month2'], values['day2'])
+ if date:
+ dct['end_date'] = date
+ break
+ return dct
+
+ def add_dct_item(self, item):
+ if not self.importer_instance.default_localisation and \
+ not "point" in item and not ("lat" in item and item['lat']):
+ return
+ cls = None
+ dct = {'origin':"<a href='%s'>%s</a>" % (item['link'],
+ self.importer_instance.origin),
+ 'license':self.importer_instance.license,
+ 'name':item['name']}
+ category = None
+ if 'category' in item and item['category']:
+ if item['category'] in self.key_categories:
+ category = self.key_categories[item['category']]
+ else:
+ self.missing_cats.add(item['category'])
+ cls = self.marker_cls
+ if 'point' in item:
+ x, y = item['point'].split(",")
+ dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y)
+ elif 'lat' in item and item['lat']:
+ dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'],
+ item['lat'])
+ else:
+ dct['point'] = self.importer_instance.default_localisation
+ dct['description'] = item['description']
+ if 'date' in item:
+ dct.update(self.parse_date(item['date']))
+ key = item['key']
+ it, updated, created = self.create_or_update_item(cls, dct, key,
+ category=category)
+ if updated:
+ self.updated_item += 1
+ if created:
+ self.new_item += 1
class XMLXsltManager(HtmlXsltManager):
PARSER = 'XMLParser'