diff options
Diffstat (limited to 'chimere/utils.py')
-rw-r--r-- | chimere/utils.py | 516 |
1 files changed, 300 insertions, 216 deletions
diff --git a/chimere/utils.py b/chimere/utils.py index 790fd56..8066255 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -22,14 +22,14 @@ Utilitaries """ import csv +import collections import datetime import feedparser -import simplejson as json +import json import os import re import StringIO import tempfile -from urllib import urlencode import urllib2 import unicodedata import zipfile @@ -47,26 +47,30 @@ from django.utils.translation import ugettext_lazy as _ from chimere import get_version from external_utils import OsmApi + def unicode_normalize(string): if type(string) == str: string = unicode(string.decode('utf-8')) return ''.join( (c for c in unicodedata.normalize('NFD', string) - if unicodedata.category(c) != 'Mn')) + if unicodedata.category(c) != 'Mn')) + class ImportManager(object): u""" Generic class for specific importers """ default_source = None + def __init__(self, importer_instance): self.importer_instance = importer_instance if self.importer_instance.default_name: self.default_name = self.importer_instance.default_name else: - self.default_name = " - ".join([cat.name + self.default_name = " - ".join([ + cat.name for cat in self.importer_instance.categories.order_by( - 'name').all()]) + 'name').all()]) def get(self): raise NotImplementedError @@ -85,8 +89,8 @@ class ImportManager(object): item = None if import_key or pk: dct_import = { - 'import_key__icontains':'%s:%s;' % (key, import_key), - 'import_source':self.importer_instance.source} + 'import_key__icontains': '%s:%s;' % (key, import_key), + 'import_source': self.importer_instance.source} ref_item = cls.objects.filter(**dct_import) try: item = None @@ -102,7 +106,7 @@ class ImportManager(object): return ref_item, None, None if not self.importer_instance.overwrite \ and ref_item.modified_since_import: - dct_import['ref_item'] = ref_item + return ref_item, None, None else: item = ref_item for k in values: @@ -123,16 +127,17 @@ class ImportManager(object): if not self.importer_instance.get_description and \ self.importer_instance.default_description: values['description'] = \ - self.importer_instance.default_description + self.importer_instance.default_description values.update({ - 'import_source':self.importer_instance.source}) - values['status'] = 'I' \ - if not self.importer_instance.automatic_update else 'A' + 'import_source': self.importer_instance.source}) + values['status'] = self.importer_instance.default_status if not self.importer_instance.associate_marker_to_way\ - and cls.__name__ == 'Route': + and cls.__name__ == 'Route': values['has_associated_marker'] = False try: item = cls.objects.create(**values) + item.modified_since_import = False + item.save() except TypeError: # error on data source return None, False, False @@ -159,8 +164,8 @@ class ImportManager(object): current_file_name = None for name in namelist: if name.endswith(suffix) \ - or name.endswith(suffix.lower()) \ - or name.endswith(suffix.upper()): + or name.endswith(suffix.lower()) \ + or name.endswith(suffix.upper()): current_file_name = name filenames.append(current_file_name) files = [] @@ -181,7 +186,7 @@ class ImportManager(object): if not hasattr(source, 'read'): if not source: source = self.importer_instance.source \ - if self.importer_instance.source else self.default_source + if self.importer_instance.source else self.default_source try: url = source if extra_url: @@ -208,6 +213,7 @@ class ImportManager(object): source = files[0] if len(suffixes) == 1 else files return (source, None) + class KMLManager(ImportManager): u""" KML importer @@ -216,6 +222,7 @@ class KMLManager(ImportManager): """ XPATH = '//kml:Folder/kml:name[text()="%s"]/../kml:Placemark' DEFAULT_XPATH = '//kml:Placemark' + def __init__(self, importer_instance, ns=''): super(KMLManager, self).__init__(importer_instance) self.ns = ns @@ -250,9 +257,9 @@ class KMLManager(ImportManager): if not self.ns: self.ns = tree.getroot().nsmap[None] xpath = self.XPATH % self.importer_instance.filtr \ - if self.importer_instance.filtr else self.DEFAULT_XPATH + if self.importer_instance.filtr else self.DEFAULT_XPATH for placemark in tree.xpath(xpath, - namespaces={'kml':self.ns}): + namespaces={'kml': self.ns}): name, point, line = None, None, None pl_id = placemark.attrib.get('id') pl_key = 'kml-%d' % self.importer_instance.pk @@ -280,10 +287,10 @@ class KMLManager(ImportManager): for p in points if p]) line = 'SRID=4326;LINESTRING(%s)' % points cls = None - dct = {'description':description, - 'name':name, - 'origin':self.importer_instance.origin, - 'license':self.importer_instance.license} + dct = {'description': description, + 'name': name, + 'origin': self.importer_instance.origin, + 'license': self.importer_instance.license} if point: dct['point'] = point cls = Marker @@ -293,7 +300,7 @@ class KMLManager(ImportManager): cls = Route if cls: item, updated, created = self.create_or_update_item( - cls, dct, pl_id, key=pl_key) + cls, dct, pl_id, key=pl_key) if updated: updated_item += 1 if created: @@ -302,15 +309,17 @@ class KMLManager(ImportManager): @classmethod def export(cls, queryset): - dct = {'name':settings.PROJECT_NAME, - 'description':unicode(datetime.date.today()), - 'locations':queryset.all() + dct = { + 'name': settings.PROJECT_NAME, + 'description': unicode(datetime.date.today()), + 'locations': queryset.all() } - filename = unicode_normalize(settings.PROJECT_NAME + dct['description']\ + filename = unicode_normalize(settings.PROJECT_NAME + dct['description'] + '.kml') result = render_to_response('chimere/export.kml', dct) return filename, result + class ShapefileManager(ImportManager): u""" Shapefile importer @@ -352,7 +361,7 @@ class ShapefileManager(ImportManager): srid = settings.CHIMERE_EPSG_DISPLAY_PROJECTION msg = _(u"SRID cannot be guessed. The default SRID (%s) has " u"been used.") % srid - #If imported items are not well located " + # If imported items are not well located " # u"ask your data provider for the SRID to use.") % srid shapefilename = tmpdir + os.sep + sources[0] ds = DataSource(shapefilename) @@ -375,7 +384,7 @@ class ShapefileManager(ImportManager): u"is not managed by Chimère.") % lyr.geom_type) geom_key = 'point' if lyr.geom_type == 'Point' else 'route' geom_cls = Marker if lyr.geom_type == 'Point' else Route - indexes = [] + # indexes = [] for idx, feat in enumerate(lyr): name = unicode(idx) if lbl_name: @@ -385,7 +394,7 @@ class ShapefileManager(ImportManager): except UnicodeDecodeError: try: name = unicode( - name.decode(settings.CHIMERE_SHAPEFILE_ENCODING)) + name.decode(settings.CHIMERE_SHAPEFILE_ENCODING)) except: continue try: @@ -394,15 +403,17 @@ class ShapefileManager(ImportManager): return (0, 0, _(u"Bad Shapefile")) if feat.geom.geom_type == 'MultiLineString': geoms = [geom.wkt for geom in feat.geom] - import_key = feat.get(id_name) if id_name and len(geoms) == 1 else '' + import_key = feat.get(id_name) if id_name and len(geoms) == 1 \ + else '' for geom in geoms: - dct = {geom_key:'SRID=%s;%s' % (srid, geom), - 'name':name, - 'origin':self.importer_instance.origin, - 'license':self.importer_instance.license - } + dct = { + geom_key: 'SRID=%s;%s' % (srid, geom), + 'name': name, + 'origin': self.importer_instance.origin, + 'license': self.importer_instance.license + } item, updated, created = self.create_or_update_item( - geom_cls, dct, import_key) + geom_cls, dct, import_key) if updated: updated_item += 1 if created: @@ -427,8 +438,9 @@ class ShapefileManager(ImportManager): tmp_name = tmp.name field_names = [field.name for field in queryset.model._meta.fields] - geo_field = getattr(queryset.model, - 'point' if 'point' in field_names else 'route')._field + geo_field = getattr( + queryset.model, + 'point' if 'point' in field_names else 'route')._field dr = ogr.GetDriverByName('ESRI Shapefile') ds = dr.CreateDataSource(tmp_name) @@ -454,7 +466,7 @@ class ShapefileManager(ImportManager): feat = ogr.Feature(feature_def) feat.SetField('name', str(unicode_normalize(item.name)[:80])) feat.SetField('category', - str(unicode_normalize(category.name)[:80])) + str(unicode_normalize(category.name)[:80])) geom = getattr(item, geo_field.name) if not geom: @@ -480,6 +492,7 @@ class ShapefileManager(ImportManager): buff.close() return filename, zip_stream + class CSVManager(ImportManager): u""" CSV importer @@ -490,9 +503,8 @@ class CSVManager(ImportManager): # (label, getter, setter) COLS = [("Id", 'pk', 'pk'), (_(u"Name"), 'name', 'name'), - (_(u"Categories"), lambda obj:", ".join( - [c.name for c in obj.categories.all()]), - set_categories), + (_(u"Categories"), lambda obj: ", ".join( + [c.name for c in obj.categories.all()]), set_categories), (_(u"State"), 'status', lambda x: x), (_(u"Description"), 'description', 'description'), (_(u"Localisation"), 'geometry', 'geometry')] @@ -512,40 +524,32 @@ class CSVManager(ImportManager): if msg: return (0, 0, msg) reader = csv.reader(source, delimiter=';', quotechar='"') - prop_cols, nominatim_fields = [], {} - reverse_nominatim_dct = dict((v, k) - for k, v in settings.CHIMERE_NOMINATIM_FIELDS.iteritems()) - nominatim_default_query = settings.CHIMERE_NOMINATIM_FIELDS - for idx, pm in enumerate(Marker.all_properties()): + prop_cols = [] + for pm in Marker.all_properties(): prop_cols.append((pm.name, pm.getAttrName(), - pm.getAttrName()+'_set')) - if settings.CHIMERE_NOMINATIM_FIELDS and \ - pm.slug in reverse_nominatim_dct: - nominatim_fields[idx+len(self.COLS)] = \ - reverse_nominatim_dct[pm.slug] - nominatim_default_query.pop(reverse_nominatim_dct[pm.slug]) + pm.getAttrName() + '_set')) cols = list(self.COLS) + prop_cols - datas = [] + # datas = [] for idx, row in enumerate(reader): - if not idx: # first row + if not idx: # first row try: assert(len(row) >= len(cols)) except AssertionError: - return (0, 0, _(u"Invalid CSV format - not enough columns " - u"check a reference CSV file")) + return (0, 0, _(u"Invalid CSV format")) continue if len(row) < len(cols): continue - pk, name, cats, state = row[0], row[1], row[2], row[3] + # pk, name, cats, state = row[0], row[1], row[2], row[3] + pk, name = row[0], row[1] geom = row[5] description = '' if self.importer_instance.get_description: description = row[4] COL_INDEX = 6 - dct = {'description':description, - 'name':name, - 'origin':self.importer_instance.origin, - 'license':self.importer_instance.license} + dct = {'description': description, + 'name': name, + 'origin': self.importer_instance.origin, + 'license': self.importer_instance.license} cls = None if 'POINT' in geom: cls = Marker @@ -553,27 +557,11 @@ class CSVManager(ImportManager): elif 'LINE' in geom: cls = Route dct['route'] = geom - elif settings.CHIMERE_NOMINATIM_FIELDS: - nominatim_query = settings.NOMINATIM_URL + "?" - nominatim_keys = nominatim_default_query.copy() - nominatim_keys['format'] = 'json' - for idx in nominatim_fields: - nominatim_keys[nominatim_fields[idx]] = row[idx] - nominatim_query += urlencode(nominatim_keys) - remotehandle = urllib2.urlopen(nominatim_query) - result = StringIO.StringIO(remotehandle.read()) - remotehandle.close() - result = json.load(result) - if not result: - continue - result = result[0] - cls = Marker - dct['point'] = "POINT(%s %s)" % (result['lon'], result['lat']) else: continue import_key = pk if pk else name.decode('utf-8') - item, updated, created = self.create_or_update_item(cls, dct, - import_key, pk=pk) + item, updated, created = self.create_or_update_item( + cls, dct, import_key, pk=pk) if updated: updated_item += 1 if created: @@ -581,19 +569,17 @@ class CSVManager(ImportManager): for idx, col in enumerate(cols[COL_INDEX:]): name, getter, setter_val = col setter = getattr(item, setter_val) - val = row[idx+COL_INDEX] + val = row[idx + COL_INDEX] setter(item, val) return (new_item, updated_item, msg) @classmethod - def export(cls, queryset, cols=[]): - dct = {'description':unicode(datetime.date.today()), 'data':[]} - cls_name = queryset.model.__name__.lower() - if not cols: - cols = list(cls.COLS) - if hasattr(queryset.model, 'all_properties'): - for pm in queryset.model.all_properties(): - cols.append((pm.name, pm.getAttrName(), pm.getAttrName()+'_set')) + def export(cls, queryset): + dct = {'description': unicode(datetime.date.today()), 'data': []} + # cls_name = queryset.model.__name__.lower() + cols = list(cls.COLS) + for pm in queryset.model.all_properties(): + cols.append((pm.name, pm.getAttrName(), pm.getAttrName() + '_set')) header = [col[0] for col in cols] dct['data'].append(header) for item in queryset.all(): @@ -602,16 +588,14 @@ class CSVManager(ImportManager): if callable(attr): data.append(attr(item)) else: - v = getattr(item, attr) - if v == None: - v = '' - data.append(v) + data.append(getattr(item, attr)) dct['data'].append(data) - filename = unicode_normalize(settings.PROJECT_NAME + dct['description']\ + filename = unicode_normalize(settings.PROJECT_NAME + dct['description'] + '.csv') result = render_to_response('chimere/export.csv', dct) return filename, result + class GeoRSSManager(ImportManager): u""" RSS importer. @@ -627,19 +611,19 @@ class GeoRSSManager(ImportManager): - number of item updated ; - error detail on error """ - from models import Marker + from models import Marker, Route new_item, updated_item, msg = 0, 0, '' feed = feedparser.parse(self.importer_instance.source) - if feed['bozo'] and not isinstance(feed['bozo_exception'], - feedparser.CharacterEncodingOverride): + if feed['bozo'] and not isinstance( + feed['bozo_exception'], feedparser.CharacterEncodingOverride): return (0, 0, _(u"RSS feed is not well formed")) for item in feed['items']: if "georss_point" not in item and 'georss_line' not in item \ and not ("geo_lat" in item and "geo_long" in item): continue cls = None - dct = {'origin':self.importer_instance.origin, - 'license':self.importer_instance.license} + dct = {'origin': self.importer_instance.origin, + 'license': self.importer_instance.license} if 'georss_point' in item or "geo_lat" in item: cls = Marker if 'georss_point' in item: @@ -661,11 +645,11 @@ class GeoRSSManager(ImportManager): points = item['georss_line'].split(' ') reordered_points = [] # lat, lon -> x, y - for idx in xrange(len(points)/2): - reordered_points.append("%s %s" % (points[idx*2+1], - points[idx*2])) + for idx in xrange(len(points) / 2): + reordered_points.append("%s %s" % (points[idx * 2 + 1], + points[idx * 2])) dct['route'] = 'SRID=4326;LINESTRING(%s)' % \ - ",".join(reordered_points) + ",".join(reordered_points) dct['name'] = item['title'] pl_id = item['id'] if 'id' in item else item['title'] @@ -676,10 +660,101 @@ class GeoRSSManager(ImportManager): new_item += 1 return (new_item, updated_item, msg) + +class JsonManager(ImportManager): + u""" + Json importer. + This manager only gets and do not produce Json feed + """ + + def get(self): + u""" + Get data from a json simple source + + Return a tuple with: + - number of new item ; + - number of item updated ; + - error detail on error + """ + from models import Marker + new_item, updated_item, msg = 0, 0, '' + source, msg = self.get_source_file(['.json']) + if msg: + return (0, 0, msg) + + vals = source.read().replace('\n', ' ') + try: + values = json.JSONDecoder( + object_pairs_hook=collections.OrderedDict).decode(vals) + except ValueError as e: + return (new_item, updated_item, + _(u"JSON file is not well formed: " + e.message)) + # configuration in filtr + try: + filtr = json.JSONDecoder().decode(self.importer_instance.filtr) + except ValueError: + return ( + new_item, updated_item, + _(u"Bad configuration: filter field must be a valid " + u"JSON string")) + + vls = filtr.values() + for k in ('name', 'id', 'description'): + if k not in vls: + return ( + new_item, updated_item, + _(u"A key must be associated to \"%s\" in the " + u"filter.") % k) + + default_dct = {'origin': self.importer_instance.origin, + 'license': self.importer_instance.license} + if 'prefix_name' in filtr: + default_dct['name'] = filtr.pop('prefix_name') + if 'prefix_description' in filtr: + default_dct['description'] = filtr.pop('prefix_description') + if self.importer_instance.default_localisation: + default_dct['point'] = self.importer_instance.default_localisation + + for item in values: + dct = default_dct.copy() + for k in filtr: + if k in item and item[k]: + if filtr[k] not in dct: + dct[filtr[k]] = "" + else: + if filtr[k] == 'description': + dct[filtr[k]] += "<br/>" + else: + dct[filtr[k]] += " " + dct[filtr[k]] += item[k] + if 'point' in item: + x, y = item['point'].split(",") + dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y) + elif 'lat' in item and item['lat'] \ + and 'lon' in item and item['lon']: + dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'], + item['lat']) + elif 'x' in item and item['x'] \ + and 'y' in item and item['y']: + dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['x'], + item['y']) + if not dct['point']: + continue + cls = Marker + pl_id = (dct.pop('id') if 'id' in dct else dct['name']) \ + + "-" + unicode(self.importer_instance.pk) + it, updated, created = self.create_or_update_item(cls, dct, pl_id) + if updated: + updated_item += 1 + if created: + new_item += 1 + return (new_item, updated_item, msg) + RE_HOOK = re.compile('\[([^\]]*)\]') # TODO: manage deleted item from OSM + class OSMManager(ImportManager): u""" OSM importer/exporter @@ -697,8 +772,8 @@ class OSMManager(ImportManager): - updated items; - error detail on error. """ - source, msg = self.get_source_file(['.osm'], - extra_url=self.importer_instance.filtr) + source, msg = self.get_source_file( + ['.osm'], extra_url=self.importer_instance.filtr) if not source: return (0, 0, msg) @@ -711,8 +786,8 @@ class OSMManager(ImportManager): return 0, 0, _(u"Nothing to import") def import_ways(self, tree): - from chimere.models import Marker, Route - msg, items, new_item, updated_item = "", [], 0 , 0 + from chimere.models import Route + msg, items, new_item, updated_item = "", [], 0, 0 nodes = {} for node in tree.xpath('//node'): node_id = node.attrib.get('id') @@ -734,17 +809,17 @@ class OSMManager(ImportManager): points.append(item.get('ref')) if not points: continue - wkt = 'SRID=4326;LINESTRING(%s)' % ",".join([nodes[point_id] - for point_id in points if point_id in nodes]) - dct = {'route':wkt, - 'name':name, - 'origin':self.importer_instance.origin \ - or u'OpenStreetMap.org', - 'license':self.importer_instance.license \ - or u'ODbL', - 'import_version':version} + wkt = 'SRID=4326;LINESTRING(%s)' % ",".join( + [nodes[point_id] for point_id in points if point_id in nodes]) + dct = {'route': wkt, + 'name': name, + 'origin': self.importer_instance.origin + or u'OpenStreetMap.org', + 'license': self.importer_instance.license + or u'ODbL', + 'import_version': version} item, updated, created = self.create_or_update_item( - Route, dct, node_id, version) + Route, dct, node_id, version) if updated: updated_item += 1 if created: @@ -754,7 +829,7 @@ class OSMManager(ImportManager): def import_nodes(self, tree): from chimere.models import Marker - msg, items, new_item, updated_item = "", [], 0 , 0 + msg, items, new_item, updated_item = "", [], 0, 0 for node in tree.xpath('//node'): name = None node_id = node.attrib.get('id') @@ -767,15 +842,15 @@ class OSMManager(ImportManager): name = item.attrib.get('v') point = 'SRID=4326;POINT(%s %s)' % (node.get('lon'), node.get('lat')) - dct = {'point':point, - 'name':name, - 'origin':self.importer_instance.origin \ - or u'OpenStreetMap.org', - 'license':self.importer_instance.license \ - or u'ODbL', - 'import_version':version} + dct = {'point': point, + 'name': name, + 'origin': self.importer_instance.origin + or u'OpenStreetMap.org', + 'license': self.importer_instance.license + or u'ODbL', + 'import_version': version} item, updated, created = self.create_or_update_item( - Marker, dct, node_id, version) + Marker, dct, node_id, version) if updated: updated_item += 1 if created: @@ -810,8 +885,8 @@ class OSMManager(ImportManager): username = username.encode('latin1') password = password.encode('latin1') api = OsmApi.OsmApi(api=api, username=username, password=password) - api.ChangesetCreate({u"comment": u"Import from Chimère %s" % \ - get_version()}) + api.ChangesetCreate({u"comment": u"Import from Chimère %s" % + get_version()}) hooks = RE_HOOK.findall(self.importer_instance.filtr) if not hooks: hooks = RE_HOOK.findall(self.importer_instance.source) @@ -825,28 +900,31 @@ class OSMManager(ImportManager): continue if key == 'bbox': x1, y1, x2, y2 = [float(val) for val in value.split(',')] - bbox = GEOSGeometry( + bbox = GEOSGeometry( 'POLYGON((%f %f,%f %f,%f %f,%f %f,%f %f))' % ( - x1, y1, x2, y1, x2, y2, x1, y2, x1, y1), srid=4326) + x1, y1, x2, y1, x2, y2, x1, y2, x1, y1), srid=4326) continue tags[key] = value if not tags: return 0, _(u"No non ambigious tag is defined in the XAPI request") if not bbox: - return 0, _(u"No bounding box is defined in the XAPI request."\ - u"If you are sure to manage the entire planet set the bounding box"\ - u" to -180,-90,180,90") - default_dct = {'tag':tags, - 'import_source':self.importer_instance.source} + return 0, _( + u"No bounding box is defined in the XAPI request." + u"If you are sure to manage the entire planet set the " + u"bounding box to -180,-90,180,90") + default_dct = {'tag': tags, + 'import_source': self.importer_instance.source} idx = -1 - for idx, item in enumerate(Marker.objects.filter(status='A', - point__contained=bbox, - categories=self.importer_instance.categories.all(), - not_for_osm=False, modified_since_import=True, - route=None).all()): + for idx, item in enumerate( + Marker.objects.filter( + status='A', + point__contained=bbox, + categories=self.importer_instance.categories.all(), + not_for_osm=False, modified_since_import=True, + route=None).all()): dct = default_dct.copy() - dct.update({'lon':item.point.x, - 'lat':item.point.y}) + dct.update({'lon': item.point.x, + 'lat': item.point.y}) dct['tag']['name'] = item.name node = None import_key = item.get_key('OSM') @@ -861,7 +939,7 @@ class OSMManager(ImportManager): if error.status == 404: dct.pop('id') dct.pop('version') - pass # if the node doesn't exist it is created + pass # if the node doesn't exist it is created else: raise if not updated: @@ -870,20 +948,23 @@ class OSMManager(ImportManager): item.import_version = node['version'] item.save() api.ChangesetClose() - return idx+1, None + return idx + 1, None + -import urllib2, chardet, HTMLParser +import chardet +import HTMLParser from BeautifulSoup import BeautifulSoup -from lxml import etree + RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''), (re.compile(' ( )*'), ' '), (re.compile(r"""<a href=["'](?!https?)(.*)["']"""), - '<a href="%(base_url)s\\1"'), + '<a href="%(base_url)s\\1"'), ) from calendar import TimeEncoding, month_name + def get_month_name(month_no, locale): with TimeEncoding(locale) as encoding: s = month_name[month_no] @@ -891,62 +972,62 @@ def get_month_name(month_no, locale): s = s.decode(encoding) return s -MONTH_NAMES = {locale:[get_month_name(no_month, locale+'.UTF-8') - for no_month in xrange(1, 13)] for locale in ['fr_FR']} +MONTH_NAMES = {locale: [get_month_name(no_month, locale + '.UTF-8') + for no_month in xrange(1, 13)] for locale in ['fr_FR']} try: - UNI_MONTH_NAMES = {locale:[m.decode('utf-8') for m in MONTH_NAMES[locale]] - for locale in MONTH_NAMES} + UNI_MONTH_NAMES = {locale: [m.decode('utf-8') for m in MONTH_NAMES[locale]] + for locale in MONTH_NAMES} except UnicodeEncodeError: - UNI_MONTH_NAMES = {locale:[m for m in MONTH_NAMES[locale]] - for locale in MONTH_NAMES} - -DATE_PARSINGS = {'fr_FR':[ - re.compile(r'(?P<day1>\d{1,2}) '\ - r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') '\ - r'(?P<year1>\d{4})?[^\d]*'\ - r'(?P<day2>\d{1,2}) '\ - r'(?P<month2>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\ - r'(?P<year2>\d{4})?.*'), - re.compile(r'(?P<day1>\d{1,2}) '\ - r'(?P<month1>'+ '|'.join(UNI_MONTH_NAMES['fr_FR']) +') *'\ - r'(?P<year1>\d{4})?') - ], - 'en':[ - re.compile(r'(?P<year1>\d{4})-'\ - r'(?P<month1>\d{2})-'\ - r'(?P<day1>\d{2})'\ - r'(?:T'\ - r'(?P<hour1>\d{2})?:'\ - r'(?P<minut1>\d{2})?:'\ - r'(?P<second1>\d{2})'\ - r')?.*'\ - r'(?P<year2>\d{4})-'\ - r'(?P<month2>\d{2})-'\ - r'(?P<day2>\d{2})'\ - r'(?:T'\ - r'(?P<hour2>\d{2})?:'\ - r'(?P<minut2>\d{2})?:'\ - r'(?P<second2>\d{2})'\ - r')?.*' - ), - re.compile(r'(?P<year1>\d{4})-'\ - r'(?P<month1>\d{2})-'\ - r'(?P<day1>\d{2})'\ - r'(?:T'\ - r'(?P<hour1>\d{2})?:'\ - r'(?P<minut1>\d{2})?:'\ - r'(?P<second1>\d{2})'\ - r')?' - ) - ], - } + UNI_MONTH_NAMES = {locale: [m for m in MONTH_NAMES[locale]] + for locale in MONTH_NAMES} + +DATE_PARSINGS = { + 'fr_FR': [ + re.compile(r'(?P<day1>\d{1,2}) ' + r'(?P<month1>' + '|'.join(UNI_MONTH_NAMES['fr_FR']) + ') ' + r'(?P<year1>\d{4})?[^\d]*' + r'(?P<day2>\d{1,2}) ' + r'(?P<month2>' + '|'.join(UNI_MONTH_NAMES['fr_FR']) + ') *' + r'(?P<year2>\d{4})?.*'), + re.compile(r'(?P<day1>\d{1,2}) ' + r'(?P<month1>' + '|'.join(UNI_MONTH_NAMES['fr_FR']) + ') * ' + r'(?P<year1>\d{4})?')], + 'en': [ + re.compile(r'(?P<year1>\d{4})-' + r'(?P<month1>\d{2})-' + r'(?P<day1>\d{2})' + r'(?:T' + r'(?P<hour1>\d{2})?:' + r'(?P<minut1>\d{2})?:' + r'(?P<second1>\d{2})' + r')?.*' + r'(?P<year2>\d{4})-' + r'(?P<month2>\d{2})-' + r'(?P<day2>\d{2})' + r'(?:T' + r'(?P<hour2>\d{2})?:' + r'(?P<minut2>\d{2})?:' + r'(?P<second2>\d{2})' + r')?.*'), + re.compile(r'(?P<year1>\d{4})-' + r'(?P<month1>\d{2})-' + r'(?P<day1>\d{2})' + r'(?:T' + r'(?P<hour1>\d{2})?:' + r'(?P<minut1>\d{2})?:' + r'(?P<second1>\d{2})' + r')?')], +} + def clean_field(value): return value.strip() + class HtmlXsltManager(ImportManager): PARSER = 'HTMLParser' + def get(self): u""" Get data from the source @@ -970,7 +1051,7 @@ class HtmlXsltManager(ImportManager): soup = BeautifulSoup(data) main_page = soup.prettify() # convert it to valid XHTML - #doc, errors = tidy_document(main_page) + # doc, errors = tidy_document(main_page) doc = main_page dom = etree.HTML(doc, getattr(etree, self.PARSER)()) try: @@ -994,8 +1075,8 @@ class HtmlXsltManager(ImportManager): base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1]) base_url += u"/" for item in newdom.getroot(): - c_item = {child.tag:clean_field(child.text) - for child in item.getchildren() if child.text} + c_item = {child.tag: clean_field(child.text) + for child in item.getchildren() if child.text} # try to have more information on the linked page if transform_child and 'link' in c_item: # not an absolute address @@ -1016,8 +1097,8 @@ class HtmlXsltManager(ImportManager): child_dom = etree.HTML(child_page, etree.HTMLParser()) extra_keys = transform_child(child_dom).getroot() if len(extra_keys): - c_item.update({extra.tag:etree.tostring(extra) - for extra in extra_keys[0].getchildren()}) + c_item.update({extra.tag: etree.tostring(extra) + for extra in extra_keys[0].getchildren()}) items.append(c_item) # change relative link to full link, simplify, unescape HTML entities html_unescape = HTMLParser.HTMLParser().unescape @@ -1025,7 +1106,7 @@ class HtmlXsltManager(ImportManager): for k in item: val = item[k] for r, replaced in RE_CLEANS: - val = re.sub(r, replaced % {'base_url':base_url}, val) + val = re.sub(r, replaced % {'base_url': base_url}, val) item[k] = html_unescape(val) self.key_categories = self.importer_instance.get_key_category_dict() self.missing_cats = set() @@ -1034,9 +1115,10 @@ class HtmlXsltManager(ImportManager): self.add_dct_item(item) msg = '' if self.missing_cats: - msg = _(u"Names \"%s\" doesn't match existing categories. " - u"Modify the import to match theses names with categories.") % ( - u'", "'.join(self.missing_cats)) + msg = _( + u"Names \"%s\" doesn't match existing categories. " + u"Modify the import to match theses names with categories.") %\ + (u'", "'.join(self.missing_cats)) return (self.new_item, self.updated_item, msg) @classmethod @@ -1073,18 +1155,18 @@ class HtmlXsltManager(ImportManager): if not m: continue values = m.groupdict() - date = self._internal_parse_date(locale, - 'year1' in values and values['year1'], - values['month1'], values['day1']) + date = self._internal_parse_date( + locale, 'year1' in values and values['year1'], + values['month1'], values['day1']) if not date: continue dct['start_date'] = date has_dates = True if 'day2' not in values: break - date = self._internal_parse_date(locale, - 'year2' in values and values['year2'], - values['month2'], values['day2']) + date = self._internal_parse_date( + locale, 'year2' in values and values['year2'], + values['month2'], values['day2']) if date: dct['end_date'] = date break @@ -1092,13 +1174,14 @@ class HtmlXsltManager(ImportManager): def add_dct_item(self, item): if not self.importer_instance.default_localisation and \ - not "point" in item and not ("lat" in item and item['lat']): + "point" not in item and not ("lat" in item and item['lat']): return cls = None - dct = {'origin':"<a href='%s'>%s</a>" % (item['link'], - self.importer_instance.origin), - 'license':self.importer_instance.license, - 'name':item['name']} + dct = { + 'origin': "<a href='%s' target='_blank'>%s</a>" % ( + item.get('link') or '#', self.importer_instance.origin), + 'license': self.importer_instance.license, + 'name': item['name']} category = None if 'category' in item and item['category']: if item['category'] in self.key_categories: @@ -1114,7 +1197,7 @@ class HtmlXsltManager(ImportManager): item['lat']) else: dct['point'] = self.importer_instance.default_localisation - dct['description'] = item['description'] + dct['description'] = item.get('description', '') if 'date' in item: dct.update(self.parse_date(item['date'])) key = item['key'] @@ -1125,5 +1208,6 @@ class HtmlXsltManager(ImportManager): if created: self.new_item += 1 + class XMLXsltManager(HtmlXsltManager): PARSER = 'XMLParser' |