diff options
Diffstat (limited to 'chimere/utils.py')
| -rw-r--r-- | chimere/utils.py | 18 | 
1 files changed, 14 insertions, 4 deletions
| diff --git a/chimere/utils.py b/chimere/utils.py index 2d74095..f3ec751 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -85,6 +85,7 @@ class ImportManager(object):              dct_import = {                  'import_key__icontains':'%s:%s;' % (key, import_key),                  'import_source':self.importer_instance.source} +            ref_item = cls.objects.filter(**dct_import)              try:                  item = None                  if pk: @@ -878,8 +879,11 @@ DATE_PARSINGS = {'fr_FR':[                           ]                  } +def clean_field(value): +    return value.strip()  class HtmlXsltManager(ImportManager): +    PARSER = 'HTMLParser'      def get(self):          u"""          Get data from the source @@ -904,7 +908,7 @@ class HtmlXsltManager(ImportManager):          # convert it to valid XHTML          #doc, errors = tidy_document(main_page)          doc = main_page -        dom = etree.HTML(doc, etree.HTMLParser()) +        dom = etree.HTML(doc, getattr(etree, self.PARSER)())          try:              xslt = etree.parse(self.importer_instance.source_file)              self.importer_instance.source_file.seek(0) @@ -926,8 +930,8 @@ class HtmlXsltManager(ImportManager):          base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1])          base_url += u"/"          for item in newdom.getroot(): -            c_item = {child.tag:child.text for child in item.getchildren() -                                                             if child.text} +            c_item = {child.tag:clean_field(child.text) +                        for child in item.getchildren() if child.text}              # try to have more information on the linked page              if transform_child and 'link' in c_item:                  # not an absolute address @@ -962,7 +966,7 @@ class HtmlXsltManager(ImportManager):          updated_item, new_item = 0, 0          for item in items:              if not self.importer_instance.default_localisation and \ -               not "point" in item: +               not "point" in item and not ("lat" in item and item['lat']):                  continue              cls = None              dct = {'origin':"<a href='%s'>%s</a>" % (item['link'], @@ -973,6 +977,9 @@ class HtmlXsltManager(ImportManager):              if 'point' in item:                  x, y = item['point'].split(",")                  dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y) +            elif 'lat' in item and item['lat']: +                dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'], +                                                           item['lat'])              else:                  dct['point'] = self.importer_instance.default_localisation              dct['description'] = item['description'] @@ -1007,3 +1014,6 @@ class HtmlXsltManager(ImportManager):              if created:                  new_item += 1          return (new_item, updated_item, '') + +class XMLXsltManager(HtmlXsltManager): +    PARSER = 'XMLParser' | 
