From 5b41f143a3956fbd9d3a2288e607e6aa8d8c3451 Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Wed, 30 Oct 2013 03:08:06 +0100 Subject: Importer HTML-XSLT: retrieving and parsing of external website using an URL and XSLT files --- chimere/utils.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) (limited to 'chimere/utils.py') diff --git a/chimere/utils.py b/chimere/utils.py index efaf084..894c13c 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -835,6 +835,17 @@ class OSMManager(ImportManager): api.ChangesetClose() return idx+1, None +import urllib2 +from BeautifulSoup import BeautifulSoup +#from tidylib import tidy_document +from lxml import etree + +RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''), + (re.compile(' ( )*'), ' '), + (re.compile(r"""