Re-work (more robust) on importation - first work on file importation

author: Étienne Loks <etienne.loks@proxience.com> 2014-11-07 19:34:58 +0100
committer: Étienne Loks <etienne.loks@proxience.com> 2014-11-07 19:34:58 +0100
commit: c0030cf85878dcfbc24abb5462ea37775a8393ef (patch)
tree: f0c93d38a3fcc620a8af680591501310aefd0a9b /ishtar_common/unicode_csv.py
parent: ebbb899c825770a7e92686786531d5502a7b4b4f (diff)
download: Ishtar-c0030cf85878dcfbc24abb5462ea37775a8393ef.tar.bz2
Ishtar-c0030cf85878dcfbc24abb5462ea37775a8393ef.zip
1 files changed, 79 insertions, 0 deletions
diff --git a/ishtar_common/unicode_csv.py b/ishtar_common/unicode_csv.py
new file mode 100644
index 000000000..d0d39f7fb
--- /dev/null
+++ b/ishtar_common/unicode_csv.py
@@ -0,0 +1,79 @@
+import csv, codecs, cStringIO
+
+def utf_8_encoder(unicode_csv_data):
+    for line in unicode_csv_data:
+        yield line.encode('utf-8')
+
+def unicode_csv_reader(unicode_csv_data, dialect=None, reference_header=[],
+                       **kwargs):
+    if not dialect:
+        dialect = csv.Sniffer().sniff(unicode_csv_data[0])
+        # csv.py don't like unicode
+        dialect.delimiter = str(dialect.delimiter)
+        dialect.quotechar = str(dialect.quotechar)
+    # csv.py doesn't do Unicode; encode temporarily as UTF-8:
+    csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
+                            dialect=dialect, **kwargs)
+    for row in csv_reader:
+        # decode UTF-8 back to Unicode, cell by cell:
+        yield [unicode(cell, 'utf-8') for cell in row]
+
+class UTF8Recoder:
+    """
+    Iterator that reads an encoded stream and reencodes the input to UTF-8
+    """
+    def __init__(self, f, encoding):
+        self.reader = codecs.getreader(encoding)(f)
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        return self.reader.next().encode("utf-8")
+
+class UnicodeReader:
+    """
+    A CSV reader which will iterate over lines in the CSV file "f",
+    which is encoded in the given encoding.
+    """
+
+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+        f = UTF8Recoder(f, encoding)
+        self.reader = csv.reader(f, dialect=dialect, **kwds)
+
+    def next(self):
+        row = self.reader.next()
+        return [unicode(s, "utf-8") for s in row]
+
+    def __iter__(self):
+        return self
+
+class UnicodeWriter:
+    """
+    A CSV writer which will write rows to CSV file "f",
+    which is encoded in the given encoding.
+    """
+
+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+        # Redirect output to a queue
+        self.queue = cStringIO.StringIO()
+        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+        self.stream = f
+        self.encoder = codecs.getincrementalencoder(encoding)()
+
+    def writerow(self, row):
+        self.writer.writerow([s.encode("utf-8") for s in row])
+        # Fetch UTF-8 output from the queue ...
+        data = self.queue.getvalue()
+        data = data.decode("utf-8")
+        # ... and reencode it into the target encoding
+        data = self.encoder.encode(data)
+        # write to the target stream
+        self.stream.write(data)
+        # empty queue
+        self.queue.truncate(0)
+
+    def writerows(self, rows):
+        for row in rows:
+            self.writerow(row)
+
author	Étienne Loks <etienne.loks@proxience.com>	2014-11-07 19:34:58 +0100
committer	Étienne Loks <etienne.loks@proxience.com>	2014-11-07 19:34:58 +0100
commit	c0030cf85878dcfbc24abb5462ea37775a8393ef (patch)
tree	f0c93d38a3fcc620a8af680591501310aefd0a9b /ishtar_common/unicode_csv.py
parent	ebbb899c825770a7e92686786531d5502a7b4b4f (diff)
download	Ishtar-c0030cf85878dcfbc24abb5462ea37775a8393ef.tar.bz2 Ishtar-c0030cf85878dcfbc24abb5462ea37775a8393ef.zip