diff options
Diffstat (limited to 'ishtar_common/unicode_csv.py')
| -rw-r--r-- | ishtar_common/unicode_csv.py | 79 | 
1 files changed, 79 insertions, 0 deletions
| diff --git a/ishtar_common/unicode_csv.py b/ishtar_common/unicode_csv.py new file mode 100644 index 000000000..d0d39f7fb --- /dev/null +++ b/ishtar_common/unicode_csv.py @@ -0,0 +1,79 @@ +import csv, codecs, cStringIO + +def utf_8_encoder(unicode_csv_data): +    for line in unicode_csv_data: +        yield line.encode('utf-8') + +def unicode_csv_reader(unicode_csv_data, dialect=None, reference_header=[], +                       **kwargs): +    if not dialect: +        dialect = csv.Sniffer().sniff(unicode_csv_data[0]) +        # csv.py don't like unicode +        dialect.delimiter = str(dialect.delimiter) +        dialect.quotechar = str(dialect.quotechar) +    # csv.py doesn't do Unicode; encode temporarily as UTF-8: +    csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), +                            dialect=dialect, **kwargs) +    for row in csv_reader: +        # decode UTF-8 back to Unicode, cell by cell: +        yield [unicode(cell, 'utf-8') for cell in row] + +class UTF8Recoder: +    """ +    Iterator that reads an encoded stream and reencodes the input to UTF-8 +    """ +    def __init__(self, f, encoding): +        self.reader = codecs.getreader(encoding)(f) + +    def __iter__(self): +        return self + +    def next(self): +        return self.reader.next().encode("utf-8") + +class UnicodeReader: +    """ +    A CSV reader which will iterate over lines in the CSV file "f", +    which is encoded in the given encoding. +    """ + +    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): +        f = UTF8Recoder(f, encoding) +        self.reader = csv.reader(f, dialect=dialect, **kwds) + +    def next(self): +        row = self.reader.next() +        return [unicode(s, "utf-8") for s in row] + +    def __iter__(self): +        return self + +class UnicodeWriter: +    """ +    A CSV writer which will write rows to CSV file "f", +    which is encoded in the given encoding. +    """ + +    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): +        # Redirect output to a queue +        self.queue = cStringIO.StringIO() +        self.writer = csv.writer(self.queue, dialect=dialect, **kwds) +        self.stream = f +        self.encoder = codecs.getincrementalencoder(encoding)() + +    def writerow(self, row): +        self.writer.writerow([s.encode("utf-8") for s in row]) +        # Fetch UTF-8 output from the queue ... +        data = self.queue.getvalue() +        data = data.decode("utf-8") +        # ... and reencode it into the target encoding +        data = self.encoder.encode(data) +        # write to the target stream +        self.stream.write(data) +        # empty queue +        self.queue.truncate(0) + +    def writerows(self, rows): +        for row in rows: +            self.writerow(row) + | 
