summaryrefslogtreecommitdiff
path: root/scripts/pre_import_sra_files.py
blob: df00d3ef59b70e8401526dabc13e0e1dd4737cc3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import unicodecsv
import datetime

from django.conf import settings

from ishtar_common.data_importer import Importer


def get_year(value):
    try:
        for fmt in ['%d/%m/%Y', '%d/%m/%Y']:
            return datetime.datetime.strptime(value, fmt).year
    except:
        pass

index_list = []


def treatment(data):
    internal_ref = data[37].strip()
    creation = data[34].strip()
    reception = data[19].strip()
    yr = get_year(creation)
    if not yr:
        yr = get_year(reception)

    idx, year = None, None
    if '-' in internal_ref:
        year, y_idx = internal_ref.split('-')
        if len(year) == 4:  # 2007-XXXX
            try:
                year = int(year)
                idx = int(y_idx)
            except ValueError:
                pass
    elif '.' in internal_ref:
        year, y_idx = internal_ref.split('.')
        if len(year) == 4:  # 2011.XXXX
            try:
                year = int(year)
                idx = int(y_idx)
            except ValueError:
                pass
    if not idx:
        idx = int(internal_ref)
    if year and year != yr:
        yr = year
    assert yr  # we should absolutly have a year!

    external_id = "{}{}-{}".format(settings.ISHTAR_LOCAL_PREFIX, yr, idx)
    assert (yr, external_id) not in index_list
    index_list.append((yr, external_id))
    return yr, idx, external_id


new_datas = []
with open('plouf.csv') as csv_file:
    datas = [line for line in unicodecsv.reader(csv_file,
                                                encoding='utf-8')]
    for idx, data in enumerate(datas):
        if idx < 3:
            # headers
            data.append('annee')
            data.append('identifiant numerique')
            data.append('external_id')
            new_datas.append(data)
            continue
        try:
            year, idx, external_id = treatment(data)
            data.append(year)
            data.append(idx)
            data.append(external_id)
            new_datas.append(data)
        except Exception as e:
            print("Line {}: {}".format(idx + 1, e))

csv = Importer()._get_csv(new_datas, empty=u'')
with open('plouf2.csv', 'w') as fle:
    fle.write(csv.encode('utf-8'))