legacy.py 8.01 KB
Newer Older
1
from datetime import datetime
2
from fuzzywuzzy import fuzz, process
3
import tempfile
4

5
from models.database import Todo, OldTodo, Protocol, ProtocolType, TodoMail
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from shared import db

import config

def lookup_todo_id(old_candidates, new_who, new_description):
    # Check for perfect matches
    for candidate in old_candidates:
        if candidate.who == new_who and candidate.description == new_description:
            return candidate.old_id
    # Accept if who has been changed
    for candidate in old_candidates:
        if candidate.description == new_description:
            return candidate.old_id
    # Do fuzzy matching on description
    content_to_number = {
        candidate.description: candidate.old_id
        for candidate in old_candidates
    }
    best_match, best_match_score = process.extractOne(
        new_description, content_to_number.keys())
    if best_match_score >= config.FUZZY_MIN_SCORE:
Administrator's avatar
Administrator committed
27
        print("Used fuzzy matching on '{}', got '{}' with score {}.".format(
28
29
30
            new_description, best_match, best_match_score))
        return content_to_number[best_match]
    else:
Administrator's avatar
Administrator committed
31
        print("Best match for '{}' is '{}' with score {}, rejecting.".format(
32
33
34
            new_description, best_match, best_match_score))
        return None

35
36
37
INSERT_PROTOCOLTYPE = "INSERT INTO `protocolManager_protocoltype`"
INSERT_PROTOCOL = "INSERT INTO `protocolManager_protocol`"
INSERT_TODO = "INSERT INTO `protocolManager_todo`"
38
INSERT_TODOMAIL = "INSERT INTO `protocolManager_todonamemailassignment`"
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

def import_old_protocols(sql_text):
    protocoltype_lines = []
    protocol_lines = []
    for line in sql_text.splitlines():
        if line.startswith(INSERT_PROTOCOLTYPE):
            protocoltype_lines.append(line)
        elif line.startswith(INSERT_PROTOCOL):
            protocol_lines.append(line)
    if (len(protocoltype_lines) == 0
    or len(protocol_lines) == 0):
        raise ValueError("Necessary lines not found.")
    type_id_to_handle = {}
    for type_line in protocoltype_lines:
        for id, handle, name, mail, protocol_id in _split_insert_line(type_line):
            type_id_to_handle[int(id)] = handle.lower()
    protocols = []
    for protocol_line in protocol_lines:
        for (protocol_id, old_type_id, date, source, textsummary, htmlsummary,
            deleted, sent, document_id) in _split_insert_line(protocol_line):
            date = datetime.strptime(date, "%Y-%m-%d")
            handle = type_id_to_handle[int(old_type_id)]
61
62
            protocoltype = ProtocolType.query.filter(ProtocolType.short_name.ilike(handle)).first()
            if protocoltype is None:
63
                raise KeyError("No protocoltype for handle '{}'.".format(handle))
64
            protocol = Protocol(protocoltype_id=protocoltype.id, date=date, source=source)
65
66
67
68
69
70
71
            db.session.add(protocol)
            db.session.commit()
            import tasks
            protocols.append(protocol)
    for protocol in sorted(protocols, key=lambda p: p.date):
        print(protocol.date)
        tasks.parse_protocol(protocol)
72
73
74
75
76
77
78
79
80
81
82
83
84
85

def import_old_todomails(sql_text):
    todomail_lines = []
    for line in sql_text.splitlines():
        if line.startswith(INSERT_TODOMAIL):
            todomail_lines.append(line)
    if len(todomail_lines) == 0:
        raise ValueError("Necessary lines not found.")
    for line in todomail_lines:
        for assignment_id, name, mail in _split_insert_line(line):
            todomail = TodoMail(name, mail)
            print(todomail)
            db.session.add(todomail)
            db.session.commit()
86
    print("done importing")
87

88
89
90
91
92
93

def import_old_todos(sql_text):
    protocoltype_lines = []
    protocol_lines = []
    todo_lines = []
    for line in sql_text.splitlines():
94
        if line.startswith(INSERT_PROTOCOLTYPE):
95
            protocoltype_lines.append(line)
96
        elif line.startswith(INSERT_PROTOCOL):
97
            protocol_lines.append(line)
98
        elif line.startswith(INSERT_TODO):
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
            todo_lines.append(line)
    if (len(protocoltype_lines) == 0
    or len(protocol_lines) == 0
    or len(todo_lines) == 0):
        raise ValueError("Necessary lines not found.")
    type_id_to_handle = {}
    for type_line in protocoltype_lines:
        for id, handle, name, mail, protocol_id in _split_insert_line(type_line):
            type_id_to_handle[int(id)] = handle.lower()
    protocol_id_to_key = {}
    for protocol_line in protocol_lines:
        for (protocol_id, type_id, date, source, textsummary, htmlsummary,
            deleted, sent, document_id) in _split_insert_line(protocol_line):
            handle = type_id_to_handle[int(type_id)]
            date_string = date [2:]
            protocol_id_to_key[int(protocol_id)] = "{}-{}".format(handle, date_string)
    todos = []
    for todo_line in todo_lines:
        for old_id, protocol_id, who, what, start_time, end_time, done in _split_insert_line(todo_line):
            protocol_id = int(protocol_id)
            if protocol_id not in protocol_id_to_key:
Administrator's avatar
Administrator committed
120
                print("Missing protocol with ID {} for Todo {}".format(protocol_id, what))
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
                continue
            todo = OldTodo(old_id=old_id, who=who, description=what,
                protocol_key=protocol_id_to_key[protocol_id])
            todos.append(todo)
    OldTodo.query.delete()
    db.session.commit()
    for todo in todos:
        db.session.add(todo)
    db.session.commit()
        
def _split_insert_line(line):
    insert_part, values_part = line.split("VALUES", 1)
    return _split_base_level(values_part)

def _split_base_level(text, begin="(", end=")", separator=",", string_terminator="'", line_end=";", ignore=" ", escape="\\"):
    raw_parts = []
    current_part = None
    index = 0
    in_string = False
    escaped = False
    for char in text:
        if escaped:
            current_part += char
            escaped = False
        elif current_part is None:
            if char == ignore:
                continue
            elif char == begin:
                current_part = ""
            elif char == line_end:
                break
            elif char == separator:
                pass
            else:
                raise ValueError(
                    "Found invalid char '{}' at position {}".format(
                        char, index))
        else:
            if in_string:
                current_part += char
                if char == escape:
                    escaped = True
                elif char == string_terminator:
                    in_string = False
            else:
                if char == string_terminator:
                    current_part += char
                    in_string = True
                elif char == end:
                    raw_parts.append(current_part)
                    current_part = None
                else:
                    current_part += char
        index += 1
    parts = []
    for part in raw_parts:
        fields = []
        current_field = ""
        in_string = False
        escaped = False
        for char in part:
            if escaped:
183
184
185
186
187
188
189
190
191
192
                if char == "n":
                    current_field += "\n"
                elif char == "r":
                    current_field += "\r"
                elif char == "t":
                    current_field += "\t"
                else:
                    if char not in "\"'\\":
                        print("escaped char: '{}'".format(char))
                    current_field += char
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
                escaped = False
            elif in_string:
                if char == escape:
                    escaped = True
                elif char == string_terminator:
                    in_string = False
                else:
                    current_field += char
            else:
                if char == string_terminator:
                    in_string = True
                elif char == separator:
                    fields.append(current_field)
                    current_field = ""
                else:
                    current_field += char
        if len(current_field) > 0:
            fields.append(current_field)
        parts.append(fields)
    return parts