diff --git a/importer.py b/importer.py index c7d2e63d1eff395f4d3e4a3e9061637ebabaae5d..e4e83d18b3bf49cc4485c2e245159dd692c56522 100644 --- a/importer.py +++ b/importer.py @@ -1,5 +1,8 @@ from server import * +import urllib.request +import urllib.parse + @app.route('/internal/import/<int:id>', methods=['GET', 'POST']) @mod_required def list_import_sources(id): @@ -25,6 +28,89 @@ def list_import_sources(id): return render_template('import_campus.html', course=courses, import_campus=import_campus, events=[]) +def fetch_co_course_events(i): + from lxml import html + from lxml import etree + events = [] + try: + remote_html = urllib.request.urlopen(i['url']).read() + except: + flash("Ungültige URL: '"+i['url']+"'") + tablexpath = "//td[text()='Termine und Ort']/following::table[1]" + basetable = html.fromstring(remote_html).xpath(tablexpath)[0] + parsebase = html.tostring(basetable); + + #parse recurring events + toparse = [i['url']] + for j in basetable.xpath("//table[@cellpadding='5']//tr[@class='hierarchy4' and td[@name='togglePeriodApp']]"): + url = str(j.xpath("td[@name='togglePeriodApp']/a/@href")[0]) + toparse.append(url) + events_raw = [] + for j in toparse: + if j.startswith('event'): + url = 'https://www.campus.rwth-aachen.de/rwth/all/'+j + else: + url = j + text = urllib.request.urlopen(url).read() + dom = html.fromstring(text).xpath(tablexpath)[0] + #we get the "heading" row, from it extract the room and time. best way to get it is to match on the picture -.- + baserow = dom.xpath("//table[@cellpadding='5']//tr[@class='hierarchy4' and td[@name='togglePeriodApp']/*/img[@src='../../server/img/minus.gif']]") + if not baserow: + continue + baserow = baserow[0] + rowdata = {'dates': []} + + # "kein raum vergeben" is a special case, else use campus id + if baserow.xpath("td[6]/text()")[0] == 'Kein Raum vergeben': + rowdata['place'] = '' + elif baserow.xpath("td[6]/a"): + rowdata['place'] = baserow.xpath("td[6]/a")[0].text_content() + else: + rowdata['place'] = baserow.xpath("td[6]/text()")[0].split(' ',1)[0] + + rowdata['start'] = baserow.xpath("td[3]/text()")[0] + rowdata['end'] = baserow.xpath("td[5]/text()")[0] + rowdata['dates'] = baserow.getparent().xpath("tr[@class='hierarchy5']//td[@colspan='3']/text()") + events_raw.append(rowdata) + + # parse single appointments + if basetable.xpath("//table[@cellpadding='3']/tr/td[text()='Einmalige Termine:']"): + singletable = basetable.xpath("//table[@cellpadding='3']/tr/td[text()='Einmalige Termine:']")[0].getparent().getparent() + for row in singletable.xpath("tr/td[2]"): + rowdata = {} + if row.xpath("text()[2]")[0] == 'Kein Raum vergeben': + rowdata['place'] = '' + elif row.xpath("a"): + rowdata['place'] = row.xpath("a")[0].text_content() + else: + rowdata['place'] = row.xpath("text()[2]")[0].split(' ',1)[0] + + rowdata['dates'] = [row.xpath("text()[1]")[0][4:14]] + rowdata['start'] = row.xpath("text()[1]")[0][17:22] + rowdata['end'] = row.xpath("text()[1]")[0][27:32] + events_raw.append(rowdata) + + #now we have to filter our data and do some lookups + for j in events_raw: + for k in j['dates']: + e = {} + fmt= "%d.%m.%Y %H:%M" + e['time'] = datetime.strptime("%s %s"%(k,j['start']) ,fmt) + e['duration'] = int((datetime.strptime("%s %s"%(k,j['end']) ,fmt) - e['time']).seconds/60) + j['place'] = str(j['place']) + if j['place'] != '': + dbplace = query("SELECT name FROM places WHERE (campus_room = ?) OR (campus_name = ?) OR ((NOT campus_name) AND name = ?)",j['place'],j['place'],j['place']) + if dbplace: + e['place'] = dbplace[0]['name'] + else: + e['place'] = 'Unbekannter Ort ('+j['place']+')' + else: + e['place'] = '' + e['title'] = i['type'] + events.append(e) + # it is parsed. + return events + @app.route('/internal/import/<int:id>/now', methods=['GET', 'POST']) @mod_required def import_from(id): @@ -36,91 +122,9 @@ def import_from(id): import_campus = query('SELECT * FROM import_campus WHERE course_id = ?',id) events = [] try: - from lxml import html - from lxml import etree - import urllib.request # if u have to port this to anything new, god be with you. for i in import_campus: - try: - remote_html = urllib.request.urlopen(i['url']).read() - except: - flash("Ungültige URL: '"+i['url']+"'") - tablexpath = "//td[text()='Termine und Ort']/following::table[1]" - basetable = html.fromstring(remote_html).xpath(tablexpath)[0] - parsebase = html.tostring(basetable); - - #parse recurring events - toparse = [i['url']] - for j in basetable.xpath("//table[@cellpadding='5']//tr[@class='hierarchy4' and td[@name='togglePeriodApp']]"): - url = str(j.xpath("td[@name='togglePeriodApp']/a/@href")[0]) - toparse.append(url) - events_raw = [] - for j in toparse: - if j.startswith('event'): - url = 'https://www.campus.rwth-aachen.de/rwth/all/'+j - else: - url = j - text = urllib.request.urlopen(url).read() - dom = html.fromstring(text).xpath(tablexpath)[0] - #we get the "heading" row, from it extract the room and time. best way to get it is to match on the picture -.- - baserow = dom.xpath("//table[@cellpadding='5']//tr[@class='hierarchy4' and td[@name='togglePeriodApp']/*/img[@src='../../server/img/minus.gif']]") - if not baserow: - continue - baserow = baserow[0] - rowdata = {'dates': []} - - # "kein raum vergeben" is a special case, else use campus id - if baserow.xpath("td[6]/text()")[0] == 'Kein Raum vergeben': - rowdata['place'] = '' - elif baserow.xpath("td[6]/a"): - rowdata['place'] = baserow.xpath("td[6]/a")[0].text_content() - else: - rowdata['place'] = baserow.xpath("td[6]/text()")[0].split(' ',1)[0] - - rowdata['start'] = baserow.xpath("td[3]/text()")[0] - rowdata['end'] = baserow.xpath("td[5]/text()")[0] - rowdata['dates'] = baserow.getparent().xpath("tr[@class='hierarchy5']//td[@colspan='3']/text()") - events_raw.append(rowdata) - - # parse single appointments - if basetable.xpath("//table[@cellpadding='3']/tr/td[text()='Einmalige Termine:']"): - singletable = basetable.xpath("//table[@cellpadding='3']/tr/td[text()='Einmalige Termine:']")[0].getparent().getparent() - for row in singletable.xpath("tr/td[2]"): - rowdata = {} - if row.xpath("text()[2]")[0] == 'Kein Raum vergeben': - rowdata['place'] = '' - elif row.xpath("a"): - rowdata['place'] = row.xpath("a")[0].text_content() - else: - rowdata['place'] = row.xpath("text()[2]")[0].split(' ',1)[0] - - rowdata['dates'] = [row.xpath("text()[1]")[0][4:14]] - rowdata['start'] = row.xpath("text()[1]")[0][17:22] - rowdata['end'] = row.xpath("text()[1]")[0][27:32] - events_raw.append(rowdata) - - #now we have to filter our data and do some lookups - for j in events_raw: - for k in j['dates']: - e = {} - fmt= "%d.%m.%Y %H:%M" - e['time'] = datetime.strptime("%s %s"%(k,j['start']) ,fmt) - e['duration'] = int((datetime.strptime("%s %s"%(k,j['end']) ,fmt) - e['time']).seconds/60) - j['place'] = str(j['place']) - if j['place'] != '': - dbplace = query("SELECT name FROM places WHERE (campus_room = ?) OR (campus_name = ?) OR ((NOT campus_name) AND name = ?)",j['place'],j['place'],j['place']) - if dbplace: - e['place'] = dbplace[0]['name'] - else: - e['place'] = 'Unbekannter Ort ('+j['place']+')' - else: - e['place'] = '' - e['title'] = i['type'] - events.append(e) - # it is parsed. - - - + events += fetch_co_course_events(i) except ImportError: flash('python-lxml not found, campus import will not work.')