Commit 44bb8aea authored by Robin Sonnabend's avatar Robin Sonnabend

Added script to retrieve the RWTH Amtliche Bekanntmachungen

parents
#!/usr/bin/python3
import urllib.parse
import urllib.request
import sys
import os
import regex as re
from datetime import datetime
index_urls = [
"https://www.rwth-aachen.de/cms/root/Die-RWTH/Aktuell/~xhf/Amtliche-Bekanntmachungen/?page=1&showall=1",
"https://www.rwth-aachen.de/cms/root/Die-RWTH/Aktuell/~bpfv/Liste-Archiv-Amtliche-Bekanntmachungen/?page=1&showall=1"
]
baseurl = "http://www.rwth-aachen.de"
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'
}
PATTERN = r'<li>\s*<div[^>]*>(?<title>[^<]+)<\/div>\s*<div[^>]*><a href="(?<link>[^"]+)"[^>]*><img[^>]*><\/a><\/div>\s*<div[^>]*>(?<date>[^<]+)<\/div>\s*<div[^>]*>(?<number>[^<]+)<\/div>\s*<div[^>]*>(?<version>[^<]+)<\/div>\s*<div[^>]*>[^<]*<\/div>\s*<\/li>'
RAW_DIR = "raw"
TITLE_DIR = "by-title"
def main():
expr = re.compile(PATTERN)
for indexurl in index_urls:
req = urllib.request.Request(indexurl, headers=hdr)
index = ""
with urllib.request.urlopen(req) as response:
index = response.read().decode("iso-8859-1")
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(TITLE_DIR, exist_ok=True)
for match in expr.finditer(index):
title = match.group("title")
link = match.group("link")
date = datetime.strptime(match.group("date"), "%d.%m.%Y")
number = match.group("number")
version = match.group("version")
print("Retrieving '{}' ({}) ...".format(title, version))
local_name = "{}.pdf".format(number.replace("/", "-"))
raw_file_name = os.path.abspath(os.path.join(RAW_DIR, local_name))
title_dir = os.path.join(TITLE_DIR, title)
title_file_name = os.path.join(title_dir, f"{version}.pdf")
urllib.request.urlretrieve(baseurl + link, raw_file_name)
os.makedirs(title_dir, exist_ok=True)
if os.path.exists(title_file_name):
os.remove(title_file_name)
os.symlink(raw_file_name, title_file_name)
if __name__ == "__main__":
exit(main())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment