create.py 2.92 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
#!/usr/bin/python3
import urllib.parse
import urllib.request
import sys
import os
import regex as re
from datetime import datetime

index_urls = [
    "https://www.rwth-aachen.de/cms/root/Die-RWTH/Aktuell/~xhf/Amtliche-Bekanntmachungen/?page=1&showall=1",
    "https://www.rwth-aachen.de/cms/root/Die-RWTH/Aktuell/~bpfv/Liste-Archiv-Amtliche-Bekanntmachungen/?page=1&showall=1"
]
baseurl = "http://www.rwth-aachen.de"
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive'
}

PATTERN = r'<li>\s*<div[^>]*>(?<title>[^<]+)<\/div>\s*<div[^>]*><a href="(?<link>[^"]+)"[^>]*><img[^>]*><\/a><\/div>\s*<div[^>]*>(?<date>[^<]+)<\/div>\s*<div[^>]*>(?<number>[^<]+)<\/div>\s*<div[^>]*>(?<version>[^<]+)<\/div>\s*<div[^>]*>[^<]*<\/div>\s*<\/li>'

RAW_DIR = "raw"
TITLE_DIR = "by-title"
26 27 28 29 30
NUMBER_DIR = "by-number"

def prepare_title(title):
    return (title.lower().replace("/", "-").replace(" ", "_").replace("(", "").replace(")", "")
        .replace("ä", "ae").replace("ö", "oe").replace("ü", "ue"))
31 32 33 34 35 36 37 38 39 40

def main():
    expr = re.compile(PATTERN)
    for indexurl in index_urls:
        req = urllib.request.Request(indexurl, headers=hdr)
        index = ""
        with urllib.request.urlopen(req) as response:
            index = response.read().decode("iso-8859-1")
        os.makedirs(RAW_DIR, exist_ok=True)
        os.makedirs(TITLE_DIR, exist_ok=True)
41
        os.makedirs(NUMBER_DIR, exist_ok=True)
42 43 44 45 46 47 48 49
        for match in expr.finditer(index):
            title = match.group("title")
            link = match.group("link")
            date = datetime.strptime(match.group("date"), "%d.%m.%Y")
            number = match.group("number")
            version = match.group("version")
            local_name = "{}.pdf".format(number.replace("/", "-"))
            raw_file_name = os.path.abspath(os.path.join(RAW_DIR, local_name))
50
            title_dir = os.path.join(TITLE_DIR, prepare_title(title))
51
            title_file_name = os.path.join(title_dir, f"{date.strftime('%Y-%m-%d')}-{version}.pdf")
52
            number_file_name = os.path.join(NUMBER_DIR, prepare_title(f"{number}.pdf"))
53
            if not os.path.exists(raw_file_name):
54
                print("Retrieving '{}' ({}) ...".format(title, version))
55
                urllib.request.urlretrieve(baseurl + link, raw_file_name)
56 57 58 59
            os.makedirs(title_dir, exist_ok=True)
            if os.path.exists(title_file_name):
                os.remove(title_file_name)
            os.symlink(raw_file_name, title_file_name)
60 61 62
            if os.path.exists(number_file_name):
                os.remove(number_file_name)
            os.symlink(raw_file_name, number_file_name)
63 64 65

if __name__ == "__main__":
    exit(main())