Commit 44bb8aea authored by Robin Sonnabend's avatar Robin Sonnabend
Browse files

Added script to retrieve the RWTH Amtliche Bekanntmachungen

import urllib.parse
import urllib.request
import sys
import os
import regex as re
from datetime import datetime
index_urls = [
baseurl = ""
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'
PATTERN = r'<li>\s*<div[^>]*>(?<title>[^<]+)<\/div>\s*<div[^>]*><a href="(?<link>[^"]+)"[^>]*><img[^>]*><\/a><\/div>\s*<div[^>]*>(?<date>[^<]+)<\/div>\s*<div[^>]*>(?<number>[^<]+)<\/div>\s*<div[^>]*>(?<version>[^<]+)<\/div>\s*<div[^>]*>[^<]*<\/div>\s*<\/li>'
RAW_DIR = "raw"
TITLE_DIR = "by-title"
def main():
expr = re.compile(PATTERN)
for indexurl in index_urls:
req = urllib.request.Request(indexurl, headers=hdr)
index = ""
with urllib.request.urlopen(req) as response:
index ="iso-8859-1")
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(TITLE_DIR, exist_ok=True)
for match in expr.finditer(index):
title ="title")
link ="link")
date = datetime.strptime("date"), "%d.%m.%Y")
number ="number")
version ="version")
print("Retrieving '{}' ({}) ...".format(title, version))
local_name = "{}.pdf".format(number.replace("/", "-"))
raw_file_name = os.path.abspath(os.path.join(RAW_DIR, local_name))
title_dir = os.path.join(TITLE_DIR, title)
title_file_name = os.path.join(title_dir, f"{version}.pdf")
urllib.request.urlretrieve(baseurl + link, raw_file_name)
os.makedirs(title_dir, exist_ok=True)
if os.path.exists(title_file_name):
os.symlink(raw_file_name, title_file_name)
if __name__ == "__main__":
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment