Skip to content
Snippets Groups Projects
Commit 990ec4e1 authored by Robin Sonnabend's avatar Robin Sonnabend
Browse files

Also categorize by number

parent b5b36e7e
No related branches found
No related tags found
No related merge requests found
raw/
by-*/
......@@ -23,6 +23,11 @@ PATTERN = r'<li>\s*<div[^>]*>(?<title>[^<]+)<\/div>\s*<div[^>]*><a href="(?<link
RAW_DIR = "raw"
TITLE_DIR = "by-title"
NUMBER_DIR = "by-number"
def prepare_title(title):
return (title.lower().replace("/", "-").replace(" ", "_").replace("(", "").replace(")", "")
.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue"))
def main():
expr = re.compile(PATTERN)
......@@ -33,23 +38,28 @@ def main():
index = response.read().decode("iso-8859-1")
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(TITLE_DIR, exist_ok=True)
os.makedirs(NUMBER_DIR, exist_ok=True)
for match in expr.finditer(index):
title = match.group("title")
link = match.group("link")
date = datetime.strptime(match.group("date"), "%d.%m.%Y")
number = match.group("number")
version = match.group("version")
print("Retrieving '{}' ({}) ...".format(title, version))
local_name = "{}.pdf".format(number.replace("/", "-"))
raw_file_name = os.path.abspath(os.path.join(RAW_DIR, local_name))
title_dir = os.path.join(TITLE_DIR, title.replace("/", "-"))
title_dir = os.path.join(TITLE_DIR, prepare_title(title))
title_file_name = os.path.join(title_dir, f"{date.strftime('%Y-%m-%d')}-{version}.pdf")
number_file_name = os.path.join(NUMBER_DIR, prepare_title(f"{number}.pdf"))
if not os.path.exists(raw_file_name):
print("Retrieving '{}' ({}) ...".format(title, version))
urllib.request.urlretrieve(baseurl + link, raw_file_name)
os.makedirs(title_dir, exist_ok=True)
if os.path.exists(title_file_name):
os.remove(title_file_name)
os.symlink(raw_file_name, title_file_name)
if os.path.exists(number_file_name):
os.remove(number_file_name)
os.symlink(raw_file_name, number_file_name)
if __name__ == "__main__":
exit(main())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment