Commit 990ec4e1 by Robin Sonnabend

Also categorize by number

parent b5b36e7e
raw/
by-*/
......@@ -23,6 +23,11 @@ PATTERN = r'<li>\s*<div[^>]*>(?<title>[^<]+)<\/div>\s*<div[^>]*><a href="(?<link
RAW_DIR = "raw"
TITLE_DIR = "by-title"
NUMBER_DIR = "by-number"
def prepare_title(title):
return (title.lower().replace("/", "-").replace(" ", "_").replace("(", "").replace(")", "")
.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue"))
def main():
expr = re.compile(PATTERN)
......@@ -33,23 +38,28 @@ def main():
index = response.read().decode("iso-8859-1")
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(TITLE_DIR, exist_ok=True)
os.makedirs(NUMBER_DIR, exist_ok=True)
for match in expr.finditer(index):
title = match.group("title")
link = match.group("link")
date = datetime.strptime(match.group("date"), "%d.%m.%Y")
number = match.group("number")
version = match.group("version")
print("Retrieving '{}' ({}) ...".format(title, version))
local_name = "{}.pdf".format(number.replace("/", "-"))
raw_file_name = os.path.abspath(os.path.join(RAW_DIR, local_name))
title_dir = os.path.join(TITLE_DIR, title.replace("/", "-"))
title_dir = os.path.join(TITLE_DIR, prepare_title(title))
title_file_name = os.path.join(title_dir, f"{date.strftime('%Y-%m-%d')}-{version}.pdf")
number_file_name = os.path.join(NUMBER_DIR, prepare_title(f"{number}.pdf"))
if not os.path.exists(raw_file_name):
print("Retrieving '{}' ({}) ...".format(title, version))
urllib.request.urlretrieve(baseurl + link, raw_file_name)
os.makedirs(title_dir, exist_ok=True)
if os.path.exists(title_file_name):
os.remove(title_file_name)
os.symlink(raw_file_name, title_file_name)
if os.path.exists(number_file_name):
os.remove(number_file_name)
os.symlink(raw_file_name, number_file_name)
if __name__ == "__main__":
exit(main())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment