From 990ec4e15daea10c10ce32a3a63b3f47f04fbc42 Mon Sep 17 00:00:00 2001 From: Robin Sonnabend <robin@fsmpi.rwth-aachen.de> Date: Wed, 22 Mar 2017 23:58:07 +0100 Subject: [PATCH] Also categorize by number --- .gitignore | 2 ++ create.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..56a5adf --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +raw/ +by-*/ diff --git a/create.py b/create.py index c796e01..1343ec1 100755 --- a/create.py +++ b/create.py @@ -23,6 +23,11 @@ PATTERN = r'<li>\s*<div[^>]*>(?<title>[^<]+)<\/div>\s*<div[^>]*><a href="(?<link RAW_DIR = "raw" TITLE_DIR = "by-title" +NUMBER_DIR = "by-number" + +def prepare_title(title): + return (title.lower().replace("/", "-").replace(" ", "_").replace("(", "").replace(")", "") + .replace("ä", "ae").replace("ö", "oe").replace("ü", "ue")) def main(): expr = re.compile(PATTERN) @@ -33,23 +38,28 @@ def main(): index = response.read().decode("iso-8859-1") os.makedirs(RAW_DIR, exist_ok=True) os.makedirs(TITLE_DIR, exist_ok=True) + os.makedirs(NUMBER_DIR, exist_ok=True) for match in expr.finditer(index): title = match.group("title") link = match.group("link") date = datetime.strptime(match.group("date"), "%d.%m.%Y") number = match.group("number") version = match.group("version") - print("Retrieving '{}' ({}) ...".format(title, version)) local_name = "{}.pdf".format(number.replace("/", "-")) raw_file_name = os.path.abspath(os.path.join(RAW_DIR, local_name)) - title_dir = os.path.join(TITLE_DIR, title.replace("/", "-")) + title_dir = os.path.join(TITLE_DIR, prepare_title(title)) title_file_name = os.path.join(title_dir, f"{date.strftime('%Y-%m-%d')}-{version}.pdf") + number_file_name = os.path.join(NUMBER_DIR, prepare_title(f"{number}.pdf")) if not os.path.exists(raw_file_name): + print("Retrieving '{}' ({}) ...".format(title, version)) urllib.request.urlretrieve(baseurl + link, raw_file_name) os.makedirs(title_dir, exist_ok=True) if os.path.exists(title_file_name): os.remove(title_file_name) os.symlink(raw_file_name, title_file_name) + if os.path.exists(number_file_name): + os.remove(number_file_name) + os.symlink(raw_file_name, number_file_name) if __name__ == "__main__": exit(main()) -- GitLab