import urllib.parse
import urllib.request
import sys
import os
import regex as re
from datetime import datetime
index_urls = [
baseurl = ""
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'
PATTERN = r'<li>\s*<div[^>]*>(?<title>[^<]+)<\/div>\s*<div[^>]*><a href="(?<link>[^"]+)"[^>]*><img[^>]*><\/a><\/div>\s*<div[^>]*>(?<date>[^<]+)<\/div>\s*<div[^>]*>(?<number>[^<]+)<\/div>\s*<div[^>]*>(?<version>[^<]+)<\/div>\s*<div[^>]*>[^<]*<\/div>\s*<\/li>'
RAW_DIR = "raw"
TITLE_DIR = "by-title"
def main():
expr = re.compile(PATTERN)
for indexurl in index_urls:
req = urllib.request.Request(indexurl, headers=hdr)
index = ""
with urllib.request.urlopen(req) as response:
index ="iso-8859-1")
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(TITLE_DIR, exist_ok=True)
for match in expr.finditer(index):
title ="title")
link ="link")
date = datetime.strptime("date"), "%d.%m.%Y")
number ="number")
version ="version")
print("Retrieving '{}' ({}) ...".format(title, version))
local_name = "{}.pdf".format(number.replace("/", "-"))
raw_file_name = os.path.abspath(os.path.join(RAW_DIR, local_name))
title_dir = os.path.join(TITLE_DIR, title)
title_file_name = os.path.join(title_dir, f"{version}.pdf")
urllib.request.urlretrieve(baseurl + link, raw_file_name)
os.makedirs(title_dir, exist_ok=True)
if os.path.exists(title_file_name):
os.symlink(raw_file_name, title_file_name)
if __name__ == "__main__":
