diff --git a/sorter.py b/sorter.py index f7f6616bcd1bff99b083939d2aef777560067556..3a0654004609eada06c21dfedf329fe61128a4db 100644 --- a/sorter.py +++ b/sorter.py @@ -73,82 +73,88 @@ def insert_transcoded_video(jobid, jobtype, data, state, status): return insert_video(data['lecture_id'], data['output']['path'], data['format_id'], status['hash'], status['filesize'], status['duration'], data['source_id'] ) -def sort_file(filename, course=None, lectures=None): - # filenames: <handle>-<sorter>-<format>.mp4 - # "sorter" musst be found with fuzzy matching. "sorter" musst be one or more of the following types: (inside the loop) - # '_' and ' ' are handled like '-' - splitfilename = filename.replace('_','-').replace(' ','-').split('-') - if not course: - handle = splitfilename[0] - if splitfilename[0].endswith('ws') or splitfilename[0].endswith('ss'): - handle = '-'.join(splitfilename[:2]) - courses = query('SELECT * FROM courses WHERE handle = ?', handle) - if not courses: - return [], 0 - course = courses[0] - if not lectures: - lectures = query('SELECT * from lectures where course_id = ?', course['id']) - # we save all extraced data in a dict +def parseVideoFileName(splitFileName): data = {'keywords': []} - # parse the file name and save all data in 'data' - for s in splitfilename: - s = s.replace('.mp4','') + for fileNameChunk in splitFileName: + fileNameChunk = fileNameChunk.replace('.mp4','') #-<YYMMDD> (date) #-<HHMM> (time) #-<keyword> # Looking for keywords in: title,speaker,comment, comma seperated list in internal try: - if len(s) == 6: - data['date'] = datetime.strptime(s,'%y%m%d').date() - elif len(s) == 4: - data['time'] = datetime.strptime(s,'%H%M').time() + if len(fileNameChunk) == 6: + data['date'] = datetime.strptime(fileNameChunk,'%y%m%d').date() + elif len(fileNameChunk) == 4: + data['time'] = datetime.strptime(fileNameChunk,'%H%M').time() else: - data['keywords'].append(s) + data['keywords'].append(fileNameChunk) except ValueError: - # if its not a date or time, handle it as keyword - data['keywords'].append(s) - # try to match the file on a single lecture + # if its not valid date or time, handle it as keyword + data['keywords'].append(fileNameChunk) + return data + +def matchDatetimeOnLecture(lectures, date, time): matches = [] # first try date and time (if one of them is set) - if ('date' in data) or ('time' in data): + if date or time: + print(1) for lecture in lectures: - if not ('time' in lecture) or not lecture['time']: + if (not 'time' in lecture) or (not lecture['time']): + continue + if date and (lecture['time'].date() != date): + continue + if time and (lecture['time'].time() != time): continue - if ('date' in data) and (lecture['time'].date() != data['date']): - continue - if ('time' in data) and (lecture['time'].time() != data['time']): - continue matches.append(lecture) - # if we can't match exactly based on date and time, we have to match keywords - if ((len(matches) != 1) and (len(data['keywords']) > 0)): - #only test lectures with the correct date/time, if we have any. Else test for matches in all lectures of this course - if len(matches) == 0: - matches.extend(lectures) - found = False - for field in ['title','speaker','comment','internal']: - for lecture in matches: - for keyword in data['keywords']: - # first test for exact match, else make it asci and try substring test - if (keyword == lecture[field]) or \ - (str(keyword).lower() in str(to_ascii(lecture[field]).lower())): - found = True - matches = [lecture] - if found: - break - if found: - break - if found: - break - # now we should have found exactly one match + return matches + +def matchKeywordsOnLecture(lectures, keywords): + for field in ['title','speaker','comment','internal']: + for lecture in lectures: + for keyword in keywords: + # first test for exact match, else make it asci and try substring test + if (keyword == lecture[field]) or \ + (str(keyword).lower() in str(to_ascii(lecture[field]).lower())): + return [lecture] + return [] + +def matchFileNameOnFormat(splitFileName): # default format is "unknown", with id 0 - fmt = 0 formats = query('SELECT * FROM formats ORDER BY prio DESC') for videoformat in formats: # we match the last part of the file name without the extension - formatstring = splitfilename[-1].split('.',1)[0].lower() + formatstring = splitFileName[-1].split('.',1)[0].lower() if formatstring in videoformat['keywords'].replace(',',' ').split(' '): - fmt = videoformat['id'] - break + return videoformat['id'] + return 0 + +def sort_file(filename, course=None, lectures=None): + # filenames: <handle>-<sorter>-<format>.mp4 + # "sorter" musst be found with fuzzy matching. "sorter" musst be one or more of the following types: (inside the loop) + # '_' and ' ' are handled like '-' + splitFileName = filename.replace('_','-').replace(' ','-').split('-') + if not course: + handle = splitFileName[0] + if splitFileName[0].endswith('ws') or splitFileName[0].endswith('ss'): + handle = '-'.join(splitFileName[:2]) + courses = query('SELECT * FROM courses WHERE handle = ?', handle) + if not courses: + return [], 0 + course = courses[0] + if not lectures: + lectures = query('SELECT * from lectures where course_id = ?', course['id']) + data = parseVideoFileName(splitFileName) + # try to match the file on a single lecture + matches = matchDatetimeOnLecture(lectures, data.get('date'), data.get('time')) + # if we can't match exactly based on date and time, we have to match keywords + if ((len(matches) != 1) and (len(data['keywords']) > 0)): + #only test lectures with the correct date/time, if we have any. Else test for matches in all lectures of this course + if len(matches) == 0: + matches = matchKeywordsOnLecture(lectures, data['keywords']) + else: + matches = matchKeywordsOnLecture(matches, data['keywords']) + # now we should have found exactly one match + fmt = matchFileNameOnFormat(splitFileName) return matches, fmt def log_sort_error(course_id, path, matches):