refactor and split up of sort_file

edfacd0a · Andreas Valder · ff93e6bf · edfacd0a
Unverified Commit edfacd0a authored 6 years ago by Andreas Valder
--- a/sorter.py
+++ b/sorter.py
@@ -73,82 +73,88 @@ def insert_transcoded_video(jobid, jobtype, data, state, status):
 		return
 	insert_video(data['lecture_id'], data['output']['path'], data['format_id'], status['hash'], status['filesize'], status['duration'], data['source_id'] )

-def sort_file(filename, course=None, lectures=None):
-	# filenames: <handle>-<sorter>-<format>.mp4
-	# "sorter" musst be found with fuzzy matching. "sorter" musst be one or more of the following types: (inside the loop)
-	# '_' and ' ' are handled like '-'
-	splitfilename = filename.replace('_','-').replace(' ','-').split('-')
-	if not course:
-		handle = splitfilename[0]
-		if splitfilename[0].endswith('ws') or splitfilename[0].endswith('ss'):
-			handle = '-'.join(splitfilename[:2])
-		courses = query('SELECT * FROM courses WHERE handle = ?', handle)
-		if not courses:
-			return [], 0
-		course = courses[0]
-	if not lectures:
-		lectures = query('SELECT * from lectures where course_id = ?', course['id'])
-	# we save all extraced data in a dict
+def parseVideoFileName(splitFileName):
 	data = {'keywords': []}
-	# parse the file name and save all data in 'data'
-	for s in splitfilename:
-		s = s.replace('.mp4','')
+	for fileNameChunk in splitFileName:
+		fileNameChunk = fileNameChunk.replace('.mp4','')
 		#-<YYMMDD> (date)
 		#-<HHMM> (time)
 		#-<keyword>
 		#	Looking for keywords in: title,speaker,comment, comma seperated list in internal
 		try:
-			if len(s) == 6:
-				data['date'] = datetime.strptime(s,'%y%m%d').date()
-			elif  len(s) == 4:
-				data['time'] = datetime.strptime(s,'%H%M').time()
+			if len(fileNameChunk) == 6:
+				data['date'] = datetime.strptime(fileNameChunk,'%y%m%d').date()
+			elif  len(fileNameChunk) == 4:
+				data['time'] = datetime.strptime(fileNameChunk,'%H%M').time()
 			else:	
-				data['keywords'].append(s)
+				data['keywords'].append(fileNameChunk)
 		except ValueError:
-			# if its not a date or time, handle it as keyword
-			data['keywords'].append(s)
-	# try to match the file on a single lecture
+			# if its not valid date or time, handle it as keyword
+			data['keywords'].append(fileNameChunk)
+	return data
+
+def matchDatetimeOnLecture(lectures, date, time):
 	matches = []
 	# first try date and time (if one of them is set)
-	if ('date' in data) or ('time' in data):
+	if date or time:
+		print(1)
 		for lecture in lectures:
-			if not ('time' in lecture) or not lecture['time']:
+			if (not 'time' in lecture) or (not lecture['time']):
 				continue
-			if ('date' in data) and (lecture['time'].date() != data['date']):
+			if date and (lecture['time'].date() != date):
 				continue
-			if ('time' in data) and (lecture['time'].time() != data['time']):
+			if time and (lecture['time'].time() != time):
 				continue
 			matches.append(lecture)
-	# if we can't match exactly  based on date and time, we have to match keywords
-	if ((len(matches) != 1) and (len(data['keywords']) > 0)):
-		#only test lectures with the correct date/time, if we have any. Else test for matches in all lectures of this course
-		if len(matches) == 0:
-			matches.extend(lectures)
-		found = False
+	return matches
+
+def matchKeywordsOnLecture(lectures, keywords):
 	for field in ['title','speaker','comment','internal']:
-			for lecture in matches:
-				for keyword in data['keywords']:
+		for lecture in lectures:
+			for keyword in keywords:
 				# first test for exact match, else make it asci and try substring test
 				if (keyword == lecture[field]) or \
 					 (str(keyword).lower() in str(to_ascii(lecture[field]).lower())):
-						found = True
-						matches = [lecture]
-					if found:
-						break
-				if found:
-					break
-			if found:
-				break
-	# now we should have found exactly one match
+					return [lecture]
+	return []
+
+def matchFileNameOnFormat(splitFileName):
 	# default format is "unknown", with id 0
-	fmt = 0
 	formats = query('SELECT * FROM formats ORDER BY prio DESC')
 	for videoformat in formats:
 		# we match the last part of the file name without the extension
-		formatstring = splitfilename[-1].split('.',1)[0].lower()
+		formatstring = splitFileName[-1].split('.',1)[0].lower()
 		if formatstring in videoformat['keywords'].replace(',',' ').split(' '):
-			fmt = videoformat['id']
-			break
+			return videoformat['id']
+	return 0
+
+def sort_file(filename, course=None, lectures=None):
+	# filenames: <handle>-<sorter>-<format>.mp4
+	# "sorter" musst be found with fuzzy matching. "sorter" musst be one or more of the following types: (inside the loop)
+	# '_' and ' ' are handled like '-'
+	splitFileName = filename.replace('_','-').replace(' ','-').split('-')
+	if not course:
+		handle = splitFileName[0]
+		if splitFileName[0].endswith('ws') or splitFileName[0].endswith('ss'):
+			handle = '-'.join(splitFileName[:2])
+		courses = query('SELECT * FROM courses WHERE handle = ?', handle)
+		if not courses:
+			return [], 0
+		course = courses[0]
+	if not lectures:
+		lectures = query('SELECT * from lectures where course_id = ?', course['id'])
+	data = parseVideoFileName(splitFileName)
+	# try to match the file on a single lecture
+	matches = matchDatetimeOnLecture(lectures, data.get('date'), data.get('time'))
+	# if we can't match exactly  based on date and time, we have to match keywords
+	if ((len(matches) != 1) and (len(data['keywords']) > 0)):
+		#only test lectures with the correct date/time, if we have any. Else test for matches in all lectures of this course
+		if len(matches) == 0:
+			matches = matchKeywordsOnLecture(lectures, data['keywords'])
+		else:
+			matches = matchKeywordsOnLecture(matches, data['keywords'])
+	# now we should have found exactly one match
+	fmt = matchFileNameOnFormat(splitFileName)
 	return matches, fmt

 def log_sort_error(course_id, path, matches):