sorter.py

from server import *
from jobmanagement import schedule_job, job_handler
from jobtypes import schedule_thumbnail
import traceback
import os.path

@app.route('/internal/sort/log')
@register_navbar('Sortierlog', icon='sort-by-attributes-alt', group='weitere')
@mod_required
def sort_log():
	return render_template('sortlog.html',sortlog=query('''
			SELECT
				sortlog.*,
				lectures.id as lecture_id,
				lectures.title as lecture_title,
				lectures.course_id as course_id,
				courses.title as course_title
			FROM sortlog
			JOIN lectures ON lectures.id = sortlog.lecture_id
			JOIN courses ON courses.id = lectures.course_id
			ORDER BY sortlog.`when` DESC
			LIMIT 50
		'''),sorterrorlog=query('SELECT * FROM sorterrorlog ORDER BY sorterrorlog.`when` DESC'))

def to_ascii(inputstring):
	asciistring = inputstring
	for charset in [('ä', 'ae'), ('ö', 'oe'), ('ü', 'ue'), ('ß', 'ss')]:
		asciistring = asciistring.replace(charset[0],charset[1])
	return asciistring

@job_handler('probe', 'remux', 'transcode')
def update_video_metadata(jobid, jobtype, data, state, status):
	if 'video_id' not in data:
		return
	if jobtype not in ['remux', 'transcode']:
		video = query('SELECT * FROM videos WHERE id = ?', data['video_id'])[0]
		if video['hash'] and video['hash'] != status['hash']:
			raise Exception('Hash mismatch for video {}'.format(data['video_id']))
	modify('UPDATE videos_data SET hash = ?, file_size = ?, duration = ? WHERE id = ?',
			status['hash'], status['filesize'], status['duration'], data['video_id'])

@app.route('/internal/jobs/add/thumbnail', methods=['GET', 'POST'])
@mod_required
@csrf_protect
@handle_errors('jobs_overview', 'Zu dieser Veranstaltung existieren keine Videos!', 404, IndexError)
def add_thumbnail_job():
	schedule_thumbnail(request.values['lectureid'])
	return redirect(request.values.get('ref', url_for('jobs_overview')))

def insert_video(lectureid, dbfilepath, fileformatid, hash="", filesize=-1, duration=-1, sourceid=None):
	visible = query('SELECT courses.autovisible FROM courses JOIN lectures ON lectures.course_id = courses.id WHERE lectures.id = ?', lectureid)[0]['autovisible']
	video_id = modify('''INSERT INTO videos_data
		(lecture_id, visible, path, video_format, title, comment, internal, file_modified, time_created, time_updated, created_by, hash, file_size, duration, source)
		VALUES
		(?, ?, ?, ?, "", "", "", ?, ?, ?, ?, ?, ?, ?)''',
		lectureid, visible, dbfilepath, fileformatid, datetime.now(), datetime.now(), datetime.now(), -1, hash, filesize, duration, source)
	if sourceid:
		schedule_remux(lectureid, video_id)
	else:
		query('INSERT INTO sortlog (lecture_id,video_id,path,`when`) VALUES (?,?,?,?)', lectureid, video_id, dbfilepath, datetime.now())
		schedule_job('probe', {'path': dbfilepath, 'lecture_id': lectureid, 'video_id': video_id, 'import-chapters': True})
	schedule_thumbnail(lectureid)
	video = query('SELECT videos.*, "format" AS sep, formats.* FROM videos JOIN formats ON formats.id = videos.video_format WHERE videos.id = ?', video_id)[0]
	lecture = query('SELECT * FROM lectures WHERE id = ?', lectureid)[0]
	course = query('SELECT * FROM courses WHERE id = ?', lecture['course_id'])[0]
	notify_mods('new_video', course['id'], course=course, lecture=lecture, video=video)

@job_handler('transcode')
def insert_transcoded_video(jobid, jobtype, data, state, status):
	if 'lecture_id' not in data or 'source_id' not in data or 'format_id' not in data:
		return
	if 'video_id' in data:
		return
	insert_video(data['lecture_id'], data['output']['path'], data['format_id'], status['hash'], status['filesize'], status['duration'], data['source_id'] )

def split_filename(filename):
	# '_' and ' ' are handled like '-'
	return filename.replace('_','-').replace(' ','-').split('-')

def parse_filename(splitFileName):
	# filenames: <handle>-<sorter>-<format>.mp4
	data = {'keywords': []}
	for fileNameChunk in splitFileName:
		fileNameChunk = fileNameChunk.replace('.mp4','')
		#-<YYMMDD> (date)
		#-<HHMM> (time)
		#-<keyword>
		#	Looking for keywords in: title,speaker,comment, comma seperated list in internal
		try:
			if len(fileNameChunk) == 6:
				data['date'] = datetime.strptime(fileNameChunk,'%y%m%d').date()
			elif  len(fileNameChunk) == 4:
				data['time'] = datetime.strptime(fileNameChunk,'%H%M').time()
			else:
				data['keywords'].append(fileNameChunk)
		except ValueError:
			# if its not valid date or time, handle it as keyword
			data['keywords'].append(fileNameChunk)
	return data

def filter_lectures_by_datetime(lectures, date, time):
	matches = []
	if date or time:
		for lecture in lectures:
			if (not 'time' in lecture) or (not lecture['time']):
				continue
			if date and (lecture['time'].date() != date):
				continue
			if time and (lecture['time'].time() != time):
				continue
			matches.append(lecture)
	return matches

def filter_lectures_by_keywords(lectures, keywords):
	for field in ['title','speaker','comment','internal']:
		for lecture in lectures:
			for keyword in keywords:
				# first test for exact match, else make it asci and try substring test
				if (keyword == lecture[field]) or \
					 (str(keyword).lower() in str(to_ascii(lecture[field]).lower())):
					return [lecture]
	return []

def extract_format_keyword_from_filename(splitFileName):
	return splitFileName[-1].split('.',1)[0].lower()

def filter_formats_by_filename(splitFileName):
	formatstring = extract_format_keyword_from_filename(splitFileName)
	formats = query('SELECT * FROM formats ORDER BY prio DESC')
	for videoformat in formats:
		# we match the last part of the file name without the extension
		if formatstring in videoformat['keywords'].replace(',',' ').split(' '):
			return videoformat['id']
	# default format is "unknown", with id 0
	return 0

def sort_file(filename, course=None, lectures=None):
	splitFileName = split_filename(filename)
	if not course:
		handle = splitFileName[0]
		if splitFileName[0].endswith('ws') or splitFileName[0].endswith('ss'):
			handle = '-'.join(splitFileName[:2])
		courses = query('SELECT * FROM courses WHERE handle = ?', handle)
		if not courses:
			return [], 0
		course = courses[0]
	if not lectures:
		lectures = query('SELECT * from lectures where course_id = ?', course['id'])
	# parse all data from the file name
	data = parse_filename(splitFileName)
	# try to match the file on a single lecture
	matches = filter_lectures_by_datetime(lectures, data.get('date'), data.get('time'))
	# if we can't match exactly  based on date and time, we have to match keywords
	if ((len(matches) != 1) and (len(data['keywords']) > 0)):
		if not len(matches) == 0:
			# only test lectures with the correct date/time, if we have any
			matches = filter_lectures_by_keywords(matches, data['keywords'])
		else:
			# Else test for matches in all lectures of this course
			matches = filter_lectures_by_keywords(lectures, data['keywords'])
	# now we should have found exactly one match
	fmt = filter_formats_by_filename(splitFileName)
	return matches, fmt

def log_sort_error(course_id, path, matches):
	matches_id = []
	for match in matches:
		matches_id.append(str(match['id']))
	query('INSERT INTO sorterrorlog_data (course_id, path, matches, `when`, time_updated, time_created) VALUES (?, ?, ?, ?, ?, ?)',
			course_id, path, ','.join(matches_id), datetime.now(), datetime.now(), datetime.now())

def sort_api_token_required(func):
	@wraps(func)
	def decorator(*args, **kwargs):
		if 'apikey' in request.values:
			token = request.values['apikey']
		elif request.get_json() and ('apikey' in request.get_json()):
			token = request.get_json()['apikey']
		else:
			token = None
		if not token == config.get('SORTER_API_KEY', [None]):
			return 'Permission denied', 403
		else:
			return func(*args, **kwargs)
	return decorator

@app.route('/internal/sort/encoded/<filename>')
@sort_api_token_required
def sort_encoded(filename):
	matches, fmt = sort_file(filename)
	if len(matches) != 1:
		log_sort_error(-1, 'kodiert/'+filename, matches)
		return "Could not match filename", 400
	lecture = matches[0]
	course = query('SELECT * FROM courses WHERE id = ?', lecture['course_id'])[0]
	if course['autopublish']:
		schedule_job('publish_video', {'source': filename, 'path': 'pub/'+course['handle']+'/'+filename, 'lecture_id': lecture['id'], 'format_id': fmt})
	return 'OK', 200

@app.route('/internal/sort/autoencode')
@sort_api_token_required
def sort_autoencode():
	filename = request.values['path']
	path = 'autoencode/'+filename
	matches, fmt = sort_file(filename)
	if len(matches) != 1:
		log_sort_error(-1, 'raw/'+path, matches)
		return "Could not match filename", 400
	lecture = matches[0]
	schedule_job('probe-raw', {'path': path, 'lecture_id': lecture['id'], 'import-chapters': True})
	return 'OK', 200

@job_handler('publish_video')
def handle_published_video(jobid, jobtype, data, state, status):
	if 'lecture_id' not in data or 'format_id' not in data:
		return
	insert_video(data['lecture_id'], data['path'], data['format_id'], hash=status['hash'], filesize=status['filesize'], duration=status['duration'])

@app.route('/internal/sort/now')
@mod_required
@sched_func(600)
def sort_now():
	courses = query('SELECT * FROM courses')
	for course in courses:
		modify('BEGIN')
		for mountpoint in config['VIDEOMOUNT']:
			existingvideos = query('SELECT videos.path FROM videos JOIN lectures ON (videos.lecture_id = lectures.id) WHERE lectures.course_id = ?',course['id'])
			knownerrors = query('SELECT sorterrorlog.path FROM sorterrorlog WHERE sorterrorlog.course_id = ?',course['id'])
			ignorefiles = existingvideos + knownerrors
			lectures = query('SELECT * from lectures where course_id = ?',course['id'])
			coursepath = mountpoint['mountpoint']+course['handle']
			try:
				files = os.listdir(coursepath)
			except FileNotFoundError:
				files = []
			for filename in files:
				try:
					# if the video is in the table "videos" already (with the correct course), skip it
					ignore = False
					for file_to_ignore in ignorefiles:
						# path is something like
						# vpnonline/08ws-swt/08ws-swt-081118.mp4
						if os.path.basename(filename) == os.path.basename(file_to_ignore['path']):
							ignore = True
							break
					if ignore:
						continue
					if not os.path.splitext(filename)[1] == '.mp4':
						continue
					matches, fmt = sort_file(filename, course=course, lectures=lectures)
					dbfilepath = mountpoint['prefix']+course['handle']+'/'+filename
					if len(matches) == 1:
						insert_video(matches[0]['id'], dbfilepath, fmt)
					else:
						log_sort_error(course['id'], dbfilepath, matches)
				except Exception:
					traceback.print_exc()
		modify('COMMIT')
	if 'ref' in request.values:
		return redirect(request.values['ref'])
	else:
		return 'OK', 200