From b96ca28eb480ae17ee97b3dd2697b1509931e0a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Simon=20K=C3=BCnzel?= <simonk@fsmpi.rwth-aachen.de>
Date: Thu, 24 Apr 2025 02:38:13 +0200
Subject: [PATCH] Add file_format to medium metadata

---
 api/api_specification.json                    |  9 ++++++
 api/api_specification.md                      | 14 +++++++--
 api/api_specification_template.md             |  7 ++++-
 api/config/db_test_data.sql                   | 22 +++++++-------
 api/migration.sql                             | 30 +++++++++++++++++++
 .../media_process/basic_targets.py            |  8 ++++-
 .../src/videoag_common/objects/medium.py      |  6 ++++
 .../src/videoag_common/test/object_data.py    | 10 +++++++
 .../jobs/media_process_scheduler/job.py       |  7 +++++
 9 files changed, 98 insertions(+), 15 deletions(-)

diff --git a/api/api_specification.json b/api/api_specification.json
index 5790d4b..90509bf 100644
--- a/api/api_specification.json
+++ b/api/api_specification.json
@@ -3740,6 +3740,15 @@
     "medium_metadata": {
       "fields": {
         "": {
+          "file_format": {
+            "config_directly_modifiable": false,
+            "id": "file_format",
+            "notes": "",
+            "object_variant": null,
+            "only_mod": false,
+            "optional": false,
+            "type": "string"
+          },
           "file_size": {
             "config_directly_modifiable": false,
             "id": "file_size",
diff --git a/api/api_specification.md b/api/api_specification.md
index 28e8aeb..eaa2b9f 100644
--- a/api/api_specification.md
+++ b/api/api_specification.md
@@ -1,4 +1,4 @@
-# Specification of the Web API for the Video-AG Website (v0.82).
+# Specification of the Web API for the Video-AG Website (v0.84).
 
 ## Introduction
 
@@ -2710,7 +2710,12 @@ Additionally, the following objects may appear as the type of some field:
 </thead>
 <tbody>
     <tr>
-        <td rowspan="4">any</td>
+        <td rowspan="5">any</td>
+        <td>file_format</td>
+        <td>string</td>
+        <td></td>
+    </tr>
+    <tr>
         <td>file_size</td>
         <td>int</td>
         <td></td>
@@ -3213,6 +3218,11 @@ Possible `error_code`:
 
 ## Changelog
 
+### v0.84
+
+* Updated `medium_metadata`
+  * Added field `file_format`
+
 ### v0.83
 
 * Updated `GET /lecture/{lecture_id}/media_process_overview`
diff --git a/api/api_specification_template.md b/api/api_specification_template.md
index 36b13ef..21874e0 100644
--- a/api/api_specification_template.md
+++ b/api/api_specification_template.md
@@ -1,4 +1,4 @@
-# Specification of the Web API for the Video-AG Website (v0.82).
+# Specification of the Web API for the Video-AG Website (v0.84).
 
 ## Introduction
 
@@ -139,6 +139,11 @@ Possible `error_code`:
 
 ## Changelog
 
+### v0.84
+
+* Updated `medium_metadata`
+  * Added field `file_format`
+
 ### v0.83
 
 * Updated `GET /lecture/{lecture_id}/media_process_overview`
diff --git a/api/config/db_test_data.sql b/api/config/db_test_data.sql
index c3c942c..1407fb2 100644
--- a/api/config/db_test_data.sql
+++ b/api/config/db_test_data.sql
@@ -90,17 +90,17 @@ SELECT setval('sorter_file_id_seq', 1000);
 
 
 -- TODO figure out correct/realistic file sizes
-INSERT INTO medium_metadata (id,file_id,"type",file_size,duration_sec,vertical_resolution,horizontal_resolution,video_frame_rate_numerator,video_frame_rate_denominator,audio_sample_rate,audio_channel_count) VALUES
-     (6,1,'thumbnail'::medium_metadata_type,42,NULL,640,640,NULL,NULL,NULL,NULL),
-     (8,2,'thumbnail'::medium_metadata_type,42,NULL,640,640,NULL,NULL,NULL,NULL),
-     (17,3,'thumbnail'::medium_metadata_type,42,NULL,640,640,NULL,NULL,NULL,NULL),
-     (20,4,'thumbnail'::medium_metadata_type,42,NULL,640,640,NULL,NULL,NULL,NULL),
-     (15,5,'plain_video'::medium_metadata_type,42,5431,1080,1920,25,1,44000,2),
-     (5,6,'plain_video'::medium_metadata_type,42,5243,720,1280,25,1,44000,2),
-     (7,7,'plain_video'::medium_metadata_type,42,5420,720,1280,25,1,44000,2),
-     (19,8,'plain_video'::medium_metadata_type,42,5001,720,1280,25,1,44000,2),
-     (16,9,'plain_video'::medium_metadata_type,42,5431,720,1280,25,1,44000,2),
-     (25,10,'plain_video'::medium_metadata_type,42,5431,480,854,25,1,44000,2);
+INSERT INTO medium_metadata (id,file_id,"type",file_size,file_format,duration_sec,vertical_resolution,horizontal_resolution,video_frame_rate_numerator,video_frame_rate_denominator,audio_sample_rate,audio_channel_count) VALUES
+     (6,1,'thumbnail'::medium_metadata_type,42,'jpg',NULL,640,640,NULL,NULL,NULL,NULL),
+     (8,2,'thumbnail'::medium_metadata_type,42,'jpg',NULL,640,640,NULL,NULL,NULL,NULL),
+     (17,3,'thumbnail'::medium_metadata_type,42,'jpg',NULL,640,640,NULL,NULL,NULL,NULL),
+     (20,4,'thumbnail'::medium_metadata_type,42,'jpg',NULL,640,640,NULL,NULL,NULL,NULL),
+     (15,5,'plain_video'::medium_metadata_type,42,'mp4',5431,1080,1920,25,1,44000,2),
+     (5,6,'plain_video'::medium_metadata_type,42,'mp4',5243,720,1280,25,1,44000,2),
+     (7,7,'plain_video'::medium_metadata_type,42,'mp4',5420,720,1280,25,1,44000,2),
+     (19,8,'plain_video'::medium_metadata_type,42,'mp4',5001,720,1280,25,1,44000,2),
+     (16,9,'plain_video'::medium_metadata_type,42,'mp4',5431,720,1280,25,1,44000,2),
+     (25,10,'plain_video'::medium_metadata_type,42,'mp4',5431,480,854,25,1,44000,2);
 SELECT setval('medium_metadata_id_seq', 1000);
 
 
diff --git a/api/migration.sql b/api/migration.sql
index 8eacea3..8427bd1 100644
--- a/api/migration.sql
+++ b/api/migration.sql
@@ -245,6 +245,30 @@ SELECT id, deleted, visible, lecture_id, "time", text
 FROM old_data.chapters
 ;
 
+CREATE OR REPLACE FUNCTION temp_get_file_extension(path text) RETURNS text AS $$
+    DECLARE
+        str text;
+    BEGIN
+        SELECT path INTO str;
+
+        -- delete path in front
+        SELECT regexp_replace(str, '.*/', '', 'g') INTO str;
+
+        IF (str NOT LIKE '%.%') THEN
+            RAISE EXCEPTION 'File has no extension: %', path;
+        END IF;
+
+        -- delete part before extension
+        SELECT regexp_replace(str, '.*\.', '', 'g') INTO str;
+
+        IF (str = '') THEN
+            RAISE EXCEPTION 'File has no extension: %', path;
+        END IF;
+
+        RETURN str;
+    END;
+$$ LANGUAGE plpgsql;
+
 DO $$
     DECLARE
         lecture record;
@@ -343,6 +367,7 @@ DO $$
                     file_id,
                     type,
                     file_size,
+                    file_format,
                     duration_sec,
                     audio_sample_rate,
                     audio_channel_count,
@@ -354,6 +379,7 @@ DO $$
                     medium_file_id,
                     'plain_video',
                     video.file_size,
+                    temp_get_file_extension(video.path),
                     video.duration,
                     44000,
                     2,
@@ -444,6 +470,7 @@ DO $$
                         file_id,
                         type,
                         file_size,
+                        file_format,
                         duration_sec,
                         audio_sample_rate,
                         audio_channel_count,
@@ -457,6 +484,7 @@ DO $$
                         -- Some of these values are all probably wrong, but we can't get them here. Can be updated later by
                         -- a script inspecting the actual files
                         0,
+                        'mp4',
                         video.duration,
                         44000,
                         2,
@@ -536,12 +564,14 @@ DO $$
                 file_id,
                 type,
                 file_size,
+                file_format,
                 vertical_resolution,
                 horizontal_resolution
             ) VALUES (
                 thumbnail_medium_file_id,
                 'thumbnail',
                 0,
+                'jpg',
                 640,
                 640
             ) RETURNING id INTO thumbnail_medium_metadata_id;
diff --git a/common_py/src/videoag_common/media_process/basic_targets.py b/common_py/src/videoag_common/media_process/basic_targets.py
index 55252a8..fe103a5 100644
--- a/common_py/src/videoag_common/media_process/basic_targets.py
+++ b/common_py/src/videoag_common/media_process/basic_targets.py
@@ -1,9 +1,11 @@
 import re
 import math
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 from videoag_common.database import *
 from videoag_common.miscellaneous import *
+from ..ffmpeg import get_file_extension
 
 if TYPE_CHECKING:
     # Can't actually import due to circular dependency
@@ -60,6 +62,10 @@ class SourceFileTargetProducer(SingleOutputTargetProducer["SourceMedium"]):
         
         assert isinstance(intermediate, SorterFile)
         sorter_file: SorterFile = intermediate
+
+        if get_file_extension(Path(sorter_file.file_path)) is None:
+            raise MediaProcessException(f"Sorter File {sorter_file.file_path} (#{sorter_file.id}) does not have a file"
+                                        f" extension")
         
         medium_file = output_files_by_id[self.output_id]
         medium_file.file_path = sorter_file.file_path
@@ -92,7 +98,7 @@ class RescaleVideoTargetProducer(SingleInputTargetProducer, SingleOutputTargetPr
             raise MediaProcessException(f"Input {self.input_id} is not of type {MediumMetadataType.PLAIN_VIDEO}")
         
         output_file = output_files_by_id[self.output_id]
-        output_file.file_path += ".mp4"
+        output_file.file_path += f".{input_medium.file_format}"
         
         return "rescale_video", {
             "input_file": input_medium.file.file_path,
diff --git a/common_py/src/videoag_common/objects/medium.py b/common_py/src/videoag_common/objects/medium.py
index c8de5dc..9fbabc0 100644
--- a/common_py/src/videoag_common/objects/medium.py
+++ b/common_py/src/videoag_common/objects/medium.py
@@ -313,6 +313,12 @@ class MediumMetadata(ApiObject, Base):
             include_in_data=True
         )
     )
+    file_format: Mapped[str] = api_mapped(
+        mapped_column(Text(collation=STRING_COLLATION), nullable=False),
+        ApiStringField(
+            include_in_data=True
+        )
+    )
     
     file: Mapped["MediumFile"] = relationship(
         primaryjoin=lambda: MediumMetadata.file_id == MediumFile.id,
diff --git a/common_py/src/videoag_common/test/object_data.py b/common_py/src/videoag_common/test/object_data.py
index 1d76807..75ac520 100644
--- a/common_py/src/videoag_common/test/object_data.py
+++ b/common_py/src/videoag_common/test/object_data.py
@@ -183,6 +183,7 @@ TEST_DATA_MEDIUM_METADATA_5 = \
         "id": 5,
         "type": "plain_video",
         "file_size": 42,
+        "file_format": "mp4",
         "vertical_resolution": 720,
         "horizontal_resolution": 1280,
         "video_frame_rate_numerator": 25,
@@ -196,6 +197,7 @@ TEST_DATA_MEDIUM_METADATA_6 = \
         "id": 6,
         "type": "thumbnail",
         "file_size": 42,
+        "file_format": "jpg",
         "vertical_resolution": 640,
         "horizontal_resolution": 640,
     }
@@ -206,6 +208,7 @@ TEST_DATA_MEDIUM_METADATA_7 = \
         "id": 7,
         "type": "plain_video",
         "file_size": 42,
+        "file_format": "mp4",
         "vertical_resolution": 720,
         "horizontal_resolution": 1280,
         "video_frame_rate_numerator": 25,
@@ -219,6 +222,7 @@ TEST_DATA_MEDIUM_METADATA_8 = \
         "id": 8,
         "type": "thumbnail",
         "file_size": 42,
+        "file_format": "jpg",
         "vertical_resolution": 640,
         "horizontal_resolution": 640,
     }
@@ -229,6 +233,7 @@ TEST_DATA_MEDIUM_METADATA_15 = \
         "id": 15,
         "type": "plain_video",
         "file_size": 42,
+        "file_format": "mp4",
         "vertical_resolution": 1080,
         "horizontal_resolution": 1920,
         "video_frame_rate_numerator": 25,
@@ -242,6 +247,7 @@ TEST_DATA_MEDIUM_METADATA_16 = \
         "id": 16,
         "type": "plain_video",
         "file_size": 42,
+        "file_format": "mp4",
         "vertical_resolution": 720,
         "horizontal_resolution": 1280,
         "video_frame_rate_numerator": 25,
@@ -255,6 +261,7 @@ TEST_DATA_MEDIUM_METADATA_25 = \
         "id": 25,
         "type": "plain_video",
         "file_size": 42,
+        "file_format": "mp4",
         "vertical_resolution": 480,
         "horizontal_resolution": 854,
         "video_frame_rate_numerator": 25,
@@ -268,6 +275,7 @@ TEST_DATA_MEDIUM_METADATA_17 = \
         "id": 17,
         "type": "thumbnail",
         "file_size": 42,
+        "file_format": "jpg",
         "vertical_resolution": 640,
         "horizontal_resolution": 640,
     }
@@ -278,6 +286,7 @@ TEST_DATA_MEDIUM_METADATA_19 = \
         "id": 19,
         "type": "plain_video",
         "file_size": 42,
+        "file_format": "mp4",
         "vertical_resolution": 720,
         "horizontal_resolution": 1280,
         "video_frame_rate_numerator": 25,
@@ -291,6 +300,7 @@ TEST_DATA_MEDIUM_METADATA_20 = \
         "id": 20,
         "type": "thumbnail",
         "file_size": 42,
+        "file_format": "jpg",
         "vertical_resolution": 640,
         "horizontal_resolution": 640,
     }
diff --git a/job_controller/jobs/media_process_scheduler/job.py b/job_controller/jobs/media_process_scheduler/job.py
index a7fa1f8..012fc46 100644
--- a/job_controller/jobs/media_process_scheduler/job.py
+++ b/job_controller/jobs/media_process_scheduler/job.py
@@ -218,6 +218,13 @@ class ProcessScheduler:
             file.medium_metadata = res
             assert isinstance(file.medium_metadata, MediumMetadata)
             file.medium_metadata.file_size = probe_context.file_size()
+            if file.medium_metadata.file_format is None:
+                extension = get_file_extension(Path(file.file_path))
+                if extension is None:
+                    raise ValueError(f"Got no file format from try_create_for_file for {medium_type} and file"
+                                     f" {file.file_path} does not have an extension. Target Producers need to make sure"
+                                     f" that their medium files have an extension.")
+                file.medium_metadata.file_format = extension
             self._session.flush()  # Ensure metadata id present
             logger.info(f"Assigned metadata {file.medium_metadata.id} of type {medium_type} to medium file {file.file_path}"
                         f" ({file.process_target_id}, {file.id})")
-- 
GitLab