From bda47816852aa068f0c2975a3563e85d90635c6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Simon=20K=C3=BCnzel?= <simonk@fsmpi.rwth-aachen.de>
Date: Thu, 1 May 2025 02:01:17 +0200
Subject: [PATCH] Add view count to publish medium stats

---
 api/api_specification.json                    | 18 +++++++
 api/api_specification.md                      | 17 +++++-
 api/api_specification_template.md             |  7 ++-
 api/src/api/routes/stats.py                   | 19 +++++--
 .../src/videoag_common/objects/__init__.py    |  1 +
 common_py/src/videoag_common/objects/stats.py | 11 ++++
 .../jobs/view_stats_aggregator/job.py         | 54 +++++++++++++------
 7 files changed, 107 insertions(+), 20 deletions(-)

diff --git a/api/api_specification.json b/api/api_specification.json
index 90509bf..f84f786 100644
--- a/api/api_specification.json
+++ b/api/api_specification.json
@@ -2145,6 +2145,24 @@
                 "only_mod": false,
                 "optional": false,
                 "type": "segmented_stats"
+              },
+              "total_watched_seconds": {
+                "config_directly_modifiable": false,
+                "id": "total_watched_seconds",
+                "notes": "",
+                "object_variant": null,
+                "only_mod": false,
+                "optional": false,
+                "type": "int"
+              },
+              "view_count": {
+                "config_directly_modifiable": false,
+                "id": "view_count",
+                "notes": "",
+                "object_variant": null,
+                "only_mod": false,
+                "optional": false,
+                "type": "int"
               }
             }
           },
diff --git a/api/api_specification.md b/api/api_specification.md
index eaa2b9f..324a7f2 100644
--- a/api/api_specification.md
+++ b/api/api_specification.md
@@ -1,4 +1,4 @@
-# Specification of the Web API for the Video-AG Website (v0.84).
+# Specification of the Web API for the Video-AG Website (v0.85).
 
 ## Introduction
 
@@ -1634,6 +1634,16 @@ This may only be used by a moderator.
         <td>segmented_stats</td>
         <td>Contains view count statistics for segments of a medium. The medium is split into equal length segments, and each segment has a view count</td>
     </tr>
+    <tr>
+        <td>total_watched_seconds</td>
+        <td>int</td>
+        <td></td>
+    </tr>
+    <tr>
+        <td>view_count</td>
+        <td>int</td>
+        <td></td>
+    </tr>
 </tbody>
 </table>
 
@@ -3218,6 +3228,11 @@ Possible `error_code`:
 
 ## Changelog
 
+### v0.85
+
+* Updated `GET /stats/publish_medium/{publish_medium_id}`
+  * Added fields `total_watched_seconds` and `view_count`
+
 ### v0.84
 
 * Updated `medium_metadata`
diff --git a/api/api_specification_template.md b/api/api_specification_template.md
index 21874e0..b177b67 100644
--- a/api/api_specification_template.md
+++ b/api/api_specification_template.md
@@ -1,4 +1,4 @@
-# Specification of the Web API for the Video-AG Website (v0.84).
+# Specification of the Web API for the Video-AG Website (v0.85).
 
 ## Introduction
 
@@ -139,6 +139,11 @@ Possible `error_code`:
 
 ## Changelog
 
+### v0.85
+
+* Updated `GET /stats/publish_medium/{publish_medium_id}`
+  * Added fields `total_watched_seconds` and `view_count`
+
 ### v0.84
 
 * Updated `medium_metadata`
diff --git a/api/src/api/routes/stats.py b/api/src/api/routes/stats.py
index 96063d8..d469cb4 100755
--- a/api/src/api/routes/stats.py
+++ b/api/src/api/routes/stats.py
@@ -113,6 +113,8 @@ def api_route_watch_log_publish_medium(course_handle: str, publish_medium_id: in
                                 " a few hours)",
            response_objects={
                "": [
+                   ("view_count", "int"),
+                   ("total_watched_seconds", "int"),
                    ("segmented_stats", "segmented_stats", "Contains view count statistics for segments of a medium. The"
                                                           " medium is split into equal length segments, and each segment"
                                                           " has a view count"),
@@ -133,6 +135,7 @@ def api_route_get_publish_medium_stats(publish_medium_id: int):
     
     def _trans(session: SessionDb) -> tuple[
         PublishMedium,
+        PublishMediumWatchStats,
         Sequence[PublishMediumSegmentWatchStats],
         Sequence[PublishMediumWatchClientStats],
         LectureWatchStats,
@@ -146,6 +149,10 @@ def api_route_get_publish_medium_stats(publish_medium_id: int):
         )
         if medium is None:
             raise ApiClientException(ERROR_UNKNOWN_OBJECT)
+        generic_medium_stats = session.scalar(
+            sql.select(PublishMediumWatchStats)
+            .where(PublishMediumWatchStats.publish_medium_id == medium.id)
+        )
         segments = session.scalars(
             sql.select(PublishMediumSegmentWatchStats)
             .where(PublishMediumSegmentWatchStats.publish_medium_id == publish_medium_id)
@@ -161,9 +168,9 @@ def api_route_get_publish_medium_stats(publish_medium_id: int):
             .where(LectureWatchStats.lecture_id == medium.lecture_id)
         )
         session.expunge_all()
-        return medium, segments, client_stats, generic_lecture_stats
+        return medium, generic_medium_stats, segments, client_stats, generic_lecture_stats
     
-    medium, segments, client_stats, generic_lecture_stats = database.execute_read_transaction(_trans)
+    medium, generic_medium_stats, segments, client_stats, generic_lecture_stats = database.execute_read_transaction(_trans)
     metadata = medium.medium_metadata
     if not isinstance(metadata, DurationMetadata):
         raise ApiClientException(ERROR_BAD_REQUEST("Medium is not a medium with watch stats (doesn't have a duration)"))
@@ -171,7 +178,11 @@ def api_route_get_publish_medium_stats(publish_medium_id: int):
     
     segment_total_views_json = []
     segment_unique_views_json = []
-    segment_duration_sec = generic_lecture_stats.segment_duration_sec
+    segment_duration_sec = None
+    if len(segments) > 0:
+        if generic_lecture_stats is None:
+            raise Exception("Some segments exist but no generic lecture stats exist (which store segment duration)")
+        segment_duration_sec = generic_lecture_stats.segment_duration_sec
     
     segments_iter = more_itertools.peekable(segments)
     for i in range(math.ceil(duration_sec / segment_duration_sec)):
@@ -196,6 +207,8 @@ def api_route_get_publish_medium_stats(publish_medium_id: int):
         client_stats_json[stat.type][stat.value] = stat.client_count
     
     return {
+        "view_count": generic_medium_stats.view_count if generic_medium_stats else 0,
+        "total_watched_seconds": generic_medium_stats.total_watched_seconds if generic_medium_stats else 0,
         "segmented_stats": {
             "segment_duration_sec": segment_duration_sec,
             "segment_total_view_counts": segment_total_views_json,
diff --git a/common_py/src/videoag_common/objects/__init__.py b/common_py/src/videoag_common/objects/__init__.py
index 367d349..bf490df 100644
--- a/common_py/src/videoag_common/objects/__init__.py
+++ b/common_py/src/videoag_common/objects/__init__.py
@@ -50,6 +50,7 @@ from .stats import (
     PublishMediumSegmentWatchStats,
     ClientStatType,
     PublishMediumWatchClientStats,
+    PublishMediumWatchStats,
     LectureDailyWatchStats,
     LectureWatchStats,
 )
diff --git a/common_py/src/videoag_common/objects/stats.py b/common_py/src/videoag_common/objects/stats.py
index 7488074..b38d16c 100755
--- a/common_py/src/videoag_common/objects/stats.py
+++ b/common_py/src/videoag_common/objects/stats.py
@@ -62,6 +62,17 @@ class PublishMediumWatchClientStats(Base):
     client_count: Mapped[int] = mapped_column(nullable=False, default=0)
 
 
+# General stats for a publish medium
+# Note that these might not sum up for the lecture values. One can watch half the lecture with one medium and the other
+# half with another medium.
+class PublishMediumWatchStats(Base):
+    
+    publish_medium_id: Mapped[int] = mapped_column(ForeignKey("publish_medium.id"), nullable=False, primary_key=True)
+    
+    view_count: Mapped[int] = mapped_column(nullable=False, default=0)
+    total_watched_seconds: Mapped[int] = mapped_column(sql.BigInteger(), nullable=False, default=0)
+
+
 # Counts the views per day for a lecture
 class LectureDailyWatchStats(Base):
     lecture_id: Mapped[int] = mapped_column(ForeignKey("lecture.id"), nullable=False, primary_key=True)
diff --git a/job_controller/jobs/view_stats_aggregator/job.py b/job_controller/jobs/view_stats_aggregator/job.py
index 6519745..084b448 100644
--- a/job_controller/jobs/view_stats_aggregator/job.py
+++ b/job_controller/jobs/view_stats_aggregator/job.py
@@ -137,9 +137,6 @@ def _aggregate_entries(
                 # have music videos (as far as I know))
                 view_count_array[i] += 1
     
-    if total_watched_seconds < _MIN_WATCH_DURATION_SECONDS:
-        return None
-    
     return view_count_array, total_watched_seconds, total_realtime_seconds
 
 
@@ -199,7 +196,7 @@ def _add_seconds_array_to_database(
         segment.total_watch_count += segment_watch_count
 
 
-def _store_client_stats_to_database(session: SessionDb, publish_medium_id: int, type: ClientStatType, value: str):
+def _store_medium_client_stats_to_database(session: SessionDb, publish_medium_id: int, type: ClientStatType, value: str):
     stat = session.scalar(
         sql.select(PublishMediumWatchClientStats)
         .where(
@@ -220,7 +217,7 @@ def _store_client_stats_to_database(session: SessionDb, publish_medium_id: int,
     stat.client_count += 1
 
 
-def _add_user_agent_client_stats(session: SessionDb, publish_medium_id: int, user_agent: str):
+def _add_medium_user_agent_client_stats(session: SessionDb, publish_medium_id: int, user_agent: str):
     ua_result = ua_parser.parse(user_agent)
     
     # For anonymity reasons we only include values from a set of common values, and all other values are stored as 'Other'
@@ -229,7 +226,7 @@ def _add_user_agent_client_stats(session: SessionDb, publish_medium_id: int, use
         os = ua_result.os.family
         if os not in _OS_FAMILIES:
             os = "Other"
-        _store_client_stats_to_database(session, publish_medium_id, ClientStatType.OPERATING_SYSTEM, os)
+        _store_medium_client_stats_to_database(session, publish_medium_id, ClientStatType.OPERATING_SYSTEM, os)
     
     if ua_result.user_agent:
         # Not sure why PyCharm thinks .user_agent is a str
@@ -237,21 +234,39 @@ def _add_user_agent_client_stats(session: SessionDb, publish_medium_id: int, use
         browser = ua_result.user_agent.family
         if browser not in _BROWSER_FAMILIES:
             browser = "Other"
-        _store_client_stats_to_database(session, publish_medium_id, ClientStatType.BROWSER, browser)
+        _store_medium_client_stats_to_database(session, publish_medium_id, ClientStatType.BROWSER, browser)
     
     if ua_result.device:
         device_family = ua_result.device.family
         if device_family not in _DEVICE_FAMILIES:
             device_family = "Other"
-        _store_client_stats_to_database(session, publish_medium_id, ClientStatType.DEVICE_FAMILY, device_family)
+        _store_medium_client_stats_to_database(session, publish_medium_id, ClientStatType.DEVICE_FAMILY, device_family)
         
         device_brand = ua_result.device.brand
         if device_brand not in _DEVICE_BRANDS:
             device_brand = "Other"
-        _store_client_stats_to_database(session, publish_medium_id, ClientStatType.DEVICE_BRAND, device_brand)
+        _store_medium_client_stats_to_database(session, publish_medium_id, ClientStatType.DEVICE_BRAND, device_brand)
+
+
+def _update_medium_generic_stats(session: SessionDb, publish_medium_id: int, total_watched_seconds: int):
+    stats = session.scalar(
+        sql.select(PublishMediumWatchStats)
+        .where(PublishMediumWatchStats.publish_medium_id == publish_medium_id)
+    )
+    if stats is None:
+        stats = PublishMediumWatchStats(
+            publish_medium_id=publish_medium_id,
+        )
+        session.add(stats)
+        session.flush([stats])  # Ensure default values are present
+    
+    stats.view_count += 1
+    stats.total_watched_seconds += total_watched_seconds
+    
+    return stats
 
 
-def _add_daily_stats(session: SessionDb, lecture_id: int, watch_date: Date):
+def _add_lecture_daily_stats(session: SessionDb, lecture_id: int, watch_date: Date):
     stat = session.scalar(
         sql.select(LectureDailyWatchStats)
         .where(LectureDailyWatchStats.lecture_id == lecture_id,
@@ -268,7 +283,7 @@ def _add_daily_stats(session: SessionDb, lecture_id: int, watch_date: Date):
     stat.view_count += 1
 
 
-def _update_and_get_generic_stats(session: SessionDb, lecture_id: int, total_watched_seconds: int, average_watch_speed: float) -> LectureWatchStats:
+def _update_and_get_lecture_generic_stats(session: SessionDb, lecture_id: int, total_watched_seconds: int, average_watch_speed: float) -> LectureWatchStats:
     stats = session.scalar(
         sql.select(LectureWatchStats)
         .where(LectureWatchStats.lecture_id == lecture_id)
@@ -328,7 +343,8 @@ def _trans_process_watch(session: SessionDb, watch_id: str):
     }
     medium_duration_sec_by_id = {}
     view_count_arrays_by_medium_id = {}
-    total_watched_seconds = 0
+    total_watched_seconds_by_medium_id = {}
+    total_watched_seconds: float = 0
     total_realtime_seconds = 0
     
     entry_iter = more_itertools.peekable(entries)
@@ -355,13 +371,17 @@ def _trans_process_watch(session: SessionDb, watch_id: str):
             return  # Client should be ignored
         array, medium_watched_seconds, medium_realtime_seconds = res
         view_count_arrays_by_medium_id[current_medium_id] = array
+        total_watched_seconds_by_medium_id[current_medium_id] = medium_watched_seconds
         total_watched_seconds += medium_watched_seconds
         total_realtime_seconds += medium_realtime_seconds
     
+    if total_watched_seconds < _MIN_WATCH_DURATION_SECONDS:
+        return
+    
     # Client won't be ignored. Now we can add data to the database (A return does NOT cause a rollback! The watch entries
     # always need to be deleted)
     
-    generic_stats = _update_and_get_generic_stats(session, lecture.id, total_watched_seconds, total_watched_seconds / total_realtime_seconds)
+    generic_stats = _update_and_get_lecture_generic_stats(session, lecture.id, round(total_watched_seconds), total_watched_seconds / total_realtime_seconds)
     for medium_id, array in view_count_arrays_by_medium_id.items():
         _add_seconds_array_to_database(
             session,
@@ -370,8 +390,12 @@ def _trans_process_watch(session: SessionDb, watch_id: str):
             medium_duration_sec_by_id[medium_id],
             array
         )
-        _add_user_agent_client_stats(session, medium_id, watcher.user_agent)
-    _add_daily_stats(session, lecture.id, entries[0].timestamp.date())  # Use start date
+        _add_medium_user_agent_client_stats(session, medium_id, watcher.user_agent)
+        medium_watched_seconds = total_watched_seconds_by_medium_id[medium_id]
+        if medium_watched_seconds >= _MIN_WATCH_DURATION_SECONDS:
+            _update_medium_generic_stats(session, medium_id, round(medium_watched_seconds))
+    
+    _add_lecture_daily_stats(session, lecture.id, entries[0].timestamp.date())  # Use start date
 
 
 def execute(database, own_job_id, input_data: CJsonObject):
-- 
GitLab