Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
J
job_controller
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
videoag
job_controller
Commits
961e4b4c
Verified
Commit
961e4b4c
authored
8 months ago
by
Dorian Koch
Browse files
Options
Downloads
Patches
Plain Diff
Recover spawning jobs
parent
a655872c
No related branches found
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/job_controller.py
+5
-0
5 additions, 0 deletions
src/job_controller.py
src/job_database_api.py
+11
-4
11 additions, 4 deletions
src/job_database_api.py
src/main.py
+30
-12
30 additions, 12 deletions
src/main.py
with
46 additions
and
16 deletions
src/job_controller.py
+
5
−
0
View file @
961e4b4c
...
@@ -4,6 +4,7 @@ from job_database_api import DummyJobDatabaseApi, JobData, JobDatabaseApi
...
@@ -4,6 +4,7 @@ from job_database_api import DummyJobDatabaseApi, JobData, JobDatabaseApi
from
jobs.dummy_job
import
DummyJob
from
jobs.dummy_job
import
DummyJob
from
kubernetes_api
import
K8sApi
from
kubernetes_api
import
K8sApi
import
os
import
os
import
datetime
def
load_config
():
def
load_config
():
...
@@ -32,6 +33,10 @@ class ControllerState():
...
@@ -32,6 +33,10 @@ class ControllerState():
db_engine
=
self
.
config
.
get
(
"
DB_ENGINE
"
)
db_engine
=
self
.
config
.
get
(
"
DB_ENGINE
"
)
if
db_engine
==
"
dummy
"
:
if
db_engine
==
"
dummy
"
:
self
.
job_api
=
DummyJobDatabaseApi
()
self
.
job_api
=
DummyJobDatabaseApi
()
# make some dummy jobs
start_id
=
int
(
datetime
.
datetime
.
now
().
timestamp
())
for
i
in
range
(
start_id
,
start_id
+
4
):
self
.
job_api
.
create_job
(
JobData
(
"
job{}
"
.
format
(
i
),
"
dummy
"
))
else
:
else
:
raise
Exception
(
f
"
Unknown DB_ENGINE:
{
db_engine
}
"
)
raise
Exception
(
f
"
Unknown DB_ENGINE:
{
db_engine
}
"
)
self
.
event_queue
=
EventQueue
()
self
.
event_queue
=
EventQueue
()
...
...
This diff is collapsed.
Click to expand it.
src/job_database_api.py
+
11
−
4
View file @
961e4b4c
...
@@ -34,19 +34,23 @@ class JobDatabaseApi(metaclass=ABCMeta):
...
@@ -34,19 +34,23 @@ class JobDatabaseApi(metaclass=ABCMeta):
@abstractmethod
@abstractmethod
def
get_next_jobs_and_set_spawning
(
self
,
limit
:
int
)
->
list
[
JobData
]:
def
get_next_jobs_and_set_spawning
(
self
,
limit
:
int
)
->
list
[
JobData
]:
pass
pass
# atomically retrieve and set state to SPAWNING
@abstractmethod
def
get_all_spawning_jobs
(
self
)
->
list
[
JobData
]:
pass
# used for recovery
@abstractmethod
@abstractmethod
def
get_job_by_id
(
self
,
job_id
:
str
)
->
Optional
[
JobData
]:
def
get_job_by_id
(
self
,
job_id
:
str
)
->
Optional
[
JobData
]:
pass
pass
# refresh state from db
@abstractmethod
@abstractmethod
def
create_job
(
self
,
job
:
JobData
):
def
create_job
(
self
,
job
:
JobData
):
pass
pass
# insert into db
@abstractmethod
@abstractmethod
def
update_job_state
(
self
,
job_id
:
str
,
new_state
:
JobState
):
def
update_job_state
(
self
,
job_id
:
str
,
new_state
:
JobState
):
pass
pass
# update state in db
class
DummyJobDatabaseApi
(
JobDatabaseApi
):
class
DummyJobDatabaseApi
(
JobDatabaseApi
):
...
@@ -65,6 +69,9 @@ class DummyJobDatabaseApi(JobDatabaseApi):
...
@@ -65,6 +69,9 @@ class DummyJobDatabaseApi(JobDatabaseApi):
ret
.
append
(
next
)
ret
.
append
(
next
)
return
ret
return
ret
def
get_all_spawning_jobs
(
self
)
->
list
[
JobData
]:
return
[
job
for
job
in
self
.
db_state
.
values
()
if
job
.
job_state
==
JobState
.
SPAWNING
]
def
get_job_by_id
(
self
,
job_id
:
str
)
->
Optional
[
JobData
]:
def
get_job_by_id
(
self
,
job_id
:
str
)
->
Optional
[
JobData
]:
return
copy
.
deepcopy
(
self
.
db_state
.
get
(
job_id
,
None
))
return
copy
.
deepcopy
(
self
.
db_state
.
get
(
job_id
,
None
))
...
...
This diff is collapsed.
Click to expand it.
src/main.py
+
30
−
12
View file @
961e4b4c
from
actions.spawn_job
import
WatchJob
from
actions.spawn_job
import
SpawnJob
,
WatchJob
from
event_queue
import
EventResult
from
event_queue
import
EventResult
from
actions.find_ready_jobs
import
FindReadyJobs
from
actions.find_ready_jobs
import
FindReadyJobs
from
job_database_api
import
JobData
from
job_database_api
import
JobData
,
JobState
from
job_controller
import
ControllerState
from
job_controller
import
ControllerState
import
datetime
import
datetime
...
@@ -60,12 +60,23 @@ def main():
...
@@ -60,12 +60,23 @@ def main():
cstate
.
event_queue
.
put
(
WatchJob
(
watch
.
metadata
.
labels
[
"
job_id
"
]))
cstate
.
event_queue
.
put
(
WatchJob
(
watch
.
metadata
.
labels
[
"
job_id
"
]))
print
(
"
Done checking for existing jobs
"
)
print
(
"
Done checking for existing jobs
"
)
# TODO: check for existing jobs in spawning state in db that are not in k8s and requeue them (ready state will be picked up by FindReadyJobs)
# find spawning jobs that are not in k8s
spawning_jobs
=
cstate
.
job_api
.
get_all_spawning_jobs
()
# make some dummy jobs
num_resetted
=
0
start_id
=
int
(
datetime
.
datetime
.
now
().
timestamp
())
for
job
in
spawning_jobs
:
for
i
in
range
(
start_id
,
start_id
+
4
):
exists
=
False
cstate
.
job_api
.
create_job
(
JobData
(
"
job{}
"
.
format
(
i
),
"
dummy
"
))
for
k8sjob
in
existing_worker_jobs
.
items
:
if
k8sjob
.
metadata
.
labels
[
"
job_id
"
]
==
job
.
job_id
:
exists
=
True
break
if
not
exists
:
# reset to ready
# TODO: maybe reset them to an error state?
job
.
update_state
(
cstate
,
JobState
.
READY
)
num_resetted
+=
1
if
num_resetted
>
0
:
print
(
f
"
Resetted
{
num_resetted
}
spawning jobs to ready state
"
)
print
(
"
Reconcilation done
"
)
cstate
.
event_queue
.
put
(
FindReadyJobs
())
cstate
.
event_queue
.
put
(
FindReadyJobs
())
...
@@ -107,10 +118,17 @@ def main():
...
@@ -107,10 +118,17 @@ def main():
print
(
f
"
Error in event
{
evt
}
:
{
e
}
"
)
print
(
f
"
Error in event
{
evt
}
:
{
e
}
"
)
print
(
"
###
"
)
print
(
"
###
"
)
print
(
"
Event loop stopped
"
)
print
(
"
Event loop stopped
"
)
# print all remaining events
# set all jobs that were supposed to be spawned back to ready
print
(
"
Remaining events in queue:
"
)
num_readied
=
0
while
not
cstate
.
event_queue
.
empty
():
for
evt
in
cstate
.
event_queue
.
queue
.
queue
:
print
(
cstate
.
event_queue
.
get
())
if
isinstance
(
evt
,
SpawnJob
):
# get up to date job data (it may have been canceled)
job
=
cstate
.
job_api
.
get_job_by_id
(
evt
.
job
.
jobData
.
job_id
)
if
job
is
not
None
and
job
.
job_state
==
JobState
.
SPAWNING
:
job
.
update_state
(
cstate
,
JobState
.
READY
)
num_readied
+=
1
if
num_readied
>
0
:
print
(
f
"
Readied
{
num_readied
}
jobs that were supposed to be spawned
"
)
sys
.
exit
(
0
)
sys
.
exit
(
0
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment