Skip to content

Commit 08e514b

Browse files
xiki-tempulalohedges
authored andcommitted
Hang killer (#8)
Implement a hang killer at BSS level
1 parent bba6729 commit 08e514b

File tree

2 files changed

+103
-5
lines changed

2 files changed

+103
-5
lines changed

python/BioSimSpace/Sandpit/Exscientia/Process/_process.py

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import traceback
3333

3434
import pandas as pd
35+
from loguru import logger
3536

3637
from .._Utils import _try_import
3738

@@ -53,6 +54,7 @@
5354
from ..Protocol._protocol import Protocol as _Protocol
5455
from .._SireWrappers import System as _System
5556
from ..Types._type import Type as _Type
57+
from ..Types import Time as _Time
5658
from .. import Units as _Units
5759
from .. import _Utils
5860
from ..FreeEnergy._restraint import Restraint as _Restraint
@@ -898,7 +900,7 @@ def setSeed(self, seed):
898900
else:
899901
self._seed = seed
900902

901-
def wait(self, max_time=None):
903+
def wait(self, max_time=None, inactivity_timeout: None | _Time = None):
902904
"""
903905
Wait for the process to finish.
904906
@@ -939,11 +941,52 @@ def wait(self, max_time=None):
939941
self._process.wait(max_time)
940942

941943
else:
942-
# Wait for the process to finish.
943-
self._process.wait()
944+
if inactivity_timeout is None:
945+
# Wait for the process to finish.
946+
self._process.wait()
944947

945-
# Store the final run time.
946-
self.runTime()
948+
# Store the final run time.
949+
self.runTime()
950+
else:
951+
inactivity_timeout = int(inactivity_timeout.milliseconds().value())
952+
last_time = self._getLastTime()
953+
if last_time is None:
954+
# Wait for the process to finish.
955+
self._process.wait()
956+
957+
# Store the final run time.
958+
self.runTime()
959+
else:
960+
while self.isRunning():
961+
self._process.wait(inactivity_timeout)
962+
if self.isRunning():
963+
current_time = self._getLastTime()
964+
if current_time > last_time:
965+
logger.info(
966+
f"Current simulation time ({current_time})."
967+
)
968+
last_time = current_time
969+
else:
970+
logger.warning(
971+
f"Current simulation time ({current_time}) has not advanced compared "
972+
f"to the last time ({last_time}). The process "
973+
f"might have hung and will be killed."
974+
)
975+
with open(
976+
f"{self.workDir()}/{self._name}.out", "a+"
977+
) as f:
978+
f.write("Process Hung. Killed.")
979+
self.kill()
980+
981+
def _getLastTime(self) -> float | None:
982+
"""This is the base method in the Process base class.
983+
Each subclass, such as AMBER or GROMACS, is expected to override this method
984+
to provide their own implementation for returning the current time.
985+
986+
If this method is not overridden, it will return None,
987+
and the `inactivity_timeout` feature will be skipped.
988+
"""
989+
return None
947990

948991
def isQueued(self):
949992
"""
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from unittest.mock import MagicMock
2+
3+
import BioSimSpace.Sandpit.Exscientia as BSS
4+
from BioSimSpace.Sandpit.Exscientia.Process._process import Process
5+
6+
7+
def test_max_time():
8+
process = MagicMock()
9+
Process.wait(process, max_time=1)
10+
process._process.wait.assert_called_once_with(60000)
11+
12+
13+
def test_None_inactivity_timeout():
14+
process = MagicMock()
15+
Process.wait(process, max_time=None, inactivity_timeout=None)
16+
process._process.wait.assert_called_once()
17+
18+
19+
def test_inactivity_timeout_no_getLastTime():
20+
process = MagicMock()
21+
process._getLastTime.return_value = None
22+
Process.wait(process, max_time=None, inactivity_timeout=BSS.Units.Time.nanosecond)
23+
process._process.wait.assert_called_once()
24+
25+
26+
def test_hang(tmp_path):
27+
process = MagicMock()
28+
process.workDir.return_value = str(tmp_path)
29+
process._name = "test"
30+
# Using TEST_HANG_COUNTER to mimic simulation progress
31+
global TEST_HANG_COUNTER
32+
TEST_HANG_COUNTER = 0
33+
process.isRunning.return_value = True
34+
35+
def _getLastTime():
36+
global TEST_HANG_COUNTER
37+
TEST_HANG_COUNTER += 1
38+
# Mimic simulation hang after 10 calls
39+
return min(TEST_HANG_COUNTER, 10)
40+
41+
process._getLastTime = _getLastTime
42+
43+
def mock_kill():
44+
# Mock kill to stop the simulation
45+
process.isRunning.return_value = False
46+
47+
process.kill.side_effect = mock_kill
48+
49+
Process.wait(process, max_time=None, inactivity_timeout=BSS.Units.Time.nanosecond)
50+
51+
assert process._process.wait.call_count == 10
52+
process.kill.assert_called_once()
53+
54+
with open(f"{tmp_path}/test.out", "r") as f:
55+
assert f.read() == "Process Hung. Killed."

0 commit comments

Comments
 (0)