Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/server-functions/server_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# See allowed_imports for what packages you can use in this class.


# Class must be named ServerFunctions
class ServerFunctions(ServerFunctionsBase):
# toy example to highlight functionality of ServerFunctions.
def __init__(self) -> None:
Expand Down
1 change: 1 addition & 0 deletions examples/server-functions/sf_incremental_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Example of fedavg using memory secure running aggregation with server functions.


# Class must be named ServerFunctions
class ServerFunctions(ServerFunctionsBase):
def __init__(self) -> None:
self.global_model = None
Expand Down
58 changes: 58 additions & 0 deletions fedn/network/combiner/hooks/grpc_wrappers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import time

import grpc

from fedn.common.log_config import logger


def safe_unary(func_name, default_resp_factory):
def decorator(fn):
def wrapper(self, request, context):
try:
return fn(self, request, context)
except Exception as e:
self._retire_and_log(func_name, e)
# Option A: return a valid default payload (keeps channel healthy)
return default_resp_factory()

return wrapper

return decorator


def safe_streaming(func_name):
def decorator(fn):
def wrapper(self, request, context):
try:
yield from fn(self, request, context)
except Exception as e:
self._retire_and_log(func_name, e)
# Option B for streaming: signal an RPC error the client understands
context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
context.set_details(f"{func_name} failed; sender should use local fallback.")
return

return wrapper

return decorator


def call_with_fallback(name, fn, *, retries=2, base_sleep=0.25, fallback_fn=None):
for i in range(retries + 1):
try:
return fn()
except grpc.RpcError as e:
code = e.code()
if code in (grpc.StatusCode.FAILED_PRECONDITION, grpc.StatusCode.UNAVAILABLE, grpc.StatusCode.DEADLINE_EXCEEDED):
logger.warning(f"{name} rpc failed with {code.name}: {e.details()}; attempt {i + 1}/{retries}")
if i < retries:
time.sleep(base_sleep * (2**i))
continue
break
except Exception as e:
logger.exception(f"{name} unexpected error: {e}")
break
if fallback_fn:
logger.info(f"{name}: using local fallback")
return fallback_fn()
raise RuntimeError(f"{name} failed and no fallback provided")
189 changes: 114 additions & 75 deletions fedn/network/combiner/hooks/hooks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import ast
import json
import linecache
import linecache as _lc
import traceback
from concurrent import futures

import grpc
Expand All @@ -11,6 +14,7 @@
# imports for user defined code
from fedn.network.combiner.hooks.allowed_import import * # noqa: F403
from fedn.network.combiner.hooks.allowed_import import ServerFunctionsBase
from fedn.network.combiner.hooks.grpc_wrappers import safe_streaming, safe_unary
from fedn.network.combiner.modelservice import bytesIO_request_generator, model_as_bytesIO, unpack_model
from fedn.utils.helpers.plugins.numpyhelper import Helper

Expand All @@ -35,6 +39,7 @@ def __init__(self) -> None:
self.implemented_functions = {}
logger.info("Server Functions initialized.")

@safe_unary("client_settings", lambda: fedn.ClientConfigResponse(client_settings=json.dumps({})))
def HandleClientConfig(self, request_iterator: fedn.ClientConfigRequest, context):
"""Distribute client configs to clients from user defined code.

Expand All @@ -45,15 +50,13 @@ def HandleClientConfig(self, request_iterator: fedn.ClientConfigRequest, context
:return: the client config response
:rtype: :class:`fedn.network.grpc.fedn_pb2.ClientConfigResponse`
"""
try:
logger.info("Received client config request.")
model, _ = unpack_model(request_iterator, self.helper)
client_settings = self.server_functions.client_settings(global_model=model)
logger.info(f"Client config response: {client_settings}")
return fedn.ClientConfigResponse(client_settings=json.dumps(client_settings))
except Exception as e:
logger.error(f"Error handling client config request: {e}")
logger.info("Received client config request.")
model, _ = unpack_model(request_iterator, self.helper)
client_settings = self.server_functions.client_settings(global_model=model)
logger.info(f"Client config response: {client_settings}")
return fedn.ClientConfigResponse(client_settings=json.dumps(client_settings))

@safe_unary("client_selection", lambda: fedn.ClientSelectionResponse(client_ids=json.dumps([])))
def HandleClientSelection(self, request: fedn.ClientSelectionRequest, context):
"""Handle client selection from user defined code.

Expand All @@ -64,15 +67,13 @@ def HandleClientSelection(self, request: fedn.ClientSelectionRequest, context):
:return: the client selection response
:rtype: :class:`fedn.network.grpc.fedn_pb2.ClientSelectionResponse`
"""
try:
logger.info("Received client selection request.")
client_ids = json.loads(request.client_ids)
client_ids = self.server_functions.client_selection(client_ids)
logger.info(f"Clients selected: {client_ids}")
return fedn.ClientSelectionResponse(client_ids=json.dumps(client_ids))
except Exception as e:
logger.error(f"Error handling client selection request: {e}")
logger.info("Received client selection request.")
client_ids = json.loads(request.client_ids)
client_ids = self.server_functions.client_selection(client_ids)
logger.info(f"Clients selected: {client_ids}")
return fedn.ClientSelectionResponse(client_ids=json.dumps(client_ids))

@safe_unary("store_metadata", lambda: fedn.ClientMetaResponse(status="ERROR"))
def HandleMetadata(self, request: fedn.ClientMetaRequest, context):
"""Store client metadata from a request.

Expand All @@ -83,32 +84,27 @@ def HandleMetadata(self, request: fedn.ClientMetaRequest, context):
:return: the client meta response
:rtype: :class:`fedn.network.grpc.fedn_pb2.ClientMetaResponse`
"""
try:
logger.info("Received metadata")
client_id = request.client_id
metadata = json.loads(request.metadata)
# dictionary contains: [model, client_metadata] in that order for each key
self.client_updates[client_id] = self.client_updates.get(client_id, []) + [metadata]
self.check_incremental_aggregate(client_id)
return fedn.ClientMetaResponse(status="Metadata stored")
except Exception as e:
logger.error(f"Error handling store metadata request: {e}")
logger.info("Received metadata")
client_id = request.client_id
metadata = json.loads(request.metadata)
# dictionary contains: [model, client_metadata] in that order for each key
self.client_updates[client_id] = self.client_updates.get(client_id, []) + [metadata]
self.check_incremental_aggregate(client_id)
return fedn.ClientMetaResponse(status="Metadata stored")

@safe_unary("store_model", lambda: fedn.StoreModelResponse(status="ERROR"))
def HandleStoreModel(self, request_iterator, context):
try:
model, final_request = unpack_model(request_iterator, self.helper)
client_id = final_request.id
if client_id == "global_model":
logger.info("Received previous global model")
self.previous_global = model
else:
logger.info(f"Received client model from client {client_id}")
# dictionary contains: [model, client_metadata] in that order for each key
self.client_updates[client_id] = [model] + self.client_updates.get(client_id, [])
self.check_incremental_aggregate(client_id)
return fedn.StoreModelResponse(status=f"Received model originating from {client_id}")
except Exception as e:
logger.error(f"Error handling store model request: {e}")
model, final_request = unpack_model(request_iterator, self.helper)
client_id = final_request.id
if client_id == "global_model":
logger.info("Received previous global model")
self.previous_global = model
else:
logger.info(f"Received client model from client {client_id}")
# dictionary contains: [model, client_metadata] in that order for each key
self.client_updates[client_id] = [model] + self.client_updates.get(client_id, [])
self.check_incremental_aggregate(client_id)
return fedn.StoreModelResponse(status=f"Received model originating from {client_id}")

def check_incremental_aggregate(self, client_id):
# incremental aggregation (memory secure)
Expand All @@ -121,6 +117,7 @@ def check_incremental_aggregate(self, client_id):
self.server_functions.incremental_aggregate(client_id, client_model, client_metadata, self.previous_global)
del self.client_updates[client_id]

@safe_streaming("aggregate")
def HandleAggregation(self, request, context):
"""Receive and store models and aggregate based on user-defined code when specified in the request.

Expand All @@ -131,22 +128,19 @@ def HandleAggregation(self, request, context):
:return: the aggregation response (aggregated model or None)
:rtype: :class:`fedn.network.grpc.fedn_pb2.AggregationResponse`
"""
try:
logger.info(f"Receieved aggregation request: {request.aggregate}")
if self.implemented_functions["incremental_aggregate"]:
aggregated_model = self.server_functions.get_incremental_aggregate_model()
else:
aggregated_model = self.server_functions.aggregate(self.previous_global, self.client_updates)

model_bytesIO = model_as_bytesIO(aggregated_model, self.helper)
request_function = fedn.AggregationResponse
self.client_updates = {}
logger.info("Returning aggregate model.")
response_generator = bytesIO_request_generator(mdl=model_bytesIO, request_function=request_function, args={})
for response in response_generator:
yield response
except Exception as e:
logger.error(f"Error handling aggregation request: {e}")
logger.info(f"Receieved aggregation request: {request.aggregate}")
if self.implemented_functions["incremental_aggregate"]:
aggregated_model = self.server_functions.get_incremental_aggregate_model()
else:
aggregated_model = self.server_functions.aggregate(self.previous_global, self.client_updates)

model_bytesIO = model_as_bytesIO(aggregated_model, self.helper)
request_function = fedn.AggregationResponse
self.client_updates = {}
logger.info("Returning aggregate model.")
response_generator = bytesIO_request_generator(mdl=model_bytesIO, request_function=request_function, args={})
for response in response_generator:
yield response

def HandleProvidedFunctions(self, request: fedn.ProvidedFunctionsResponse, context):
"""Handles the 'provided_functions' request. Sends back which functions are available.
Expand All @@ -158,18 +152,19 @@ def HandleProvidedFunctions(self, request: fedn.ProvidedFunctionsResponse, conte
:return: dict with str -> bool for which functions are available
:rtype: :class:`fedn.network.grpc.fedn_pb2.ProvidedFunctionsResponse`
"""
try:
logger.info("Receieved provided functions request.")
server_functions_code = request.function_code
# if no new code return previous
if server_functions_code == self.server_functions_code:
logger.info("No new server function code provided.")
logger.info(f"Provided functions: {self.implemented_functions}")
return fedn.ProvidedFunctionsResponse(available_functions=self.implemented_functions)

self.server_functions_code = server_functions_code
self.implemented_functions = {}
self._instansiate_server_functions_code()
logger.info("Receieved provided functions request.")
server_functions_code = request.function_code
# if no new code return previous
if server_functions_code == self.server_functions_code:
logger.info("No new server function code provided.")
logger.info(f"Provided functions: {self.implemented_functions}")
return fedn.ProvidedFunctionsResponse(available_functions=self.implemented_functions)

self.server_functions_code = server_functions_code
self.implemented_functions = {}
self._instansiate_server_functions_code()

if self.implemented_functions == {}: # not defaultet due to error
functions = ["client_selection", "client_settings", "aggregate", "incremental_aggregate"]
# parse the entire code string into an AST
tree = ast.parse(server_functions_code)
Expand All @@ -185,20 +180,64 @@ def HandleProvidedFunctions(self, request: fedn.ProvidedFunctionsResponse, conte
else:
print(f"Function '{func}' not found.")
self.implemented_functions[func] = False
logger.info(f"Provided function: {self.implemented_functions}")
return fedn.ProvidedFunctionsResponse(available_functions=self.implemented_functions)
except Exception as e:
logger.error(f"Error handling provided functions request: {e}")

logger.info(f"Provided function: {self.implemented_functions}")
return fedn.ProvidedFunctionsResponse(available_functions=self.implemented_functions)

def _instansiate_server_functions_code(self):
# this will create a new user defined instance of the ServerFunctions class.
try:
namespace = {}
exec(self.server_functions_code, globals(), namespace) # noqa: S102
# create a stable synthetic filename to appear in tracebacks
self._server_code_filename = f"server_functions:{hash(self.server_functions_code)}"
code_obj = compile(self.server_functions_code, self._server_code_filename, "exec")

# prime linecache so traceback can show source lines
linecache.cache[self._server_code_filename] = (
len(self.server_functions_code),
None,
[ln if ln.endswith("\n") else ln + "\n" for ln in self.server_functions_code.splitlines()],
self._server_code_filename,
)

exec(code_obj, globals(), namespace) # noqa: S102
exec("server_functions = ServerFunctions()", globals(), namespace) # noqa: S102
self.server_functions = namespace.get("server_functions")
except Exception as e:
logger.error(f"Exec failed with error: {str(e)}")
logger.error(f"Exec failed: {e}")
self.server_functions = None
self.implemented_functions = dict.fromkeys(["client_selection", "client_settings", "aggregate", "incremental_aggregate"], False)

def _retire_and_log(self, func_name: str, err: Exception):
# retire the function immediately
if func_name in self.implemented_functions:
self.implemented_functions[func_name] = False

# try to find frames that originate from the compiled user code
tb = traceback.extract_tb(err.__traceback__)
user_frames = []
filename = getattr(self, "_server_code_filename", None)
for frame in tb:
if filename and frame.filename == filename:
user_frames.append(frame)

if user_frames:
# deepest frame in user code (where it actually failed)
f = user_frames[-1]
# fetch the source line from linecache (primed earlier)

src_line = (_lc.getline(f.filename, f.lineno) or "").rstrip("\n")
logger.error(
"User function '%s' crashed at %s:%d in %s()\n> %s\nException: %s",
func_name,
f.filename,
f.lineno,
f.name,
src_line,
repr(err),
)
else:
# fallback: full traceback (server + user frames) if we didn't match a user frame
logger.exception("%s failed, retiring until next code update: %s", func_name, err)


def serve():
Expand Down
Loading
Loading