Skip to content

Commit 37a777e

Browse files
authored
Update inference_utils.py
1 parent 119400d commit 37a777e

File tree

1 file changed

+21
-7
lines changed

1 file changed

+21
-7
lines changed

impl/services/inference_utils.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ def get_embeddings(
8383
return [result["embedding"] for result in response.data]
8484

8585

86+
import asyncio
87+
from litellm.exceptions import RateLimitError
88+
8689
async def get_async_chat_completion_response(
8790
messages: List[Dict[str, Any]],
8891
model: Optional[str] = None,
@@ -116,13 +119,24 @@ async def get_async_chat_completion_response(
116119
else:
117120
litellm_kwargs[key] = type_hints[key](value)
118121

119-
completion = await acompletion(
120-
model=model,
121-
messages=messages,
122-
deployment_id=deployment_id,
123-
**litellm_kwargs
124-
)
125-
return completion
122+
max_retries = 5
123+
for attempt in range(max_retries):
124+
try:
125+
# Your existing logic to get the response
126+
completion = await acompletion(
127+
model=model,
128+
messages=messages,
129+
deployment_id=deployment_id,
130+
**litellm_kwargs
131+
)
132+
return completion
133+
except RateLimitError as e:
134+
if attempt < max_retries - 1:
135+
backoff_time = 2 ** attempt # Exponential backoff
136+
await asyncio.sleep(backoff_time)
137+
else:
138+
raise HTTPException(status_code=429, detail=f"Rate limit exceeded: {e}")
139+
126140
except Exception as e:
127141
if "LLM Provider NOT provided" in e.args[0]:
128142
logger.error(f"Error: error {model} is not currently supported")

0 commit comments

Comments
 (0)