28
28
logger .info ("Evaluating the LLM Change Agent." )
29
29
30
30
31
-
32
31
def download_document (url , input_dir ):
33
32
"""Download the document from the URL."""
34
33
if not os .path .exists (input_dir ):
@@ -174,44 +173,39 @@ def generate_changes_via_llm(eval_dir, output_dir, provider, model):
174
173
print (f"Predicted changes saved to { output_sub_dir } " )
175
174
176
175
177
- def compare_changes (expected_dir :Path , output_dir :Path ):
176
+ def compare_changes (expected_dir : Path , output_dir : Path ):
178
177
"""Compare the actual changes with the predicted changes."""
179
178
# For each document in the expected directory, there is a corresponding document in the output directory
180
179
181
180
output_files = list (output_dir .rglob ("*.yaml" ))
182
181
183
182
# output_files_dict is : {provider_model: {filename: file_path}}
184
- output_files_list_of_dicts = [
185
- {f"{ file .parts [- 3 ]} _{ file .parts [- 2 ]} " : {file .name :file }} for file in output_files
186
- ]
187
-
183
+ output_files_list_of_dicts = [{f"{ file .parts [- 3 ]} _{ file .parts [- 2 ]} " : {file .name : file }} for file in output_files ]
184
+
188
185
for model_output in output_files_list_of_dicts :
189
186
for provider_model , file_info in model_output .items ():
190
187
for filename , filepath in file_info .items ():
191
188
filename = filepath .name
192
189
expected_file = expected_dir / filename
193
190
output_file = filepath
194
- with open (expected_file , "r" ) as ex , open (output_file , "r" ) as out :
191
+ with open (expected_file , "r" ) as ex , open (output_file , "r" ) as out :
195
192
expected_yaml = yaml .safe_load (ex )
196
193
output_yaml = yaml .safe_load (out )
197
194
expected_yaml_subset = {k : v for k , v in expected_yaml .items () if k in output_yaml }
198
195
for pr_id , output_changes in output_yaml .items ():
199
196
expected_change = expected_yaml_subset .get (pr_id )
200
197
if len (output_changes ) > 0 :
201
198
compare_output_vs_expected (expected_change , output_changes )
199
+ logger .info (f"Finished comparing changes for { provider_model } " )
202
200
203
201
204
-
205
- def compare_output_vs_expected (expected_changes , output_changes :List ):
202
+ def compare_output_vs_expected (expected_changes , output_changes : List ):
206
203
"""Compare the expected changes with the output changes."""
207
204
output_changes = normalize_changes (output_changes )
208
205
accuracy = 0.0
209
206
total = len (expected_changes )
210
207
correct = 0
211
208
import pdb ; pdb .set_trace ()
212
-
213
-
214
-
215
209
216
210
217
211
def run_evaluate (model : str , provider : str ):
@@ -230,4 +224,3 @@ def run_evaluate(model: str, provider: str):
230
224
generate_changes_via_llm (model = model , provider = provider , eval_dir = eval_dir , output_dir = output_dir )
231
225
232
226
compare_changes (expected_dir = expected_dir , output_dir = output_dir )
233
-
0 commit comments