aiq.eval.swe_bench_evaluator.evaluate#

Attributes#

Classes#

Module Contents#

logger#
class SweBenchEvaluator(
run_id: str,
max_workers: int,
output_dir: pathlib.Path,
)#
run_id#
max_workers#
output_dir#
_unsupported_repos = []#
_swe_bench_inputs = []#
_swe_bench_outputs = []#
_model_name_or_path = 'no_llm'#
get_model_name_from_output(workflow_output: list[dict]) str | None#

Fetch the model_name_or_path from the first entry in the list.

static empty_report_dir(report_dir: pathlib.Path)#

Remove the current contents of the report directory.

static move_report_and_logs(
swe_bench_report_file: str,
logs_dir: str,
report_dir: pathlib.Path,
)#

Temorary function to move the report and logs to the output directory

is_repo_supported(repo: str, version: str) bool#

Check if the repo is supported by swebench

process_eval_input(
eval_input: aiq.eval.evaluator.evaluator_model.EvalInput,
) tuple[pathlib.Path, pathlib.Path]#

Converts EvalInput into lists of SWEBenchInput and SWEBenchOutput models and applies filtering.

build_eval_output()#

Builds the EvalOutput object from the SWEBenchOutput models and the average score.

static compute_score(success_cnt: int, total_cnt: int) float#
async evaluate(
eval_input: aiq.eval.evaluator.evaluator_model.EvalInput,
) aiq.eval.evaluator.evaluator_model.EvalOutput#

Run the swebench evaluation and store the report in the output directory