In [4]:
import datasets

eval_ds = datasets.load_dataset("m-ric/agents_medium_benchmark_2")["train"]

### Define utilities and tools
To run the SERPAPI tool, you will need to have a [SerpAPI](https://serpapi.com/dashboard) API key: for this you need a paid account.

In [5]:
import time
import json
import os
import re
import string
import warnings
from tqdm import tqdm
from typing import List

from smolagents import (
    GoogleSearchTool,
    CodeAgent,
    ToolCallingAgent,
    HfApiModel,
    AgentError,
    VisitWebpageTool,
    PythonInterpreterTool,
)
from smolagents.agents import ActionStep
from dotenv import load_dotenv

load_dotenv()
os.makedirs("output", exist_ok=True)


def serialize_agent_error(obj):
    if isinstance(obj, AgentError):
        return {"error_type": obj.__class__.__name__, "message": obj.message}
    else:
        return str(obj)


def answer_questions(eval_ds, file_name, agent, model_id, action_type):
    answered_questions = []
    if os.path.exists(file_name):
        with open(file_name, "r") as f:
            for line in f:
                answered_questions.append(json.loads(line)["question"])

    for _, example in tqdm(enumerate(eval_ds), total=len(eval_ds)):
        try:
            question = example["question"]
            if example["source"] == "SimpleQA":
                question += " Answer with only the final number."
            if question in answered_questions:
                continue
            start_time = time.time()
            answer = agent.run(question)
            end_time = time.time()
            for step_log in agent.logs:
                if hasattr(step_log, "memory"):
                    step_log.memory = None

            # Remove memory from logs to make them more compact.
            for step in agent.logs:
                if isinstance(step, ActionStep):
                    step.agent_memory = None

            annotated_example = {
                "model_id": model_id,
                "agent_action_type": action_type,
                "question": question,
                "answer": answer,
                "true_answer": example["true_answer"],
                "source": example["source"],
                "intermediate_steps": str(agent.logs),
                "start_time": start_time,
                "end_time": end_time,
                "token_counts": agent.monitor.get_total_token_counts(),
            }

            with open(file_name, "a") as f:
                json.dump(annotated_example, f, default=serialize_agent_error)
                f.write("\n")  # add a newline for JSONL format
        except Exception as e:
            print("Failed:", e)


def normalize_number_str(number_str: str) -> float:
    # we replace these common units and commas to allow
    # conversion to float
    for char in ["$", "%", ","]:
        number_str = number_str.replace(char, "")
    try:
        return float(number_str)
    except ValueError:
        return float("inf")


def split_string(
    s: str,
    char_list: list[str] = [",", ";"],
) -> list[str]:
    pattern = f"[{''.join(char_list)}]"
    return re.split(pattern, s)


def is_float(element: any) -> bool:
    try:
        float(element)
        return True
    except ValueError:
        return False


def normalize_str(input_str, remove_punct=True) -> str:
    """
    Normalize a string by:
    - Removing all white spaces
    - Optionally removing punctuation (if remove_punct is True)
    - Converting to lowercase
    Parameters:
    - input_str: str, the string to normalize
    - remove_punct: bool, whether to remove punctuation (default: True)
    Returns:
    - str, the normalized string
    """
    # Remove all white spaces. Required e.g for seagull vs. sea gull
    no_spaces = re.sub(r"\s", "", input_str)

    # Remove punctuation, if specified.
    if remove_punct:
        translator = str.maketrans("", "", string.punctuation)
        return no_spaces.lower().translate(translator)
    else:
        return no_spaces.lower()


def extract_numbers(text: str) -> List[str]:
    """This pattern matches:
    - Optional negative sign
    - Numbers with optional comma thousand separators
    - Optional decimal points with decimal numbers
    """
    pattern = r"-?(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?"

    return [el.replace(",", "") for el in re.findall(pattern, text)]


def get_question_score_gaia(
    model_answer: str,
    ground_truth: str,
) -> bool:
    if is_float(ground_truth):
        normalized_answer = normalize_number_str(str(model_answer))
        return normalized_answer == float(ground_truth)

    elif any(char in ground_truth for char in [",", ";"]):  # if gt is a list
        # question with the fish: normalization removes punct
        gt_elems = split_string(ground_truth)
        ma_elems = split_string(model_answer)

        if len(gt_elems) != len(ma_elems):  # check length is the same
            warnings.warn(
                "Answer lists have different lengths, returning False.", UserWarning
            )
            return False

        comparisons = []
        for ma_elem, gt_elem in zip(
            ma_elems, gt_elems
        ):  # compare each element as float or str
            if is_float(gt_elem):
                normalized_ma_elem = normalize_number_str(ma_elem)
                comparisons.append(normalized_ma_elem == float(gt_elem))
            else:
                # we do not remove punct since comparisons can include punct
                comparisons.append(
                    normalize_str(ma_elem, remove_punct=False)
                    == normalize_str(gt_elem, remove_punct=False)
                )
        return all(comparisons)

    else:  # if gt is a str
        return normalize_str(model_answer) == normalize_str(ground_truth)

## Run benchmark

### Open models

In [None]:
open_model_ids = [
    "meta-llama/Llama-3.3-70B-Instruct",
    # "Qwen/QwQ-32B-Preview",
    "Qwen/Qwen2.5-72B-Instruct",
    "Qwen/Qwen2.5-Coder-32B-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    # "HuggingFaceTB/SmolLM2-1.7B-Instruct",
    # "meta-llama/Llama-3.1-70B-Instruct",
]

for model_id in open_model_ids:
    print(f"Evaluating '{model_id}'...")
    action_type = "tool_calling"
    agent = ToolCallingAgent(
        tools=[GoogleSearchTool(), VisitWebpageTool(), PythonInterpreterTool()],
        model=HfApiModel(model_id),
        max_steps=10,
    )
    file_name = f"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl"
    answer_questions(eval_ds, file_name, agent, model_id, action_type)

    action_type = "code"
    agent = CodeAgent(
        tools=[GoogleSearchTool(), VisitWebpageTool()],
        model=HfApiModel(model_id),
        additional_authorized_imports=["numpy"],
        max_steps=10,
    )
    file_name = f"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl"
    answer_questions(eval_ds, file_name, agent, model_id, action_type)

## Closed models

In [None]:
from smolagents import LiteLLMModel

litellm_model_ids = ["gpt-4o", "anthropic/claude-3-5-sonnet-latest"]

for model_id in litellm_model_ids:
    print(f"Evaluating '{model_id}'...")
    action_type = "tool_calling"
    agent = ToolCallingAgent(
        tools=[GoogleSearchTool(), VisitWebpageTool(), PythonInterpreterTool()],
        model=LiteLLMModel(model_id),
        max_steps=10,
    )
    file_name = f"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl"
    answer_questions(eval_ds, file_name, agent, model_id, action_type)

    action_type = "code"
    agent = CodeAgent(
        tools=[GoogleSearchTool(), VisitWebpageTool()],
        model=LiteLLMModel(model_id),
        additional_authorized_imports=["numpy"],
        max_steps=10,
    )
    file_name = f"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl"
    answer_questions(eval_ds, file_name, agent, model_id, action_type)

In [3]:
# import glob
# import json
# jsonl_files = glob.glob(f"output/*.jsonl")

# for file_path in jsonl_files:
#     print(file_path)
#     # Read all lines and filter out SimpleQA sources
#     filtered_lines = []
#     removed = 0
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             try:
#                 data = json.loads(line.strip())
#                 if data["source"] == "SimpleQA" and "Answer with only the final number." not in data["question"]:
#                     removed +=1
#                 else:
#                     filtered_lines.append(line)
#             except json.JSONDecodeError:
#                 print("Invalid line:", line)
#                 continue  # Skip invalid JSON lines
#     print(f"Removed {removed} lines.")
#     # Write filtered content back to the same file
#     with open(file_path, 'w', encoding='utf-8') as f:
#         f.writelines(filtered_lines)

## Score answers

In [9]:
import pandas as pd
import glob

res = []
for f in glob.glob("output/*.jsonl"):
    res.append(pd.read_json(f, lines=True))
result_df = pd.concat(res)


def get_correct(row):
    if row["source"] == "GSM8K":
        numbers_answer = extract_numbers(str(row["answer"]))
        if len(numbers_answer) == 0:
            return False
        return float(numbers_answer[-1]) == float(row["true_answer"])
    else:
        return get_question_score_gaia(str(row["answer"]), str(row["true_answer"]))


result_df["correct"] = result_df.apply(get_correct, axis=1)

result_df = result_df.loc[
    (result_df["agent_action_type"] == "code")
    & (
        ~result_df["model_id"].isin(
            [
                "meta-llama/Llama-3.2-3B-Instruct",
                "meta-llama/Llama-3.1-70B-Instruct",
                "HuggingFaceTB/SmolLM2-1.7B-Instruct",
            ]
        )
    )
]
result_df = (
    (result_df.groupby(["model_id", "source"])[["correct"]].mean() * 100)
    .round(1)
    .reset_index()
)
result_df["type"] = "agent"



In [12]:
vanilla_data = [
    ["gpt-4o", "SimpleQA", 38.2],
    ["gpt-4o", "GAIA", 9.3],
    ["Qwen/Qwen2.5-72B-Instruct", "SimpleQA", 9.1],
    ["anthropic/claude-3-5-sonnet-latest", "SimpleQA", 28.4],
    ["gpt-4o", "GSM8K", 94.3],
    ["anthropic/claude-3-5-sonnet-latest", "GSM8K", 96.4],
    ["meta-llama/Llama-3.3-70B-Instruct", "GSM8K", 95.1],
]

df2 = pd.DataFrame(vanilla_data, columns=["model_id", "source", "correct"])
df2["type"] = "vanilla"

combined_df = pd.concat([result_df, df2], ignore_index=True)

pivot_df = combined_df.pivot_table(
    index=["model_id", "source"],
    columns=["type"],
    values="correct",
    fill_value=float("nan"),
).reset_index()

### Display results

In [11]:
display(pivot_df)

type,model_id,source,agent,vanilla
0,Qwen/Qwen2.5-72B-Instruct,GAIA,12.5,
1,Qwen/Qwen2.5-72B-Instruct,GSM8K,82.9,
2,Qwen/Qwen2.5-72B-Instruct,SimpleQA,42.5,9.1
3,Qwen/Qwen2.5-Coder-32B-Instruct,GAIA,28.1,
4,Qwen/Qwen2.5-Coder-32B-Instruct,GSM8K,92.9,
5,Qwen/Qwen2.5-Coder-32B-Instruct,SimpleQA,42.5,
6,anthropic/claude-3-5-sonnet-latest,GAIA,43.8,
7,anthropic/claude-3-5-sonnet-latest,GSM8K,91.4,96.4
8,anthropic/claude-3-5-sonnet-latest,SimpleQA,47.5,28.4
9,gpt-4o,GAIA,25.0,9.3


In [29]:
def create_mathjax_table(pivot_df, formatted_df):
    # Start the matrix environment with 4 columns
    # l for left-aligned model and task, c for centered numbers
    mathjax_table = "\\begin{array}{llcc}\n"
    mathjax_table += (
        "\\text{Model} & \\text{Task} & \\text{Agent} & \\text{Vanilla} \\\\\n"
    )
    mathjax_table += "\\hline\n"

    # Sort the DataFrame by model_id and source
    formatted_df = formatted_df.sort_values(["model_id", "source"])

    current_model = None
    for _, row in formatted_df.iterrows():
        model = row["model_id"]
        source = row["source"]

        # Add a horizontal line between different models
        if current_model is not None and current_model != model:
            mathjax_table += "\\hline\n"

        # Format model name
        model_display = model.replace("_", "\\_")
        if "Qwen" in model or "anthropic" in model:
            model_display = f"\\textit{{{model_display}}}"

        # If it's the same model as previous row, use empty space
        if current_model == model:
            model_display = "\\;"

        # Add the data row
        mathjax_table += (
            f"{model_display} & {source} & {row['agent']} & {row['vanilla']} \\\\\n"
        )

        current_model = model

    mathjax_table += "\\hline\n"
    mathjax_table += "\\end{array}"

    return mathjax_table


# Usage (after running your previous data processing code):
mathjax_table = create_mathjax_table(pivot_df, formatted_df)
print(mathjax_table)

\begin{array}{llcc}
\text{Model} & \text{Task} & \text{Agent} & \text{Vanilla} \\
\hline
\textit{Qwen/Qwen2.5-72B-Instruct} & GAIA & 12.500 & - \\
\; & GSM8K & 82.900 & - \\
\; & SimpleQA & \textbf{42.500} & 9.100 \\
\hline
\textit{Qwen/Qwen2.5-Coder-32B-Instruct} & GAIA & 28.100 & - \\
\; & GSM8K & 92.900 & - \\
\; & SimpleQA & 42.500 & - \\
\hline
\textit{anthropic/claude-3-5-sonnet-latest} & GAIA & 43.800 & - \\
\; & GSM8K & 91.400 & \textbf{96.400} \\
\; & SimpleQA & \textbf{47.500} & 28.400 \\
\hline
gpt-4o & GAIA & \textbf{25.000} & 9.300 \\
\; & GSM8K & 91.400 & \textbf{94.300} \\
\; & SimpleQA & \textbf{60.000} & 38.200 \\
\hline
meta-llama/Llama-3.3-70B-Instruct & GAIA & 21.900 & - \\
\; & GSM8K & \textbf{95.700} & 95.100 \\
\; & SimpleQA & 30.000 & - \\
\hline
\end{array}
