A comprehensive benchmark for evaluating tool-augmented LLMs. It includes 73 API tools, 314 annotated tool-use dialogues with 753 API calls, and tests LLMs' capabilities in planning, retrieving, and calling APIs.
from benchthing import Bench
bench = Bench("api-bank")
bench.run(
benchmark="api-bank",
task_id="1",
models=yourLanguageModels
)
result = bench.get_result("1")