📓 Answer Relevance Feedback Evaluation¶
In many ways, feedbacks can be thought of as LLM apps themselves. Given text, they return some result. Thinking in this way, we can use TruLens to evaluate and track our feedback quality. We can even do this for different models (e.g. gpt-3.5 and gpt-4) or prompting schemes (such as chain-of-thought reasoning).
This notebook follows an evaluation of a set of test cases. You are encouraged to run this on your own and even expand the test cases to evaluate performance on test cases applicable to your scenario or domain.
# Import relevance feedback function
from test_cases import answer_relevance_golden_set
from trulens.apps.basic import TruBasicApp
from trulens.core import Feedback
from trulens.core import Select
from trulens.core import TruSession
from trulens.feedback import GroundTruthAgreement
from trulens.providers.litellm import LiteLLM
from trulens.providers.openai import OpenAI
TruSession().reset_database()
import os
os.environ["OPENAI_API_KEY"] = "..."
os.environ["COHERE_API_KEY"] = "..."
os.environ["HUGGINGFACE_API_KEY"] = "..."
os.environ["ANTHROPIC_API_KEY"] = "..."
os.environ["TOGETHERAI_API_KEY"] = "..."
# GPT 3.5
turbo = OpenAI(model_engine="gpt-3.5-turbo")
def wrapped_relevance_turbo(input, output):
return turbo.relevance(input, output)
# GPT 4
gpt4 = OpenAI(model_engine="gpt-4")
def wrapped_relevance_gpt4(input, output):
return gpt4.relevance(input, output)
# Cohere
command_nightly = LiteLLM(model_engine="cohere/command-nightly")
def wrapped_relevance_command_nightly(input, output):
return command_nightly.relevance(input, output)
# Anthropic
claude_1 = LiteLLM(model_engine="claude-instant-1")
def wrapped_relevance_claude1(input, output):
return claude_1.relevance(input, output)
claude_2 = LiteLLM(model_engine="claude-2")
def wrapped_relevance_claude2(input, output):
return claude_2.relevance(input, output)
# Meta
llama_2_13b = LiteLLM(
model_engine="together_ai/togethercomputer/Llama-2-7B-32K-Instruct"
)
def wrapped_relevance_llama2(input, output):
return llama_2_13b.relevance(input, output)
Here we'll set up our golden set as a set of prompts, responses and expected
scores stored in test_cases.py
. Then, our numeric_difference method will look
up the expected score for each prompt/response pair by exact match. After
looking up the expected score, we will then take the L1 difference between the
actual score and expected score.
# Create a Feedback object using the numeric_difference method of the
# ground_truth object
ground_truth = GroundTruthAgreement(
answer_relevance_golden_set, provider=OpenAI()
)
# Call the numeric_difference method with app and record and aggregate to get
# the mean absolute error
f_mae = (
Feedback(ground_truth.absolute_error, name="Mean Absolute Error")
.on(Select.Record.calls[0].args.args[0])
.on(Select.Record.calls[0].args.args[1])
.on_output()
)
tru_wrapped_relevance_turbo = TruBasicApp(
wrapped_relevance_turbo,
app_name="answer relevance",
app_version="gpt-3.5-turbo",
feedbacks=[f_mae],
)
tru_wrapped_relevance_gpt4 = TruBasicApp(
wrapped_relevance_gpt4,
app_name="answer relevance",
app_version="gpt-4",
feedbacks=[f_mae],
)
tru_wrapped_relevance_commandnightly = TruBasicApp(
wrapped_relevance_command_nightly,
app_name="answer relevance",
app_version="Command-Nightly",
feedbacks=[f_mae],
)
tru_wrapped_relevance_claude1 = TruBasicApp(
wrapped_relevance_claude1,
app_name="answer relevance",
app_version="Claude 1",
feedbacks=[f_mae],
)
tru_wrapped_relevance_claude2 = TruBasicApp(
wrapped_relevance_claude2,
app_name="answer relevance",
app_version="Claude 2",
feedbacks=[f_mae],
)
tru_wrapped_relevance_llama2 = TruBasicApp(
wrapped_relevance_llama2,
app_name="answer relevance",
app_version="Llama-2-13b",
feedbacks=[f_mae],
)
for i in range(len(answer_relevance_golden_set)):
prompt = answer_relevance_golden_set[i]["query"]
response = answer_relevance_golden_set[i]["response"]
with tru_wrapped_relevance_turbo as recording:
tru_wrapped_relevance_turbo.app(prompt, response)
with tru_wrapped_relevance_gpt4 as recording:
tru_wrapped_relevance_gpt4.app(prompt, response)
with tru_wrapped_relevance_commandnightly as recording:
tru_wrapped_relevance_commandnightly.app(prompt, response)
with tru_wrapped_relevance_claude1 as recording:
tru_wrapped_relevance_claude1.app(prompt, response)
with tru_wrapped_relevance_claude2 as recording:
tru_wrapped_relevance_claude2.app(prompt, response)
with tru_wrapped_relevance_llama2 as recording:
tru_wrapped_relevance_llama2.app(prompt, response)
TruSession().get_leaderboard().sort_values(by="Mean Absolute Error")