📓 Groundedness Evaluations¶
In many ways, feedbacks can be thought of as LLM apps themselves. Given text, they return some result. Thinking in this way, we can use TruLens to evaluate and track our feedback quality. We can even do this for different models (e.g. gpt-3.5 and gpt-4) or prompting schemes (such as chain-of-thought reasoning).
This notebook follows an evaluation of a set of test cases generated from human annotated datasets. In particular, we generate test cases from SummEval.
SummEval is one of the datasets dedicated to automated evaluations on summarization tasks, which are closely related to the groundedness evaluation in RAG with the retrieved context (i.e. the source) and response (i.e. the summary). It contains human annotation of numerical score (1 to 5) comprised of scoring from 3 human expert annotators and 5 crowd-sourced annotators. There are 16 models being used for generation in total for 100 paragraphs in the test set, so there are a total of 16,000 machine-generated summaries. Each paragraph also has several human-written summaries for comparative analysis.
For evaluating groundedness feedback functions, we compute the annotated "consistency" scores, a measure of whether the summarized response is factually consistent with the source texts and hence can be used as a proxy to evaluate groundedness in our RAG triad, and normalized to 0 to 1 score as our expected_score and to match the output of feedback functions.
# Import groundedness feedback function
from test_cases import generate_summeval_groundedness_golden_set
from trulens.apps.basic import TruBasicApp
from trulens.core import Feedback
from trulens.core import Select
from trulens.core import TruSession
from trulens.feedback import GroundTruthAgreement
TruSession().reset_database()
# generator for groundedness golden set
test_cases_gen = generate_summeval_groundedness_golden_set(
"./datasets/summeval/summeval_test_100.json"
)
# specify the number of test cases we want to run the smoke test on
groundedness_golden_set = []
for i in range(5):
groundedness_golden_set.append(next(test_cases_gen))
groundedness_golden_set[:5]
import os
os.environ["OPENAI_API_KEY"] = "..."
os.environ["HUGGINGFACE_API_KEY"] = "..."
Benchmarking various Groundedness feedback function providers (OpenAI GPT-3.5-turbo vs GPT-4 vs Huggingface)¶
import numpy as np
from trulens.feedback.v2.feedback import Groundedness
from trulens.providers.huggingface import Huggingface
from trulens.providers.openai import OpenAI
openai_provider = OpenAI()
openai_gpt4_provider = OpenAI(model_engine="gpt-4")
huggingface_provider = Huggingface()
groundedness_hug = Groundedness(groundedness_provider=huggingface_provider)
groundedness_openai = Groundedness(groundedness_provider=openai_provider)
groundedness_openai_gpt4 = Groundedness(
groundedness_provider=openai_gpt4_provider
)
f_groundedness_hug = (
Feedback(
huggingface_provider.groundedness_measure,
name="Groundedness Huggingface",
)
.on_input()
.on_output()
.aggregate(groundedness_hug.grounded_statements_aggregator)
)
def wrapped_groundedness_hug(input, output):
return np.mean(list(f_groundedness_hug(input, output)[0].values()))
f_groundedness_openai = (
Feedback(
OpenAI(model_engine="gpt-3.5-turbo").groundedness_measure,
name="Groundedness OpenAI GPT-3.5",
)
.on_input()
.on_output()
.aggregate(groundedness_openai.grounded_statements_aggregator)
)
def wrapped_groundedness_openai(input, output):
return f_groundedness_openai(input, output)[0]["full_doc_score"]
f_groundedness_openai_gpt4 = (
Feedback(
OpenAI(model_engine="gpt-3.5-turbo").groundedness_measure,
name="Groundedness OpenAI GPT-4",
)
.on_input()
.on_output()
.aggregate(groundedness_openai_gpt4.grounded_statements_aggregator)
)
def wrapped_groundedness_openai_gpt4(input, output):
return f_groundedness_openai_gpt4(input, output)[0]["full_doc_score"]
# Create a Feedback object using the numeric_difference method of the ground_truth object
ground_truth = GroundTruthAgreement(groundedness_golden_set, provider=OpenAI())
# Call the numeric_difference method with app and record and aggregate to get the mean absolute error
f_absolute_error = (
Feedback(ground_truth.absolute_error, name="Mean Absolute Error")
.on(Select.Record.calls[0].args.args[0])
.on(Select.Record.calls[0].args.args[1])
.on_output()
)
tru_wrapped_groundedness_hug = TruBasicApp(
wrapped_groundedness_hug,
app_name="groundedness",
app_version="huggingface",
feedbacks=[f_absolute_error],
)
tru_wrapped_groundedness_openai = TruBasicApp(
wrapped_groundedness_openai,
app_name="groundedness",
app_version="openai gpt-3.5",
feedbacks=[f_absolute_error],
)
tru_wrapped_groundedness_openai_gpt4 = TruBasicApp(
wrapped_groundedness_openai_gpt4,
app_name="groundedness",
app_version="openai gpt-4",
feedbacks=[f_absolute_error],
)
for i in range(len(groundedness_golden_set)):
source = groundedness_golden_set[i]["query"]
response = groundedness_golden_set[i]["response"]
with tru_wrapped_groundedness_hug as recording:
tru_wrapped_groundedness_hug.app(source, response)
with tru_wrapped_groundedness_openai as recording:
tru_wrapped_groundedness_openai.app(source, response)
with tru_wrapped_groundedness_openai_gpt4 as recording:
tru_wrapped_groundedness_openai_gpt4.app(source, response)
TruSession().get_leaderboard().sort_values(by="Mean Absolute Error")