Groundtruth dataset persistence

Ground truth dataset persistence and evaluation in TruLens¶

In this notebook, we give a quick walkthrough of how you can prepare your own ground truth dataset, as well as utilize our utility function to load preprocessed BEIR (Benchmarking IR) datasets to take advantage of its unified format.

In [ ]:

Copied!

# !pip install trulens trulens-provider-openai openai
# !pip install trulens trulens-provider-openai openai

In [ ]:

Copied!

import os

os.environ["OPENAI_API_KEY"] = "sk-..."
import os

os.environ["OPENAI_API_KEY"] = "sk-..."

In [ ]:

Copied!

from trulens.core import TruSession

session = TruSession()
session.reset_database()
from trulens.core import TruSession

session = TruSession()
session.reset_database()

Add custom ground truth dataset to TruLens¶

Create a custom ground truth dataset. You can include queries, expected responses, and even expected chunks if evaluating retrieval.

In [ ]:

Copied!





import pandas as pd

data = {
    "query": ["hello world", "who is the president?", "what is AI?"],
    "query_id": ["1", "2", "3"],
    "expected_response": ["greeting", "Joe Biden", "Artificial Intelligence"],
    "expected_chunks": [
        [
            {
                "text": "All CS major students must know the term 'Hello World'",
                "title": "CS 101",
            }
        ],
        [
            {
                "text": "Barack Obama was the president of the US (POTUS) from 2008 to 2016.'",
                "title": "US Presidents",
            }
        ],
        [
            {
                "text": "AI is the simulation of human intelligence processes by machines, especially computer systems.",
                "title": "AI is not a bubble :(",
            }
        ],
    ],
}

df = pd.DataFrame(data)
import pandas as pd

data = {
    "query": ["hello world", "who is the president?", "what is AI?"],
    "query_id": ["1", "2", "3"],
    "expected_response": ["greeting", "Joe Biden", "Artificial Intelligence"],
    "expected_chunks": [
        [
            {
                "text": "All CS major students must know the term 'Hello World'",
                "title": "CS 101",
            }
        ],
        [
            {
                "text": "Barack Obama was the president of the US (POTUS) from 2008 to 2016.'",
                "title": "US Presidents",
            }
        ],
        [
            {
                "text": "AI is the simulation of human intelligence processes by machines, especially computer systems.",
                "title": "AI is not a bubble :(",
            }
        ],
    ],
}

df = pd.DataFrame(data)

Idempotency in TruLens dataset:¶

IDs for both datasets and ground truth data entries are based on their content and metadata, so add_ground_truth_to_dataset is idempotent and should not create duplicate rows in the DB.

In [ ]:

Copied!





session.add_ground_truth_to_dataset(
    dataset_name="test_dataset_new",
    ground_truth_df=df,
    dataset_metadata={"domain": "Random QA"},
)
session.add_ground_truth_to_dataset(
    dataset_name="test_dataset_new",
    ground_truth_df=df,
    dataset_metadata={"domain": "Random QA"},
)

Retrieving groundtruth dataset from the DB for Ground truth evaluation (semantic similarity)¶

Below we will introduce how to retrieve the ground truth dataset (or a subset of it) that we just persisted, and use it as the golden set in GroundTruthAgreement feedback function to perform ground truth lookup and evaluation

In [ ]:

Copied!

ground_truth_df = session.get_ground_truth("test_dataset_new")
ground_truth_df = session.get_ground_truth("test_dataset_new")

In [ ]:

Copied!

ground_truth_df
ground_truth_df

In [ ]:

Copied!





from trulens.core import Feedback
from trulens.feedback import GroundTruthAgreement
from trulens.providers.openai import OpenAI as fOpenAI

f_groundtruth = Feedback(
    GroundTruthAgreement(ground_truth_df, provider=fOpenAI()).agreement_measure,
    name="Ground Truth (semantic similarity measurement)",
).on_input_output()
from trulens.core import Feedback
from trulens.feedback import GroundTruthAgreement
from trulens.providers.openai import OpenAI as fOpenAI

f_groundtruth = Feedback(
    GroundTruthAgreement(ground_truth_df, provider=fOpenAI()).agreement_measure,
    name="Ground Truth (semantic similarity measurement)",
).on_input_output()

Create Simple LLM Application¶

In [ ]:

Copied!





from openai import OpenAI
from trulens.apps.custom import instrument

oai_client = OpenAI()


class APP:
    @instrument
    def completion(self, prompt):
        completion = (
            oai_client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": f"Please answer the question: {prompt}",
                    }
                ],
            )
            .choices[0]
            .message.content
        )
        return completion


llm_app = APP()
from openai import OpenAI
from trulens.apps.custom import instrument

oai_client = OpenAI()


class APP:
    @instrument
    def completion(self, prompt):
        completion = (
            oai_client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": f"Please answer the question: {prompt}",
                    }
                ],
            )
            .choices[0]
            .message.content
        )
        return completion


llm_app = APP()

Instrument chain for logging with TruLens¶

In [ ]:

Copied!





# add trulens as a context manager for llm_app
from trulens.apps.custom import TruCustomApp

tru_app = TruCustomApp(
    llm_app, app_name="LLM App v1", feedbacks=[f_groundtruth]
)
# add trulens as a context manager for llm_app
from trulens.apps.custom import TruCustomApp

tru_app = TruCustomApp(
    llm_app, app_name="LLM App v1", feedbacks=[f_groundtruth]
)

In [ ]:

Copied!

# Instrumented query engine can operate as a context manager:
with tru_app as recording:
    llm_app.completion("what is AI?")
# Instrumented query engine can operate as a context manager:
with tru_app as recording:
    llm_app.completion("what is AI?")

In [ ]:

Copied!

session.get_leaderboard(app_ids=[tru_app.app_id])
session.get_leaderboard(app_ids=[tru_app.app_id])

In [ ]:

Copied!

session.reset_database()
session.reset_database()

Loading dataset to a dataframe:¶

This is helpful when we'd want to inspect the groundtruth dataset after transformation. The below example loads a preprocessed dataset from BEIR (Benchmarking Information Retrieval) collection

In [ ]:

Copied!

from trulens.benchmark.benchmark_frameworks.dataset.beir_loader import (
    TruBEIRDataLoader,
)

beir_data_loader = TruBEIRDataLoader(data_folder="./", dataset_name="scifact")

gt_df = beir_data_loader.load_dataset_to_df(download=True)
from trulens.benchmark.benchmark_frameworks.dataset.beir_loader import (
    TruBEIRDataLoader,
)

beir_data_loader = TruBEIRDataLoader(data_folder="./", dataset_name="scifact")

gt_df = beir_data_loader.load_dataset_to_df(download=True)

In [ ]:

Copied!

gt_df.expected_chunks[0]
gt_df.expected_chunks[0]

In [ ]:

Copied!





# then we can save the ground truth to the dataset
session.add_ground_truth_to_dataset(
    dataset_name="my_beir_scifact",
    ground_truth_df=gt_df,
    dataset_metadata={"domain": "Information Retrieval"},
)
# then we can save the ground truth to the dataset
session.add_ground_truth_to_dataset(
    dataset_name="my_beir_scifact",
    ground_truth_df=gt_df,
    dataset_metadata={"domain": "Information Retrieval"},
)

Single method to save to the database¶

We also make directly persisting to DB easy. This is particular useful for larger datasets such as MSMARCO, where there are over 8 million documents in the corpus.

In [ ]:

Copied!





beir_data_loader.persist_dataset(
    session=session,
    dataset_name="my_beir_scifact",
    dataset_metadata={"domain": "Information Retrieval"},
)
beir_data_loader.persist_dataset(
    session=session,
    dataset_name="my_beir_scifact",
    dataset_metadata={"domain": "Information Retrieval"},
)

Benchmarking feedback functions / evaluators as a special case of groundtruth evaluation¶

When using feedback functions, it can often be useful to calibrate them against ground truth human evaluations. We can do so here for context relevance using popular information retrieval datasets like those from BEIR mentioned above.

This can be especially useful for choosing between models to power feedback functions. We'll do so here by comparing gpt-4o and gpt-4o-mini.

In [ ]:

Copied!





from typing import Tuple

from trulens.providers.openai import OpenAI

provider_4o = OpenAI(model_engine="gpt-4o")
provider_4o_mini = OpenAI(model_engine="gpt-4o-mini")


def context_relevance_4o(
    input, output, benchmark_params
) -> Tuple[float, float]:
    return provider_4o.context_relevance(
        question=input,
        context=output,
        temperature=benchmark_params["temperature"],
    )


def context_relevance_4o_mini(
    input, output, benchmark_params
) -> Tuple[float, float]:
    return provider_4o_mini.context_relevance(
        question=input,
        context=output,
        temperature=benchmark_params["temperature"],
    )
from typing import Tuple

from trulens.providers.openai import OpenAI

provider_4o = OpenAI(model_engine="gpt-4o")
provider_4o_mini = OpenAI(model_engine="gpt-4o-mini")


def context_relevance_4o(
    input, output, benchmark_params
) -> Tuple[float, float]:
    return provider_4o.context_relevance(
        question=input,
        context=output,
        temperature=benchmark_params["temperature"],
    )


def context_relevance_4o_mini(
    input, output, benchmark_params
) -> Tuple[float, float]:
    return provider_4o_mini.context_relevance(
        question=input,
        context=output,
        temperature=benchmark_params["temperature"],
    )

In [ ]:

Copied!

gt_df = gt_df.head(10)
gt_df
gt_df = gt_df.head(10)
gt_df

In [ ]:

Copied!





from trulens.feedback import GroundTruthAggregator

true_labels = []

for chunks in gt_df.expected_chunks:
    for chunk in chunks:
        true_labels.append(chunk["expected_score"])
ndcg_agg_func = GroundTruthAggregator(true_labels=true_labels, k=10).ndcg_at_k
from trulens.feedback import GroundTruthAggregator

true_labels = []

for chunks in gt_df.expected_chunks:
    for chunk in chunks:
        true_labels.append(chunk["expected_score"])
ndcg_agg_func = GroundTruthAggregator(true_labels=true_labels, k=10).ndcg_at_k

In [ ]:

Copied!





from trulens.benchmark.benchmark_frameworks.tru_benchmark_experiment import (
    BenchmarkParams,
)
from trulens.benchmark.benchmark_frameworks.tru_benchmark_experiment import (
    TruBenchmarkExperiment,
)
from trulens.benchmark.benchmark_frameworks.tru_benchmark_experiment import (
    create_benchmark_experiment_app,
)

benchmark_experiment = TruBenchmarkExperiment(
    feedback_fn=context_relevance_4o,
    agg_funcs=[ndcg_agg_func],
    benchmark_params=BenchmarkParams(temperature=0.5),
)

benchmark_experiment_mini = TruBenchmarkExperiment(
    feedback_fn=context_relevance_4o_mini,
    agg_funcs=[ndcg_agg_func],
    benchmark_params=BenchmarkParams(temperature=0.5),
)
from trulens.benchmark.benchmark_frameworks.tru_benchmark_experiment import (
    BenchmarkParams,
)
from trulens.benchmark.benchmark_frameworks.tru_benchmark_experiment import (
    TruBenchmarkExperiment,
)
from trulens.benchmark.benchmark_frameworks.tru_benchmark_experiment import (
    create_benchmark_experiment_app,
)

benchmark_experiment = TruBenchmarkExperiment(
    feedback_fn=context_relevance_4o,
    agg_funcs=[ndcg_agg_func],
    benchmark_params=BenchmarkParams(temperature=0.5),
)

benchmark_experiment_mini = TruBenchmarkExperiment(
    feedback_fn=context_relevance_4o_mini,
    agg_funcs=[ndcg_agg_func],
    benchmark_params=BenchmarkParams(temperature=0.5),
)

In [ ]:

Copied!





tru_benchmark = create_benchmark_experiment_app(
    app_name="Context Relevance",
    app_version="gpt-4o",
    benchmark_experiment=benchmark_experiment,
)

with tru_benchmark as recording:
    feedback_res = tru_benchmark.app(gt_df)
tru_benchmark = create_benchmark_experiment_app(
    app_name="Context Relevance",
    app_version="gpt-4o",
    benchmark_experiment=benchmark_experiment,
)

with tru_benchmark as recording:
    feedback_res = tru_benchmark.app(gt_df)

In [ ]:

Copied!





tru_benchmark_mini = create_benchmark_experiment_app(
    app_name="Context Relevance",
    app_version="gpt-4o-mini",
    benchmark_experiment=benchmark_experiment_mini,
)
with tru_benchmark_mini as recording:
    feedback_res_mini = tru_benchmark_mini.app(gt_df)
tru_benchmark_mini = create_benchmark_experiment_app(
    app_name="Context Relevance",
    app_version="gpt-4o-mini",
    benchmark_experiment=benchmark_experiment_mini,
)
with tru_benchmark_mini as recording:
    feedback_res_mini = tru_benchmark_mini.app(gt_df)

In [ ]:

Copied!

session.get_leaderboard()
session.get_leaderboard()