Ground Truth Evaluations¶
In this quickstart you will create a evaluate a LangChain app using ground truth. Ground truth evaluation can be especially useful during early LLM experiments when you have a small set of example queries that are critical to get right.
Ground truth evaluation works by comparing the similarity of an LLM response compared to its matching verified response.
Import from LangChain and TruLens¶
In [ ]:
Copied!
# !pip install trulens trulens-apps-langchain trulens-providers-huggingface trulens-providers-openai langchain>=0.0.342 langchain_community
# !pip install trulens trulens-apps-langchain trulens-providers-huggingface trulens-providers-openai langchain>=0.0.342 langchain_community
In [ ]:
Copied!
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import HumanMessagePromptTemplate
from langchain.prompts import PromptTemplate
from langchain_community.llms import OpenAI
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.feedback import GroundTruthAgreement
from trulens.providers.huggingface import Huggingface
from trulens.providers.openai import OpenAI as fOpenAI
session = TruSession()
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import HumanMessagePromptTemplate
from langchain.prompts import PromptTemplate
from langchain_community.llms import OpenAI
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.feedback import GroundTruthAgreement
from trulens.providers.huggingface import Huggingface
from trulens.providers.openai import OpenAI as fOpenAI
session = TruSession()
Add API keys¶
For this quickstart, you will need Open AI keys.
In [ ]:
Copied!
import os
os.environ["HUGGINGFACE_API_KEY"] = "hf_..."
os.environ["OPENAI_API_KEY"] = "sk-..."
import os
os.environ["HUGGINGFACE_API_KEY"] = "hf_..."
os.environ["OPENAI_API_KEY"] = "sk-..."
Create Simple LLM Application¶
This example uses Langchain with an OpenAI LLM.
In [ ]:
Copied!
full_prompt = HumanMessagePromptTemplate(
prompt=PromptTemplate(
template="Provide an answer to the following: {prompt}",
input_variables=["prompt"],
)
)
chat_prompt_template = ChatPromptTemplate.from_messages([full_prompt])
llm = OpenAI(temperature=0.9, max_tokens=128)
chain = LLMChain(llm=llm, prompt=chat_prompt_template, verbose=True)
full_prompt = HumanMessagePromptTemplate(
prompt=PromptTemplate(
template="Provide an answer to the following: {prompt}",
input_variables=["prompt"],
)
)
chat_prompt_template = ChatPromptTemplate.from_messages([full_prompt])
llm = OpenAI(temperature=0.9, max_tokens=128)
chain = LLMChain(llm=llm, prompt=chat_prompt_template, verbose=True)
Initialize Feedback Function(s)¶
In [ ]:
Copied!
golden_set = [
{"query": "who invented the lightbulb?", "response": "Thomas Edison"},
{"query": "¿quien invento la bombilla?", "response": "Thomas Edison"},
]
f_groundtruth = Feedback(
GroundTruthAgreement(golden_set, provider=fOpenAI()).agreement_measure, name="Ground Truth"
).on_input_output()
# Define a language match feedback function using HuggingFace.
hugs = Huggingface()
f_lang_match = Feedback(hugs.language_match).on_input_output()
golden_set = [
{"query": "who invented the lightbulb?", "response": "Thomas Edison"},
{"query": "¿quien invento la bombilla?", "response": "Thomas Edison"},
]
f_groundtruth = Feedback(
GroundTruthAgreement(golden_set, provider=fOpenAI()).agreement_measure, name="Ground Truth"
).on_input_output()
# Define a language match feedback function using HuggingFace.
hugs = Huggingface()
f_lang_match = Feedback(hugs.language_match).on_input_output()
Instrument chain for logging with TruLens¶
In [ ]:
Copied!
from trulens.apps.langchain import TruChain
tc = TruChain(chain, feedbacks=[f_groundtruth, f_lang_match])
from trulens.apps.langchain import TruChain
tc = TruChain(chain, feedbacks=[f_groundtruth, f_lang_match])
In [ ]:
Copied!
# Instrumented query engine can operate as a context manager:
with tc as recording:
chain("¿quien invento la bombilla?")
chain("who invented the lightbulb?")
# Instrumented query engine can operate as a context manager:
with tc as recording:
chain("¿quien invento la bombilla?")
chain("who invented the lightbulb?")
Explore in a Dashboard¶
In [ ]:
Copied!
from trulens.dashboard import run_dashboard
run_dashboard(session) # open a local streamlit app to explore
# stop_dashboard(session) # stop if needed
from trulens.dashboard import run_dashboard
run_dashboard(session) # open a local streamlit app to explore
# stop_dashboard(session) # stop if needed