Measuring Retrieval Quality¶
There are a variety of ways we can measure retrieval quality from LLM-based evaluations to embedding similarity. In this example, we will explore the different methods available.
In [ ]:
Copied!
# !pip install trulens trulens-apps-llamaindex trulens-providers-openai llama_index==0.10.11 html2text>=2020.1.16
# !pip install trulens trulens-apps-llamaindex trulens-providers-openai llama_index==0.10.11 html2text>=2020.1.16
Add API keys¶
For this quickstart, you will need Open AI and Huggingface keys. The OpenAI key is used for embeddings and GPT, and the Huggingface key is used for evaluation.
In [ ]:
Copied!
import os
os.environ["OPENAI_API_KEY"] = "..."
os.environ["HUGGINGFACE_API_KEY"] = "..."
import os
os.environ["OPENAI_API_KEY"] = "..."
os.environ["HUGGINGFACE_API_KEY"] = "..."
Import from LlamaIndex and TruLens¶
In [ ]:
Copied!
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.feedback.embeddings import Embeddings
from trulens.apps.llamaindex import TruLlama
from trulens.providers.openai import OpenAI
session = TruSession()
session.reset_database()
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.feedback.embeddings import Embeddings
from trulens.apps.llamaindex import TruLlama
from trulens.providers.openai import OpenAI
session = TruSession()
session.reset_database()
Create Simple LLM Application¶
This example uses LlamaIndex which internally uses an OpenAI LLM.
In [ ]:
Copied!
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import VectorStoreIndex
from llama_index.legacy import ServiceContext
from llama_index.readers.web import SimpleWebPageReader
documents = SimpleWebPageReader(html_to_text=True).load_data(
["http://paulgraham.com/worked.html"]
)
embed_model = HuggingFaceEmbeddings(
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
service_context = ServiceContext.from_defaults(embed_model=embed_model)
index = VectorStoreIndex.from_documents(
documents, service_context=service_context
)
query_engine = index.as_query_engine(top_k=5)
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import VectorStoreIndex
from llama_index.legacy import ServiceContext
from llama_index.readers.web import SimpleWebPageReader
documents = SimpleWebPageReader(html_to_text=True).load_data(
["http://paulgraham.com/worked.html"]
)
embed_model = HuggingFaceEmbeddings(
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
service_context = ServiceContext.from_defaults(embed_model=embed_model)
index = VectorStoreIndex.from_documents(
documents, service_context=service_context
)
query_engine = index.as_query_engine(top_k=5)
Send your first request¶
In [ ]:
Copied!
response = query_engine.query("What did the author do growing up?")
print(response)
response = query_engine.query("What did the author do growing up?")
print(response)
Initialize Feedback Function(s)¶
In [ ]:
Copied!
import numpy as np
# Initialize provider class
openai = OpenAI()
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
Feedback(openai.context_relevance)
.on_input()
.on(TruLlama.select_source_nodes().node.text)
.aggregate(np.mean)
)
import numpy as np
# Initialize provider class
openai = OpenAI()
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
Feedback(openai.context_relevance)
.on_input()
.on(TruLlama.select_source_nodes().node.text)
.aggregate(np.mean)
)
In [ ]:
Copied!
f_embed = Embeddings(embed_model=embed_model)
f_embed_dist = (
Feedback(f_embed.cosine_distance)
.on_input()
.on(TruLlama.select_source_nodes().node.text)
.aggregate(np.mean)
)
f_embed = Embeddings(embed_model=embed_model)
f_embed_dist = (
Feedback(f_embed.cosine_distance)
.on_input()
.on(TruLlama.select_source_nodes().node.text)
.aggregate(np.mean)
)
Instrument app for logging with TruLens¶
In [ ]:
Copied!
tru_query_engine_recorder = TruLlama(
query_engine,
app_name="LlamaIndex_App",
app_version="1",
feedbacks=[f_context_relevance, f_embed_dist],
)
tru_query_engine_recorder = TruLlama(
query_engine,
app_name="LlamaIndex_App",
app_version="1",
feedbacks=[f_context_relevance, f_embed_dist],
)
In [ ]:
Copied!
# or as context manager
with tru_query_engine_recorder as recording:
query_engine.query("What did the author do growing up?")
# or as context manager
with tru_query_engine_recorder as recording:
query_engine.query("What did the author do growing up?")
Explore in a Dashboard¶
In [ ]:
Copied!
from trulens.dashboard import run_dashboard
run_dashboard(session) # open a local streamlit app to explore
# stop_dashboard(session) # stop if needed
from trulens.dashboard import run_dashboard
run_dashboard(session) # open a local streamlit app to explore
# stop_dashboard(session) # stop if needed
Note: Feedback functions evaluated in the deferred manner can be seen in the "Progress" page of the TruLens dashboard.
Or view results directly in your notebook¶
In [ ]:
Copied!
session.get_records_and_feedback()[0]
session.get_records_and_feedback()[0]