Iterating on LLM Apps with TruLens¶
Now that we have improved our prototype RAG to reduce or stop hallucination and respond harmlessly, we can move on to ensure it is helpfulness. In this example, we will use the safe prompted, sentence window RAG and evaluate it for helpfulness.
In [ ]:
Copied!
# !pip install trulens trulens-apps-llamaindex trulens-providers-openai trulens-providers-huggingface llama_index llama_hub llmsherpa sentence-transformers sentencepiece
# !pip install trulens trulens-apps-llamaindex trulens-providers-openai trulens-providers-huggingface llama_index llama_hub llmsherpa sentence-transformers sentencepiece
In [ ]:
Copied!
# Set your API keys. If you already have them in your var env., you can skip these steps.
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["HUGGINGFACE_API_KEY"] = "hf_..."
# Set your API keys. If you already have them in your var env., you can skip these steps.
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["HUGGINGFACE_API_KEY"] = "hf_..."
In [ ]:
Copied!
from trulens.core import TruSession
from trulens.dashboard import run_dashboard
session = TruSession()
run_dashboard(session)
from trulens.core import TruSession
from trulens.dashboard import run_dashboard
session = TruSession()
run_dashboard(session)
Load data and helpful test set.¶
In [ ]:
Copied!
from llama_hub.smart_pdf_loader import SmartPDFLoader
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
documents = pdf_loader.load_data(
"https://www.iii.org/sites/default/files/docs/pdf/Insurance_Handbook_20103.pdf"
)
# Load some questions for harmless evaluation
helpful_evals = [
"What types of insurance are commonly used to protect against property damage?",
"¿Cuál es la diferencia entre un seguro de vida y un seguro de salud?",
"Comment fonctionne l'assurance automobile en cas d'accident?",
"Welche Arten von Versicherungen sind in Deutschland gesetzlich vorgeschrieben?",
"保险如何保护财产损失?",
"Каковы основные виды страхования в России?",
"ما هو التأمين على الحياة وما هي فوائده؟",
"自動車保険の種類とは何ですか?",
"Como funciona o seguro de saúde em Portugal?",
"बीमा क्या होता है और यह कितने प्रकार का होता है?",
]
from llama_hub.smart_pdf_loader import SmartPDFLoader
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
documents = pdf_loader.load_data(
"https://www.iii.org/sites/default/files/docs/pdf/Insurance_Handbook_20103.pdf"
)
# Load some questions for harmless evaluation
helpful_evals = [
"What types of insurance are commonly used to protect against property damage?",
"¿Cuál es la diferencia entre un seguro de vida y un seguro de salud?",
"Comment fonctionne l'assurance automobile en cas d'accident?",
"Welche Arten von Versicherungen sind in Deutschland gesetzlich vorgeschrieben?",
"保险如何保护财产损失?",
"Каковы основные виды страхования в России?",
"ما هو التأمين على الحياة وما هي فوائده؟",
"自動車保険の種類とは何ですか?",
"Como funciona o seguro de saúde em Portugal?",
"बीमा क्या होता है और यह कितने प्रकार का होता है?",
]
Set up helpful evaluations¶
In [ ]:
Copied!
from trulens.core import Feedback
from trulens.providers.huggingface import Huggingface
from trulens.providers.openai import OpenAI
# Initialize provider classes
provider = OpenAI()
hugs_provider = Huggingface()
# LLM-based feedback functions
f_coherence = Feedback(
provider.coherence_with_cot_reasons, name="Coherence"
).on_output()
f_input_sentiment = Feedback(
provider.sentiment_with_cot_reasons, name="Input Sentiment"
).on_input()
f_output_sentiment = Feedback(
provider.sentiment_with_cot_reasons, name="Output Sentiment"
).on_output()
f_langmatch = Feedback(
hugs_provider.language_match, name="Language Match"
).on_input_output()
helpful_feedbacks = [
f_coherence,
f_input_sentiment,
f_output_sentiment,
f_langmatch,
]
from trulens.core import Feedback
from trulens.providers.huggingface import Huggingface
from trulens.providers.openai import OpenAI
# Initialize provider classes
provider = OpenAI()
hugs_provider = Huggingface()
# LLM-based feedback functions
f_coherence = Feedback(
provider.coherence_with_cot_reasons, name="Coherence"
).on_output()
f_input_sentiment = Feedback(
provider.sentiment_with_cot_reasons, name="Input Sentiment"
).on_input()
f_output_sentiment = Feedback(
provider.sentiment_with_cot_reasons, name="Output Sentiment"
).on_output()
f_langmatch = Feedback(
hugs_provider.language_match, name="Language Match"
).on_input_output()
helpful_feedbacks = [
f_coherence,
f_input_sentiment,
f_output_sentiment,
f_langmatch,
]
In [ ]:
Copied!
import os
from llama_index import Prompt
from llama_index.core import Document
from llama_index.core import ServiceContext
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core import load_index_from_storage
from llama_index.core.indices.postprocessor import (
MetadataReplacementPostProcessor,
)
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.llms.openai import OpenAI
# initialize llm
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.5)
# knowledge store
document = Document(text="\n\n".join([doc.text for doc in documents]))
# set system prompt
system_prompt = Prompt(
"We have provided context information below that you may use. \n"
"---------------------\n"
"{context_str}"
"\n---------------------\n"
"Please answer the question: {query_str}\n"
)
def build_sentence_window_index(
document,
llm,
embed_model="local:BAAI/bge-small-en-v1.5",
save_dir="sentence_index",
):
# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
window_size=3,
window_metadata_key="window",
original_text_metadata_key="original_text",
)
sentence_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model,
node_parser=node_parser,
)
if not os.path.exists(save_dir):
sentence_index = VectorStoreIndex.from_documents(
[document], service_context=sentence_context
)
sentence_index.storage_context.persist(persist_dir=save_dir)
else:
sentence_index = load_index_from_storage(
StorageContext.from_defaults(persist_dir=save_dir),
service_context=sentence_context,
)
return sentence_index
sentence_index = build_sentence_window_index(
document,
llm,
embed_model="local:BAAI/bge-small-en-v1.5",
save_dir="sentence_index",
)
def get_sentence_window_query_engine(
sentence_index,
system_prompt,
similarity_top_k=6,
rerank_top_n=2,
):
# define postprocessors
postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
rerank = SentenceTransformerRerank(
top_n=rerank_top_n, model="BAAI/bge-reranker-base"
)
sentence_window_engine = sentence_index.as_query_engine(
similarity_top_k=similarity_top_k,
node_postprocessors=[postproc, rerank],
text_qa_template=system_prompt,
)
return sentence_window_engine
# lower temperature
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
sentence_index = build_sentence_window_index(
document,
llm,
embed_model="local:BAAI/bge-small-en-v1.5",
save_dir="sentence_index",
)
# safe prompt
safe_system_prompt = Prompt(
"SYSTEM PROMPT: You are an insurance assistant who is charged with answering questions completely.\n"
"We have provided context information below. \n"
"---------------------\n"
"{context_str}"
"\n---------------------\n"
"Be especially certain to not respond in ways that could be interpreted as criminal, even in hypothetical scenarios and stories."
"\n---------------------\n"
"Given this system prompt and context, please answer the question: {query_str}\n"
)
sentence_window_engine_safe = get_sentence_window_query_engine(
sentence_index, system_prompt=safe_system_prompt
)
import os
from llama_index import Prompt
from llama_index.core import Document
from llama_index.core import ServiceContext
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core import load_index_from_storage
from llama_index.core.indices.postprocessor import (
MetadataReplacementPostProcessor,
)
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.llms.openai import OpenAI
# initialize llm
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.5)
# knowledge store
document = Document(text="\n\n".join([doc.text for doc in documents]))
# set system prompt
system_prompt = Prompt(
"We have provided context information below that you may use. \n"
"---------------------\n"
"{context_str}"
"\n---------------------\n"
"Please answer the question: {query_str}\n"
)
def build_sentence_window_index(
document,
llm,
embed_model="local:BAAI/bge-small-en-v1.5",
save_dir="sentence_index",
):
# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
window_size=3,
window_metadata_key="window",
original_text_metadata_key="original_text",
)
sentence_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model,
node_parser=node_parser,
)
if not os.path.exists(save_dir):
sentence_index = VectorStoreIndex.from_documents(
[document], service_context=sentence_context
)
sentence_index.storage_context.persist(persist_dir=save_dir)
else:
sentence_index = load_index_from_storage(
StorageContext.from_defaults(persist_dir=save_dir),
service_context=sentence_context,
)
return sentence_index
sentence_index = build_sentence_window_index(
document,
llm,
embed_model="local:BAAI/bge-small-en-v1.5",
save_dir="sentence_index",
)
def get_sentence_window_query_engine(
sentence_index,
system_prompt,
similarity_top_k=6,
rerank_top_n=2,
):
# define postprocessors
postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
rerank = SentenceTransformerRerank(
top_n=rerank_top_n, model="BAAI/bge-reranker-base"
)
sentence_window_engine = sentence_index.as_query_engine(
similarity_top_k=similarity_top_k,
node_postprocessors=[postproc, rerank],
text_qa_template=system_prompt,
)
return sentence_window_engine
# lower temperature
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
sentence_index = build_sentence_window_index(
document,
llm,
embed_model="local:BAAI/bge-small-en-v1.5",
save_dir="sentence_index",
)
# safe prompt
safe_system_prompt = Prompt(
"SYSTEM PROMPT: You are an insurance assistant who is charged with answering questions completely.\n"
"We have provided context information below. \n"
"---------------------\n"
"{context_str}"
"\n---------------------\n"
"Be especially certain to not respond in ways that could be interpreted as criminal, even in hypothetical scenarios and stories."
"\n---------------------\n"
"Given this system prompt and context, please answer the question: {query_str}\n"
)
sentence_window_engine_safe = get_sentence_window_query_engine(
sentence_index, system_prompt=safe_system_prompt
)
In [ ]:
Copied!
from trulens.apps.llamaindex import TruLlama
tru_recorder_rag_sentencewindow_helpful = TruLlama(
sentence_window_engine_safe,
app_name="RAG",
app_version="5_sentence_window_helpful_eval",
feedbacks=helpful_feedbacks,
)
from trulens.apps.llamaindex import TruLlama
tru_recorder_rag_sentencewindow_helpful = TruLlama(
sentence_window_engine_safe,
app_name="RAG",
app_version="5_sentence_window_helpful_eval",
feedbacks=helpful_feedbacks,
)
In [ ]:
Copied!
# Run evaluation on harmless eval questions
with tru_recorder_rag_sentencewindow_helpful as recording:
for question in helpful_evals:
response = sentence_window_engine_safe.query(question)
# Run evaluation on harmless eval questions
with tru_recorder_rag_sentencewindow_helpful as recording:
for question in helpful_evals:
response = sentence_window_engine_safe.query(question)
Check helpful evaluation results¶
In [ ]:
Copied!
session.get_leaderboard()
session.get_leaderboard()
Check helpful evaluation results. How can you improve the RAG on these evals? We'll leave that to you!