Langchain model comparison
LLM Comparison¶
When building an LLM application we have hundreds of different models to choose from, all with different costs/latency and performance characteristics. Importantly, performance of LLMs can be heterogeneous across different use cases. Rather than relying on standard benchmarks or leaderboard performance, we want to evaluate an LLM for the use case we need.
Doing this sort of comparison is a core use case of TruLens. In this example, we'll walk through how to build a simple langchain app and evaluate across 3 different models: small flan, large flan and text-turbo-3.
Import libraries¶
In [ ]:
Copied!
# !pip install trulens trulens-providers-huggingface trulens-providers-openai langchain==0.0.283 langchain_community
# !pip install trulens trulens-providers-huggingface trulens-providers-openai langchain==0.0.283 langchain_community
In [ ]:
Copied!
import os
# Imports from langchain to build app. You may need to install langchain first
# with the following:
# !pip install langchain>=0.0.170
from langchain.prompts import PromptTemplate
# Imports main tools:
# Imports main tools:
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.apps.langchain import TruChain
from trulens.providers.huggingface import Huggingface
from trulens.providers.openai import OpenAI
session = TruSession()
import os
# Imports from langchain to build app. You may need to install langchain first
# with the following:
# !pip install langchain>=0.0.170
from langchain.prompts import PromptTemplate
# Imports main tools:
# Imports main tools:
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.apps.langchain import TruChain
from trulens.providers.huggingface import Huggingface
from trulens.providers.openai import OpenAI
session = TruSession()
Set API Keys¶
For this example, we need API keys for the Huggingface, HuggingFaceHub, and OpenAI
In [ ]:
Copied!
os.environ["HUGGINGFACE_API_KEY"] = "..."
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "..."
os.environ["OPENAI_API_KEY"] = "..."
os.environ["HUGGINGFACE_API_KEY"] = "..."
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "..."
os.environ["OPENAI_API_KEY"] = "..."
Set up prompt template¶
In [ ]:
Copied!
template = """Question: {question}
Answer: """
prompt = PromptTemplate(template=template, input_variables=["question"])
template = """Question: {question}
Answer: """
prompt = PromptTemplate(template=template, input_variables=["question"])
Set up feedback functions¶
In [ ]:
Copied!
# API endpoints for models used in feedback functions:
hugs = Huggingface()
openai = OpenAI()
# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()
# By default this will evaluate feedback on main app input and main app output.
all_feedbacks = [f_qa_relevance]
# API endpoints for models used in feedback functions:
hugs = Huggingface()
openai = OpenAI()
# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()
# By default this will evaluate feedback on main app input and main app output.
all_feedbacks = [f_qa_relevance]
Load a couple sizes of Flan and ask questions¶
In [ ]:
Copied!
from langchain import HuggingFaceHub
from langchain import LLMChain
# initialize the models
hub_llm_smallflan = HuggingFaceHub(
repo_id="google/flan-t5-small", model_kwargs={"temperature": 1e-10}
)
hub_llm_largeflan = HuggingFaceHub(
repo_id="google/flan-t5-large", model_kwargs={"temperature": 1e-10}
)
davinci = OpenAI(model_name="text-davinci-003")
# create prompt template > LLM chain
smallflan_chain = LLMChain(prompt=prompt, llm=hub_llm_smallflan)
largeflan_chain = LLMChain(prompt=prompt, llm=hub_llm_largeflan)
davinci_chain = LLMChain(prompt=prompt, llm=davinci)
# Trulens instrumentation.
smallflan_app_recorder = TruChain(
app_name="small_flan", app_version="v1", app=smallflan_chain, feedbacks=all_feedbacks
)
largeflan_app_recorder = TruChain(
app_name="large_flan", app_version="v1", app=largeflan_chain, feedbacks=all_feedbacks
)
davinci_app_recorder = TruChain(
app_name="davinci", app_version="v1", app=davinci_chain, feedbacks=all_feedbacks
)
from langchain import HuggingFaceHub
from langchain import LLMChain
# initialize the models
hub_llm_smallflan = HuggingFaceHub(
repo_id="google/flan-t5-small", model_kwargs={"temperature": 1e-10}
)
hub_llm_largeflan = HuggingFaceHub(
repo_id="google/flan-t5-large", model_kwargs={"temperature": 1e-10}
)
davinci = OpenAI(model_name="text-davinci-003")
# create prompt template > LLM chain
smallflan_chain = LLMChain(prompt=prompt, llm=hub_llm_smallflan)
largeflan_chain = LLMChain(prompt=prompt, llm=hub_llm_largeflan)
davinci_chain = LLMChain(prompt=prompt, llm=davinci)
# Trulens instrumentation.
smallflan_app_recorder = TruChain(
app_name="small_flan", app_version="v1", app=smallflan_chain, feedbacks=all_feedbacks
)
largeflan_app_recorder = TruChain(
app_name="large_flan", app_version="v1", app=largeflan_chain, feedbacks=all_feedbacks
)
davinci_app_recorder = TruChain(
app_name="davinci", app_version="v1", app=davinci_chain, feedbacks=all_feedbacks
)
Run the application with all 3 models¶
In [ ]:
Copied!
prompts = [
"Who won the superbowl in 2010?",
"What is the capital of Thailand?",
"Who developed the theory of evolution by natural selection?",
]
for prompt in prompts:
with smallflan_app_recorder as recording:
smallflan_chain(prompt)
with largeflan_app_recorder as recording:
largeflan_chain(prompt)
with davinci_app_recorder as recording:
davinci_chain(prompt)
prompts = [
"Who won the superbowl in 2010?",
"What is the capital of Thailand?",
"Who developed the theory of evolution by natural selection?",
]
for prompt in prompts:
with smallflan_app_recorder as recording:
smallflan_chain(prompt)
with largeflan_app_recorder as recording:
largeflan_chain(prompt)
with davinci_app_recorder as recording:
davinci_chain(prompt)
Run the TruLens dashboard¶
In [ ]:
Copied!
from trulens.dashboard import run_dashboard
run_dashboard(session)
from trulens.dashboard import run_dashboard
run_dashboard(session)