LangChain Stream¶
One of the biggest pain-points developers discuss when trying to build useful LLM applications is latency; these applications often make multiple calls to LLM APIs, each one taking a few seconds. It can be quite a frustrating user experience to stare at a loading spinner for more than a couple seconds. Streaming helps reduce this perceived latency by returning the output of the LLM token by token, instead of all at once.
This notebook demonstrates how to monitor a LangChain streaming app with TruLens.
Import from LangChain and TruLens¶
In [ ]:
Copied!
# !pip install trulens trulens.apps.langchain trulens-providers-huggingface 'langchain>=0.2.16' 'langchain-openai>=0.0.1rc0'
# !pip install trulens trulens.apps.langchain trulens-providers-huggingface 'langchain>=0.2.16' 'langchain-openai>=0.0.1rc0'
In [ ]:
Copied!
from langchain.prompts import PromptTemplate
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI, OpenAI
from trulens.core import Feedback, TruSession
from trulens.providers.huggingface import Huggingface
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.prompts import PromptTemplate
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI, OpenAI
from trulens.core import Feedback, TruSession
from trulens.providers.huggingface import Huggingface
from langchain_community.chat_message_histories import ChatMessageHistory
In [ ]:
Copied!
import dotenv
dotenv.load_dotenv()
# import os
# os.environ["HUGGINGFACE_API_KEY"] = "hf_..."
# os.environ["OPENAI_API_KEY"] = "sk-..."
import dotenv
dotenv.load_dotenv()
# import os
# os.environ["HUGGINGFACE_API_KEY"] = "hf_..."
# os.environ["OPENAI_API_KEY"] = "sk-..."
Create Async Application¶
In [ ]:
Copied!
chatllm = ChatOpenAI(
temperature=0.0,
streaming=True, # important
)
llm = OpenAI(
temperature=0.0,
)
memory = ChatMessageHistory()
# Setup a simple question/answer chain with streaming ChatOpenAI.
prompt = PromptTemplate(
input_variables=["human_input", "chat_history"],
template="""
You are having a conversation with a person. Make small talk.
{chat_history}
Human: {human_input}
AI:""",
)
chain = RunnableWithMessageHistory(
prompt | chatllm,
lambda: memory,
input_messages_key="input",
history_messages_key="chat_history",)
chatllm = ChatOpenAI(
temperature=0.0,
streaming=True, # important
)
llm = OpenAI(
temperature=0.0,
)
memory = ChatMessageHistory()
# Setup a simple question/answer chain with streaming ChatOpenAI.
prompt = PromptTemplate(
input_variables=["human_input", "chat_history"],
template="""
You are having a conversation with a person. Make small talk.
{chat_history}
Human: {human_input}
AI:""",
)
chain = RunnableWithMessageHistory(
prompt | chatllm,
lambda: memory,
input_messages_key="input",
history_messages_key="chat_history",)
Set up a language match feedback function.¶
In [ ]:
Copied!
session = TruSession()
session.reset_database()
hugs = Huggingface()
f_lang_match = Feedback(hugs.language_match).on_input_output()
session = TruSession()
session.reset_database()
hugs = Huggingface()
f_lang_match = Feedback(hugs.language_match).on_input_output()
Set up evaluation and tracking with TruLens¶
In [ ]:
Copied!
# Example of how to also get filled-in prompt templates in timeline:
from trulens.core.instruments import instrument
from trulens.apps.langchain import TruChain
instrument.method(PromptTemplate, "format")
tc = TruChain(chain, feedbacks=[f_lang_match], app_name="chat_with_memory")
# Example of how to also get filled-in prompt templates in timeline:
from trulens.core.instruments import instrument
from trulens.apps.langchain import TruChain
instrument.method(PromptTemplate, "format")
tc = TruChain(chain, feedbacks=[f_lang_match], app_name="chat_with_memory")
In [ ]:
Copied!
tc.print_instrumented()
tc.print_instrumented()
Start the TruLens dashboard¶
In [ ]:
Copied!
from trulens.dashboard import run_dashboard
run_dashboard(session)
from trulens.dashboard import run_dashboard
run_dashboard(session)
Use the application¶
In [ ]:
Copied!
message = "Hi. How are you?"
async with tc as recording:
stream = chain.astream(
input=dict(human_input=message, chat_history=[]),
)
async for chunk in stream:
print(chunk.content, end="")
record = recording.get()
message = "Hi. How are you?"
async with tc as recording:
stream = chain.astream(
input=dict(human_input=message, chat_history=[]),
)
async for chunk in stream:
print(chunk.content, end="")
record = recording.get()
In [ ]:
Copied!
# Main output is a concatenation of chunk contents:
record.main_output
# Main output is a concatenation of chunk contents:
record.main_output
In [ ]:
Copied!
# Costs may not include all costs fields but should include the number of chunks
# received.
record.cost
# Costs may not include all costs fields but should include the number of chunks
# received.
record.cost
In [ ]:
Copied!
# Feedback is only evaluated once the chunks are all received.
record.feedback_results[0].result()
# Feedback is only evaluated once the chunks are all received.
record.feedback_results[0].result()