LangChain Stream¶

One of the biggest pain-points developers discuss when trying to build useful LLM applications is latency; these applications often make multiple calls to LLM APIs, each one taking a few seconds. It can be quite a frustrating user experience to stare at a loading spinner for more than a couple seconds. Streaming helps reduce this perceived latency by returning the output of the LLM token by token, instead of all at once.

This notebook demonstrates how to monitor a LangChain streaming app with TruLens.

Import from LangChain and TruLens¶

In [ ]:

Copied!

# !pip install trulens trulens.apps.langchain trulens-providers-huggingface 'langchain>=0.2.16' 'langchain-openai>=0.0.1rc0'
# !pip install trulens trulens.apps.langchain trulens-providers-huggingface 'langchain>=0.2.16' 'langchain-openai>=0.0.1rc0'

In [ ]:

Copied!





from langchain.prompts import PromptTemplate
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI, OpenAI
from trulens.core import Feedback, TruSession
from trulens.providers.huggingface import Huggingface
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.prompts import PromptTemplate
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI, OpenAI
from trulens.core import Feedback, TruSession
from trulens.providers.huggingface import Huggingface
from langchain_community.chat_message_histories import ChatMessageHistory

Setup¶

Add API keys¶

For this example you will need Huggingface and OpenAI keys

In [ ]:

Copied!





import dotenv
dotenv.load_dotenv()

# import os
# os.environ["HUGGINGFACE_API_KEY"] = "hf_..."
# os.environ["OPENAI_API_KEY"] = "sk-..."
import dotenv
dotenv.load_dotenv()

# import os
# os.environ["HUGGINGFACE_API_KEY"] = "hf_..."
# os.environ["OPENAI_API_KEY"] = "sk-..."

Create Async Application¶

In [ ]:

Copied!





chatllm = ChatOpenAI(
    temperature=0.0,
    streaming=True,  # important
)
llm = OpenAI(
    temperature=0.0,
)
memory = ChatMessageHistory()

# Setup a simple question/answer chain with streaming ChatOpenAI.
prompt = PromptTemplate(
    input_variables=["human_input", "chat_history"],
    template="""
    You are having a conversation with a person. Make small talk.
    {chat_history}
        Human: {human_input}
        AI:""",
)

chain = RunnableWithMessageHistory(
    prompt | chatllm,
    lambda: memory, 
    input_messages_key="input",
    history_messages_key="chat_history",)
chatllm = ChatOpenAI(
    temperature=0.0,
    streaming=True,  # important
)
llm = OpenAI(
    temperature=0.0,
)
memory = ChatMessageHistory()

# Setup a simple question/answer chain with streaming ChatOpenAI.
prompt = PromptTemplate(
    input_variables=["human_input", "chat_history"],
    template="""
    You are having a conversation with a person. Make small talk.
    {chat_history}
        Human: {human_input}
        AI:""",
)

chain = RunnableWithMessageHistory(
    prompt | chatllm,
    lambda: memory, 
    input_messages_key="input",
    history_messages_key="chat_history",)

Set up a language match feedback function.¶

In [ ]:

Copied!





session = TruSession()
session.reset_database()
hugs = Huggingface()
f_lang_match = Feedback(hugs.language_match).on_input_output()
session = TruSession()
session.reset_database()
hugs = Huggingface()
f_lang_match = Feedback(hugs.language_match).on_input_output()

Set up evaluation and tracking with TruLens¶

In [ ]:

Copied!

# Example of how to also get filled-in prompt templates in timeline:
from trulens.core.instruments import instrument
from trulens.apps.langchain import TruChain

instrument.method(PromptTemplate, "format")

tc = TruChain(chain, feedbacks=[f_lang_match], app_name="chat_with_memory")
# Example of how to also get filled-in prompt templates in timeline:
from trulens.core.instruments import instrument
from trulens.apps.langchain import TruChain

instrument.method(PromptTemplate, "format")

tc = TruChain(chain, feedbacks=[f_lang_match], app_name="chat_with_memory")

In [ ]:

Copied!

tc.print_instrumented()
tc.print_instrumented()

Start the TruLens dashboard¶

In [ ]:

Copied!

from trulens.dashboard import run_dashboard
run_dashboard(session)
from trulens.dashboard import run_dashboard
run_dashboard(session)

Use the application¶

In [ ]:

Copied!





message = "Hi. How are you?"

async with tc as recording:
    stream = chain.astream(
        input=dict(human_input=message, chat_history=[]),
    )

    async for chunk in stream:
        print(chunk.content, end="")

record = recording.get()
message = "Hi. How are you?"

async with tc as recording:
    stream = chain.astream(
        input=dict(human_input=message, chat_history=[]),
    )

    async for chunk in stream:
        print(chunk.content, end="")

record = recording.get()

In [ ]:

Copied!

# Main output is a concatenation of chunk contents:

record.main_output
# Main output is a concatenation of chunk contents:

record.main_output

In [ ]:

Copied!

# Costs may not include all costs fields but should include the number of chunks
# received.

record.cost
# Costs may not include all costs fields but should include the number of chunks
# received.

record.cost

In [ ]:

Copied!

# Feedback is only evaluated once the chunks are all received.

record.feedback_results[0].result()
# Feedback is only evaluated once the chunks are all received.

record.feedback_results[0].result()