📓 Evaluate Streaming Apps¶
This notebook shows how to evaluate a custom streaming app.
It also shows the use of the dummy feedback function provider which behaves like the huggingface provider except it does not actually perform any network calls and just produces constant results. It can be used to prototype feedback function wiring for your apps before invoking potentially slow (to run/to load) feedback functions.
Import libraries¶
In [ ]:
Copied!
# !pip install trulens trulens-providers-huggingface
# !pip install trulens trulens-providers-huggingface
In [ ]:
Copied!
from trulens.core import Feedback
from trulens.core import TruSession
session = TruSession()
session.reset_database()
from trulens.core import Feedback
from trulens.core import TruSession
session = TruSession()
session.reset_database()
In [ ]:
Copied!
from trulens.dashboard import run_dashboard
run_dashboard(session)
from trulens.dashboard import run_dashboard
run_dashboard(session)
Set keys¶
In [ ]:
Copied!
# import os
# os.environ["OPENAI_API_KEY"] = "sk-..."
import dotenv
dotenv.load_dotenv()
# import os
# os.environ["OPENAI_API_KEY"] = "sk-..."
import dotenv
dotenv.load_dotenv()
Build the app¶
In [ ]:
Copied!
from openai import OpenAI
from trulens.apps.custom import instrument
oai_client = OpenAI()
class APP:
@instrument
def stream_completion(self, prompt):
completion = oai_client.chat.completions.create(
model="gpt-3.5-turbo",
stream=True,
stream_options={
"include_usage": True
}, # not yet tracked by trulens
temperature=0,
messages=[
{
"role": "user",
"content": f"Please answer the question: {prompt}",
}
],
)
for chunk in completion:
if (
len(choices := chunk.choices) > 0
and (content := choices[0].delta.content) is not None
):
yield content
llm_app = APP()
from openai import OpenAI
from trulens.apps.custom import instrument
oai_client = OpenAI()
class APP:
@instrument
def stream_completion(self, prompt):
completion = oai_client.chat.completions.create(
model="gpt-3.5-turbo",
stream=True,
stream_options={
"include_usage": True
}, # not yet tracked by trulens
temperature=0,
messages=[
{
"role": "user",
"content": f"Please answer the question: {prompt}",
}
],
)
for chunk in completion:
if (
len(choices := chunk.choices) > 0
and (content := choices[0].delta.content) is not None
):
yield content
llm_app = APP()
Create dummy feedback¶
By setting the provider as Dummy()
, you can erect your evaluation suite and then easily substitute in a real model provider (e.g. OpenAI) later.
In [ ]:
Copied!
from trulens.providers.huggingface.provider import Dummy
hugs = Dummy()
f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output()
from trulens.providers.huggingface.provider import Dummy
hugs = Dummy()
f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output()
Create the app¶
In [ ]:
Copied!
# add trulens as a context manager for llm_app with dummy feedback
from trulens.apps.custom import TruCustomApp
tru_app = TruCustomApp(
llm_app,
app_name="LLM App",
app_version="v1",
feedbacks=[f_positive_sentiment],
)
# add trulens as a context manager for llm_app with dummy feedback
from trulens.apps.custom import TruCustomApp
tru_app = TruCustomApp(
llm_app,
app_name="LLM App",
app_version="v1",
feedbacks=[f_positive_sentiment],
)
Run the app¶
In [ ]:
Copied!
with tru_app as recording:
for chunk in llm_app.stream_completion(
"give me a good name for a colorful sock company and the store behind its founding"
):
print(chunk, end="")
record = recording.get()
with tru_app as recording:
for chunk in llm_app.stream_completion(
"give me a good name for a colorful sock company and the store behind its founding"
):
print(chunk, end="")
record = recording.get()
In [ ]:
Copied!
# Check full output:
record.main_output
# Check full output:
record.main_output
In [ ]:
Copied!
# Check costs, not that only the number of chunks is presently tracked for streaming apps.
record.cost
# Check costs, not that only the number of chunks is presently tracked for streaming apps.
record.cost
In [ ]:
Copied!
session.get_leaderboard(app_ids=[tru_app.app_id])
session.get_leaderboard(app_ids=[tru_app.app_id])