📓 Evaluate Streaming Apps¶

This notebook shows how to evaluate a custom streaming app.

It also shows the use of the dummy feedback function provider which behaves like the huggingface provider except it does not actually perform any network calls and just produces constant results. It can be used to prototype feedback function wiring for your apps before invoking potentially slow (to run/to load) feedback functions.

Import libraries¶

In [ ]:

Copied!

# !pip install trulens trulens-providers-huggingface
# !pip install trulens trulens-providers-huggingface

In [ ]:

Copied!

from trulens.core import Feedback
from trulens.core import TruSession

session = TruSession()
session.reset_database()
from trulens.core import Feedback
from trulens.core import TruSession

session = TruSession()
session.reset_database()

In [ ]:

Copied!

from trulens.dashboard import run_dashboard

run_dashboard(session)
from trulens.dashboard import run_dashboard

run_dashboard(session)

Set keys¶

In [ ]:

Copied!

# import os
# os.environ["OPENAI_API_KEY"] = "sk-..."
import dotenv

dotenv.load_dotenv()
# import os
# os.environ["OPENAI_API_KEY"] = "sk-..."
import dotenv

dotenv.load_dotenv()

Build the app¶

In [ ]:

Copied!





from openai import OpenAI
from trulens.apps.custom import instrument

oai_client = OpenAI()


class APP:
    @instrument
    def stream_completion(self, prompt):
        completion = oai_client.chat.completions.create(
            model="gpt-3.5-turbo",
            stream=True,
            stream_options={
                "include_usage": True
            },  # not yet tracked by trulens
            temperature=0,
            messages=[
                {
                    "role": "user",
                    "content": f"Please answer the question: {prompt}",
                }
            ],
        )
        for chunk in completion:
            if (
                len(choices := chunk.choices) > 0
                and (content := choices[0].delta.content) is not None
            ):
                yield content


llm_app = APP()
from openai import OpenAI
from trulens.apps.custom import instrument

oai_client = OpenAI()


class APP:
    @instrument
    def stream_completion(self, prompt):
        completion = oai_client.chat.completions.create(
            model="gpt-3.5-turbo",
            stream=True,
            stream_options={
                "include_usage": True
            },  # not yet tracked by trulens
            temperature=0,
            messages=[
                {
                    "role": "user",
                    "content": f"Please answer the question: {prompt}",
                }
            ],
        )
        for chunk in completion:
            if (
                len(choices := chunk.choices) > 0
                and (content := choices[0].delta.content) is not None
            ):
                yield content


llm_app = APP()

Create dummy feedback¶

By setting the provider as Dummy(), you can erect your evaluation suite and then easily substitute in a real model provider (e.g. OpenAI) later.

In [ ]:

Copied!

from trulens.providers.huggingface.provider import Dummy

hugs = Dummy()

f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output()
from trulens.providers.huggingface.provider import Dummy

hugs = Dummy()

f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output()

Create the app¶

In [ ]:

Copied!





# add trulens as a context manager for llm_app with dummy feedback
from trulens.apps.custom import TruCustomApp

tru_app = TruCustomApp(
    llm_app,
    app_name="LLM App",
    app_version="v1",
    feedbacks=[f_positive_sentiment],
)
# add trulens as a context manager for llm_app with dummy feedback
from trulens.apps.custom import TruCustomApp

tru_app = TruCustomApp(
    llm_app,
    app_name="LLM App",
    app_version="v1",
    feedbacks=[f_positive_sentiment],
)

Run the app¶

In [ ]:

Copied!





with tru_app as recording:
    for chunk in llm_app.stream_completion(
        "give me a good name for a colorful sock company and the store behind its founding"
    ):
        print(chunk, end="")

record = recording.get()
with tru_app as recording:
    for chunk in llm_app.stream_completion(
        "give me a good name for a colorful sock company and the store behind its founding"
    ):
        print(chunk, end="")

record = recording.get()

In [ ]:

Copied!

# Check full output:

record.main_output
# Check full output:

record.main_output

In [ ]:

Copied!

# Check costs, not that only the number of chunks is presently tracked for streaming apps.

record.cost
# Check costs, not that only the number of chunks is presently tracked for streaming apps.

record.cost

In [ ]:

Copied!

session.get_leaderboard(app_ids=[tru_app.app_id])
session.get_leaderboard(app_ids=[tru_app.app_id])