📓 Logging Human Feedback¶
In many situations, it can be useful to log human feedback from your users about your LLM app's performance. Combining human feedback along with automated feedback can help you drill down on subsets of your app that underperform, and uncover new failure modes. This example will walk you through a simple example of recording human feedback with TruLens.
In [ ]:
Copied!
# !pip install trulens openai
# !pip install trulens openai
In [ ]:
Copied!
import os
from trulens.apps.custom import TruCustomApp
from trulens.core import TruSession
session = TruSession()
session.start_dashboard()
import os
from trulens.apps.custom import TruCustomApp
from trulens.core import TruSession
session = TruSession()
session.start_dashboard()
Set Keys¶
For this example, you need an OpenAI key.
In [ ]:
Copied!
os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["OPENAI_API_KEY"] = "sk-..."
Set up your app¶
Here we set up a custom application using just an OpenAI chat completion. The process for logging human feedback is the same however you choose to set up your app.
In [ ]:
Copied!
from openai import OpenAI
from trulens.apps.custom import instrument
oai_client = OpenAI()
class APP:
@instrument
def completion(self, prompt):
completion = (
oai_client.chat.completions.create(
model="gpt-3.5-turbo",
temperature=0,
messages=[
{
"role": "user",
"content": f"Please answer the question: {prompt}",
}
],
)
.choices[0]
.message.content
)
return completion
llm_app = APP()
# add trulens as a context manager for llm_app
tru_app = TruCustomApp(llm_app, app_name="LLM App", app_version="v1")
from openai import OpenAI
from trulens.apps.custom import instrument
oai_client = OpenAI()
class APP:
@instrument
def completion(self, prompt):
completion = (
oai_client.chat.completions.create(
model="gpt-3.5-turbo",
temperature=0,
messages=[
{
"role": "user",
"content": f"Please answer the question: {prompt}",
}
],
)
.choices[0]
.message.content
)
return completion
llm_app = APP()
# add trulens as a context manager for llm_app
tru_app = TruCustomApp(llm_app, app_name="LLM App", app_version="v1")
Run the app¶
In [ ]:
Copied!
with tru_app as recording:
llm_app.completion("Give me 10 names for a colorful sock company")
with tru_app as recording:
llm_app.completion("Give me 10 names for a colorful sock company")
In [ ]:
Copied!
# Get the record to add the feedback to.
record = recording.get()
# Get the record to add the feedback to.
record = recording.get()
Create a mechanism for recording human feedback.¶
Be sure to click an emoji in the record to record human_feedback
to log.
In [ ]:
Copied!
from ipywidgets import Button
from ipywidgets import HBox
from ipywidgets import Label
from ipywidgets import Textarea
from ipywidgets import VBox
from trulens.core.schema.feedback import FeedbackCall
thumbs_up_button = Button(description="👍")
thumbs_down_button = Button(description="👎")
def update_feedback(human_feedback):
# add the human feedback to a particular app and record
session.add_feedback(
name="Human Feedack",
record_id=record.record_id,
app_id=tru_app.app_id,
result=human_feedback,
)
def on_thumbs_up_button_clicked(b):
update_feedback(human_feedback=1)
print("👍")
def on_thumbs_down_button_clicked(b):
update_feedback(human_feedback=0)
print("👎")
thumbs_up_button.on_click(on_thumbs_up_button_clicked)
thumbs_down_button.on_click(on_thumbs_down_button_clicked)
VBox([
Label(record.main_input),
Label(record.main_output),
HBox([thumbs_up_button, thumbs_down_button]),
])
from ipywidgets import Button
from ipywidgets import HBox
from ipywidgets import Label
from ipywidgets import Textarea
from ipywidgets import VBox
from trulens.core.schema.feedback import FeedbackCall
thumbs_up_button = Button(description="👍")
thumbs_down_button = Button(description="👎")
def update_feedback(human_feedback):
# add the human feedback to a particular app and record
session.add_feedback(
name="Human Feedack",
record_id=record.record_id,
app_id=tru_app.app_id,
result=human_feedback,
)
def on_thumbs_up_button_clicked(b):
update_feedback(human_feedback=1)
print("👍")
def on_thumbs_down_button_clicked(b):
update_feedback(human_feedback=0)
print("👎")
thumbs_up_button.on_click(on_thumbs_up_button_clicked)
thumbs_down_button.on_click(on_thumbs_down_button_clicked)
VBox([
Label(record.main_input),
Label(record.main_output),
HBox([thumbs_up_button, thumbs_down_button]),
])
In [ ]:
Copied!
# Use Feedback call to attach more than one human feedback and optionally
# metadata. Here we allow the user to press the feedback buttons multiple times
# and give a reason for their feedback. The aggregate feedback result is
# computed in the code below as the mean of the human feedback results.
calls = []
thumbs_up_button = Button(description="👍")
thumbs_down_button = Button(description="👎")
reason_area = Textarea(description="Reason")
def add_human_feedback(human_feedback, reason):
if not reason:
reason = "No reason provided"
calls.append(
FeedbackCall(args={}, ret=human_feedback, meta={"reason": reason})
)
session.add_feedback(
name="Human Feedack with Metadata",
record_id=record.record_id,
app_id=tru_app.app_id,
result=sum([call.ret for call in calls]) / len(calls),
calls=calls,
)
if human_feedback == 1:
print("👍", reason)
else:
print("👎", reason)
def on_thumbs_up_button_clicked(b):
add_human_feedback(1.0, reason_area.value)
reason_area.value = ""
def on_thumbs_down_button_clicked(b):
add_human_feedback(0.0, reason_area.value)
reason_area.value = ""
thumbs_up_button.on_click(on_thumbs_up_button_clicked)
thumbs_down_button.on_click(on_thumbs_down_button_clicked)
VBox([
Label(record.main_input),
Label(record.main_output),
HBox([thumbs_up_button, thumbs_down_button, reason_area]),
])
# Use Feedback call to attach more than one human feedback and optionally
# metadata. Here we allow the user to press the feedback buttons multiple times
# and give a reason for their feedback. The aggregate feedback result is
# computed in the code below as the mean of the human feedback results.
calls = []
thumbs_up_button = Button(description="👍")
thumbs_down_button = Button(description="👎")
reason_area = Textarea(description="Reason")
def add_human_feedback(human_feedback, reason):
if not reason:
reason = "No reason provided"
calls.append(
FeedbackCall(args={}, ret=human_feedback, meta={"reason": reason})
)
session.add_feedback(
name="Human Feedack with Metadata",
record_id=record.record_id,
app_id=tru_app.app_id,
result=sum([call.ret for call in calls]) / len(calls),
calls=calls,
)
if human_feedback == 1:
print("👍", reason)
else:
print("👎", reason)
def on_thumbs_up_button_clicked(b):
add_human_feedback(1.0, reason_area.value)
reason_area.value = ""
def on_thumbs_down_button_clicked(b):
add_human_feedback(0.0, reason_area.value)
reason_area.value = ""
thumbs_up_button.on_click(on_thumbs_up_button_clicked)
thumbs_down_button.on_click(on_thumbs_down_button_clicked)
VBox([
Label(record.main_input),
Label(record.main_output),
HBox([thumbs_up_button, thumbs_down_button, reason_area]),
])
See the result logged with your app.¶
In [ ]:
Copied!
# Note that individual FeedbackCall are not shown in leaderboard and nor is
# their metadata.
session.get_leaderboard(app_ids=[tru_app.app_id])
# Note that individual FeedbackCall are not shown in leaderboard and nor is
# their metadata.
session.get_leaderboard(app_ids=[tru_app.app_id])