Azure OpenAI Llama Index Quickstart¶

In this quickstart you will create a simple Llama Index App and learn how to log it and get feedback on an LLM response using both an embedding and chat completion model from Azure OpenAI.

Setup¶

Install dependencies¶

Let's install some of the dependencies for this notebook if we don't have them already

In [ ]:

Copied!

# !pip install trulens trulens-apps-llamaindex trulens-providers-openai llama_index==0.9.13 llama-index-llms-azure-openai llama-index-embeddings-azure-openai langchain==0.0.346 html2text==2020.1.16
# !pip install trulens trulens-apps-llamaindex trulens-providers-openai llama_index==0.9.13 llama-index-llms-azure-openai llama-index-embeddings-azure-openai langchain==0.0.346 html2text==2020.1.16

Add API keys¶

For this quickstart, you will need a larger set of information from Azure OpenAI compared to typical OpenAI usage. These can be retrieved from https://oai.azure.com/ . Deployment name below is also found on the oai azure page.

In [ ]:

Copied!





# Check your https://oai.azure.com dashboard to retrieve params:

import os

os.environ["AZURE_OPENAI_API_KEY"] = "..."  # azure
os.environ["AZURE_OPENAI_ENDPOINT"] = (
    "https://<your endpoint here>.openai.azure.com/"  # azure
)
os.environ["OPENAI_API_VERSION"] = "2023-07-01-preview"  # may need updating
os.environ["OPENAI_API_TYPE"] = "azure"
# Check your https://oai.azure.com dashboard to retrieve params:

import os

os.environ["AZURE_OPENAI_API_KEY"] = "..."  # azure
os.environ["AZURE_OPENAI_ENDPOINT"] = (
    "https://.openai.azure.com/"  # azure
)
os.environ["OPENAI_API_VERSION"] = "2023-07-01-preview"  # may need updating
os.environ["OPENAI_API_TYPE"] = "azure"

Import from TruLens¶

In [ ]:

Copied!





# Imports main tools:
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.apps.llamaindex import TruLlama

session = TruSession()
session.reset_database()
# Imports main tools:
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.apps.llamaindex import TruLlama

session = TruSession()
session.reset_database()

Create Simple LLM Application¶

This example uses LlamaIndex which internally uses an OpenAI LLM.

In [ ]:

Copied!





import os

from llama_index.core import VectorStoreIndex
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.legacy import ServiceContext
from llama_index.legacy import set_global_service_context
from llama_index.legacy.readers import SimpleWebPageReader
from llama_index.llms.azure_openai import AzureOpenAI

# get model from Azure
llm = AzureOpenAI(
    model="gpt-35-turbo",
    deployment_name="<your deployment>",
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["OPENAI_API_VERSION"],
)

# You need to deploy your own embedding model as well as your own chat completion model
embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="<your deployment>",
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["OPENAI_API_VERSION"],
)

documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["http://paulgraham.com/worked.html"]
)

service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)

set_global_service_context(service_context)

index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()
import os

from llama_index.core import VectorStoreIndex
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.legacy import ServiceContext
from llama_index.legacy import set_global_service_context
from llama_index.legacy.readers import SimpleWebPageReader
from llama_index.llms.azure_openai import AzureOpenAI

# get model from Azure
llm = AzureOpenAI(
    model="gpt-35-turbo",
    deployment_name="",
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["OPENAI_API_VERSION"],
)

# You need to deploy your own embedding model as well as your own chat completion model
embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="",
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["OPENAI_API_VERSION"],
)

documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["http://paulgraham.com/worked.html"]
)

service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)

set_global_service_context(service_context)

index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()

Send your first request¶

In [ ]:

Copied!





query = "What is most interesting about this essay?"
answer = query_engine.query(query)

print(answer.get_formatted_sources())
print("query was:", query)
print("answer was:", answer)
query = "What is most interesting about this essay?"
answer = query_engine.query(query)

print(answer.get_formatted_sources())
print("query was:", query)
print("answer was:", answer)

Initialize Feedback Function(s)¶

In [ ]:

Copied!





import numpy as np
from trulens.feedback.v2.feedback import Groundedness
from trulens.providers.openai import AzureOpenAI

# Initialize AzureOpenAI-based feedback function collection class:
azopenai = AzureOpenAI(deployment_name="truera-gpt-35-turbo")

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(
    azopenai.relevance, name="Answer Relevance"
).on_input_output()

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(
        azopenai.context_relevance_with_cot_reasons, name="Context Relevance"
    )
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

# groundedness of output on the context
groundedness = Groundedness(groundedness_provider=azopenai)
f_groundedness = (
    Feedback(
        groundedness.groundedness_measure_with_cot_reasons, name="Groundedness"
    )
    .on(TruLlama.select_source_nodes().node.text.collect())
    .on_output()
    .aggregate(groundedness.grounded_statements_aggregator)
)
import numpy as np
from trulens.feedback.v2.feedback import Groundedness
from trulens.providers.openai import AzureOpenAI

# Initialize AzureOpenAI-based feedback function collection class:
azopenai = AzureOpenAI(deployment_name="truera-gpt-35-turbo")

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(
    azopenai.relevance, name="Answer Relevance"
).on_input_output()

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(
        azopenai.context_relevance_with_cot_reasons, name="Context Relevance"
    )
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

# groundedness of output on the context
groundedness = Groundedness(groundedness_provider=azopenai)
f_groundedness = (
    Feedback(
        groundedness.groundedness_measure_with_cot_reasons, name="Groundedness"
    )
    .on(TruLlama.select_source_nodes().node.text.collect())
    .on_output()
    .aggregate(groundedness.grounded_statements_aggregator)
)

Custom functions can also use the Azure provider¶

In [ ]:

Copied!





from typing import Dict, Tuple

from trulens.feedback import prompts


class Custom_AzureOpenAI(AzureOpenAI):
    def style_check_professional(self, response: str) -> float:
        """
        Custom feedback function to grade the professional style of the response, extending AzureOpenAI provider.

        Args:
            response (str): text to be graded for professional style.

        Returns:
            float: A value between 0 and 1. 0 being "not professional" and 1 being "professional".
        """
        professional_prompt = str.format(
            "Please rate the professionalism of the following text on a scale from 0 to 10, where 0 is not at all professional and 10 is extremely professional: \n\n{}",
            response,
        )
        return self.generate_score(system_prompt=professional_prompt)

    def context_relevance_with_cot_reasons_extreme(
        self, question: str, statement: str
    ) -> Tuple[float, Dict]:
        """
        Tweaked version of question statement relevance, extending AzureOpenAI provider.
        A function that completes a template to check the relevance of the statement to the question.
        Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores.
        Also uses chain of thought methodology and emits the reasons.

        Args:
            question (str): A question being asked.
            statement (str): A statement to the question.

        Returns:
            float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant".
        """

        system_prompt = str.format(
            prompts.context_relevance, question=question, statement=statement
        )

        # remove scoring guidelines around middle scores
        system_prompt = system_prompt.replace(
            "- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\n\n",
            "",
        )

        system_prompt = system_prompt.replace(
            "RELEVANCE:", prompts.COT_REASONS_TEMPLATE
        )

        return self.generate_score_and_reasons(system_prompt)


custom_azopenai = Custom_AzureOpenAI(deployment_name="truera-gpt-35-turbo")

# Question/statement relevance between question and each context chunk.
f_context_relevance_extreme = (
    Feedback(
        custom_azopenai.context_relevance_with_cot_reasons_extreme,
        name="Context Relevance - Extreme",
    )
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

f_style_check = Feedback(
    custom_azopenai.style_check_professional, name="Professional Style"
).on_output()
from typing import Dict, Tuple

from trulens.feedback import prompts


class Custom_AzureOpenAI(AzureOpenAI):
    def style_check_professional(self, response: str) -> float:
        """
        Custom feedback function to grade the professional style of the response, extending AzureOpenAI provider.

        Args:
            response (str): text to be graded for professional style.

        Returns:
            float: A value between 0 and 1. 0 being "not professional" and 1 being "professional".
        """
        professional_prompt = str.format(
            "Please rate the professionalism of the following text on a scale from 0 to 10, where 0 is not at all professional and 10 is extremely professional: \n\n{}",
            response,
        )
        return self.generate_score(system_prompt=professional_prompt)

    def context_relevance_with_cot_reasons_extreme(
        self, question: str, statement: str
    ) -> Tuple[float, Dict]:
        """
        Tweaked version of question statement relevance, extending AzureOpenAI provider.
        A function that completes a template to check the relevance of the statement to the question.
        Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores.
        Also uses chain of thought methodology and emits the reasons.

        Args:
            question (str): A question being asked.
            statement (str): A statement to the question.

        Returns:
            float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant".
        """

        system_prompt = str.format(
            prompts.context_relevance, question=question, statement=statement
        )

        # remove scoring guidelines around middle scores
        system_prompt = system_prompt.replace(
            "- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\n\n",
            "",
        )

        system_prompt = system_prompt.replace(
            "RELEVANCE:", prompts.COT_REASONS_TEMPLATE
        )

        return self.generate_score_and_reasons(system_prompt)


custom_azopenai = Custom_AzureOpenAI(deployment_name="truera-gpt-35-turbo")

# Question/statement relevance between question and each context chunk.
f_context_relevance_extreme = (
    Feedback(
        custom_azopenai.context_relevance_with_cot_reasons_extreme,
        name="Context Relevance - Extreme",
    )
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

f_style_check = Feedback(
    custom_azopenai.style_check_professional, name="Professional Style"
).on_output()

Instrument chain for logging with TruLens¶

In [ ]:

Copied!





tru_query_engine_recorder = TruLlama(
    query_engine,
    app_name="LlamaIndex_App1_AzureOpenAI",
    feedbacks=[
        f_groundedness,
        f_qa_relevance,
        f_context_relevance,
        f_context_relevance_extreme,
        f_style_check,
    ],
)
tru_query_engine_recorder = TruLlama(
    query_engine,
    app_name="LlamaIndex_App1_AzureOpenAI",
    feedbacks=[
        f_groundedness,
        f_qa_relevance,
        f_context_relevance,
        f_context_relevance_extreme,
        f_style_check,
    ],
)

In [ ]:

Copied!





query = "What is most interesting about this essay?"
with tru_query_engine_recorder as recording:
    answer = query_engine.query(query)
    print(answer.get_formatted_sources())
    print("query was:", query)
    print("answer was:", answer)
query = "What is most interesting about this essay?"
with tru_query_engine_recorder as recording:
    answer = query_engine.query(query)
    print(answer.get_formatted_sources())
    print("query was:", query)
    print("answer was:", answer)

Explore in a Dashboard¶

In [ ]:

Copied!

from trulens.dashboard import run_dashboard

run_dashboard(session)  # open a local streamlit app to explore

# stop_dashboard(session) # stop if needed
from trulens.dashboard import run_dashboard

run_dashboard(session)  # open a local streamlit app to explore

# stop_dashboard(session) # stop if needed

Or view results directly in your notebook¶

In [ ]:

Copied!

records, feedback = session.get_records_and_feedback(
    app_ids=[tru_query_engine_recorder.app_id]
)

records
records, feedback = session.get_records_and_feedback(
    app_ids=[tru_query_engine_recorder.app_id]
)

records

In [ ]:

Copied!

session.get_leaderboard(app_ids=[tru_query_engine_recorder.app_id])
session.get_leaderboard(app_ids=[tru_query_engine_recorder.app_id])