Multi-modal LLMs and Multimodal RAG with Gemini¶
In the first example, run and evaluate a multimodal Gemini model with a multimodal evaluator.
In the second example, learn how to run semantic evaluations on a multi-modal RAG, including the RAG triad.
Note: google-generativeai
is only available for certain countries and regions. Original example attribution: LlamaIndex
In [ ]:
Copied!
# !pip install trulens trulens-providers-litellm trulens-apps-llamaindex llama-index 'google-generativeai>=0.3.0' matplotlib qdrant_client
# !pip install trulens trulens-providers-litellm trulens-apps-llamaindex llama-index 'google-generativeai>=0.3.0' matplotlib qdrant_client
Use Gemini to understand Images from URLs¶
In [ ]:
Copied!
import os
os.environ["GOOGLE_API_KEY"] = "..."
import os
os.environ["GOOGLE_API_KEY"] = "..."
Initialize GeminiMultiModal
and Load Images from URLs¶
In [ ]:
Copied!
from llama_index.multi_modal_llms.gemini import GeminiMultiModal
from llama_index.multi_modal_llms.generic_utils import load_image_urls
image_urls = [
"https://storage.googleapis.com/generativeai-downloads/data/scene.jpg",
# Add yours here!
]
image_documents = load_image_urls(image_urls)
gemini_pro = GeminiMultiModal(model_name="models/gemini-pro-vision")
from llama_index.multi_modal_llms.gemini import GeminiMultiModal
from llama_index.multi_modal_llms.generic_utils import load_image_urls
image_urls = [
"https://storage.googleapis.com/generativeai-downloads/data/scene.jpg",
# Add yours here!
]
image_documents = load_image_urls(image_urls)
gemini_pro = GeminiMultiModal(model_name="models/gemini-pro-vision")
In [ ]:
Copied!
image_documents
image_documents
Setup TruLens Instrumentation¶
In [ ]:
Copied!
from trulens.core import Feedback
from trulens.core import Select
from trulens.core import TruSession
from trulens.apps.custom import TruCustomApp
from trulens.apps.custom import instrument
from trulens.core.feedback import Provider
session = TruSession()
session.reset_database()
# create a custom class to instrument
class Gemini:
@instrument
def complete(self, prompt, image_documents):
completion = gemini_pro.complete(
prompt=prompt,
image_documents=image_documents,
)
return completion
gemini = Gemini()
from trulens.core import Feedback
from trulens.core import Select
from trulens.core import TruSession
from trulens.apps.custom import TruCustomApp
from trulens.apps.custom import instrument
from trulens.core.feedback import Provider
session = TruSession()
session.reset_database()
# create a custom class to instrument
class Gemini:
@instrument
def complete(self, prompt, image_documents):
completion = gemini_pro.complete(
prompt=prompt,
image_documents=image_documents,
)
return completion
gemini = Gemini()
Setup custom provider with Gemini¶
In [ ]:
Copied!
# create a custom gemini feedback provider
class Gemini_Provider(Provider):
def city_rating(self, image_url) -> float:
image_documents = load_image_urls([image_url])
city_score = float(
gemini_pro.complete(
prompt="Is the image of a city? Respond with the float likelihood from 0.0 (not city) to 1.0 (city).",
image_documents=image_documents,
).text
)
return city_score
gemini_provider = Gemini_Provider()
f_custom_function = Feedback(
gemini_provider.city_rating, name="City Likelihood"
).on(Select.Record.calls[0].args.image_documents[0].image_url)
# create a custom gemini feedback provider
class Gemini_Provider(Provider):
def city_rating(self, image_url) -> float:
image_documents = load_image_urls([image_url])
city_score = float(
gemini_pro.complete(
prompt="Is the image of a city? Respond with the float likelihood from 0.0 (not city) to 1.0 (city).",
image_documents=image_documents,
).text
)
return city_score
gemini_provider = Gemini_Provider()
f_custom_function = Feedback(
gemini_provider.city_rating, name="City Likelihood"
).on(Select.Record.calls[0].args.image_documents[0].image_url)
Test custom feedback function¶
In [ ]:
Copied!
gemini_provider.city_rating(
image_url="https://storage.googleapis.com/generativeai-downloads/data/scene.jpg"
)
gemini_provider.city_rating(
image_url="https://storage.googleapis.com/generativeai-downloads/data/scene.jpg"
)
Instrument custom app with TruLens¶
In [ ]:
Copied!
tru_gemini = TruCustomApp(
gemini, app_name="gemini", feedbacks=[f_custom_function]
)
tru_gemini = TruCustomApp(
gemini, app_name="gemini", feedbacks=[f_custom_function]
)
Run the app¶
In [ ]:
Copied!
with tru_gemini as recording:
gemini.complete(
prompt="Identify the city where this photo was taken.",
image_documents=image_documents,
)
with tru_gemini as recording:
gemini.complete(
prompt="Identify the city where this photo was taken.",
image_documents=image_documents,
)
Build Multi-Modal RAG for Restaurant Recommendation¶
Our stack consists of TruLens + Gemini + LlamaIndex + Pydantic structured output capabilities.
Pydantic structured output is great,
Download data to use¶
In [ ]:
Copied!
from pathlib import Path
input_image_path = Path("google_restaurants")
if not input_image_path.exists():
Path.mkdir(input_image_path)
!wget "https://docs.google.com/uc?export=download&id=1Pg04p6ss0FlBgz00noHAOAJ1EYXiosKg" -O ./google_restaurants/miami.png
!wget "https://docs.google.com/uc?export=download&id=1dYZy17bD6pSsEyACXx9fRMNx93ok-kTJ" -O ./google_restaurants/orlando.png
!wget "https://docs.google.com/uc?export=download&id=1ShPnYVc1iL_TA1t7ErCFEAHT74-qvMrn" -O ./google_restaurants/sf.png
!wget "https://docs.google.com/uc?export=download&id=1WjISWnatHjwL4z5VD_9o09ORWhRJuYqm" -O ./google_restaurants/toronto.png
from pathlib import Path
input_image_path = Path("google_restaurants")
if not input_image_path.exists():
Path.mkdir(input_image_path)
!wget "https://docs.google.com/uc?export=download&id=1Pg04p6ss0FlBgz00noHAOAJ1EYXiosKg" -O ./google_restaurants/miami.png
!wget "https://docs.google.com/uc?export=download&id=1dYZy17bD6pSsEyACXx9fRMNx93ok-kTJ" -O ./google_restaurants/orlando.png
!wget "https://docs.google.com/uc?export=download&id=1ShPnYVc1iL_TA1t7ErCFEAHT74-qvMrn" -O ./google_restaurants/sf.png
!wget "https://docs.google.com/uc?export=download&id=1WjISWnatHjwL4z5VD_9o09ORWhRJuYqm" -O ./google_restaurants/toronto.png
Define Pydantic Class for Structured Parser¶
In [ ]:
Copied!
import matplotlib.pyplot as plt
from PIL import Image
from pydantic import BaseModel
class GoogleRestaurant(BaseModel):
"""Data model for a Google Restaurant."""
restaurant: str
food: str
location: str
category: str
hours: str
price: str
rating: float
review: str
description: str
nearby_tourist_places: str
google_image_url = "./google_restaurants/miami.png"
image = Image.open(google_image_url).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)
import matplotlib.pyplot as plt
from PIL import Image
from pydantic import BaseModel
class GoogleRestaurant(BaseModel):
"""Data model for a Google Restaurant."""
restaurant: str
food: str
location: str
category: str
hours: str
price: str
rating: float
review: str
description: str
nearby_tourist_places: str
google_image_url = "./google_restaurants/miami.png"
image = Image.open(google_image_url).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)
In [ ]:
Copied!
from llama_index import SimpleDirectoryReader
from llama_index.multi_modal_llms import GeminiMultiModal
from llama_index.output_parsers import PydanticOutputParser
from llama_index.program import MultiModalLLMCompletionProgram
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
def pydantic_gemini(
model_name, output_class, image_documents, prompt_template_str
):
gemini_llm = GeminiMultiModal(
api_key=os.environ["GOOGLE_API_KEY"], model_name=model_name
)
llm_program = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(output_class),
image_documents=image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=gemini_llm,
verbose=True,
)
response = llm_program()
return response
google_image_documents = SimpleDirectoryReader(
"./google_restaurants"
).load_data()
results = []
for img_doc in google_image_documents:
pydantic_response = pydantic_gemini(
"models/gemini-pro-vision",
GoogleRestaurant,
[img_doc],
prompt_template_str,
)
# only output the results for miami for example along with image
if "miami" in img_doc.image_path:
for r in pydantic_response:
print(r)
results.append(pydantic_response)
from llama_index import SimpleDirectoryReader
from llama_index.multi_modal_llms import GeminiMultiModal
from llama_index.output_parsers import PydanticOutputParser
from llama_index.program import MultiModalLLMCompletionProgram
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
def pydantic_gemini(
model_name, output_class, image_documents, prompt_template_str
):
gemini_llm = GeminiMultiModal(
api_key=os.environ["GOOGLE_API_KEY"], model_name=model_name
)
llm_program = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(output_class),
image_documents=image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=gemini_llm,
verbose=True,
)
response = llm_program()
return response
google_image_documents = SimpleDirectoryReader(
"./google_restaurants"
).load_data()
results = []
for img_doc in google_image_documents:
pydantic_response = pydantic_gemini(
"models/gemini-pro-vision",
GoogleRestaurant,
[img_doc],
prompt_template_str,
)
# only output the results for miami for example along with image
if "miami" in img_doc.image_path:
for r in pydantic_response:
print(r)
results.append(pydantic_response)
Construct Text Nodes for Building Vector Store. Store metadata and description for each restaurant.¶
In [ ]:
Copied!
from llama_index.schema import TextNode
nodes = []
for res in results:
text_node = TextNode()
metadata = {}
for r in res:
# set description as text of TextNode
if r[0] == "description":
text_node.text = r[1]
else:
metadata[r[0]] = r[1]
text_node.metadata = metadata
nodes.append(text_node)
from llama_index.schema import TextNode
nodes = []
for res in results:
text_node = TextNode()
metadata = {}
for r in res:
# set description as text of TextNode
if r[0] == "description":
text_node.text = r[1]
else:
metadata[r[0]] = r[1]
text_node.metadata = metadata
nodes.append(text_node)
Using Gemini Embedding for building Vector Store for Dense retrieval. Index Restaurants as nodes into Vector Store¶
In [ ]:
Copied!
from llama_index.core import ServiceContext
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.embeddings import GeminiEmbedding
from llama_index.llms import Gemini
from llama_index.vector_stores import QdrantVectorStore
import qdrant_client
# Create a local Qdrant vector store
client = qdrant_client.QdrantClient(path="qdrant_gemini_4")
vector_store = QdrantVectorStore(client=client, collection_name="collection")
# Using the embedding model to Gemini
embed_model = GeminiEmbedding(
model_name="models/embedding-001", api_key=os.environ["GOOGLE_API_KEY"]
)
service_context = ServiceContext.from_defaults(
llm=Gemini(), embed_model=embed_model
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
nodes=nodes,
service_context=service_context,
storage_context=storage_context,
)
from llama_index.core import ServiceContext
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.embeddings import GeminiEmbedding
from llama_index.llms import Gemini
from llama_index.vector_stores import QdrantVectorStore
import qdrant_client
# Create a local Qdrant vector store
client = qdrant_client.QdrantClient(path="qdrant_gemini_4")
vector_store = QdrantVectorStore(client=client, collection_name="collection")
# Using the embedding model to Gemini
embed_model = GeminiEmbedding(
model_name="models/embedding-001", api_key=os.environ["GOOGLE_API_KEY"]
)
service_context = ServiceContext.from_defaults(
llm=Gemini(), embed_model=embed_model
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
nodes=nodes,
service_context=service_context,
storage_context=storage_context,
)
Using Gemini to synthesize the results and recommend the restaurants to user¶
In [ ]:
Copied!
query_engine = index.as_query_engine(
similarity_top_k=1,
)
response = query_engine.query(
"recommend an inexpensive Orlando restaurant for me and its nearby tourist places"
)
print(response)
query_engine = index.as_query_engine(
similarity_top_k=1,
)
response = query_engine.query(
"recommend an inexpensive Orlando restaurant for me and its nearby tourist places"
)
print(response)
Instrument and Evaluate query_engine
with TruLens¶
In [ ]:
Copied!
import re
from google.cloud import aiplatform
from llama_index.llms import Gemini
import numpy as np
from trulens.core import Feedback
from trulens.core import Select
from trulens.core.feedback import Provider
from trulens.feedback.v2.feedback import Groundedness
from trulens.providers.litellm import LiteLLM
aiplatform.init(project="trulens-testing", location="us-central1")
gemini_provider = LiteLLM(model_engine="gemini-pro")
grounded = Groundedness(groundedness_provider=gemini_provider)
# Define a groundedness feedback function
f_groundedness = (
Feedback(
grounded.groundedness_measure_with_cot_reasons, name="Groundedness"
)
.on(
Select.RecordCalls._response_synthesizer.get_response.args.text_chunks[
0
].collect()
)
.on_output()
.aggregate(grounded.grounded_statements_aggregator)
)
# Question/answer relevance between overall question and answer.
f_qa_relevance = (
Feedback(gemini_provider.relevance, name="Answer Relevance")
.on_input()
.on_output()
)
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
Feedback(gemini_provider.context_relevance, name="Context Relevance")
.on_input()
.on(
Select.RecordCalls._response_synthesizer.get_response.args.text_chunks[
0
]
)
.aggregate(np.mean)
)
gemini_text = Gemini()
# create a custom gemini feedback provider to rate affordability. Do it with len() and math and also with an LLM.
class Gemini_Provider(Provider):
def affordable_math(self, text: str) -> float:
"""
Count the number of money signs using len(). Then subtract 1 and divide by 3.
"""
affordability = 1 - ((len(text) - 1) / 3)
return affordability
def affordable_llm(self, text: str) -> float:
"""
Count the number of money signs using an LLM. Then subtract 1 and take the reciprocal.
"""
prompt = f"Count the number of characters in the text: {text}. Then subtract 1 and divide the result by 3. Last subtract from 1. Final answer:"
gemini_response = gemini_text.complete(prompt).text
# gemini is a bit verbose, so do some regex to get the answer out.
float_pattern = r"[-+]?\d*\.\d+|\d+"
float_numbers = re.findall(float_pattern, gemini_response)
rightmost_float = float(float_numbers[-1])
affordability = rightmost_float
return affordability
gemini_provider_custom = Gemini_Provider()
f_affordable_math = Feedback(
gemini_provider_custom.affordable_math, name="Affordability - Math"
).on(
Select.RecordCalls.retriever._index.storage_context.vector_stores.default.query.rets.nodes[
0
].metadata.price
)
f_affordable_llm = Feedback(
gemini_provider_custom.affordable_llm, name="Affordability - LLM"
).on(
Select.RecordCalls.retriever._index.storage_context.vector_stores.default.query.rets.nodes[
0
].metadata.price
)
import re
from google.cloud import aiplatform
from llama_index.llms import Gemini
import numpy as np
from trulens.core import Feedback
from trulens.core import Select
from trulens.core.feedback import Provider
from trulens.feedback.v2.feedback import Groundedness
from trulens.providers.litellm import LiteLLM
aiplatform.init(project="trulens-testing", location="us-central1")
gemini_provider = LiteLLM(model_engine="gemini-pro")
grounded = Groundedness(groundedness_provider=gemini_provider)
# Define a groundedness feedback function
f_groundedness = (
Feedback(
grounded.groundedness_measure_with_cot_reasons, name="Groundedness"
)
.on(
Select.RecordCalls._response_synthesizer.get_response.args.text_chunks[
0
].collect()
)
.on_output()
.aggregate(grounded.grounded_statements_aggregator)
)
# Question/answer relevance between overall question and answer.
f_qa_relevance = (
Feedback(gemini_provider.relevance, name="Answer Relevance")
.on_input()
.on_output()
)
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
Feedback(gemini_provider.context_relevance, name="Context Relevance")
.on_input()
.on(
Select.RecordCalls._response_synthesizer.get_response.args.text_chunks[
0
]
)
.aggregate(np.mean)
)
gemini_text = Gemini()
# create a custom gemini feedback provider to rate affordability. Do it with len() and math and also with an LLM.
class Gemini_Provider(Provider):
def affordable_math(self, text: str) -> float:
"""
Count the number of money signs using len(). Then subtract 1 and divide by 3.
"""
affordability = 1 - ((len(text) - 1) / 3)
return affordability
def affordable_llm(self, text: str) -> float:
"""
Count the number of money signs using an LLM. Then subtract 1 and take the reciprocal.
"""
prompt = f"Count the number of characters in the text: {text}. Then subtract 1 and divide the result by 3. Last subtract from 1. Final answer:"
gemini_response = gemini_text.complete(prompt).text
# gemini is a bit verbose, so do some regex to get the answer out.
float_pattern = r"[-+]?\d*\.\d+|\d+"
float_numbers = re.findall(float_pattern, gemini_response)
rightmost_float = float(float_numbers[-1])
affordability = rightmost_float
return affordability
gemini_provider_custom = Gemini_Provider()
f_affordable_math = Feedback(
gemini_provider_custom.affordable_math, name="Affordability - Math"
).on(
Select.RecordCalls.retriever._index.storage_context.vector_stores.default.query.rets.nodes[
0
].metadata.price
)
f_affordable_llm = Feedback(
gemini_provider_custom.affordable_llm, name="Affordability - LLM"
).on(
Select.RecordCalls.retriever._index.storage_context.vector_stores.default.query.rets.nodes[
0
].metadata.price
)
Test the feedback function(s)¶
In [ ]:
Copied!
grounded.groundedness_measure_with_cot_reasons(
[
"""('restaurant', 'La Mar by Gaston Acurio')
('food', 'South American')
('location', '500 Brickell Key Dr, Miami, FL 33131')
('category', 'Restaurant')
('hours', 'Open ⋅ Closes 11 PM')
('price', 'Moderate')
('rating', 4.4)
('review', '4.4 (2,104)')
('description', 'Chic waterfront find offering Peruvian & fusion fare, plus bars for cocktails, ceviche & anticucho.')
('nearby_tourist_places', 'Brickell Key Park')"""
],
"La Mar by Gaston Acurio is a delicious peruvian restaurant by the water",
)
grounded.groundedness_measure_with_cot_reasons(
[
"""('restaurant', 'La Mar by Gaston Acurio')
('food', 'South American')
('location', '500 Brickell Key Dr, Miami, FL 33131')
('category', 'Restaurant')
('hours', 'Open ⋅ Closes 11 PM')
('price', 'Moderate')
('rating', 4.4)
('review', '4.4 (2,104)')
('description', 'Chic waterfront find offering Peruvian & fusion fare, plus bars for cocktails, ceviche & anticucho.')
('nearby_tourist_places', 'Brickell Key Park')"""
],
"La Mar by Gaston Acurio is a delicious peruvian restaurant by the water",
)
In [ ]:
Copied!
gemini_provider.context_relevance(
"I'm hungry for Peruvian, and would love to eat by the water. Can you recommend a dinner spot?",
"""('restaurant', 'La Mar by Gaston Acurio')
('food', 'South American')
('location', '500 Brickell Key Dr, Miami, FL 33131')
('category', 'Restaurant')
('hours', 'Open ⋅ Closes 11 PM')
('price', 'Moderate')
('rating', 4.4)
('review', '4.4 (2,104)')
('description', 'Chic waterfront find offering Peruvian & fusion fare, plus bars for cocktails, ceviche & anticucho.')
('nearby_tourist_places', 'Brickell Key Park')""",
)
gemini_provider.context_relevance(
"I'm hungry for Peruvian, and would love to eat by the water. Can you recommend a dinner spot?",
"""('restaurant', 'La Mar by Gaston Acurio')
('food', 'South American')
('location', '500 Brickell Key Dr, Miami, FL 33131')
('category', 'Restaurant')
('hours', 'Open ⋅ Closes 11 PM')
('price', 'Moderate')
('rating', 4.4)
('review', '4.4 (2,104)')
('description', 'Chic waterfront find offering Peruvian & fusion fare, plus bars for cocktails, ceviche & anticucho.')
('nearby_tourist_places', 'Brickell Key Park')""",
)
In [ ]:
Copied!
gemini_provider.relevance(
"I'm hungry for Peruvian, and would love to eat by the water. Can you recommend a dinner spot?",
"La Mar by Gaston Acurio is a delicious peruvian restaurant by the water",
)
gemini_provider.relevance(
"I'm hungry for Peruvian, and would love to eat by the water. Can you recommend a dinner spot?",
"La Mar by Gaston Acurio is a delicious peruvian restaurant by the water",
)
In [ ]:
Copied!
gemini_provider_custom.affordable_math("$$")
gemini_provider_custom.affordable_math("$$")
In [ ]:
Copied!
gemini_provider_custom.affordable_llm("$$")
gemini_provider_custom.affordable_llm("$$")
Set up instrumentation and eval¶
In [ ]:
Copied!
from trulens.apps.llamaindex import TruLlama
tru_query_engine_recorder = TruLlama(
query_engine,
app_name="LlamaIndex_App",
app_version="1",
feedbacks=[
f_affordable_math,
f_affordable_llm,
f_context_relevance,
f_groundedness,
f_qa_relevance,
],
)
from trulens.apps.llamaindex import TruLlama
tru_query_engine_recorder = TruLlama(
query_engine,
app_name="LlamaIndex_App",
app_version="1",
feedbacks=[
f_affordable_math,
f_affordable_llm,
f_context_relevance,
f_groundedness,
f_qa_relevance,
],
)
In [ ]:
Copied!
from trulens.dashboard import run_dashboard
from trulens.dashboard import stop_dashboard
stop_dashboard(session, force=True)
run_dashboard(session)
from trulens.dashboard import run_dashboard
from trulens.dashboard import stop_dashboard
stop_dashboard(session, force=True)
run_dashboard(session)
Run the app¶
In [ ]:
Copied!
with tru_query_engine_recorder as recording:
query_engine.query(
"recommend an american restaurant in Orlando for me and its nearby tourist places"
)
with tru_query_engine_recorder as recording:
query_engine.query(
"recommend an american restaurant in Orlando for me and its nearby tourist places"
)
In [ ]:
Copied!
run_dashboard(session)
run_dashboard(session)
In [ ]:
Copied!
session.get_leaderboard(app_ids=[tru_query_engine_recorder.app_id])
session.get_leaderboard(app_ids=[tru_query_engine_recorder.app_id])