Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 105 additions & 96 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd
import uvicorn
import sys
import time
# import nest_asyncio

from utils import CloudObjectStorageReader, CustomWatsonX, create_sparse_vector_query_with_model, create_sparse_vector_query_with_model_and_filter
Expand Down Expand Up @@ -75,13 +76,18 @@
"cosEndpointURL": os.environ.get("COS_ENDPOINT_URL")
}

generate_params = {
GenParams.MAX_NEW_TOKENS: 250,
GenParams.DECODING_METHOD: "greedy",
GenParams.STOP_SEQUENCES: ['END',';',';END'],
GenParams.REPETITION_PENALTY: 1
}

# Create a global client connection to elastic search
async_es_client = AsyncElasticsearch(
wxd_creds["wxdurl"],
basic_auth=(wxd_creds["username"], wxd_creds["password"]),
verify_certs=False,
request_timeout=3600,
)


# Create a watsonx client cache for faster calls.
custom_watsonx_cache = {}

@app.get("/")
def index():
Expand Down Expand Up @@ -115,13 +121,6 @@ async def ingestDocs(request: ingestRequest)->ingestResponse:
documents = await cos_reader.load_data()
print(f"Total documents: {len(documents)}\nExample document:\n{documents[0]}")

async_es_client = AsyncElasticsearch(
wxd_creds["wxdurl"],
basic_auth=(wxd_creds["username"], wxd_creds["password"]),
verify_certs=False,
request_timeout=3600,
)

await async_es_client.info()

# Pipeline must occur before index due to pipeline dependency
Expand Down Expand Up @@ -213,15 +212,19 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
index_name = request.es_index_name
index_text_field = request.es_index_text_field
es_model_name = request.es_model_name
model_text_field = request.es_model_text_field
num_results = request.num_results
llm_params = request.llm_params
es_filters = request.filters
llm_instructions = request.llm_instructions

# Sets the llm instruction if the user provides it
if not request.llm_instructions:
llm_instructions = os.environ.get("LLM_INSTRUCTIONS")
else:
llm_instructions = request.llm_instructions
# Sanity check for instructions
if "{query_str}" not in llm_instructions or "{context_str}" not in llm_instructions:
data_response = {
"llm_response": "",
"references": [{"error":"Please add {query_str} and {context_str} placeholders to the instructions."}]
}
return queryLLMResponse(**data_response)

# Format payload for later query
payload = {
Expand All @@ -231,89 +234,95 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
}

# Attempt to connect to ElasticSearch and call Watsonx for a response
try:
# Setting up the structure of the payload for the query engine
user_query = payload["input_data"][0]["values"][0][0]

# Create the prompt template based on llm_instructions
prompt_template = PromptTemplate(llm_instructions)

# Create the watsonx LLM object that will be used for the RAG pattern
Settings.llm = CustomWatsonX(
credentials=wml_credentials,
project_id=project_id,
model_id=llm_params.model_id,
validate_model_id=False,
additional_kwargs=llm_params.parameters.dict(),
)
Settings.embed_model = None

# Create a client connection to elastic search
async_es_client = AsyncElasticsearch(
wxd_creds["wxdurl"],
basic_auth=(wxd_creds["username"], wxd_creds["password"]),
verify_certs=False,
request_timeout=3600,
)
# try:
# Setting up the structure of the payload for the query engine
user_query = payload["input_data"][0]["values"][0][0]

# Create a vector store using the elastic client
vector_store = ElasticsearchStore(
es_client=async_es_client,
index_name=index_name,
text_field=index_text_field
)
# Create the prompt template based on llm_instructions
prompt_template = PromptTemplate(llm_instructions)

# Retrieve an index of the ingested documents in the vector store
# for later retrieval and querying
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

# Create a retriever object using the index and setting params

if es_filters:
print(es_filters)
for k, v in es_filters.items():
print(k)
print(v)
filters = MetadataFilters(
filters=[
MetadataFilter(key=k,operator=FilterOperator.EQ, value=v) for k, v in es_filters.items()
]
)

query_engine = index.as_query_engine(
text_qa_template=prompt_template,
similarity_top_k=num_results,
vector_store_query_mode="sparse",
vector_store_kwargs={
"custom_query": create_sparse_vector_query_with_model_and_filter(es_model_name, filters=filters)
},
)
else:
query_engine = index.as_query_engine(
text_qa_template=prompt_template,
similarity_top_k=num_results,
vector_store_query_mode="sparse",
vector_store_kwargs={
"custom_query": create_sparse_vector_query_with_model(es_model_name)
},
)

# Finally query the engine with the user question
response = query_engine.query(user_query)

# Format the data
data_response = {
"llm_response": response.response,
"references": [node.to_dict() for node in response.source_nodes]
}
# Create the watsonx LLM object that will be used for the RAG pattern
Settings.llm = get_custom_watsonx(llm_params.model_id, llm_params.parameters.dict())
Settings.embed_model = None

return queryLLMResponse(**data_response)
# Create a vector store using the elastic client
vector_store = ElasticsearchStore(
es_client=async_es_client,
index_name=index_name,
text_field=index_text_field
)

except Exception as e:
return queryLLMResponse(
llm_response = "",
references=[{"error": repr(e)}]
# Retrieve an index of the ingested documents in the vector store
# for later retrieval and querying
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

if es_filters:
print(es_filters)
for k, v in es_filters.items():
print(k)
print(v)
filters = MetadataFilters(
filters=[
MetadataFilter(key=k,operator=FilterOperator.EQ, value=v) for k, v in es_filters.items()
]
)

query_engine = index.as_query_engine(
text_qa_template=prompt_template,
similarity_top_k=num_results,
vector_store_query_mode="sparse",
vector_store_kwargs={
"custom_query": create_sparse_vector_query_with_model_and_filter(es_model_name, model_text_field=model_text_field, filters=filters)
},
)
else:
query_engine = index.as_query_engine(
text_qa_template=prompt_template,
similarity_top_k=num_results,
vector_store_query_mode="sparse",
vector_store_kwargs={
"custom_query": create_sparse_vector_query_with_model(es_model_name, model_text_field=model_text_field)
},
)
print(user_query)
# Finally query the engine with the user question
response = query_engine.query(user_query)
print(response)
data_response = {
"llm_response": response.response,
"references": [node.to_dict() for node in response.source_nodes]
}

return queryLLMResponse(**data_response)

def get_custom_watsonx(model_id, additional_kwargs):
# Serialize additional_kwargs to a JSON string, with sorted keys
additional_kwargs_str = json.dumps(additional_kwargs, sort_keys=True)
# Generate a hash of the serialized string
additional_kwargs_hash = hash(additional_kwargs_str)

cache_key = f"{model_id}_{additional_kwargs_hash}"

# Check if the object already exists in the cache
if cache_key in custom_watsonx_cache:
return custom_watsonx_cache[cache_key]

# If not in the cache, create a new CustomWatsonX object and store it
custom_watsonx = CustomWatsonX(
credentials=wml_credentials,
project_id=project_id,
model_id=model_id,
validate_model_id=False,
additional_kwargs=additional_kwargs,
)
custom_watsonx_cache[cache_key] = custom_watsonx
return custom_watsonx

#except Exception as e:
# return queryLLMResponse(
# llm_response = "",
# references=[{"error": repr(e)}]
# )


if __name__ == '__main__':
Expand Down
3 changes: 2 additions & 1 deletion customTypes/queryLLMRequest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,12 @@ class Config:
protected_namespaces = ()

class queryLLMRequest(BaseModel):
llm_instructions: Optional[str] = Field(None, title="LLM Instructions", description="Instructions for LLM")
question: str
es_index_name: str
es_index_text_field: Optional[str] = Field(default="body_content_field")
es_model_name: Optional[str] = Field(default=".elser_model_1")
es_model_text_field: Optional[str] = Field(default="ml.tokens")
llm_instructions: Optional[str] = Field(default="[INST]<<SYS>>You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Be brief in your answers. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'\''t know the answer to a question, please do not share false information. <</SYS>>\nGenerate the next agent response by answering the question. You are provided several documents with titles. If the answer comes from different documents please mention all possibilities and use the tiles of documents to separate between topics or domains. Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer.\n{context_str}<</SYS>>\n\n{query_str} Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer. [/INST]", title="LLM Instructions", description="Instructions for LLM")
num_results: Optional[str] = Field(default="5")
llm_params: Optional[LLMParams] = LLMParams()
filters: Optional[Dict[str, Any]] = Field(None,
Expand Down
2 changes: 2 additions & 0 deletions env
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ WXD_USERNAME=""
WXD_PASSWORD=""
WXD_URL=""
WX_URL="https://us-south.ml.cloud.ibm.com"
MODEL_ID="ibm/granite-13b-chat-v2"
MODEL_PARAMETERS="{\"model_parameters\":{\"decoding_method\":\"greedy\",\"max_new_tokens\":500,\"min_new_tokens\":0,\"random_seed\":null,\"stop_sequences\":[],\"temperature\":0.7,\"top_k\":50,\"top_p\":1,\"repetition_penalty\":1}}"
INDEX_NAME=""
PIPELINE_NAME=""
LLM_INSTRUCTIONS="[INST]<<SYS>>You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Be brief in your answers. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'\''t know the answer to a question, please do not share false information. <</SYS>>\nGenerate the next agent response by answering the question. You are provided several documents with titles. If the answer comes from different documents please mention all possibilities and use the tiles of documents to separate between topics or domains. Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer.\n{context_str}<</SYS>>\n\n{query_str} Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer. [/INST]"
16 changes: 16 additions & 0 deletions model_paramters.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"model_parameters": {
"model_id": "meta-llama/llama-2-70b-chat",
"inputs": [],
"parameters": {
"decoding_method": "greedy",
"max_new_tokens": 500,
"min_new_tokens": 1,
"moderations": {
"hap_input": "true",
"hap_output": "true",
"threshold": 0.75
}
}
}
}