diff --git a/app.py b/app.py
index 4f479b6..0a6c4db 100644
--- a/app.py
+++ b/app.py
@@ -3,6 +3,7 @@
import pandas as pd
import uvicorn
import sys
+import time
# import nest_asyncio
from utils import CloudObjectStorageReader, CustomWatsonX, create_sparse_vector_query_with_model, create_sparse_vector_query_with_model_and_filter
@@ -75,13 +76,18 @@
"cosEndpointURL": os.environ.get("COS_ENDPOINT_URL")
}
-generate_params = {
- GenParams.MAX_NEW_TOKENS: 250,
- GenParams.DECODING_METHOD: "greedy",
- GenParams.STOP_SEQUENCES: ['END',';',';END'],
- GenParams.REPETITION_PENALTY: 1
-}
+# Create a global client connection to elastic search
+async_es_client = AsyncElasticsearch(
+ wxd_creds["wxdurl"],
+ basic_auth=(wxd_creds["username"], wxd_creds["password"]),
+ verify_certs=False,
+ request_timeout=3600,
+)
+
+
+# Create a watsonx client cache for faster calls.
+custom_watsonx_cache = {}
@app.get("/")
def index():
@@ -115,13 +121,6 @@ async def ingestDocs(request: ingestRequest)->ingestResponse:
documents = await cos_reader.load_data()
print(f"Total documents: {len(documents)}\nExample document:\n{documents[0]}")
- async_es_client = AsyncElasticsearch(
- wxd_creds["wxdurl"],
- basic_auth=(wxd_creds["username"], wxd_creds["password"]),
- verify_certs=False,
- request_timeout=3600,
- )
-
await async_es_client.info()
# Pipeline must occur before index due to pipeline dependency
@@ -213,15 +212,19 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
index_name = request.es_index_name
index_text_field = request.es_index_text_field
es_model_name = request.es_model_name
+ model_text_field = request.es_model_text_field
num_results = request.num_results
llm_params = request.llm_params
es_filters = request.filters
+ llm_instructions = request.llm_instructions
- # Sets the llm instruction if the user provides it
- if not request.llm_instructions:
- llm_instructions = os.environ.get("LLM_INSTRUCTIONS")
- else:
- llm_instructions = request.llm_instructions
+ # Sanity check for instructions
+ if "{query_str}" not in llm_instructions or "{context_str}" not in llm_instructions:
+ data_response = {
+ "llm_response": "",
+ "references": [{"error":"Please add {query_str} and {context_str} placeholders to the instructions."}]
+ }
+ return queryLLMResponse(**data_response)
# Format payload for later query
payload = {
@@ -231,89 +234,95 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
}
# Attempt to connect to ElasticSearch and call Watsonx for a response
- try:
- # Setting up the structure of the payload for the query engine
- user_query = payload["input_data"][0]["values"][0][0]
-
- # Create the prompt template based on llm_instructions
- prompt_template = PromptTemplate(llm_instructions)
-
- # Create the watsonx LLM object that will be used for the RAG pattern
- Settings.llm = CustomWatsonX(
- credentials=wml_credentials,
- project_id=project_id,
- model_id=llm_params.model_id,
- validate_model_id=False,
- additional_kwargs=llm_params.parameters.dict(),
- )
- Settings.embed_model = None
-
- # Create a client connection to elastic search
- async_es_client = AsyncElasticsearch(
- wxd_creds["wxdurl"],
- basic_auth=(wxd_creds["username"], wxd_creds["password"]),
- verify_certs=False,
- request_timeout=3600,
- )
+ # try:
+ # Setting up the structure of the payload for the query engine
+ user_query = payload["input_data"][0]["values"][0][0]
- # Create a vector store using the elastic client
- vector_store = ElasticsearchStore(
- es_client=async_es_client,
- index_name=index_name,
- text_field=index_text_field
- )
+ # Create the prompt template based on llm_instructions
+ prompt_template = PromptTemplate(llm_instructions)
- # Retrieve an index of the ingested documents in the vector store
- # for later retrieval and querying
- index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
-
- # Create a retriever object using the index and setting params
-
- if es_filters:
- print(es_filters)
- for k, v in es_filters.items():
- print(k)
- print(v)
- filters = MetadataFilters(
- filters=[
- MetadataFilter(key=k,operator=FilterOperator.EQ, value=v) for k, v in es_filters.items()
- ]
- )
-
- query_engine = index.as_query_engine(
- text_qa_template=prompt_template,
- similarity_top_k=num_results,
- vector_store_query_mode="sparse",
- vector_store_kwargs={
- "custom_query": create_sparse_vector_query_with_model_and_filter(es_model_name, filters=filters)
- },
- )
- else:
- query_engine = index.as_query_engine(
- text_qa_template=prompt_template,
- similarity_top_k=num_results,
- vector_store_query_mode="sparse",
- vector_store_kwargs={
- "custom_query": create_sparse_vector_query_with_model(es_model_name)
- },
- )
-
- # Finally query the engine with the user question
- response = query_engine.query(user_query)
-
- # Format the data
- data_response = {
- "llm_response": response.response,
- "references": [node.to_dict() for node in response.source_nodes]
- }
+ # Create the watsonx LLM object that will be used for the RAG pattern
+ Settings.llm = get_custom_watsonx(llm_params.model_id, llm_params.parameters.dict())
+ Settings.embed_model = None
- return queryLLMResponse(**data_response)
+ # Create a vector store using the elastic client
+ vector_store = ElasticsearchStore(
+ es_client=async_es_client,
+ index_name=index_name,
+ text_field=index_text_field
+ )
- except Exception as e:
- return queryLLMResponse(
- llm_response = "",
- references=[{"error": repr(e)}]
+ # Retrieve an index of the ingested documents in the vector store
+ # for later retrieval and querying
+ index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
+
+ if es_filters:
+ print(es_filters)
+ for k, v in es_filters.items():
+ print(k)
+ print(v)
+ filters = MetadataFilters(
+ filters=[
+ MetadataFilter(key=k,operator=FilterOperator.EQ, value=v) for k, v in es_filters.items()
+ ]
)
+
+ query_engine = index.as_query_engine(
+ text_qa_template=prompt_template,
+ similarity_top_k=num_results,
+ vector_store_query_mode="sparse",
+ vector_store_kwargs={
+ "custom_query": create_sparse_vector_query_with_model_and_filter(es_model_name, model_text_field=model_text_field, filters=filters)
+ },
+ )
+ else:
+ query_engine = index.as_query_engine(
+ text_qa_template=prompt_template,
+ similarity_top_k=num_results,
+ vector_store_query_mode="sparse",
+ vector_store_kwargs={
+ "custom_query": create_sparse_vector_query_with_model(es_model_name, model_text_field=model_text_field)
+ },
+ )
+ print(user_query)
+ # Finally query the engine with the user question
+ response = query_engine.query(user_query)
+ print(response)
+ data_response = {
+ "llm_response": response.response,
+ "references": [node.to_dict() for node in response.source_nodes]
+ }
+
+ return queryLLMResponse(**data_response)
+
+def get_custom_watsonx(model_id, additional_kwargs):
+ # Serialize additional_kwargs to a JSON string, with sorted keys
+ additional_kwargs_str = json.dumps(additional_kwargs, sort_keys=True)
+ # Generate a hash of the serialized string
+ additional_kwargs_hash = hash(additional_kwargs_str)
+
+ cache_key = f"{model_id}_{additional_kwargs_hash}"
+
+ # Check if the object already exists in the cache
+ if cache_key in custom_watsonx_cache:
+ return custom_watsonx_cache[cache_key]
+
+ # If not in the cache, create a new CustomWatsonX object and store it
+ custom_watsonx = CustomWatsonX(
+ credentials=wml_credentials,
+ project_id=project_id,
+ model_id=model_id,
+ validate_model_id=False,
+ additional_kwargs=additional_kwargs,
+ )
+ custom_watsonx_cache[cache_key] = custom_watsonx
+ return custom_watsonx
+
+#except Exception as e:
+# return queryLLMResponse(
+# llm_response = "",
+# references=[{"error": repr(e)}]
+# )
if __name__ == '__main__':
diff --git a/customTypes/queryLLMRequest.py b/customTypes/queryLLMRequest.py
index 1b00ac7..e12e442 100644
--- a/customTypes/queryLLMRequest.py
+++ b/customTypes/queryLLMRequest.py
@@ -33,11 +33,12 @@ class Config:
protected_namespaces = ()
class queryLLMRequest(BaseModel):
- llm_instructions: Optional[str] = Field(None, title="LLM Instructions", description="Instructions for LLM")
question: str
es_index_name: str
es_index_text_field: Optional[str] = Field(default="body_content_field")
es_model_name: Optional[str] = Field(default=".elser_model_1")
+ es_model_text_field: Optional[str] = Field(default="ml.tokens")
+ llm_instructions: Optional[str] = Field(default="[INST]<>You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Be brief in your answers. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'\''t know the answer to a question, please do not share false information. <>\nGenerate the next agent response by answering the question. You are provided several documents with titles. If the answer comes from different documents please mention all possibilities and use the tiles of documents to separate between topics or domains. Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer.\n{context_str}<>\n\n{query_str} Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer. [/INST]", title="LLM Instructions", description="Instructions for LLM")
num_results: Optional[str] = Field(default="5")
llm_params: Optional[LLMParams] = LLMParams()
filters: Optional[Dict[str, Any]] = Field(None,
diff --git a/env b/env
index fc52a20..694afd2 100644
--- a/env
+++ b/env
@@ -8,6 +8,8 @@ WXD_USERNAME=""
WXD_PASSWORD=""
WXD_URL=""
WX_URL="https://us-south.ml.cloud.ibm.com"
+MODEL_ID="ibm/granite-13b-chat-v2"
+MODEL_PARAMETERS="{\"model_parameters\":{\"decoding_method\":\"greedy\",\"max_new_tokens\":500,\"min_new_tokens\":0,\"random_seed\":null,\"stop_sequences\":[],\"temperature\":0.7,\"top_k\":50,\"top_p\":1,\"repetition_penalty\":1}}"
INDEX_NAME=""
PIPELINE_NAME=""
LLM_INSTRUCTIONS="[INST]<>You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Be brief in your answers. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'\''t know the answer to a question, please do not share false information. <>\nGenerate the next agent response by answering the question. You are provided several documents with titles. If the answer comes from different documents please mention all possibilities and use the tiles of documents to separate between topics or domains. Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer.\n{context_str}<>\n\n{query_str} Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer. [/INST]"
diff --git a/model_paramters.json b/model_paramters.json
new file mode 100644
index 0000000..3539fca
--- /dev/null
+++ b/model_paramters.json
@@ -0,0 +1,16 @@
+{
+ "model_parameters": {
+ "model_id": "meta-llama/llama-2-70b-chat",
+ "inputs": [],
+ "parameters": {
+ "decoding_method": "greedy",
+ "max_new_tokens": 500,
+ "min_new_tokens": 1,
+ "moderations": {
+ "hap_input": "true",
+ "hap_output": "true",
+ "threshold": 0.75
+ }
+ }
+ }
+}
\ No newline at end of file