From 208465f8c1a2b0a981f7de5d09e12f836a9d657f Mon Sep 17 00:00:00 2001
From: Yasser Sheriff <ysherif@us.ibm.com>
Date: Sat, 13 Apr 2024 22:03:43 -0400
Subject: [PATCH 1/4] add model parameters

---
 env | 2 ++
 1 file changed, 2 insertions(+)
diff --git a/env b/env
index fc52a20..694afd2 100644
--- a/env
+++ b/env
@@ -8,6 +8,8 @@ WXD_USERNAME=""
 WXD_PASSWORD=""
 WXD_URL=""
 WX_URL="https://us-south.ml.cloud.ibm.com"
+MODEL_ID="ibm/granite-13b-chat-v2"
+MODEL_PARAMETERS="{\"model_parameters\":{\"decoding_method\":\"greedy\",\"max_new_tokens\":500,\"min_new_tokens\":0,\"random_seed\":null,\"stop_sequences\":[],\"temperature\":0.7,\"top_k\":50,\"top_p\":1,\"repetition_penalty\":1}}"
 INDEX_NAME=""
 PIPELINE_NAME=""
 LLM_INSTRUCTIONS="[INST]<<SYS>>You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Be brief in your answers. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'\''t know the answer to a question, please do not share false information. <</SYS>>\nGenerate the next agent response by answering the question. You are provided several documents with titles. If the answer comes from different documents please mention all possibilities and use the tiles of documents to separate between topics or domains. Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer.\n{context_str}<</SYS>>\n\n{query_str} Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer. [/INST]"

From 493def6a8d01380fbd4014bdeb4696f9cab46177 Mon Sep 17 00:00:00 2001
From: Yasser Sheriff <ysherif@us.ibm.com>
Date: Sat, 13 Apr 2024 22:12:33 -0400
Subject: [PATCH 2/4] initialize customwatsonx

---
 app.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/app.py b/app.py
index 4f479b6..8b40f09 100644
--- a/app.py
+++ b/app.py
@@ -51,6 +51,10 @@
 
 load_dotenv()
 
+#Load Model paramters
+llm_parameters = os.environ.get("MODEL_PARAMETERS")
+model_paramters = json.loads(llm_parameters)
+
 #Token to IBM Cloud
 ibm_cloud_api_key = os.environ.get("IBM_CLOUD_API_KEY")
 project_id = os.environ.get("WX_PROJECT_ID")
@@ -82,6 +86,14 @@
     GenParams.REPETITION_PENALTY: 1
 }
 
+Settings.llm = CustomWatsonX(
+    credentials=wml_credentials,
+    project_id=project_id,
+    model_id=os.environ.get("MODEL_ID"),
+    validate_model_id=False,
+    additional_kwargs=model_paramters["model_parameters"],
+)
+Settings.embed_model = None
 
 @app.get("/")
 def index():
@@ -239,14 +251,14 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
         prompt_template = PromptTemplate(llm_instructions)
 
         # Create the watsonx LLM object that will be used for the RAG pattern
-        Settings.llm = CustomWatsonX(
-            credentials=wml_credentials,
-            project_id=project_id,
-            model_id=llm_params.model_id,
-            validate_model_id=False,
-            additional_kwargs=llm_params.parameters.dict(),
-        )
-        Settings.embed_model = None
+        #Settings.llm = CustomWatsonX(
+        #    credentials=wml_credentials,
+        #    project_id=project_id,
+        #    model_id=llm_params.model_id,
+        #    validate_model_id=False,
+        #    additional_kwargs=llm_params.parameters.dict(),
+        #)
+        #Settings.embed_model = None
 
         # Create a client connection to elastic search
         async_es_client = AsyncElasticsearch(
@@ -318,4 +330,4 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
 
 if __name__ == '__main__':
     if 'uvicorn' not in sys.argv[0]:
-        uvicorn.run("app:app", host='0.0.0.0', port=4050, reload=True)
\ No newline at end of file
+        uvicorn.run("app:app", host='0.0.0.0', port=4050, reload=True)

From 43b28ec5687855de871a83a0f3a3d5d76757d2a4 Mon Sep 17 00:00:00 2001
From: Yasser Sheriff <ysherif@us.ibm.com>
Date: Wed, 17 Apr 2024 13:33:48 -0400
Subject: [PATCH 3/4] add model parameters json

---
 app.py               | 168 ++++++++++++++++++++++---------------------
 model_paramters.json |  16 +++++
 2 files changed, 102 insertions(+), 82 deletions(-)
 create mode 100644 model_paramters.json

diff --git a/app.py b/app.py
index 8b40f09..431c577 100644
--- a/app.py
+++ b/app.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import uvicorn
 import sys
+import time
 # import nest_asyncio
 
 from utils import CloudObjectStorageReader, CustomWatsonX, create_sparse_vector_query_with_model, create_sparse_vector_query_with_model_and_filter
@@ -51,9 +52,8 @@
 
 load_dotenv()
 
-#Load Model paramters
-llm_parameters = os.environ.get("MODEL_PARAMETERS")
-model_paramters = json.loads(llm_parameters)
+modelParamters = open('model_paramters.json')
+model_paramters = json.load(modelParamters)
 
 #Token to IBM Cloud
 ibm_cloud_api_key = os.environ.get("IBM_CLOUD_API_KEY")
@@ -86,6 +86,7 @@
     GenParams.REPETITION_PENALTY: 1
 }
 
+
 Settings.llm = CustomWatsonX(
     credentials=wml_credentials,
     project_id=project_id,
@@ -243,91 +244,94 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
     }
 
     # Attempt to connect to ElasticSearch and call Watsonx for a response
-    try:
-        # Setting up the structure of the payload for the query engine
-        user_query = payload["input_data"][0]["values"][0][0]
-
-        # Create the prompt template based on llm_instructions
-        prompt_template = PromptTemplate(llm_instructions)
-
-        # Create the watsonx LLM object that will be used for the RAG pattern
-        #Settings.llm = CustomWatsonX(
-        #    credentials=wml_credentials,
-        #    project_id=project_id,
-        #    model_id=llm_params.model_id,
-        #    validate_model_id=False,
-        #    additional_kwargs=llm_params.parameters.dict(),
-        #)
-        #Settings.embed_model = None
-
-        # Create a client connection to elastic search
-        async_es_client = AsyncElasticsearch(
-            wxd_creds["wxdurl"],
-            basic_auth=(wxd_creds["username"], wxd_creds["password"]),
-            verify_certs=False,
-            request_timeout=3600,
-        )
+    #try:
+    # Setting up the structure of the payload for the query engine
+    user_query = payload["input_data"][0]["values"][0][0]
 
-        # Create a vector store using the elastic client
-        vector_store = ElasticsearchStore(
-            es_client=async_es_client,
-            index_name=index_name,
-            text_field=index_text_field
-        )
+    # Create the prompt template based on llm_instructions
+    prompt_template = PromptTemplate(llm_instructions)
 
-        # Retrieve an index of the ingested documents in the vector store
-        # for later retrieval and querying
-        index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
-
-        # Create a retriever object using the index and setting params
-
-        if es_filters: 
-            print(es_filters)
-            for k, v in es_filters.items():
-                print(k)
-                print(v)
-            filters = MetadataFilters(
-                    filters=[
-                        MetadataFilter(key=k,operator=FilterOperator.EQ, value=v) for k, v in es_filters.items()
-                ]
-            )
-            
-            query_engine = index.as_query_engine(
-                text_qa_template=prompt_template,
-                similarity_top_k=num_results,
-                vector_store_query_mode="sparse",
-                vector_store_kwargs={
-                    "custom_query": create_sparse_vector_query_with_model_and_filter(es_model_name, filters=filters)
-                },
-            )
-        else:
-            query_engine = index.as_query_engine(
-                text_qa_template=prompt_template,
-                similarity_top_k=num_results,
-                vector_store_query_mode="sparse",
-                vector_store_kwargs={
-                    "custom_query": create_sparse_vector_query_with_model(es_model_name)
-                },
-            )
-
-        # Finally query the engine with the user question
-        response = query_engine.query(user_query)
-
-        # Format the data
-        data_response = {
-            "llm_response": response.response,
-            "references": [node.to_dict() for node in response.source_nodes]
-        }
+    # Create the watsonx LLM object that will be used for the RAG pattern
 
-        return queryLLMResponse(**data_response)
+  #  Settings.llm = CustomWatsonX(
+  #      credentials=wml_credentials,
+  #      project_id=project_id,
+  #      model_id=llm_params.model_id,
+  #      validate_model_id=True,
+  #      additional_kwargs=llm_params.parameters.dict(),
+  #  )
+  #  Settings.embed_model = None
 
-    except Exception as e:
-        return queryLLMResponse(
-            llm_response = "",
-            references=[{"error": repr(e)}]
+    # Create a client connection to elastic search
+    async_es_client = AsyncElasticsearch(
+        wxd_creds["wxdurl"],
+        basic_auth=(wxd_creds["username"], wxd_creds["password"]),
+        verify_certs=False,
+        request_timeout=3600,
+    )
+
+    # Create a vector store using the elastic client
+    vector_store = ElasticsearchStore(
+        es_client=async_es_client,
+        index_name=index_name,
+        text_field=index_text_field
+    )
+
+    # Retrieve an index of the ingested documents in the vector store
+    # for later retrieval and querying
+    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
+
+    # Create a retriever object using the index and setting params
+
+    print("Elastic Search Start" + str(time.perf_counter()))
+    if es_filters: 
+        print(es_filters)
+        for k, v in es_filters.items():
+            print(k)
+            print(v)
+        filters = MetadataFilters(
+                filters=[
+                    MetadataFilter(key=k,operator=FilterOperator.EQ, value=v) for k, v in es_filters.items()
+            ]
         )
+        
+        query_engine = index.as_query_engine(
+            text_qa_template=prompt_template,
+            similarity_top_k=num_results,
+            vector_store_query_mode="sparse",
+            vector_store_kwargs={
+                "custom_query": create_sparse_vector_query_with_model_and_filter(es_model_name, filters=filters)
+            },
+        )
+    else:
+        query_engine = index.as_query_engine(
+            text_qa_template=prompt_template,
+            similarity_top_k=num_results,
+            vector_store_query_mode="sparse",
+            vector_store_kwargs={
+                "custom_query": create_sparse_vector_query_with_model(es_model_name)
+            },
+        )
+
+    # Finally query the engine with the user question
+    response = query_engine.query(user_query)
+  
+
+    # Format the data
+    data_response = {
+        "llm_response": response.response,
+        "references": [node.to_dict() for node in response.source_nodes]
+    }
+
+    return queryLLMResponse(**data_response)
+
+#except Exception as e:
+#    return queryLLMResponse(
+#        llm_response = "",
+#        references=[{"error": repr(e)}]
+#    )
 
 
 if __name__ == '__main__':
     if 'uvicorn' not in sys.argv[0]:
-        uvicorn.run("app:app", host='0.0.0.0', port=4050, reload=True)
+        uvicorn.run("app:app", host='0.0.0.0', port=4050, reload=True)
\ No newline at end of file
diff --git a/model_paramters.json b/model_paramters.json
new file mode 100644
index 0000000..3539fca
--- /dev/null
+++ b/model_paramters.json
@@ -0,0 +1,16 @@
+{
+        "model_parameters": {
+        "model_id": "meta-llama/llama-2-70b-chat",
+        "inputs": [],
+        "parameters": {
+            "decoding_method": "greedy",
+        "max_new_tokens": 500,
+        "min_new_tokens": 1,
+        "moderations": {
+            "hap_input": "true",
+            "hap_output": "true",
+            "threshold": 0.75
+            }
+        }
+    }
+}
\ No newline at end of file

From 6010f11f784853046763b86d278e1ceef91d0b7e Mon Sep 17 00:00:00 2001
From: Yasser Sheriff <ysherif@us.ibm.com>
Date: Fri, 19 Apr 2024 10:41:33 -0400
Subject: [PATCH 4/4] add cache key code

---
 app.py                         | 105 +++++++++++++++------------------
 customTypes/queryLLMRequest.py |   3 +-
 2 files changed, 51 insertions(+), 57 deletions(-)

diff --git a/app.py b/app.py
index 431c577..0a6c4db 100644
--- a/app.py
+++ b/app.py
@@ -52,9 +52,6 @@
 
 load_dotenv()
 
-modelParamters = open('model_paramters.json')
-model_paramters = json.load(modelParamters)
-
 #Token to IBM Cloud
 ibm_cloud_api_key = os.environ.get("IBM_CLOUD_API_KEY")
 project_id = os.environ.get("WX_PROJECT_ID")
@@ -79,22 +76,18 @@
     "cosEndpointURL": os.environ.get("COS_ENDPOINT_URL")
 }
 
-generate_params = {
-    GenParams.MAX_NEW_TOKENS: 250,
-    GenParams.DECODING_METHOD: "greedy",
-    GenParams.STOP_SEQUENCES: ['END',';',';END'],
-    GenParams.REPETITION_PENALTY: 1
-}
-
 
-Settings.llm = CustomWatsonX(
-    credentials=wml_credentials,
-    project_id=project_id,
-    model_id=os.environ.get("MODEL_ID"),
-    validate_model_id=False,
-    additional_kwargs=model_paramters["model_parameters"],
+# Create a global client connection to elastic search
+async_es_client = AsyncElasticsearch(
+    wxd_creds["wxdurl"],
+    basic_auth=(wxd_creds["username"], wxd_creds["password"]),
+    verify_certs=False,
+    request_timeout=3600,
 )
-Settings.embed_model = None
+
+
+# Create a watsonx client cache for faster calls.
+custom_watsonx_cache = {}
 
 @app.get("/")
 def index():
@@ -128,13 +121,6 @@ async def ingestDocs(request: ingestRequest)->ingestResponse:
     documents = await cos_reader.load_data()
     print(f"Total documents: {len(documents)}\nExample document:\n{documents[0]}")
 
-    async_es_client = AsyncElasticsearch(
-        wxd_creds["wxdurl"],
-        basic_auth=(wxd_creds["username"], wxd_creds["password"]),
-        verify_certs=False,
-        request_timeout=3600,
-    )
-
     await async_es_client.info()
 
     # Pipeline must occur before index due to pipeline dependency
@@ -226,15 +212,19 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
     index_name       = request.es_index_name
     index_text_field = request.es_index_text_field
     es_model_name    = request.es_model_name
+    model_text_field = request.es_model_text_field
     num_results      = request.num_results
     llm_params       = request.llm_params
     es_filters       = request.filters
+    llm_instructions = request.llm_instructions
 
-    # Sets the llm instruction if the user provides it
-    if not request.llm_instructions:
-        llm_instructions = os.environ.get("LLM_INSTRUCTIONS")
-    else:
-        llm_instructions = request.llm_instructions
+    # Sanity check for instructions
+    if "{query_str}" not in llm_instructions or "{context_str}" not in llm_instructions:
+        data_response = {
+            "llm_response": "",
+            "references": [{"error":"Please add {query_str} and {context_str} placeholders to the instructions."}]
+        }
+        return queryLLMResponse(**data_response)
 
     # Format payload for later query
     payload = {
@@ -244,7 +234,7 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
     }
 
     # Attempt to connect to ElasticSearch and call Watsonx for a response
-    #try:
+    # try:
     # Setting up the structure of the payload for the query engine
     user_query = payload["input_data"][0]["values"][0][0]
 
@@ -252,23 +242,8 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
     prompt_template = PromptTemplate(llm_instructions)
 
     # Create the watsonx LLM object that will be used for the RAG pattern
-
-  #  Settings.llm = CustomWatsonX(
-  #      credentials=wml_credentials,
-  #      project_id=project_id,
-  #      model_id=llm_params.model_id,
-  #      validate_model_id=True,
-  #      additional_kwargs=llm_params.parameters.dict(),
-  #  )
-  #  Settings.embed_model = None
-
-    # Create a client connection to elastic search
-    async_es_client = AsyncElasticsearch(
-        wxd_creds["wxdurl"],
-        basic_auth=(wxd_creds["username"], wxd_creds["password"]),
-        verify_certs=False,
-        request_timeout=3600,
-    )
+    Settings.llm = get_custom_watsonx(llm_params.model_id, llm_params.parameters.dict())
+    Settings.embed_model = None
 
     # Create a vector store using the elastic client
     vector_store = ElasticsearchStore(
@@ -281,9 +256,6 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
     # for later retrieval and querying
     index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
 
-    # Create a retriever object using the index and setting params
-
-    print("Elastic Search Start" + str(time.perf_counter()))
     if es_filters: 
         print(es_filters)
         for k, v in es_filters.items():
@@ -300,7 +272,7 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
             similarity_top_k=num_results,
             vector_store_query_mode="sparse",
             vector_store_kwargs={
-                "custom_query": create_sparse_vector_query_with_model_and_filter(es_model_name, filters=filters)
+                "custom_query": create_sparse_vector_query_with_model_and_filter(es_model_name, model_text_field=model_text_field, filters=filters)
             },
         )
     else:
@@ -309,15 +281,13 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
             similarity_top_k=num_results,
             vector_store_query_mode="sparse",
             vector_store_kwargs={
-                "custom_query": create_sparse_vector_query_with_model(es_model_name)
+                "custom_query": create_sparse_vector_query_with_model(es_model_name, model_text_field=model_text_field)
             },
         )
-
+    print(user_query)
     # Finally query the engine with the user question
     response = query_engine.query(user_query)
-  
-
-    # Format the data
+    print(response)
     data_response = {
         "llm_response": response.response,
         "references": [node.to_dict() for node in response.source_nodes]
@@ -325,6 +295,29 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
 
     return queryLLMResponse(**data_response)
 
+def get_custom_watsonx(model_id, additional_kwargs):
+    # Serialize additional_kwargs to a JSON string, with sorted keys
+    additional_kwargs_str = json.dumps(additional_kwargs, sort_keys=True)
+    # Generate a hash of the serialized string
+    additional_kwargs_hash = hash(additional_kwargs_str)
+    
+    cache_key = f"{model_id}_{additional_kwargs_hash}"
+
+    # Check if the object already exists in the cache
+    if cache_key in custom_watsonx_cache:
+        return custom_watsonx_cache[cache_key]
+
+    # If not in the cache, create a new CustomWatsonX object and store it
+    custom_watsonx = CustomWatsonX(
+        credentials=wml_credentials,
+        project_id=project_id,
+        model_id=model_id,
+        validate_model_id=False,
+        additional_kwargs=additional_kwargs,
+    )
+    custom_watsonx_cache[cache_key] = custom_watsonx
+    return custom_watsonx
+
 #except Exception as e:
 #    return queryLLMResponse(
 #        llm_response = "",
diff --git a/customTypes/queryLLMRequest.py b/customTypes/queryLLMRequest.py
index 1b00ac7..e12e442 100644
--- a/customTypes/queryLLMRequest.py
+++ b/customTypes/queryLLMRequest.py
@@ -33,11 +33,12 @@ class Config:
         protected_namespaces = ()
 
 class queryLLMRequest(BaseModel):
-    llm_instructions: Optional[str] = Field(None, title="LLM Instructions", description="Instructions for LLM")
     question: str
     es_index_name: str
     es_index_text_field: Optional[str] = Field(default="body_content_field")
     es_model_name: Optional[str] = Field(default=".elser_model_1")
+    es_model_text_field: Optional[str] = Field(default="ml.tokens")
+    llm_instructions: Optional[str] = Field(default="[INST]<<SYS>>You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Be brief in your answers. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'\''t know the answer to a question, please do not share false information. <</SYS>>\nGenerate the next agent response by answering the question. You are provided several documents with titles. If the answer comes from different documents please mention all possibilities and use the tiles of documents to separate between topics or domains. Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer.\n{context_str}<</SYS>>\n\n{query_str} Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer. [/INST]", title="LLM Instructions", description="Instructions for LLM")
     num_results: Optional[str] = Field(default="5")
     llm_params: Optional[LLMParams] = LLMParams()
     filters: Optional[Dict[str, Any]] = Field(None,