ibm-build-lab · annumberhocker · Apr 14, 2024 · Apr 14, 2024 · Apr 17, 2024 · Apr 19, 2024
diff --git a/app.py b/app.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import uvicorn
 import sys
+import time
 # import nest_asyncio
 
 from utils import CloudObjectStorageReader, CustomWatsonX, create_sparse_vector_query_with_model, create_sparse_vector_query_with_model_and_filter
@@ -75,13 +76,18 @@
     "cosEndpointURL": os.environ.get("COS_ENDPOINT_URL")
 }
 
-generate_params = {
-    GenParams.MAX_NEW_TOKENS: 250,
-    GenParams.DECODING_METHOD: "greedy",
-    GenParams.STOP_SEQUENCES: ['END',';',';END'],
-    GenParams.REPETITION_PENALTY: 1
-}
 
+# Create a global client connection to elastic search
+async_es_client = AsyncElasticsearch(
+    wxd_creds["wxdurl"],
+    basic_auth=(wxd_creds["username"], wxd_creds["password"]),
+    verify_certs=False,
+    request_timeout=3600,
+)
+
+
+# Create a watsonx client cache for faster calls.
+custom_watsonx_cache = {}
 
 @app.get("/")
 def index():
@@ -115,13 +121,6 @@ async def ingestDocs(request: ingestRequest)->ingestResponse:
     documents = await cos_reader.load_data()
     print(f"Total documents: {len(documents)}\nExample document:\n{documents[0]}")
 
-    async_es_client = AsyncElasticsearch(
-        wxd_creds["wxdurl"],
-        basic_auth=(wxd_creds["username"], wxd_creds["password"]),
-        verify_certs=False,
-        request_timeout=3600,
-    )
-
     await async_es_client.info()
 
     # Pipeline must occur before index due to pipeline dependency
@@ -213,15 +212,19 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
     index_name       = request.es_index_name
     index_text_field = request.es_index_text_field
     es_model_name    = request.es_model_name
+    model_text_field = request.es_model_text_field
     num_results      = request.num_results
     llm_params       = request.llm_params
     es_filters       = request.filters
+    llm_instructions = request.llm_instructions
 
-    # Sets the llm instruction if the user provides it
-    if not request.llm_instructions:
-        llm_instructions = os.environ.get("LLM_INSTRUCTIONS")
-    else:
-        llm_instructions = request.llm_instructions
+    # Sanity check for instructions
+    if "{query_str}" not in llm_instructions or "{context_str}" not in llm_instructions:
+        data_response = {
+            "llm_response": "",
+            "references": [{"error":"Please add {query_str} and {context_str} placeholders to the instructions."}]
+        }
+        return queryLLMResponse(**data_response)
 
     # Format payload for later query
     payload = {
@@ -231,89 +234,95 @@ def queryLLM(request: queryLLMRequest)->queryLLMResponse:
     }
 
     # Attempt to connect to ElasticSearch and call Watsonx for a response
-    try:
-        # Setting up the structure of the payload for the query engine
-        user_query = payload["input_data"][0]["values"][0][0]
-
-        # Create the prompt template based on llm_instructions
-        prompt_template = PromptTemplate(llm_instructions)
-
-        # Create the watsonx LLM object that will be used for the RAG pattern
-        Settings.llm = CustomWatsonX(
-            credentials=wml_credentials,
-            project_id=project_id,
-            model_id=llm_params.model_id,
-            validate_model_id=False,
-            additional_kwargs=llm_params.parameters.dict(),
-        )
-        Settings.embed_model = None
-
-        # Create a client connection to elastic search
-        async_es_client = AsyncElasticsearch(
-            wxd_creds["wxdurl"],
-            basic_auth=(wxd_creds["username"], wxd_creds["password"]),
-            verify_certs=False,
-            request_timeout=3600,
-        )
+    # try:
+    # Setting up the structure of the payload for the query engine
+    user_query = payload["input_data"][0]["values"][0][0]
 
-        # Create a vector store using the elastic client
-        vector_store = ElasticsearchStore(
-            es_client=async_es_client,
-            index_name=index_name,
-            text_field=index_text_field
-        )
+    # Create the prompt template based on llm_instructions
+    prompt_template = PromptTemplate(llm_instructions)
 
-        # Retrieve an index of the ingested documents in the vector store
-        # for later retrieval and querying
-        index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
-
-        # Create a retriever object using the index and setting params
-
-        if es_filters: 
-            print(es_filters)
-            for k, v in es_filters.items():
-                print(k)
-                print(v)
-            filters = MetadataFilters(
-                    filters=[
-                        MetadataFilter(key=k,operator=FilterOperator.EQ, value=v) for k, v in es_filters.items()
-                ]
-            )
-
-            query_engine = index.as_query_engine(
-                text_qa_template=prompt_template,
-                similarity_top_k=num_results,
-                vector_store_query_mode="sparse",
-                vector_store_kwargs={
-                    "custom_query": create_sparse_vector_query_with_model_and_filter(es_model_name, filters=filters)
-                },
-            )
-        else:
-            query_engine = index.as_query_engine(
-                text_qa_template=prompt_template,
-                similarity_top_k=num_results,
-                vector_store_query_mode="sparse",
-                vector_store_kwargs={
-                    "custom_query": create_sparse_vector_query_with_model(es_model_name)
-                },
-            )
-
-        # Finally query the engine with the user question
-        response = query_engine.query(user_query)
-
-        # Format the data
-        data_response = {
-            "llm_response": response.response,
-            "references": [node.to_dict() for node in response.source_nodes]
-        }
+    # Create the watsonx LLM object that will be used for the RAG pattern
+    Settings.llm = get_custom_watsonx(llm_params.model_id, llm_params.parameters.dict())
+    Settings.embed_model = None
 
-        return queryLLMResponse(**data_response)
+    # Create a vector store using the elastic client
+    vector_store = ElasticsearchStore(
+        es_client=async_es_client,
+        index_name=index_name,
+        text_field=index_text_field
+    )
 
-    except Exception as e:
-        return queryLLMResponse(
-            llm_response = "",
-            references=[{"error": repr(e)}]
+    # Retrieve an index of the ingested documents in the vector store
+    # for later retrieval and querying
+    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
+
+    if es_filters: 
+        print(es_filters)
+        for k, v in es_filters.items():
+            print(k)
+            print(v)
+        filters = MetadataFilters(
+                filters=[
+                    MetadataFilter(key=k,operator=FilterOperator.EQ, value=v) for k, v in es_filters.items()
+            ]
         )
+
+        query_engine = index.as_query_engine(
+            text_qa_template=prompt_template,
+            similarity_top_k=num_results,
+            vector_store_query_mode="sparse",
+            vector_store_kwargs={
+                "custom_query": create_sparse_vector_query_with_model_and_filter(es_model_name, model_text_field=model_text_field, filters=filters)
+            },
+        )
+    else:
+        query_engine = index.as_query_engine(
+            text_qa_template=prompt_template,
+            similarity_top_k=num_results,
+            vector_store_query_mode="sparse",
+            vector_store_kwargs={
+                "custom_query": create_sparse_vector_query_with_model(es_model_name, model_text_field=model_text_field)
+            },
+        )
+    print(user_query)
+    # Finally query the engine with the user question
+    response = query_engine.query(user_query)
+    print(response)
+    data_response = {
+        "llm_response": response.response,
+        "references": [node.to_dict() for node in response.source_nodes]
+    }
+
+    return queryLLMResponse(**data_response)
+
+def get_custom_watsonx(model_id, additional_kwargs):
+    # Serialize additional_kwargs to a JSON string, with sorted keys
+    additional_kwargs_str = json.dumps(additional_kwargs, sort_keys=True)
+    # Generate a hash of the serialized string
+    additional_kwargs_hash = hash(additional_kwargs_str)
+
+    cache_key = f"{model_id}_{additional_kwargs_hash}"
+
+    # Check if the object already exists in the cache
+    if cache_key in custom_watsonx_cache:
+        return custom_watsonx_cache[cache_key]
+
+    # If not in the cache, create a new CustomWatsonX object and store it
+    custom_watsonx = CustomWatsonX(
+        credentials=wml_credentials,
+        project_id=project_id,
+        model_id=model_id,
+        validate_model_id=False,
+        additional_kwargs=additional_kwargs,
+    )
+    custom_watsonx_cache[cache_key] = custom_watsonx
+    return custom_watsonx
+
+#except Exception as e:
+#    return queryLLMResponse(
+#        llm_response = "",
+#        references=[{"error": repr(e)}]
+#    )
 
 
 if __name__ == '__main__':

diff --git a/customTypes/queryLLMRequest.py b/customTypes/queryLLMRequest.py
@@ -33,11 +33,12 @@ class Config:
         protected_namespaces = ()
 
 class queryLLMRequest(BaseModel):
-    llm_instructions: Optional[str] = Field(None, title="LLM Instructions", description="Instructions for LLM")
     question: str
     es_index_name: str
     es_index_text_field: Optional[str] = Field(default="body_content_field")
     es_model_name: Optional[str] = Field(default=".elser_model_1")
+    es_model_text_field: Optional[str] = Field(default="ml.tokens")
+    llm_instructions: Optional[str] = Field(default="[INST]<<SYS>>You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Be brief in your answers. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'\''t know the answer to a question, please do not share false information. <</SYS>>\nGenerate the next agent response by answering the question. You are provided several documents with titles. If the answer comes from different documents please mention all possibilities and use the tiles of documents to separate between topics or domains. Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer.\n{context_str}<</SYS>>\n\n{query_str} Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer. [/INST]", title="LLM Instructions", description="Instructions for LLM")
     num_results: Optional[str] = Field(default="5")
     llm_params: Optional[LLMParams] = LLMParams()
     filters: Optional[Dict[str, Any]] = Field(None,

diff --git a/env b/env
@@ -8,6 +8,8 @@ WXD_USERNAME=""
 WXD_PASSWORD=""
 WXD_URL=""
 WX_URL="https://us-south.ml.cloud.ibm.com"
+MODEL_ID="ibm/granite-13b-chat-v2"
+MODEL_PARAMETERS="{\"model_parameters\":{\"decoding_method\":\"greedy\",\"max_new_tokens\":500,\"min_new_tokens\":0,\"random_seed\":null,\"stop_sequences\":[],\"temperature\":0.7,\"top_k\":50,\"top_p\":1,\"repetition_penalty\":1}}"
 INDEX_NAME=""
 PIPELINE_NAME=""
 LLM_INSTRUCTIONS="[INST]<<SYS>>You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Be brief in your answers. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'\''t know the answer to a question, please do not share false information. <</SYS>>\nGenerate the next agent response by answering the question. You are provided several documents with titles. If the answer comes from different documents please mention all possibilities and use the tiles of documents to separate between topics or domains. Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer.\n{context_str}<</SYS>>\n\n{query_str} Answer with no more than 150 words. If you cannot base your answer on the given document, please state that you do not have an answer. [/INST]"
diff --git a/model_paramters.json b/model_paramters.json
@@ -0,0 +1,16 @@
+{
+        "model_parameters": {
+        "model_id": "meta-llama/llama-2-70b-chat",
+        "inputs": [],
+        "parameters": {
+            "decoding_method": "greedy",
+        "max_new_tokens": 500,
+        "min_new_tokens": 1,
+        "moderations": {
+            "hap_input": "true",
+            "hap_output": "true",
+            "threshold": 0.75
+            }
+        }
+    }
+}