diff --git a/examples/sqlite_example.ipynb b/examples/sqlite_example.ipynb new file mode 100644 index 0000000..e7219da --- /dev/null +++ b/examples/sqlite_example.ipynb @@ -0,0 +1,316 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Using Defog.ai SQL-Eval with SQLite database" + ], + "metadata": { + "id": "HCvehWlRJHeB" + } + }, + { + "cell_type": "markdown", + "source": [ + "This notebook runs through the process of using the sql-eval repository with Sqlite database. It uses sample data from https://github.com/defog-ai/defog-data" + ], + "metadata": { + "id": "OCfYLRx6Je0u" + } + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "5-mrSRmDM332" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Clone repo and install requirements" + ], + "metadata": { + "id": "cY0PrwrcM7hj" + } + }, + { + "cell_type": "code", + "source": [ + "!git clone https://github.com/defog-ai/defog-data.git\n", + "!pip install -r /content/defog-data/requirements.txt" + ], + "metadata": { + "id": "Q_dRdIPcC5zU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!git clone https://github.com/defog-ai/sql-eval.git\n", + "!mv sql-eval/* .\n", + "!pip install -r requirements.txt" + ], + "metadata": { + "id": "yBXaZkMHOjw3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!mv defog-data/defog_data ." + ], + "metadata": { + "id": "_rfF5_i_C51r" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "import pandas\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"OPEN-API-KEY\"" + ], + "metadata": { + "id": "d3T5YTF0C54V" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# NOTE" + ], + "metadata": { + "id": "bVnZczKRT5gs" + } + }, + { + "cell_type": "markdown", + "source": [ + "For the sake of this example, we will be using only 3 databases - academic, advising and atis. Hence modify `db_names` and `test_queries` variable in ***defog-data/translate_ddl_dialect.py*** to have only these 3 tables" + ], + "metadata": { + "id": "N3vc2K0ENEcv" + } + }, + { + "cell_type": "code", + "source": [ + "# modify translate_ddl_dialect.py to have only first 3 databases\n", + "# db_names = [\n", + "# \"academic\",\n", + "# \"advising\",\n", + "# \"atis\",\n", + "# ]\n", + "# test_queries = [\n", + "# (\"academic\", \"writes\"),\n", + "# (\"advising\", \"student_record\"),\n", + "# (\"atis\", \"time_zone\"),\n", + "# ]" + ], + "metadata": { + "id": "pF0CdzUN92NS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Create and populate SQLite databases" + ], + "metadata": { + "id": "RD8iC9uOOWe-" + } + }, + { + "cell_type": "code", + "source": [ + "!python /content/defog-data/translate_ddl_dialect.py --dialects sqlite\n" + ], + "metadata": { + "id": "ozUvxXbCvmb5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "NOTE : In utils/creds.py change `\"path_to_folder\": f\"/content/sqlite_dbs/\"` for sqlite (or whatever the path of sqlite database is)\n" + ], + "metadata": { + "id": "RaiV1RQtO_HL" + } + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "U1PJ0keiRhMR" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Modify questions set to have only question for selected db" + ], + "metadata": { + "id": "ANA1cXAZRiAC" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "questions_gen_sqlite = pd.read_csv(\"data/questions_gen_sqlite.csv\")" + ], + "metadata": { + "id": "fMOFkE1EAMDf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "questions_gen_sqlite_trimmed = questions_gen_sqlite[questions_gen_sqlite['db_name'].isin([\"academic\",\"advising\",\"atis\"])]" + ], + "metadata": { + "id": "niN4kV6zAY4P" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "questions_gen_sqlite_trimmed.to_csv(\"questions_gen_sqlite_trimmed.csv\", index=False)" + ], + "metadata": { + "id": "j8AARBihA6I6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "Pb-k0jrxSPFa" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Run evaluation code" + ], + "metadata": { + "id": "iM0gyDNeSPYG" + } + }, + { + "cell_type": "code", + "source": [ + "!python main.py \\\n", + " -db sqlite \\\n", + " -q questions_gen_sqlite_trimmed.csv \\\n", + " -o results/openai_classic.csv \\\n", + " -g oa \\\n", + " -f prompts/prompt_openai.json \\\n", + " -m gpt-3.5-turbo\n" + ], + "metadata": { + "id": "7HHm3oirxRtP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "qBipGtrE2bRn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The evaluation results will be available in csv file within results folder" + ], + "metadata": { + "id": "fTUCqso_SmFF" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Check data in SQLite databases" + ], + "metadata": { + "id": "z1_f_ooPSeoL" + } + }, + { + "cell_type": "code", + "source": [ + "import sqlite3\n", + "conn = sqlite3.connect('sqlite_dbs/academic.db')\n", + "c = conn.cursor()\n", + "\n", + "sql_query=\"\"\"\n", + "SELECT publication.year, AVG(publication.citation_num) AS average_citations FROM publication GROUP BY publication.year ORDER BY publication.year NULLS LAST;\n", + "\n", + "\"\"\"\n", + "\n", + "c.execute(sql_query)\n", + "print(c.fetchall())" + ], + "metadata": { + "id": "W3XNBatD5Lk7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "_mjX0K4O7cC6" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file