Merge pull request openai#66 from openai/ted/update-embedding-examples

updates embedding examples based on ada-002
Ardandxb · Jan 10, 2023 · 3302d1b · 3302d1b
2 parents 502429c + 8b547fd
commit 3302d1b
Show file tree

Hide file tree

Showing 13 changed files with 19,605 additions and 12,247 deletions.
diff --git a/examples/Classification_using_embeddings.ipynb b/examples/Classification_using_embeddings.ipynb
diff --git a/examples/Clustering.ipynb b/examples/Clustering.ipynb
diff --git a/examples/Clustering_for_transaction_classification.ipynb b/examples/Clustering_for_transaction_classification.ipynb
diff --git a/examples/Get_embeddings.ipynb b/examples/Get_embeddings.ipynb
@@ -29,8 +29,7 @@
     "import openai\n",
     "\n",
     "embedding = openai.Embedding.create(\n",
-    "    input=\"Your text goes here\",\n",
-    "    engine=\"text-embedding-ada-002\"\n",
+    "    input=\"Your text goes here\", model=\"text-embedding-ada-002\"\n",
     ")[\"data\"][0][\"embedding\"]\n",
     "len(embedding)\n"
    ]
@@ -54,15 +53,11 @@
     "\n",
     "\n",
     "@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n",
-    "def get_embedding(text: str, engine=\"text-embedding-ada-002\") -> list[float]:\n",
+    "def get_embedding(text: str, model=\"text-embedding-ada-002\") -> list[float]:\n",
+    "    return openai.Embedding.create(input=[text], model=model)[\"data\"][0][\"embedding\"]\n",
     "\n",
-    "    # replace newlines, which can negatively affect performance.\n",
-    "    text = text.replace(\"\\n\", \" \")\n",
     "\n",
-    "    return openai.Embedding.create(input=[text], engine=engine)[\"data\"][0][\"embedding\"]\n",
-    "\n",
-    "\n",
-    "embedding = get_embedding(\"Your text goes here\", engine=\"text-embedding-ada-002\")\n",
+    "embedding = get_embedding(\"Your text goes here\", model=\"text-embedding-ada-002\")\n",
     "print(len(embedding))\n"
    ]
   }

diff --git a/examples/Obtain_dataset.ipynb b/examples/Obtain_dataset.ipynb
@@ -21,7 +21,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "import pandas as pd\n",
+    "import tiktoken\n",
+    "\n",
+    "from openai.embeddings_utils import get_embedding\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# embedding model parameters\n",
+    "embedding_model = \"text-embedding-ada-002\"\n",
+    "embedding_encoding = \"cl100k_base\"  # this the encoding for text-embedding-ada-002\n",
+    "max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -97,25 +122,26 @@
        "1  Title: Arrived in pieces; Content: Not pleased...  "
       ]
      },
-     "execution_count": 1,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import pandas as pd\n",
-    "\n",
-    "input_datapath = 'data/fine_food_reviews_1k.csv'  # to save space, we provide a pre-filtered dataset\n",
+    "# load & inspect dataset\n",
+    "input_datapath = \"data/fine_food_reviews_1k.csv\"  # to save space, we provide a pre-filtered dataset\n",
     "df = pd.read_csv(input_datapath, index_col=0)\n",
-    "df = df[['Time', 'ProductId', 'UserId', 'Score', 'Summary', 'Text']]\n",
+    "df = df[[\"Time\", \"ProductId\", \"UserId\", \"Score\", \"Summary\", \"Text\"]]\n",
     "df = df.dropna()\n",
-    "df['combined'] = \"Title: \" + df.Summary.str.strip() + \"; Content: \" + df.Text.str.strip()\n",
-    "df.head(2)"
+    "df[\"combined\"] = (\n",
+    "    \"Title: \" + df.Summary.str.strip() + \"; Content: \" + df.Text.str.strip()\n",
+    ")\n",
+    "df.head(2)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -124,54 +150,52 @@
        "1000"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# subsample to 1k most recent reviews and remove samples that are too long\n",
-    "df = df.sort_values('Time').tail(1_100)\n",
-    "df.drop('Time', axis=1, inplace=True)\n",
+    "top_n = 1000\n",
+    "df = df.sort_values(\"Time\").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out\n",
+    "df.drop(\"Time\", axis=1, inplace=True)\n",
     "\n",
-    "from transformers import GPT2TokenizerFast\n",
-    "tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")\n",
+    "encoding = tiktoken.get_encoding(embedding_encoding)\n",
     "\n",
-    "# remove reviews that are too long\n",
-    "df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x)))\n",
-    "df = df[df.n_tokens<8000].tail(1_000)\n",
-    "len(df)"
+    "# omit reviews that are too long to embed\n",
+    "df[\"n_tokens\"] = df.combined.apply(lambda x: len(encoding.encode(x)))\n",
+    "df = df[df.n_tokens <= max_tokens].tail(top_n)\n",
+    "len(df)\n"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 2. Get embeddings and save them for future reuse"
+    "## 2. Get embeddings and save them for future reuse"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import openai\n",
-    "from openai.embeddings_utils import get_embedding\n",
     "# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage\n",
     "\n",
-    "# This will take just between 5 and 10 minutes\n",
-    "df['ada_similarity'] = df.combined.apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))\n",
-    "df['ada_search'] = df['ada_similarity']\n",
-    "df.to_csv('data/fine_food_reviews_with_embeddings_1k.csv')"
+    "# This may take a few minutes\n",
+    "df[\"embedding\"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))\n",
+    "df.to_csv(\"data/fine_food_reviews_with_embeddings_1k.csv\")\n"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "openai-cookbook",
+   "display_name": "openai",
    "language": "python",
-   "name": "openai-cookbook"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -183,12 +207,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.9.9 (main, Dec  7 2021, 18:04:56) \n[Clang 13.0.0 (clang-1300.0.29.3)]"
   },
   "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
-    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+    "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97"
    }
   }
  },