Skip to content

Commit

Permalink
added notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
Alireza Dirafzoon committed Mar 15, 2023
1 parent 53adc17 commit bf182d5
Show file tree
Hide file tree
Showing 8 changed files with 3,510 additions and 0 deletions.
325 changes: 325 additions & 0 deletions src/MLC/notebooks/k_means_1.ipynb

Large diffs are not rendered by default.

226 changes: 226 additions & 0 deletions src/MLC/notebooks/k_means_2.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "functional-corrections",
"metadata": {},
"source": [
"## K-means with multi-dimensional data\n",
" \n",
"$X_{n \\times d}$"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "formal-antique",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "durable-horse",
"metadata": {},
"outputs": [],
"source": [
"n, d, k=1000, 20, 4\n",
"max_itr=100"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "egyptian-omaha",
"metadata": {},
"outputs": [],
"source": [
"X=np.random.random((n,d))"
]
},
{
"cell_type": "markdown",
"id": "employed-helen",
"metadata": {},
"source": [
"$$ argmin_j ||x_i - c_j||_2 $$"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "center-timer",
"metadata": {},
"outputs": [],
"source": [
"def k_means(X, k):\n",
" #Randomly Initialize Centroids\n",
" np.random.seed(0)\n",
" C= X[np.random.randint(n,size=k),:]\n",
" E=np.float('inf')\n",
" for itr in range(max_itr):\n",
" \n",
" # Find the distance of each point from the centroids \n",
" E_prev=E\n",
" E=0\n",
" center_idx=np.zeros(n)\n",
" for i in range(n):\n",
" min_d=np.float('inf')\n",
" c=0\n",
" for j in range(k):\n",
" d=np.linalg.norm(X[i,:]-C[j,:],2)\n",
" if d<min_d:\n",
" min_d=d\n",
" c=j\n",
" \n",
" E+=min_d\n",
" center_idx[i]=c\n",
" \n",
" #Find the new centers\n",
" for j in range(k):\n",
" C[j,:]=np.mean( X[center_idx==j,:] ,0)\n",
" \n",
" if itr%10==0:\n",
" print(E)\n",
" if E_prev==E:\n",
" break\n",
" \n",
" return C, E, center_idx"
]
},
{
"cell_type": "markdown",
"id": "material-hayes",
"metadata": {},
"source": [
"$$ argmin_j ||x_i - c_j||_2 $$\n",
"\n",
"$$||x_i - c_j||_2 = \\sqrt{(x_i - c_j)^T (x_i-c_j)} = \\sqrt{x_i^T x_i -2 x_i^T c_j + c_j^T c_j} $$\n",
"\n",
"- $ diag(X~X^T)$, can be used to get $x_i^T x_i$\n",
"\n",
"- $X~C^T $, can be used to get $x_i^T c_j$\n",
"\n",
"- $diag(C~C^T)$, can be used to get $c_j^T c_j$"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "colored-linux",
"metadata": {},
"outputs": [],
"source": [
"def k_means_vectorized(X, k):\n",
" \n",
" #Randomly Initialize Centroids\n",
" np.random.seed(0)\n",
" C= X[np.random.randint(n,size=k),:]\n",
" E=np.float('inf')\n",
" for itr in range(max_itr):\n",
" # Find the distance of each point from the centroids \n",
" XX= np.tile(np.diag(np.matmul(X, X.T)), (k,1) ).T\n",
" XC=np.matmul(X, C.T)\n",
" CC= np.tile(np.diag(np.matmul(C, C.T)), (n,1)) \n",
"\n",
" D= np.sqrt(XX-2*XC+CC)\n",
"\n",
" # Assign the elements to the centroids:\n",
" center_idx=np.argmin(D, axis=1)\n",
"\n",
" #Find the new centers\n",
" for j in range(k):\n",
" C[j,:]=np.mean( X[center_idx==j,:] ,0)\n",
"\n",
" #Find the error\n",
" E_prev=E\n",
" E=np.sum(D[np.arange(n),center_idx])\n",
" if itr%10==0:\n",
" print(E)\n",
" if E_prev==E:\n",
" break\n",
" \n",
" return C, E, center_idx"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "equivalent-platinum",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1517.502248752696\n",
"1218.91004301866\n",
"1217.362137659097\n",
"0.8816308975219727 seconds\n"
]
}
],
"source": [
"start=time.time()\n",
"C, E, center_idx = k_means(X, k)\n",
"print(time.time()-start,'seconds')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "environmental-steam",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1517.502248752696\n",
"1218.9100430186547\n",
"1217.3621376590977\n",
"0.09020209312438965 seconds\n"
]
}
],
"source": [
"start=time.time()\n",
"C, E, center_idx = k_means_vectorized(X, k)\n",
"print(time.time()-start,'seconds')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "north-picking",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit bf182d5

Please sign in to comment.