watched lectures and completed first half of homework

Qing-zhan · Dec 11, 2015 · 111f024 · 111f024
1 parent 8495617
commit 111f024
Showing 1 changed file with 354 additions and 0 deletions.
diff --git a/ex2/ex2.ipynb b/ex2/ex2.ipynb
@@ -0,0 +1,354 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Programming Exercise 2: Logistic Regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1 Logistic Regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "datafile = 'data/ex2data1.txt'\n",
+    "#!head $datafile\n",
+    "cols = np.loadtxt(datafile,delimiter=',',usecols=(0,1,2),unpack=True) #Read in comma separated data\n",
+    "##Form the usual \"X\" matrix and \"y\" vector\n",
+    "X = np.transpose(np.array(cols[:-1]))\n",
+    "y = np.transpose(np.array(cols[-1:]))\n",
+    "m = y.size # number of training examples\n",
+    "##Insert the usual column of 1's into the \"X\" matrix\n",
+    "X = np.insert(X,0,1,axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.1 Visualizing the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#Divide the sample into two: ones with positive classification, one with null classification\n",
+    "pos = np.array([X[i] for i in xrange(X.shape[0]) if y[i] == 1])\n",
+    "neg = np.array([X[i] for i in xrange(X.shape[0]) if y[i] == 0])\n",
+    "#Check to make sure I included all entries\n",
+    "print \"Included everything? \",(len(pos)+len(neg) == X.shape[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def plotData():\n",
+    "    plt.figure(figsize=(10,6))\n",
+    "    plt.plot(pos[:,1],pos[:,2],'k+',label='Admitted')\n",
+    "    plt.plot(neg[:,1],neg[:,2],'yo',label='Not admitted')\n",
+    "    plt.xlabel('Exam 1 score')\n",
+    "    plt.ylabel('Exam 2 score')\n",
+    "    plt.legend()\n",
+    "    plt.grid(True)\n",
+    "    \n",
+    "plotData()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.2 Implementation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from scipy.special import expit #Vectorized sigmoid function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#Quick check that expit is what I think it is\n",
+    "myx = np.arange(-10,10,.1)\n",
+    "plt.plot(myx,expit(myx))\n",
+    "plt.title(\"Woohoo this looks like a sigmoid function to me.\")\n",
+    "plt.grid(True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#Hypothesis function and cost function for logistic regression\n",
+    "def h(theta,X): #Logistic hypothesis function\n",
+    "    return expit(np.dot(X,theta))\n",
+    "\n",
+    "def computeCost(mytheta,myX,myy): #Cost function\n",
+    "    \"\"\"\n",
+    "    theta_start is an n- dimensional vector of initial theta guess\n",
+    "    X is matrix with n- columns and m- rows\n",
+    "    y is a matrix with m- rows and 1 column\n",
+    "    \"\"\"\n",
+    "    #note to self: *.shape is (rows, columns)\n",
+    "    return float((1./m) * np.sum(np.dot(-np.array(myy).T,np.log(h(mytheta,myX))) - np.dot((1-np.array(y)).T,np.log(1-h(mytheta,X)))))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#Check that with theta as zeros, cost returns about 0.693:\n",
+    "initial_theta = np.zeros((X.shape[1],1))\n",
+    "computeCost(initial_theta,X,y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#An alternative to OCTAVE's 'fminunc' we'll use some scipy.optimize function, \"fmin\"\n",
+    "from scipy import optimize\n",
+    "\n",
+    "def optimizeTheta(mytheta,myX,myy):\n",
+    "    result = optimize.fmin(computeCost, x0=mytheta, args=(X, y), maxiter=400, full_output=True)\n",
+    "    return result[0], result[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "theta, mincost = optimizeTheta(initial_theta,X,y)\n",
+    "#That's pretty cool. Black boxes ftw"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#\"Call your costFunction function using the optimal parameters of θ. \n",
+    "#You should see that the cost is about 0.203.\"\n",
+    "print computeCost(theta,X,y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#Plotting the decision boundary: two points, draw a line between\n",
+    "#Decision boundary occurs when h = 0, or when\n",
+    "#theta0 + theta1*x1 + theta2*x2 = 0\n",
+    "#y=mx+b is replaced by x2 = (-1/thetheta2)(theta0 + theta1*x1)\n",
+    "\n",
+    "boundary_xs = np.array([np.min(X[:,1]), np.max(X[:,1])])\n",
+    "boundary_ys = (-1./theta[2])*(theta[0] + theta[1]*boundary_xs)\n",
+    "plotData()\n",
+    "plt.plot(boundary_xs,boundary_ys,'b-',label='Decision Boundary')\n",
+    "plt.legend()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#For a student with an Exam 1 score of 45 and an Exam 2 score of 85, \n",
+    "#you should expect to see an admission probability of 0.776.\n",
+    "print h(theta,np.array([1, 45.,85.]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def makePrediction(mytheta, myx):\n",
+    "    return h(mytheta,myx) >= 0.5\n",
+    "\n",
+    "#Compute the percentage of samples I got correct:\n",
+    "pos_correct = float(np.sum(makePrediction(theta,pos)))\n",
+    "neg_correct = float(np.sum(np.invert(makePrediction(theta,neg))))\n",
+    "tot = len(pos)+len(neg)\n",
+    "prcnt_correct = float(pos_correct+neg_correct)/tot\n",
+    "print \"Fraction of training samples correctly predicted: %f.\" % prcnt_correct "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2 Regularized Logistic Regression"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.1 Visualizing the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "datafile = 'data/ex2data2.txt'\n",
+    "#!head $datafile\n",
+    "cols = np.loadtxt(datafile,delimiter=',',usecols=(0,1,2),unpack=True) #Read in comma separated data\n",
+    "##Form the usual \"X\" matrix and \"y\" vector\n",
+    "X = np.transpose(np.array(cols[:-1]))\n",
+    "y = np.transpose(np.array(cols[-1:]))\n",
+    "m = y.size # number of training examples\n",
+    "##Insert the usual column of 1's into the \"X\" matrix\n",
+    "X = np.insert(X,0,1,axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#Divide the sample into two: ones with positive classification, one with null classification\n",
+    "pos = np.array([X[i] for i in xrange(X.shape[0]) if y[i] == 1])\n",
+    "neg = np.array([X[i] for i in xrange(X.shape[0]) if y[i] == 0])\n",
+    "#Check to make sure I included all entries\n",
+    "print \"Included everything? \",(len(pos)+len(neg) == X.shape[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def plotData():\n",
+    "    plt.figure(figsize=(6,6)) #Draw it square to emphasize circular features\n",
+    "    plt.plot(pos[:,1],pos[:,2],'k+',label='y=1')\n",
+    "    plt.plot(neg[:,1],neg[:,2],'yo',label='y=0')\n",
+    "    plt.xlabel('Microchip Test 1')\n",
+    "    plt.ylabel('Microchip Test 2')\n",
+    "    plt.legend()\n",
+    "    plt.grid(True)\n",
+    "    \n",
+    "plotData()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#### 2.2 Feature mapping"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}