Skip to content

Commit

Permalink
Update ex6
Browse files Browse the repository at this point in the history
  • Loading branch information
jtlowery committed Mar 9, 2017
1 parent d58af54 commit 5de26f3
Show file tree
Hide file tree
Showing 13 changed files with 100 additions and 102 deletions.
4 changes: 1 addition & 3 deletions ex6/dataset3Params.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def dataset3Params(X, y, Xval, yval):
cross-validation set.
"""

# You need to return the following variables correctly.
# You need to return the following variables correctly.
C = 1
sigma = 0.3

Expand All @@ -23,7 +23,5 @@ def dataset3Params(X, y, Xval, yval):
# Note: You can compute the prediction error using
# mean(double(predictions ~= yval))
#


# =========================================================================
return C, sigma
8 changes: 3 additions & 5 deletions ex6/emailFeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ def emailFeatures(word_indices):
produces a feature vector from the word indices.
"""

# Total number of words in the dictionary
# Total number of words in the dictionary
n = 1899

# You need to return the following variables correctly.
# You need to return the following variables correctly.
x = np.zeros(n)
# ====================== YOUR CODE HERE ======================
# Instructions: Fill in this function to return a feature vector for the
Expand Down Expand Up @@ -48,8 +48,6 @@ def emailFeatures(word_indices):
# x = [ 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 ... 0 0 0 1 0 ..]
#
#


# =========================================================================

return x
return x
26 changes: 13 additions & 13 deletions ex6/ex6.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
# the data.
#

print 'Loading and Visualizing Data ...'
print('Loading and Visualizing Data ...')

# Load from ex6data1:
# You will have X, y in your environment
Expand All @@ -42,7 +42,7 @@
# Plot training data
plotData(X, y)

raw_input("Program paused. Press Enter to continue...")
input('Program paused. Press Enter to continue...')

## ==================== Part 2: Training Linear SVM ====================
# The following code will train a linear SVM on the dataset and plot the
Expand All @@ -55,7 +55,7 @@
X = data['X']
y = data['y'].flatten()

print 'Training Linear SVM ...'
print('Training Linear SVM ...')

# You should try to change the C value below and see how the decision
# boundary varies (e.g., try C = 1000)
Expand All @@ -65,13 +65,13 @@
model = clf.fit(X, y)
visualizeBoundaryLinear(X, y, model)

raw_input("Program paused. Press Enter to continue...")
input('Program paused. Press Enter to continue...')

## =============== Part 3: Implementing Gaussian Kernel ===============
# You will now implement the Gaussian kernel to use
# with the SVM. You should complete the code in gaussianKernel.m
#
print 'Evaluating the Gaussian Kernel ...'
print('Evaluating the Gaussian Kernel ...')

x1 = np.array([1, 2, 1])
x2 = np.array([0, 4, -1])
Expand All @@ -81,14 +81,14 @@
# print 'Gaussian Kernel between x1 = [1 2 1], x2 = [0 4 -1], sigma = %0.5f : ' \
# '\t%f\n(this value should be about 0.324652)\n' % (sigma, sim)

raw_input("Program paused. Press Enter to continue...")
input('Program paused. Press Enter to continue...')

## =============== Part 4: Visualizing Dataset 2 ================
# The following code will load the next dataset into your environment and
# plot the data.
#

print 'Loading and Visualizing Data ...'
print('Loading and Visualizing Data ...')

# Load from ex6data2:
# You will have X, y in your environment
Expand All @@ -99,13 +99,13 @@
# Plot training data
plotData(X, y)

raw_input("Program paused. Press Enter to continue...")
input('Program paused. Press Enter to continue...')

## ========== Part 5: Training SVM with RBF Kernel (Dataset 2) ==========
# After you have implemented the kernel, we can now use it to train the
# SVM classifier.
#
print 'Training SVM with RBF Kernel (this may take 1 to 2 minutes) ...'
print('Training SVM with RBF Kernel (this may take 1 to 2 minutes) ...')

# Load from ex6data2:
# You will have X, y in your environment
Expand All @@ -126,14 +126,14 @@
model = clf.fit(X, y)
visualizeBoundary(X, y, model)

raw_input("Program paused. Press Enter to continue...")
input('Program paused. Press Enter to continue...')

## =============== Part 6: Visualizing Dataset 3 ================
# The following code will load the next dataset into your environment and
# plot the data.
#

print 'Loading and Visualizing Data ...'
print('Loading and Visualizing Data ...')

# Load from ex6data3:
# You will have X, y in your environment
Expand All @@ -144,7 +144,7 @@
# Plot training data
plotData(X, y)

raw_input("Program paused. Press Enter to continue...")
input('Program paused. Press Enter to continue...')

## ========== Part 7: Training SVM with RBF Kernel (Dataset 3) ==========

Expand All @@ -167,5 +167,5 @@
model = clf.fit(X, y)
visualizeBoundary(X, y, model)

raw_input("Program paused. Press Enter to continue...")
input('Program paused. Press Enter to continue...')

38 changes: 19 additions & 19 deletions ex6/ex6_spam.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,25 +31,25 @@
# complete the code in processEmail.m to produce a word indices vector
# for a given email.

print 'Preprocessing sample email (emailSample1.txt)'
print('Preprocessing sample email (emailSample1.txt)')

# Extract Features
file = open('emailSample1.txt', 'r')
file_contents = file.readlines()
word_indices = processEmail(''.join(file_contents))

# Print Stats
print 'Word Indices: '
print word_indices
print('Word Indices: ')
print(word_indices)

raw_input("Program paused. Press Enter to continue...")
input('Program paused. Press Enter to continue...')

## ==================== Part 2: Feature Extraction ====================
# Now, you will convert each email into a vector of features in R^n.
# You should complete the code in emailFeatures.m to produce a feature
# vector for a given email.

print 'Extracting features from sample email (emailSample1.txt)'
print('Extracting features from sample email (emailSample1.txt)')

# Extract Features
file = open('emailSample1.txt')
Expand All @@ -58,10 +58,10 @@
features = emailFeatures(word_indices)

# Print Stats
print 'Length of feature vector: %d'% features.size
print 'Number of non-zero entries: %d'% sum(features > 0)
print('Length of feature vector: %d'% features.size)
print('Number of non-zero entries: %d'% sum(features > 0))

raw_input("Program paused. Press Enter to continue...")
input('Program paused. Press Enter to continue...')

## =========== Part 3: Train Linear SVM for Spam Classification ========
# In this section, you will train a linear classifier to determine if an
Expand All @@ -73,16 +73,16 @@
X = data['X']
y = data['y'].flatten()

print 'Training Linear SVM (Spam Classification)'
print '(this may take 1 to 2 minutes) ...'
print('Training Linear SVM (Spam Classification)')
print('(this may take 1 to 2 minutes) ...')

C = 0.1
clf = svm.SVC(C=C, kernel='linear', tol=1e-3, max_iter=200)
model = clf.fit(X, y)

p = model.predict(X)

print 'Training Accuracy: %f', np.mean(np.double(p == y)) * 100
print('Training Accuracy: %f', np.mean(np.double(p == y)) * 100)

## =================== Part 4: Test Spam Classification ================
# After training the classifier, we can evaluate it on a test set. We have
Expand All @@ -94,11 +94,11 @@
Xtest = data['Xtest']
ytest = data['ytest']

print 'Evaluating the trained Linear SVM on a test set ...'
print('Evaluating the trained Linear SVM on a test set ...')

p = model.predict(Xtest)

print 'Test Accuracy: %f', np. mean(np.double(p == ytest)) * 100
print('Test Accuracy: %f', np. mean(np.double(p == ytest)) * 100)


## ================= Part 5: Top Predictors of Spam ====================
Expand All @@ -111,17 +111,17 @@

# Sort the weights and obtain the vocabulary list

t = sorted(list(enumerate(model.coef_[0])),key=lambda e: e[1], reverse=True)
t = sorted(list(enumerate(model.coef_[0])), key=lambda e: e[1], reverse=True)
d = OrderedDict(t)
idx = d.keys()
weight = d.values()
vocabList = getVocabList()

print 'Top predictors of spam: '
print('Top predictors of spam: ')
for i in range(15):
print ' %-15s (%f)' %(vocabList[idx[i]], weight[i])
print(' %-15s (%f)' %(vocabList[idx[i]], weight[i]))

print 'Program paused. Press enter to continue.'
print('Program paused. Press enter to continue.')

## =================== Part 6: Try Your Own Emails =====================
# Now that you've trained the spam classifier, you can use it on your own
Expand All @@ -144,6 +144,6 @@
x = emailFeatures(word_indices)
p = model.predict(x)

print 'Processed %s\n\nSpam Classification: %d' % (filename, p)
print '(1 indicates spam, 0 indicates not spam)'
print('Processed %s\n\nSpam Classification: %d' % (filename, p))
print('(1 indicates spam, 0 indicates not spam)')

12 changes: 5 additions & 7 deletions ex6/gaussianKernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ def gaussianKernel(x1, x2, sigma):
and returns the value in sim
"""

# Ensure that x1 and x2 are column vectors
# x1 = x1.ravel()
# x2 = x2.ravel()
# Ensure that x1 and x2 are column vectors
# x1 = x1.ravel()
# x2 = x2.ravel()

# You need to return the following variables correctly.
# You need to return the following variables correctly.
sim = 0

# ====================== YOUR CODE HERE ======================
Expand All @@ -19,7 +19,5 @@ def gaussianKernel(x1, x2, sigma):
# sigma
#
#


# =============================================================
return sim
return sim
8 changes: 4 additions & 4 deletions ex6/getVocabList.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ def getVocabList():
and returns a cell array of the words in vocabList.
"""

## Read the fixed vocabulary list
## Read the fixed vocabulary list
with open('vocab.txt') as f:

# Store all dictionary words in cell array vocab{}
# Store all dictionary words in cell array vocab{}

# For ease of implementation, we use a struct to map the strings => integers
# In practice, you'll want to use some form of hashmap
# For ease of implementation, we use a struct to map the strings => integers
# In practice, you'll want to use some form of hashmap
vocabList = []
for line in f:
idx, w = line.split()
Expand Down
7 changes: 5 additions & 2 deletions ex6/linearKernel.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import numpy as np


def linearKernel(x1, x2):
"""returns a linear kernel between x1 and x2
and returns the value in sim
"""

# Ensure that x1 and x2 are column vectors
# Ensure that x1 and x2 are column vectors
x1 = x1.ravel()
x2 = x2.ravel()

# Compute the kernel
# Compute the kernel
sim = x1.T.dot(x2) # dot product

return sim
10 changes: 6 additions & 4 deletions ex6/plotData.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import matplotlib.pyplot as plt
import numpy as np
from show import show


def plotData(X, y):
"""plots the data points with + for the positive examples
and o for the negative examples. X is assumed to be a Mx2 matrix.
Expand All @@ -9,11 +11,11 @@ def plotData(X, y):
"""
plt.figure()

# Find Indices of Positive and Negative Examples
pos = np.where(y==1, True, False).flatten()
neg = np.where(y==0, True, False).flatten()
# Find Indices of Positive and Negative Examples
pos = np.where(y == 1, True, False).flatten()
neg = np.where(y == 0, True, False).flatten()

# Plot Examples
# Plot Examples
plt.plot(X[pos,0], X[pos, 1], 'k+', linewidth=1, markersize=7)
plt.plot(X[neg,0], X[neg, 1], 'ko', color='y', markersize=7)
show()
Expand Down
1 change: 1 addition & 0 deletions ex6/porterStemmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

import sys


class PorterStemmer:

def __init__(self):
Expand Down
Loading

0 comments on commit 5de26f3

Please sign in to comment.