Skip to content

Commit

Permalink
nb setup
Browse files Browse the repository at this point in the history
  • Loading branch information
jimthompson5802 committed Apr 5, 2017
1 parent d4b9b1c commit 1bd7d37
Showing 1 changed file with 225 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Address Matching with k-Nearest Neighbors\n",
"\n",
"\n",
"This function illustrates a way to perform address matching between two data sets.\n",
"\n",
"For each test address, we will return the closest reference address to it.\n",
"\n",
"We will consider two distance functions:\n",
"1. Edit distance for street number/name and\n",
"2. Euclidian distance (L2) for the zip codes"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import random\n",
"import string\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"from tensorflow.python.framework import ops\n",
"ops.reset_default_graph()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### First we generate the data sets we will need"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# n = Size of created data sets\n",
"n = 10\n",
"street_names = ['abbey', 'baker', 'canal', 'donner', 'elm']\n",
"street_types = ['rd', 'st', 'ln', 'pass', 'ave']\n",
"rand_zips = [random.randint(65000,65999) for i in range(5)]\n",
"\n",
"# Function to randomly create one typo in a string w/ a probability\n",
"def create_typo(s, prob=0.75):\n",
" if random.uniform(0,1) < prob:\n",
" rand_ind = random.choice(range(len(s)))\n",
" s_list = list(s)\n",
" s_list[rand_ind]=random.choice(string.ascii_lowercase)\n",
" s = ''.join(s_list)\n",
" return(s)\n",
"\n",
"# Generate the reference dataset\n",
"numbers = [random.randint(1, 9999) for i in range(n)]\n",
"streets = [random.choice(street_names) for i in range(n)]\n",
"street_suffs = [random.choice(street_types) for i in range(n)]\n",
"zips = [random.choice(rand_zips) for i in range(n)]\n",
"full_streets = [str(x) + ' ' + y + ' ' + z for x,y,z in zip(numbers, streets, street_suffs)]\n",
"reference_data = [list(x) for x in zip(full_streets,zips)]\n",
"\n",
"# Generate test dataset with some typos\n",
"typo_streets = [create_typo(x) for x in streets]\n",
"typo_full_streets = [str(x) + ' ' + y + ' ' + z for x,y,z in zip(numbers, typo_streets, street_suffs)]\n",
"test_data = [list(x) for x in zip(typo_full_streets,zips)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Now we can perform address matching"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Create graph\n",
"sess = tf.Session()\n",
"\n",
"# Placeholders\n",
"test_address = tf.sparse_placeholder( dtype=tf.string)\n",
"test_zip = tf.placeholder(shape=[None, 1], dtype=tf.float32)\n",
"ref_address = tf.sparse_placeholder(dtype=tf.string)\n",
"ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32)\n",
"\n",
"# Declare Zip code distance for a test zip and reference set\n",
"zip_dist = tf.square(tf.subtract(ref_zip, test_zip))\n",
"\n",
"# Declare Edit distance for address\n",
"address_dist = tf.edit_distance(test_address, ref_address, normalize=True)\n",
"\n",
"# Create similarity scores\n",
"zip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1))\n",
"zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1))\n",
"zip_sim = tf.div(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min))\n",
"address_sim = tf.subtract(1., address_dist)\n",
"\n",
"# Combine distance functions\n",
"address_weight = 0.5\n",
"zip_weight = 1. - address_weight\n",
"weighted_sim = tf.add(tf.transpose(tf.multiply(address_weight, address_sim)), tf.multiply(zip_weight, zip_sim))\n",
"\n",
"# Predict: Get max similarity entry\n",
"top_match_index = tf.argmax(weighted_sim, 1)\n",
"\n",
"# Function to Create a character-sparse tensor from strings\n",
"def sparse_from_word_vec(word_vec):\n",
" num_words = len(word_vec)\n",
" indices = [[xi, 0, yi] for xi,x in enumerate(word_vec) for yi,y in enumerate(x)]\n",
" chars = list(''.join(word_vec))\n",
" return(tf.SparseTensorValue(indices, chars, [num_words,1,1]))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Address: 4108 clm st, 65797\n",
"Match : 4108 elm st, 65797\n",
"Address: 5824 abbwy ln, 65492\n",
"Match : 5824 abbey ln, 65492\n",
"Address: 9376 ejm ln, 65546\n",
"Match : 9376 elm ln, 65546\n",
"Address: 245 abbey pass, 65546\n",
"Match : 245 abbey pass, 65546\n",
"Address: 3124 donoer ave, 65797\n",
"Match : 3124 donner ave, 65797\n",
"Address: 8434 nlm pass, 65941\n",
"Match : 8434 elm pass, 65941\n",
"Address: 5584 denner rd, 65797\n",
"Match : 5584 donner rd, 65797\n",
"Address: 2524 baeer st, 65546\n",
"Match : 2524 baker st, 65546\n",
"Address: 2346 nbbey rd, 65492\n",
"Match : 2346 abbey rd, 65492\n",
"Address: 8762 bayer pass, 65797\n",
"Match : 8762 baker pass, 65797\n"
]
}
],
"source": [
"# Loop through test indices\n",
"reference_addresses = [x[0] for x in reference_data]\n",
"reference_zips = np.array([[x[1] for x in reference_data]])\n",
"\n",
"# Create sparse address reference set\n",
"sparse_ref_set = sparse_from_word_vec(reference_addresses)\n",
"\n",
"for i in range(n):\n",
" test_address_entry = test_data[i][0]\n",
" test_zip_entry = [[test_data[i][1]]]\n",
" \n",
" # Create sparse address vectors\n",
" test_address_repeated = [test_address_entry] * n\n",
" sparse_test_set = sparse_from_word_vec(test_address_repeated)\n",
" \n",
" feeddict={test_address: sparse_test_set,\n",
" test_zip: test_zip_entry,\n",
" ref_address: sparse_ref_set,\n",
" ref_zip: reference_zips}\n",
" best_match = sess.run(top_match_index, feed_dict=feeddict)\n",
" best_street = reference_addresses[best_match[0]]\n",
" [best_zip] = reference_zips[0][best_match]\n",
" [[test_zip_]] = test_zip_entry\n",
" print('Address: ' + str(test_address_entry) + ', ' + str(test_zip_))\n",
" print('Match : ' + str(best_street) + ', ' + str(best_zip))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda env:tf-cpu]",
"language": "python",
"name": "conda-env-tf-cpu-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 1bd7d37

Please sign in to comment.