forked from nfmcclure/tensorflow_cookbook
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d4b9b1c
commit 1bd7d37
Showing
1 changed file
with
225 additions
and
0 deletions.
There are no files selected for viewing
225 changes: 225 additions & 0 deletions
225
05_Nearest_Neighbor_Methods/05_An_Address_Matching_Example/05_address_matching.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Address Matching with k-Nearest Neighbors\n", | ||
"\n", | ||
"\n", | ||
"This function illustrates a way to perform address matching between two data sets.\n", | ||
"\n", | ||
"For each test address, we will return the closest reference address to it.\n", | ||
"\n", | ||
"We will consider two distance functions:\n", | ||
"1. Edit distance for street number/name and\n", | ||
"2. Euclidian distance (L2) for the zip codes" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import random\n", | ||
"import string\n", | ||
"import numpy as np\n", | ||
"import tensorflow as tf\n", | ||
"from tensorflow.python.framework import ops\n", | ||
"ops.reset_default_graph()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### First we generate the data sets we will need" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# n = Size of created data sets\n", | ||
"n = 10\n", | ||
"street_names = ['abbey', 'baker', 'canal', 'donner', 'elm']\n", | ||
"street_types = ['rd', 'st', 'ln', 'pass', 'ave']\n", | ||
"rand_zips = [random.randint(65000,65999) for i in range(5)]\n", | ||
"\n", | ||
"# Function to randomly create one typo in a string w/ a probability\n", | ||
"def create_typo(s, prob=0.75):\n", | ||
" if random.uniform(0,1) < prob:\n", | ||
" rand_ind = random.choice(range(len(s)))\n", | ||
" s_list = list(s)\n", | ||
" s_list[rand_ind]=random.choice(string.ascii_lowercase)\n", | ||
" s = ''.join(s_list)\n", | ||
" return(s)\n", | ||
"\n", | ||
"# Generate the reference dataset\n", | ||
"numbers = [random.randint(1, 9999) for i in range(n)]\n", | ||
"streets = [random.choice(street_names) for i in range(n)]\n", | ||
"street_suffs = [random.choice(street_types) for i in range(n)]\n", | ||
"zips = [random.choice(rand_zips) for i in range(n)]\n", | ||
"full_streets = [str(x) + ' ' + y + ' ' + z for x,y,z in zip(numbers, streets, street_suffs)]\n", | ||
"reference_data = [list(x) for x in zip(full_streets,zips)]\n", | ||
"\n", | ||
"# Generate test dataset with some typos\n", | ||
"typo_streets = [create_typo(x) for x in streets]\n", | ||
"typo_full_streets = [str(x) + ' ' + y + ' ' + z for x,y,z in zip(numbers, typo_streets, street_suffs)]\n", | ||
"test_data = [list(x) for x in zip(typo_full_streets,zips)]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Now we can perform address matching" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create graph\n", | ||
"sess = tf.Session()\n", | ||
"\n", | ||
"# Placeholders\n", | ||
"test_address = tf.sparse_placeholder( dtype=tf.string)\n", | ||
"test_zip = tf.placeholder(shape=[None, 1], dtype=tf.float32)\n", | ||
"ref_address = tf.sparse_placeholder(dtype=tf.string)\n", | ||
"ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32)\n", | ||
"\n", | ||
"# Declare Zip code distance for a test zip and reference set\n", | ||
"zip_dist = tf.square(tf.subtract(ref_zip, test_zip))\n", | ||
"\n", | ||
"# Declare Edit distance for address\n", | ||
"address_dist = tf.edit_distance(test_address, ref_address, normalize=True)\n", | ||
"\n", | ||
"# Create similarity scores\n", | ||
"zip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1))\n", | ||
"zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1))\n", | ||
"zip_sim = tf.div(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min))\n", | ||
"address_sim = tf.subtract(1., address_dist)\n", | ||
"\n", | ||
"# Combine distance functions\n", | ||
"address_weight = 0.5\n", | ||
"zip_weight = 1. - address_weight\n", | ||
"weighted_sim = tf.add(tf.transpose(tf.multiply(address_weight, address_sim)), tf.multiply(zip_weight, zip_sim))\n", | ||
"\n", | ||
"# Predict: Get max similarity entry\n", | ||
"top_match_index = tf.argmax(weighted_sim, 1)\n", | ||
"\n", | ||
"# Function to Create a character-sparse tensor from strings\n", | ||
"def sparse_from_word_vec(word_vec):\n", | ||
" num_words = len(word_vec)\n", | ||
" indices = [[xi, 0, yi] for xi,x in enumerate(word_vec) for yi,y in enumerate(x)]\n", | ||
" chars = list(''.join(word_vec))\n", | ||
" return(tf.SparseTensorValue(indices, chars, [num_words,1,1]))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Address: 4108 clm st, 65797\n", | ||
"Match : 4108 elm st, 65797\n", | ||
"Address: 5824 abbwy ln, 65492\n", | ||
"Match : 5824 abbey ln, 65492\n", | ||
"Address: 9376 ejm ln, 65546\n", | ||
"Match : 9376 elm ln, 65546\n", | ||
"Address: 245 abbey pass, 65546\n", | ||
"Match : 245 abbey pass, 65546\n", | ||
"Address: 3124 donoer ave, 65797\n", | ||
"Match : 3124 donner ave, 65797\n", | ||
"Address: 8434 nlm pass, 65941\n", | ||
"Match : 8434 elm pass, 65941\n", | ||
"Address: 5584 denner rd, 65797\n", | ||
"Match : 5584 donner rd, 65797\n", | ||
"Address: 2524 baeer st, 65546\n", | ||
"Match : 2524 baker st, 65546\n", | ||
"Address: 2346 nbbey rd, 65492\n", | ||
"Match : 2346 abbey rd, 65492\n", | ||
"Address: 8762 bayer pass, 65797\n", | ||
"Match : 8762 baker pass, 65797\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Loop through test indices\n", | ||
"reference_addresses = [x[0] for x in reference_data]\n", | ||
"reference_zips = np.array([[x[1] for x in reference_data]])\n", | ||
"\n", | ||
"# Create sparse address reference set\n", | ||
"sparse_ref_set = sparse_from_word_vec(reference_addresses)\n", | ||
"\n", | ||
"for i in range(n):\n", | ||
" test_address_entry = test_data[i][0]\n", | ||
" test_zip_entry = [[test_data[i][1]]]\n", | ||
" \n", | ||
" # Create sparse address vectors\n", | ||
" test_address_repeated = [test_address_entry] * n\n", | ||
" sparse_test_set = sparse_from_word_vec(test_address_repeated)\n", | ||
" \n", | ||
" feeddict={test_address: sparse_test_set,\n", | ||
" test_zip: test_zip_entry,\n", | ||
" ref_address: sparse_ref_set,\n", | ||
" ref_zip: reference_zips}\n", | ||
" best_match = sess.run(top_match_index, feed_dict=feeddict)\n", | ||
" best_street = reference_addresses[best_match[0]]\n", | ||
" [best_zip] = reference_zips[0][best_match]\n", | ||
" [[test_zip_]] = test_zip_entry\n", | ||
" print('Address: ' + str(test_address_entry) + ', ' + str(test_zip_))\n", | ||
" print('Match : ' + str(best_street) + ', ' + str(best_zip))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"anaconda-cloud": {}, | ||
"kernelspec": { | ||
"display_name": "Python [conda env:tf-cpu]", | ||
"language": "python", | ||
"name": "conda-env-tf-cpu-py" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |