Skip to content

Commit

Permalink
add movie recommandation example.
Browse files Browse the repository at this point in the history
  • Loading branch information
perhapszzy committed May 17, 2017
1 parent 345cf7d commit 75b9b3a
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 2 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM tensorflow/tensorflow:0.12.0
FROM tensorflow/tensorflow:1.0.0

ENV LANG C.UTF-8
RUN apt-get update && apt-get install -y bc
Expand All @@ -9,7 +9,7 @@ RUN rm -rf /notebooks/*

COPY caicloud.tensorflow /caicloud.tensorflow
COPY Deep_Learning_with_TensorFlow/datasets /notebooks/Deep_Learning_with_TensorFlow/datasets
COPY Deep_Learning_with_TensorFlow/0.12.0 /notebooks/Deep_Learning_with_TensorFlow/0.12.0
COPY Deep_Learning_with_TensorFlow/1.0.0 /notebooks/Deep_Learning_with_TensorFlow/1.0.0
COPY run_tf.sh /run_tf.sh

CMD ["/run_tf.sh"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# 使用TensorFlow解决推荐问题

## 数据集
这里使用[电影评级数据集](http://grouplens.org/datasets/movielens/)来模拟推荐问题。该数据集中数据格式如下:
```
1::1193::5::978300760
1::661::3::978302109
1::914::3::978301968
1::3408::4::978300275
1::2355::5::978824291
```
每一行包含了一个用户对一个电影的评分。比如第一行表示用户1对电影1193评分为5。数据中最后一列为时间戳,在本样例中我们并没有使用时间戳信息。这里我们的目标是对于给定的(用户,电影)对,预测给定用户对给定电影的评分。

运行一下命令可以下载数据:
```
./download_data.sh
```


## 任务训练
通过以下脚本可以在本地训练:
```
./train_model.sh
```

运行改脚本可以得到类似下面的结果:
```
Training begins @ 2017-05-18 00:24:33.373159
Eval RMSE at round 0 is: 2.81291127205
Eval RMSE at round 2000 is: 0.945966959
Eval RMSE at round 4000 is: 0.933194696903
Eval RMSE at round 6000 is: 0.927836835384
Eval RMSE at round 8000 is: 0.923974812031
Eval RMSE at round 10000 is: 0.92291110754
Eval RMSE at round 12000 is: 0.919465661049
Eval RMSE at round 14000 is: 0.918680250645
Eval RMSE at round 16000 is: 0.917023718357
Eval RMSE at round 18000 is: 0.915674805641
Eval RMSE at round 20000 is: 0.91452050209
Eval RMSE at round 22000 is: 0.915164649487
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash

DATA_DIR=/tmp/movielens
SIZE=1m
mkdir -p ${DATA_DIR}
wget http://files.grouplens.org/datasets/movielens/ml-${SIZE}.zip -O ${DATA_DIR}/ml-${SIZE}.zip
unzip ${DATA_DIR}/ml-${SIZE}.zip -d ${DATA_DIR}
141 changes: 141 additions & 0 deletions caicloud.tensorflow/caicloud/clever/examples/recommandation/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# coding=utf-8

import time

import numpy as np
import tensorflow as tf
import pandas as pd

from caicloud.clever.tensorflow import dist_base
from caicloud.clever.tensorflow import model_exporter

tf.app.flags.DEFINE_string("export_dir",
"/tmp/saved_model/movie",
"model export directory path.")

tf.app.flags.DEFINE_string("batch_size", 128, "training batch size.")
tf.app.flags.DEFINE_string("embedding_dim", 50, "embedding dimension.")

FLAGS = tf.app.flags.FLAGS
USER_NUM = 6040
ITEM_NUM = 3952

def get_data():
col_names = ["user", "item", "rate", "st"]
df = pd.read_csv("/tmp/movielens/ml-1m/ratings.dat", sep="::", header=None, names=col_names, engine='python')

df["user"] -= 1
df["item"] -= 1
for col in ("user", "item"):
df[col] = df[col].astype(np.int32)
df["rate"] = df["rate"].astype(np.float32)

rows = len(df)
print "Total number of instances: ", rows
df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
split_index = int(rows * 0.9)
return df[0:split_index], df[split_index:]

class ShuffleIterator(object):
def __init__(self, inputs, batch_size=10):
self.inputs = inputs
self.batch_size = batch_size
self.num_cols = len(self.inputs)
self.len = len(self.inputs[0])
self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))

def __len__(self):
return self.len

def __iter__(self):
return self

def __next__(self):
return self.next()

def next(self):
ids = np.random.randint(0, self.len, (self.batch_size,))
out = self.inputs[ids, :]
return [out[:, i] for i in range(self.num_cols)]

_train, _test = get_data()
_iter_train = ShuffleIterator([_train["user"], _train["item"], _train["rate"]], batch_size=FLAGS.batch_size)
_train_op = None
_infer = None
_global_step = None
_user_batch = None
_item_batch = None
_rate_batch = None
_cost = None
_rmse = None
_local_step = 0

def inference(user_batch, item_batch, dim):
w_user = tf.get_variable("embd_user", shape=[USER_NUM, dim],
initializer=tf.truncated_normal_initializer(stddev=0.02))
w_item = tf.get_variable("embd_item", shape=[ITEM_NUM, dim],
initializer=tf.truncated_normal_initializer(stddev=0.02))

input1 = tf.nn.embedding_lookup(w_user, user_batch)
input2 = tf.nn.embedding_lookup(w_item, item_batch)
input = tf.concat([input1, input2], 1)

w = tf.get_variable("w", shape=[2*dim, 1], initializer=tf.truncated_normal_initializer(stddev=0.02))
b = tf.get_variable("b", shape=[1], initializer=tf.constant_initializer(1))
infer = tf.transpose(tf.matmul(input, w) + b, name="infer")
return infer

def model_fn(sync, num_replicas):
global _train_op, _infer, _user_batch, _item_batch, _rate_batch, _rmse, _cost, _global_step

_user_batch = tf.placeholder(tf.int32, shape=[None], name="user")
_item_batch = tf.placeholder(tf.int32, shape=[None], name="item")
_rate_batch = tf.placeholder(tf.float32, shape=[None], name="rate")

_infer = inference(_user_batch, _item_batch, FLAGS.embedding_dim)
_global_step = tf.contrib.framework.get_or_create_global_step()

_cost = tf.square(_infer - _rate_batch)
optimizer = tf.train.AdamOptimizer(0.001)
_train_op = optimizer.minimize(_cost, global_step=_global_step)

_rmse = tf.sqrt(tf.reduce_mean(_cost))

def rmse_evalute_fn(session):
return session.run(_rmse, feed_dict={
_user_batch: _test["user"], _item_batch: _test["item"], _rate_batch: _test["rate"]})

# 定义模型导出配置
model_export_spec = model_exporter.ModelExportSpec(
export_dir=FLAGS.export_dir,
input_tensors={"user": _user_batch, "item": _item_batch},
output_tensors={"infer": _infer})

# 定义模型评测(准确率)的计算方法
model_metric_ops = {
"rmse": rmse_evalute_fn
}

return dist_base.ModelFnHandler(
global_step=_global_step,
optimizer=optimizer,
model_metric_ops=model_metric_ops,
model_export_spec=model_export_spec,
summary_op=None)

def train_fn(session, num_global_step):
global _train_op, _infer, _user_batch, _item_batch, _rate_batch, _rmse, _local_step, _cost

users, items, rates = next(_iter_train)
session.run(_train_op, feed_dict={_user_batch: users, _item_batch: items, _rate_batch: rates})

if _local_step % 2000 == 0:
rmse, infer, cost = session.run([_rmse, _infer, _cost], feed_dict={_user_batch: _test["user"], _item_batch: _test["item"], _rate_batch: _test["rate"]})
print("Eval RMSE at round {} is: {}".format(num_global_step, rmse))

_local_step += 1
return False

if __name__ == '__main__':
distTfRunner = dist_base.DistTensorflowRunner(model_fn = model_fn, gen_init_fn=None)
distTfRunner.run(train_fn)
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
#
# Copyright 2017 Caicloud authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

rm -rf /tmp/caicloud-dist-tf
rm -rf /tmp/saved_model/movie

export TF_MAX_STEPS=30000
export TF_SAVE_CHECKPOINTS_SECS=60
export TF_SAVE_SUMMARIES_STEPS=1000
python train.py

0 comments on commit 75b9b3a

Please sign in to comment.