Skip to content

Commit

Permalink
AliTianChi Data Mining , a naive solution
Browse files Browse the repository at this point in the history
  • Loading branch information
wepe committed Apr 13, 2015
0 parents commit b9ad8b5
Show file tree
Hide file tree
Showing 8 changed files with 248 additions and 0 deletions.
1 change: 1 addition & 0 deletions data/readme
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
将数据tianchi_mobile_recommend_train_item.csv、tianchi_mobile_recommend_train_user.csv放到本文件夹下。
1 change: 1 addition & 0 deletions model/readme
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
暂不分享
34 changes: 34 additions & 0 deletions preprocess/data_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#-*-coding:utf-8-*-
"""
1.运行split_by_date.py,在data目录下生成date文件夹以及文件
2.运行split_by_user.py,在data目录下生成user文件夹以及文件
3.运行gen_iid_geohash_category.py,在data目录下生成dictionary文件夹以及item.pkl文件
4.运行gen_uid_iid.py,在data/dictionary目录下生成date文件夹以及*.pkl文件
"""

import time
from split_by_date import splitByDate
from split_by_user import splitByUser
from gen_iid_geohash_category import genIid
from gen_uid_iid import genUidIid



if __name__ == "__main__":
print "====================================="
t0 = time.time()
splitByDate()
t1 = time.time()
print "It takes %f s to split by date,generate 'data/date/*.csv'" %(t1-t0)
splitByUser()
t2 = time.time()
print "It takes %f s to split by user,generate 'data/user/*.csv'" %(t2-t1)
genIid()
t3 = time.time()
print "It takes %f s to make dictionary{iid:[geohash,category]},generate 'data/dictionary/item.pkl'" %(t3-t2)
genUidIid()
t4 = time.time()
print "It takes %f s to make dictionary{(uid,iid):[[b1,b2,b3,b4],[g1,g2..],[c1,c2..],[h1,h2..]]},generate 'data/dictionary/date/*.pkl'" %(t4-t3)
print "====================================="
30 changes: 30 additions & 0 deletions preprocess/gen_iid_geohash_category.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#-*- coding:utf8 -*-#
"""
---------------------------------------
*功能:将tianchi_mobile_recommend_train_item.csv保存为字典(哈希表),方便查找。
*格式:以商品iid为key,对应商品位置和类别表[geohash,category]为value
---------------------------------------
"""

import os
import csv
import cPickle

def genIid():
os.mkdir("../data/dictionary")
file_path = "../data/tianchi_mobile_recommend_train_item.csv"

f = open(file_path,'rb')
rows = csv.reader(f)
rows.next()
dictionary = {}
for row in rows:
dictionary[row[0]] = [row[1],row[2]]
f = open("../data/dictionary/item.pkl",'wb')
cPickle.dump(dictionary,f,-1)
f.close()



44 changes: 44 additions & 0 deletions preprocess/gen_uid_iid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#-*- coding:utf8 -*-#
"""
---------------------------------------
*功能:遍历/data/date里的文件,然后读取并提样本和特征。如"2014-12-18.csv",文件里面出现的所有用户,以及用户有过行为的商品,都会分别生成样本(uid,iid)。
*举例:用户u1点击过i1、i2两件商品,买过i3这件商品,则该用户可以构建三个样本:(u1,i1)、(u1,i2)、(u1,i3)。
*样本-特征:样本名(uid,iid)以元组格式作为字典key,样本的特征向量以list格式[feat1,feat2...] 作为字典value。PS:字典的查找会快一点,以空间换时间。
*保存:生成的 样本-特征 保存为 "文件名.pkl" 文件,利用cPickle保存。
---------------------------------------
"""

import os
import csv
import cPickle

def genUidIid():
os.mkdir("../data/dictionary/date")
direction = "../data/date/"
file_list = os.listdir(direction)
for file_name in file_list:
file_path = direction+file_name
f = open(file_path,'rb')
rows = csv.reader(f)
rows.next()
dictionary = {} #{(uid,iid):[[b1,b2,b3,b4],[g1,g2..],[c1,c2..],[h1,h2..]]}
for row in rows:
sample = (row[0],row[1]) # Attention: tuple is hashable,but list is not hashable
if dictionary.has_key(sample):
dictionary[sample][0][int(row[2])-1] += 1
if dictionary[sample][1].count(row[3])==0:
dictionary[sample][1].append(row[3])
if dictionary[sample][2].count(row[4])==0:
dictionary[sample][2].append(row[4])
dictionary[sample][3].append(int(row[5]))
else:
dictionary[sample]=[[0,0,0,0],[row[3]],[row[4]],[int(row[5])]]
dictionary[sample][0][int(row[2])-1] = 1
dictionary[sample][3].sort()
f.close()

f = open("../data/dictionary/date/"+file_name.split('.')[0]+".pkl",'wb')
cPickle.dump(dictionary,f,-1)
f.close()

48 changes: 48 additions & 0 deletions preprocess/split_by_date.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#-*-coding:utf-8-*-
"""
将tianchi_mobile_recommend_train_user.csv按照日期分割为31份**.csv文件,放在'/data/date/'目录下。
生成的**.csv文件内容格式如下:
user_id, item_id, behavior_type,user_geohash,item_category, hour
99512554,37320317, 3, 94gn6nd, 9232, 20
"""

import csv
import os


#记录已存在的date.csv
date_dictionary = {}

#将words写入date.csv文件最后一行,文件打开采用'a'模式,即在原文件后添加(add)
def writeByDate(date,words):
file_name = date+".csv"
os.chdir('../data/date/')
if not date_dictionary.has_key(date):
date_dictionary[date] = True
f = open(file_name,'a')
write = csv.writer(f)
write.writerow(['user_id','item_id','behavior_type','user_geohash','item_category','hour'])
write.writerow(words)
f.close()
else:
f = open(file_name,'a')
write = csv.writer(f)
write.writerow(words)
f.close()
os.chdir('../../preprocess/')


#主函数
def splitByDate():
os.mkdir('../data/date')
f = open("../data/tianchi_mobile_recommend_train_user.csv")
rows = csv.reader(f)
rows.next()
for row in rows:
date = row[-1].split(" ")[0]
hour = row[-1].split(" ")[1]
words = row[0:-1]
words.append(hour)
writeByDate(date,words)
52 changes: 52 additions & 0 deletions preprocess/split_by_user.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#-*-coding:utf-8-*-

"""
遍历'data/date/'目录下的所有csv文件,按照用户分割,生成'data/user'目录以及用户文件.
用户文件内容格式:
'date','item_id','behavior_type','user_geohash','item_category','hour'
"""

import os
import csv
import time

#记录已存在的user_id.csv
user_dictionary = {}



def writeByUser(user_id,words):
file_name = user_id+".csv"
os.chdir("../data/user/")
if not user_dictionary.has_key(user_id):
user_dictionary[user_id] = True
f = open(file_name,'a')
write = csv.writer(f)
write.writerow(['date','item_id','behavior_type','user_geohash','item_category','hour'])
write.writerow(words)
f.close()
else:
f = open(file_name,'a')
write = csv.writer(f)
write.writerow(words)
f.close()
os.chdir("../../preprocess/")


def splitByUser():
os.mkdir("../data/user/")
directory = "../data/date/"
csvlist = os.listdir(directory)
csvlist.sort()
for eachcsv in csvlist:
f = open(directory+eachcsv)
rows = csv.reader(f)
rows.next()
for row in rows:
user_id = row[0]
words = [eachcsv.split('.')[0]]
words.extend(row[1:])
writeByUser(user_id,words)
38 changes: 38 additions & 0 deletions rule/gen_submission_by_rule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#-*-coding:utf-8-*-#
"""
提交12月18号加购物车且当天没买的,F1可达到7.6%
"""

import cPickle
import csv


#存储 (uid,iid)
result = {}


item = cPickle.load(open("../data/dictionary/item.pkl","rb"))
day = cPickle.load(open("../data/dictionary/date/2014-12-18.pkl","rb"))

for key in day:
uid,iid = key
#对于商品子集里的商品,18号加购物车且没买的,生成提交文件
if item.has_key(iid) and day[key][0][2]>0 and day[key][0][3]==0:
rows = csv.reader(open("../data/user/"+key[0]+".csv","rb"))
rows.next()
for row in rows:
if row[0] == "2014-12-18" and row[1] == key[1] and row[2] == "3":
result[key] = 1


#写入文件
f = open("tianchi_mobile_recommendation_predict.csv","wb")
write = csv.writer(f)
write.writerow(["user_id","item_id"])
total = 0
for key in result:
write.writerow(key)
total += 1
print "generate submission file,total %d (uid,iid)" %total
f.close()

0 comments on commit b9ad8b5

Please sign in to comment.