forked from wepe/AliTianChi
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
AliTianChi Data Mining , a naive solution
- Loading branch information
0 parents
commit b9ad8b5
Showing
8 changed files
with
248 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
将数据tianchi_mobile_recommend_train_item.csv、tianchi_mobile_recommend_train_user.csv放到本文件夹下。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
暂不分享 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#-*-coding:utf-8-*- | ||
""" | ||
1.运行split_by_date.py,在data目录下生成date文件夹以及文件 | ||
2.运行split_by_user.py,在data目录下生成user文件夹以及文件 | ||
3.运行gen_iid_geohash_category.py,在data目录下生成dictionary文件夹以及item.pkl文件 | ||
4.运行gen_uid_iid.py,在data/dictionary目录下生成date文件夹以及*.pkl文件 | ||
""" | ||
|
||
import time | ||
from split_by_date import splitByDate | ||
from split_by_user import splitByUser | ||
from gen_iid_geohash_category import genIid | ||
from gen_uid_iid import genUidIid | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
print "=====================================" | ||
t0 = time.time() | ||
splitByDate() | ||
t1 = time.time() | ||
print "It takes %f s to split by date,generate 'data/date/*.csv'" %(t1-t0) | ||
splitByUser() | ||
t2 = time.time() | ||
print "It takes %f s to split by user,generate 'data/user/*.csv'" %(t2-t1) | ||
genIid() | ||
t3 = time.time() | ||
print "It takes %f s to make dictionary{iid:[geohash,category]},generate 'data/dictionary/item.pkl'" %(t3-t2) | ||
genUidIid() | ||
t4 = time.time() | ||
print "It takes %f s to make dictionary{(uid,iid):[[b1,b2,b3,b4],[g1,g2..],[c1,c2..],[h1,h2..]]},generate 'data/dictionary/date/*.pkl'" %(t4-t3) | ||
print "=====================================" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#-*- coding:utf8 -*-# | ||
""" | ||
--------------------------------------- | ||
*功能:将tianchi_mobile_recommend_train_item.csv保存为字典(哈希表),方便查找。 | ||
*格式:以商品iid为key,对应商品位置和类别表[geohash,category]为value | ||
--------------------------------------- | ||
""" | ||
|
||
import os | ||
import csv | ||
import cPickle | ||
|
||
def genIid(): | ||
os.mkdir("../data/dictionary") | ||
file_path = "../data/tianchi_mobile_recommend_train_item.csv" | ||
|
||
f = open(file_path,'rb') | ||
rows = csv.reader(f) | ||
rows.next() | ||
dictionary = {} | ||
for row in rows: | ||
dictionary[row[0]] = [row[1],row[2]] | ||
f = open("../data/dictionary/item.pkl",'wb') | ||
cPickle.dump(dictionary,f,-1) | ||
f.close() | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#-*- coding:utf8 -*-# | ||
""" | ||
--------------------------------------- | ||
*功能:遍历/data/date里的文件,然后读取并提样本和特征。如"2014-12-18.csv",文件里面出现的所有用户,以及用户有过行为的商品,都会分别生成样本(uid,iid)。 | ||
*举例:用户u1点击过i1、i2两件商品,买过i3这件商品,则该用户可以构建三个样本:(u1,i1)、(u1,i2)、(u1,i3)。 | ||
*样本-特征:样本名(uid,iid)以元组格式作为字典key,样本的特征向量以list格式[feat1,feat2...] 作为字典value。PS:字典的查找会快一点,以空间换时间。 | ||
*保存:生成的 样本-特征 保存为 "文件名.pkl" 文件,利用cPickle保存。 | ||
--------------------------------------- | ||
""" | ||
|
||
import os | ||
import csv | ||
import cPickle | ||
|
||
def genUidIid(): | ||
os.mkdir("../data/dictionary/date") | ||
direction = "../data/date/" | ||
file_list = os.listdir(direction) | ||
for file_name in file_list: | ||
file_path = direction+file_name | ||
f = open(file_path,'rb') | ||
rows = csv.reader(f) | ||
rows.next() | ||
dictionary = {} #{(uid,iid):[[b1,b2,b3,b4],[g1,g2..],[c1,c2..],[h1,h2..]]} | ||
for row in rows: | ||
sample = (row[0],row[1]) # Attention: tuple is hashable,but list is not hashable | ||
if dictionary.has_key(sample): | ||
dictionary[sample][0][int(row[2])-1] += 1 | ||
if dictionary[sample][1].count(row[3])==0: | ||
dictionary[sample][1].append(row[3]) | ||
if dictionary[sample][2].count(row[4])==0: | ||
dictionary[sample][2].append(row[4]) | ||
dictionary[sample][3].append(int(row[5])) | ||
else: | ||
dictionary[sample]=[[0,0,0,0],[row[3]],[row[4]],[int(row[5])]] | ||
dictionary[sample][0][int(row[2])-1] = 1 | ||
dictionary[sample][3].sort() | ||
f.close() | ||
|
||
f = open("../data/dictionary/date/"+file_name.split('.')[0]+".pkl",'wb') | ||
cPickle.dump(dictionary,f,-1) | ||
f.close() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#-*-coding:utf-8-*- | ||
""" | ||
将tianchi_mobile_recommend_train_user.csv按照日期分割为31份**.csv文件,放在'/data/date/'目录下。 | ||
生成的**.csv文件内容格式如下: | ||
user_id, item_id, behavior_type,user_geohash,item_category, hour | ||
99512554,37320317, 3, 94gn6nd, 9232, 20 | ||
""" | ||
|
||
import csv | ||
import os | ||
|
||
|
||
#记录已存在的date.csv | ||
date_dictionary = {} | ||
|
||
#将words写入date.csv文件最后一行,文件打开采用'a'模式,即在原文件后添加(add) | ||
def writeByDate(date,words): | ||
file_name = date+".csv" | ||
os.chdir('../data/date/') | ||
if not date_dictionary.has_key(date): | ||
date_dictionary[date] = True | ||
f = open(file_name,'a') | ||
write = csv.writer(f) | ||
write.writerow(['user_id','item_id','behavior_type','user_geohash','item_category','hour']) | ||
write.writerow(words) | ||
f.close() | ||
else: | ||
f = open(file_name,'a') | ||
write = csv.writer(f) | ||
write.writerow(words) | ||
f.close() | ||
os.chdir('../../preprocess/') | ||
|
||
|
||
#主函数 | ||
def splitByDate(): | ||
os.mkdir('../data/date') | ||
f = open("../data/tianchi_mobile_recommend_train_user.csv") | ||
rows = csv.reader(f) | ||
rows.next() | ||
for row in rows: | ||
date = row[-1].split(" ")[0] | ||
hour = row[-1].split(" ")[1] | ||
words = row[0:-1] | ||
words.append(hour) | ||
writeByDate(date,words) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#-*-coding:utf-8-*- | ||
|
||
""" | ||
遍历'data/date/'目录下的所有csv文件,按照用户分割,生成'data/user'目录以及用户文件. | ||
用户文件内容格式: | ||
'date','item_id','behavior_type','user_geohash','item_category','hour' | ||
""" | ||
|
||
import os | ||
import csv | ||
import time | ||
|
||
#记录已存在的user_id.csv | ||
user_dictionary = {} | ||
|
||
|
||
|
||
def writeByUser(user_id,words): | ||
file_name = user_id+".csv" | ||
os.chdir("../data/user/") | ||
if not user_dictionary.has_key(user_id): | ||
user_dictionary[user_id] = True | ||
f = open(file_name,'a') | ||
write = csv.writer(f) | ||
write.writerow(['date','item_id','behavior_type','user_geohash','item_category','hour']) | ||
write.writerow(words) | ||
f.close() | ||
else: | ||
f = open(file_name,'a') | ||
write = csv.writer(f) | ||
write.writerow(words) | ||
f.close() | ||
os.chdir("../../preprocess/") | ||
|
||
|
||
def splitByUser(): | ||
os.mkdir("../data/user/") | ||
directory = "../data/date/" | ||
csvlist = os.listdir(directory) | ||
csvlist.sort() | ||
for eachcsv in csvlist: | ||
f = open(directory+eachcsv) | ||
rows = csv.reader(f) | ||
rows.next() | ||
for row in rows: | ||
user_id = row[0] | ||
words = [eachcsv.split('.')[0]] | ||
words.extend(row[1:]) | ||
writeByUser(user_id,words) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#-*-coding:utf-8-*-# | ||
""" | ||
提交12月18号加购物车且当天没买的,F1可达到7.6% | ||
""" | ||
|
||
import cPickle | ||
import csv | ||
|
||
|
||
#存储 (uid,iid) | ||
result = {} | ||
|
||
|
||
item = cPickle.load(open("../data/dictionary/item.pkl","rb")) | ||
day = cPickle.load(open("../data/dictionary/date/2014-12-18.pkl","rb")) | ||
|
||
for key in day: | ||
uid,iid = key | ||
#对于商品子集里的商品,18号加购物车且没买的,生成提交文件 | ||
if item.has_key(iid) and day[key][0][2]>0 and day[key][0][3]==0: | ||
rows = csv.reader(open("../data/user/"+key[0]+".csv","rb")) | ||
rows.next() | ||
for row in rows: | ||
if row[0] == "2014-12-18" and row[1] == key[1] and row[2] == "3": | ||
result[key] = 1 | ||
|
||
|
||
#写入文件 | ||
f = open("tianchi_mobile_recommendation_predict.csv","wb") | ||
write = csv.writer(f) | ||
write.writerow(["user_id","item_id"]) | ||
total = 0 | ||
for key in result: | ||
write.writerow(key) | ||
total += 1 | ||
print "generate submission file,total %d (uid,iid)" %total | ||
f.close() |