AliTianChi Data Mining , a naive solution

nextInnovationUcas · Apr 13, 2015 · b9ad8b5 · b9ad8b5
commit b9ad8b5
Show file tree

Hide file tree

Showing 8 changed files with 248 additions and 0 deletions.
diff --git a/data/readme b/data/readme
@@ -0,0 +1 @@
+将数据tianchi_mobile_recommend_train_item.csv、tianchi_mobile_recommend_train_user.csv放到本文件夹下。
diff --git a/model/readme b/model/readme
@@ -0,0 +1 @@
+暂不分享
diff --git a/preprocess/data_preprocess.py b/preprocess/data_preprocess.py
@@ -0,0 +1,34 @@
+#-*-coding:utf-8-*-
+"""
+1.运行split_by_date.py，在data目录下生成date文件夹以及文件
+2.运行split_by_user.py，在data目录下生成user文件夹以及文件
+3.运行gen_iid_geohash_category.py，在data目录下生成dictionary文件夹以及item.pkl文件
+4.运行gen_uid_iid.py，在data/dictionary目录下生成date文件夹以及*.pkl文件
+
+
+"""
+
+import time
+from split_by_date import splitByDate
+from split_by_user import splitByUser
+from gen_iid_geohash_category import genIid
+from gen_uid_iid import genUidIid
+
+
+
+if __name__ == "__main__":
+    print "====================================="
+    t0 = time.time()
+    splitByDate()
+    t1 = time.time()
+    print "It takes %f s to split by date,generate 'data/date/*.csv'" %(t1-t0)
+    splitByUser()
+    t2 = time.time()
+    print "It takes %f s to split by user,generate 'data/user/*.csv'" %(t2-t1)
+    genIid()
+    t3 = time.time()
+    print "It takes %f s to make dictionary{iid:[geohash,category]},generate 'data/dictionary/item.pkl'" %(t3-t2)
+    genUidIid()
+    t4 = time.time()
+    print "It takes %f s to make dictionary{(uid,iid):[[b1,b2,b3,b4],[g1,g2..],[c1,c2..],[h1,h2..]]},generate 'data/dictionary/date/*.pkl'" %(t4-t3)
+    print "====================================="
diff --git a/preprocess/gen_iid_geohash_category.py b/preprocess/gen_iid_geohash_category.py
@@ -0,0 +1,30 @@
+#-*- coding:utf8 -*-#
+"""
+---------------------------------------
+*功能：将tianchi_mobile_recommend_train_item.csv保存为字典（哈希表），方便查找。
+*格式：以商品iid为key，对应商品位置和类别表[geohash,category]为value
+
+---------------------------------------
+
+"""
+
+import os
+import csv
+import cPickle
+
+def genIid():
+    os.mkdir("../data/dictionary")
+    file_path = "../data/tianchi_mobile_recommend_train_item.csv"
+
+    f = open(file_path,'rb')
+    rows = csv.reader(f)
+    rows.next()
+    dictionary = {} 
+    for row in rows:
+        dictionary[row[0]] = [row[1],row[2]]
+    f = open("../data/dictionary/item.pkl",'wb')
+    cPickle.dump(dictionary,f,-1)
+    f.close()
+
+
+
diff --git a/preprocess/gen_uid_iid.py b/preprocess/gen_uid_iid.py
@@ -0,0 +1,44 @@
+#-*- coding:utf8 -*-#
+"""
+---------------------------------------
+*功能：遍历/data/date里的文件，然后读取并提样本和特征。如"2014-12-18.csv"，文件里面出现的所有用户，以及用户有过行为的商品，都会分别生成样本(uid,iid)。
+*举例：用户u1点击过i1、i2两件商品，买过i3这件商品，则该用户可以构建三个样本：(u1,i1)、（u1，i2）、（u1，i3）。
+*样本-特征：样本名(uid,iid)以元组格式作为字典key，样本的特征向量以list格式[feat1,feat2...] 作为字典value。PS：字典的查找会快一点，以空间换时间。
+*保存：生成的 样本-特征 保存为 "文件名.pkl" 文件，利用cPickle保存。
+---------------------------------------
+
+"""
+
+import os
+import csv
+import cPickle
+
+def genUidIid():
+	os.mkdir("../data/dictionary/date")
+	direction = "../data/date/"
+	file_list = os.listdir(direction)
+	for file_name in file_list:
+	    file_path = direction+file_name
+	    f = open(file_path,'rb')
+	    rows = csv.reader(f)
+	    rows.next()
+	    dictionary = {} #{(uid,iid):[[b1,b2,b3,b4],[g1,g2..],[c1,c2..],[h1,h2..]]}
+	    for row in rows:
+		sample = (row[0],row[1])  # Attention: tuple is hashable,but list is not hashable
+		if dictionary.has_key(sample):
+		    dictionary[sample][0][int(row[2])-1] += 1
+		    if dictionary[sample][1].count(row[3])==0:
+		        dictionary[sample][1].append(row[3])
+		    if dictionary[sample][2].count(row[4])==0:
+		        dictionary[sample][2].append(row[4])
+		    dictionary[sample][3].append(int(row[5]))
+		else:
+		    dictionary[sample]=[[0,0,0,0],[row[3]],[row[4]],[int(row[5])]]
+		    dictionary[sample][0][int(row[2])-1] = 1
+		dictionary[sample][3].sort()
+	    f.close()
+
+	    f = open("../data/dictionary/date/"+file_name.split('.')[0]+".pkl",'wb')
+	    cPickle.dump(dictionary,f,-1)
+	    f.close()
+
diff --git a/preprocess/split_by_date.py b/preprocess/split_by_date.py
@@ -0,0 +1,48 @@
+#-*-coding:utf-8-*-
+"""
+将tianchi_mobile_recommend_train_user.csv按照日期分割为31份**.csv文件，放在'/data/date/'目录下。
+生成的**.csv文件内容格式如下：
+
+user_id, item_id, behavior_type,user_geohash,item_category,    hour
+99512554,37320317,  3,            94gn6nd,    9232,             20
+
+"""
+
+import csv
+import os
+
+
+#记录已存在的date.csv
+date_dictionary = {}
+
+#将words写入date.csv文件最后一行，文件打开采用'a'模式，即在原文件后添加（add）
+def writeByDate(date,words):
+    file_name = date+".csv"
+    os.chdir('../data/date/')
+    if not date_dictionary.has_key(date):
+        date_dictionary[date] = True
+        f = open(file_name,'a')
+        write = csv.writer(f)
+        write.writerow(['user_id','item_id','behavior_type','user_geohash','item_category','hour'])
+        write.writerow(words)
+        f.close()
+    else:
+        f = open(file_name,'a')
+        write = csv.writer(f)
+        write.writerow(words)
+        f.close()
+    os.chdir('../../preprocess/')
+
+
+#主函数
+def splitByDate():
+    os.mkdir('../data/date')
+    f = open("../data/tianchi_mobile_recommend_train_user.csv")
+    rows = csv.reader(f)
+    rows.next()
+    for row in rows:
+        date = row[-1].split(" ")[0]
+        hour = row[-1].split(" ")[1]
+        words = row[0:-1]
+        words.append(hour)
+        writeByDate(date,words)
diff --git a/preprocess/split_by_user.py b/preprocess/split_by_user.py
@@ -0,0 +1,52 @@
+#-*-coding:utf-8-*-
+
+"""
+遍历'data/date/'目录下的所有csv文件，按照用户分割，生成'data/user'目录以及用户文件.
+
+用户文件内容格式：
+
+'date','item_id','behavior_type','user_geohash','item_category','hour'
+
+"""
+
+import os
+import csv
+import time
+
+#记录已存在的user_id.csv
+user_dictionary = {}
+
+
+
+def writeByUser(user_id,words):
+    file_name = user_id+".csv"
+    os.chdir("../data/user/")
+    if not user_dictionary.has_key(user_id):
+        user_dictionary[user_id] = True
+        f = open(file_name,'a')
+        write = csv.writer(f)
+        write.writerow(['date','item_id','behavior_type','user_geohash','item_category','hour'])
+        write.writerow(words)
+        f.close()
+    else:
+        f = open(file_name,'a')
+        write = csv.writer(f)
+        write.writerow(words)
+        f.close()
+    os.chdir("../../preprocess/")
+
+
+def splitByUser():
+    os.mkdir("../data/user/")
+    directory = "../data/date/"
+    csvlist = os.listdir(directory)
+    csvlist.sort()
+    for eachcsv in csvlist:
+        f = open(directory+eachcsv)
+        rows = csv.reader(f)
+        rows.next()
+        for row in rows:
+            user_id = row[0]
+            words = [eachcsv.split('.')[0]]
+            words.extend(row[1:])
+            writeByUser(user_id,words)
diff --git a/rule/gen_submission_by_rule.py b/rule/gen_submission_by_rule.py
@@ -0,0 +1,38 @@
+#-*-coding:utf-8-*-#
+"""
+提交12月18号加购物车且当天没买的,F1可达到7.6%
+
+"""
+
+import cPickle
+import csv
+
+
+#存储 (uid,iid)
+result = {}
+
+
+item = cPickle.load(open("../data/dictionary/item.pkl","rb"))
+day = cPickle.load(open("../data/dictionary/date/2014-12-18.pkl","rb"))
+
+for key in day:
+    uid,iid = key
+    #对于商品子集里的商品，18号加购物车且没买的，生成提交文件
+    if  item.has_key(iid) and day[key][0][2]>0 and day[key][0][3]==0:
+        rows = csv.reader(open("../data/user/"+key[0]+".csv","rb"))
+        rows.next()           
+        for row in rows:
+            if row[0] == "2014-12-18" and row[1] == key[1] and row[2] == "3":
+               result[key] = 1
+
+
+#写入文件
+f = open("tianchi_mobile_recommendation_predict.csv","wb")
+write = csv.writer(f)
+write.writerow(["user_id","item_id"])
+total = 0
+for key in result:
+    write.writerow(key)
+    total += 1 
+print "generate submission file,total %d  (uid,iid)" %total
+f.close()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		将数据tianchi_mobile_recommend_train_item.csv、tianchi_mobile_recommend_train_user.csv放到本文件夹下。