Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaoming-qxm authored Mar 28, 2017
1 parent 3d90742 commit bebea4a
Showing 2 changed files with 51 additions and 39 deletions.
72 changes: 33 additions & 39 deletions create_user_table.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#-*- coding: utf-8 -*-

import pandas as pd
import numpy as np
from collections import Counter

ACTION_201602_FILE = "data/JData_Action_201602.csv"
@@ -11,12 +12,7 @@
PRODUCT_FILE = "data/JData_Product.csv"
USER_FILE = "data/JData_User.csv"
NEW_USER_FILE = "data/JData_User_New.csv"


df = pd.DataFrame(columns=["user_id", 'age', "sex",
"user_lv_cd", "browse_num",
"buy_num", "buy_browse_ratio",
"add_cart_num", "del_cart_num"])
USER_TABLE_FILE = "data/user_table.csv"


def get_from_jdata_user():
@@ -25,35 +21,10 @@ def get_from_jdata_user():
return df_usr


def merge_behavior_count(group):
group['browse_num'] = sum(group['browse_num'])
group['addcart_num'] = sum(group['addcart_num'])
group['delcart_num'] = sum(group['delcart_num'])
group['buy_num'] = sum(group['buy_num'])
group['favor_num'] = sum(group['favor_num'])
group['click_num'] = sum(group['click_num'])

if(group['browse_num'] == 0):
group['buy_browse_ratio'] = 0.
else:
group['buy_browse_ratio'] = group[
'buy_num'] / float(group['browse_num'])
if(group['click_num']):
group['buy_click_ratio'] = 0.
else:
group['buy_click_ratio'] = group['buy_num'] / float(group['click_num'])
if(group['addcart_num'] == 0):
group['buy_addcart_ratio'] = 0.
else:
group['buy_addcart_ratio'] = group[
'buy_num'] / float(group['addcart_num'])

return group


def add_type_count(group):
behavior_type = group.type.astype(int)
type_cnt = Counter(behavior_type)

group['browse_num'] = type_cnt[1]
group['addcart_num'] = type_cnt[2]
group['delcart_num'] = type_cnt[3]
@@ -67,20 +38,22 @@ def add_type_count(group):


def get_from_action_data(fname, chunk_size=100000):
# Number of Record: 18117303
reader = pd.read_csv(fname, header=0, iterator=True)
chunks = []
loop = True
while loop:
try:
chunk = reader.get_chunk(chunk_size)
chunk = reader.get_chunk(chunk_size)[["user_id", "type"]]
chunks.append(chunk)
except StopIteration:
loop = False
print("Iteration is stopped")

df_ac = pd.concat(chunks, ignore_index=True)
df_ac = df_ac.groupby(['user_id']).apply(add_type_count)

df_ac = df_ac.groupby(['user_id'], as_index=False).apply(add_type_count)
# Select unique row
df_ac = df_ac.drop_duplicates('user_id')

return df_ac

@@ -93,9 +66,30 @@ def merge_action_data():
df_ac.append(get_from_action_data(fname=ACTION_201604_FILE))

df_ac = pd.concat(df_ac, ignore_index=True)
df_ac = df_ac.groupby(['user_id']).apply(merge_behavior_count)
df_ac = df_ac.groupby(['user_id'], as_index=False).sum()

df_ac['buy_addcart_ratio'] = df_ac['buy_num'] / df_ac['addcart_num']
df_ac['buy_addcart_ratio'] = df_ac['buy_addcart_ratio'].fillna(0)

df_ac['buy_browse_ratio'] = df_ac['buy_num'] / df_ac['browse_num']
df_ac['buy_browse_ratio'] = df_ac['buy_browse_ratio'].fillna(0)

df_ac['buy_click_ratio'] = df_ac['buy_num'] / df_ac['click_num']
df_ac['buy_click_ratio'] = df_ac['buy_click_ratio'].fillna(0)

df_ac['buy_favor_ratio'] = df_ac['buy_num'] / df_ac['favor_num']
df_ac['buy_favor_ratio'] = df_ac['buy_favor_ratio'].fillna(0)

return df_ac


if __name__ == "__main__":

user_base = get_from_jdata_user()
user_behavior = merge_action_data()

# SQL: left join
user_behavior = pd.merge(
user_base, user_behavior, on=['user_id'], how='left')

df_usr = get_from_jdata_user()
df[["user_id", "age", "sex", "user_lv_cd"]] = df_usr
merge_action_data()
user_behavior.to_csv(USER_TABLE_FILE, index=False)
18 changes: 18 additions & 0 deletions sample_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#-*- coding: utf-8 -*-

ACTION_201602_FILE = "JData_Action_201602.csv"
ACTION_201603_FILE = "JData_Action_201603.csv"
ACTION_201603_EXTRA_FILE = "JData_Action_201603_extra.csv"
ACTION_201604_FILE = "JData_Action_201604.csv"
NEW_USER_FILE = "JData_User_New.csv"
COMMENT_FILE = "JData_Comment.csv"

file_list = [ACTION_201602_FILE, ACTION_201603_FILE,
ACTION_201603_EXTRA_FILE, ACTION_201604_FILE,
NEW_USER_FILE, COMMENT_FILE]

for fname in file_list:
with open("data_ori/" + fname, 'rb') as fi:
with open('data/' + fname, 'wb') as fo:
for i in range(1000):
fo.write(fi.readline())

0 comments on commit bebea4a

Please sign in to comment.