From e133c4f8b212be90efcc02645c31a22aacf335f3 Mon Sep 17 00:00:00 2001 From: Qin Xiaoming Date: Thu, 30 Mar 2017 22:44:25 +0800 Subject: [PATCH] Add files via upload --- data_analysis.ipynb | 114 +++++++++++++++- data_analysis.py | 60 +++++++- explore_potential_user.ipynb | 257 +++++++++++++++++++++++++++++++++++ potential_user.py | 122 +++++++++++++++++ 4 files changed, 543 insertions(+), 10 deletions(-) create mode 100644 explore_potential_user.ipynb create mode 100644 potential_user.py diff --git a/data_analysis.ipynb b/data_analysis.ipynb index 4c69049..145e667 100644 --- a/data_analysis.ipynb +++ b/data_analysis.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": { "collapsed": true }, @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": { "collapsed": true }, @@ -237,6 +237,13 @@ "plt.legend(prop={'size':9})" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**分析**: 一周用户购买数量分布相对比较均衡,周六周日购买数相对较少,可能是此时大家都去过周末玩了,而平时可以逛京东作为消遣." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -348,11 +355,112 @@ "plt.legend(prop={'size':9})" ] }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "**分析**: 从上面可以发现,在2月6号到2月10号之间是我们的农历新年,快递在这几天不上班,因而购物数量相对较少,在我们实际分析时, 可以暂时将这部分数据作为异常数据不去考虑,不加入我们的训练样本中." + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "更多的分析等着你去发掘." + "#### 查看特定用户对特定商品的活动轨迹" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def spec_ui_action_data(fname, user_id, item_id, chunk_size=100000):\n", + " reader = pd.read_csv(fname, header=0, iterator=True)\n", + " chunks = []\n", + " loop = True\n", + " while loop:\n", + " try:\n", + " chunk = reader.get_chunk(chunk_size)[\n", + " [\"user_id\", \"sku_id\", \"type\", \"time\"]]\n", + " chunks.append(chunk)\n", + " except StopIteration:\n", + " loop = False\n", + " print(\"Iteration is stopped\")\n", + "\n", + " df_ac = pd.concat(chunks, ignore_index=True)\n", + " df_ac = df_ac[(df_ac['user_id'] == user_id) & (df_ac['sku_id'] == item_id)]\n", + "\n", + " return df_ac" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def explore_user_item_via_time():\n", + " user_id = 62969\n", + " item_id = 62655\n", + " df_ac = []\n", + " df_ac.append(spec_ui_action_data(ACTION_201602_FILE, user_id, item_id))\n", + " df_ac.append(spec_ui_action_data(ACTION_201603_FILE, user_id, item_id))\n", + " df_ac.append(spec_ui_action_data(\n", + " ACTION_201603_EXTRA_FILE, user_id, item_id))\n", + " df_ac.append(spec_ui_action_data(ACTION_201604_FILE, user_id, item_id))\n", + "\n", + " df_ac = pd.concat(df_ac, ignore_index=False)\n", + " print(df_ac.sort_values(by='time'))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration is stopped\n", + "Iteration is stopped\n", + "Iteration is stopped\n", + "Iteration is stopped\n", + " user_id sku_id type time\n", + "12296 62969 62655 1 2016-02-01 11:00:05\n", + "12307 62969 62655 1 2016-02-01 11:00:05\n", + "12300 62969 62655 2 2016-02-01 11:00:18\n", + "12285 62969 62655 1 2016-02-01 11:00:49\n", + "12305 62969 62655 1 2016-02-01 11:00:49\n", + "12302 62969 62655 2 2016-02-01 11:02:16\n", + "12294 62969 62655 1 2016-02-01 11:02:17\n", + "12304 62969 62655 1 2016-02-01 11:02:17\n", + "12297 62969 62655 1 2016-02-01 11:03:47\n", + "12306 62969 62655 1 2016-02-01 11:03:47\n", + "12303 62969 62655 4 2016-02-01 11:04:00\n" + ] + } + ], + "source": [ + "explore_user_item_via_time()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> 预测数据部分: 2016-04-16到2016-04-20用户是否下单P中的商品,每个用户只会下单一个商品;\n", + "\n", + " 由于我们需要预测16-20号五天用户的购买情况,那我们不妨分析下用户以5天为单位(周期为5)购买情况." ] }, { diff --git a/data_analysis.py b/data_analysis.py index dca1197..a4e5ae5 100644 --- a/data_analysis.py +++ b/data_analysis.py @@ -3,10 +3,10 @@ import pandas as pd import numpy as np -ACTION_201602_FILE = "data/JData_Action_201602.csv" -ACTION_201603_FILE = "data/JData_Action_201603.csv" -ACTION_201603_EXTRA_FILE = "data/JData_Action_201603_extra.csv" -ACTION_201604_FILE = "data/JData_Action_201604.csv" +ACTION_201602_FILE = "data_ori/JData_Action_201602.csv" +ACTION_201603_FILE = "data_ori/JData_Action_201603.csv" +ACTION_201603_EXTRA_FILE = "data_ori/JData_Action_201603_extra.csv" +ACTION_201604_FILE = "data_ori/JData_Action_201604.csv" COMMENT_FILE = "data/JData_Comment.csv" PRODUCT_FILE = "data/JData_Product.csv" USER_FILE = "data/JData_User.csv" @@ -64,9 +64,55 @@ def merge_weekday_action_data(): df_ui.columns = ['weekday', 'user_item_num'] print(df_ui) -merge_weekday_action_data() +def month_action_data_statistic(): + # Feb. + df_ac = get_from_action_data(fname=ACTION_201602_FILE) + df_ac['time'] = pd.to_datetime(df_ac['time']).apply(lambda x: x.day) + + # March + df_ac = [] + df_ac.append(get_from_action_data(fname=ACTION_201603_FILE)) + df_ac.append(get_from_action_data(fname=ACTION_201603_EXTRA_FILE)) + df_ac = pd.concat(df_ac, ignore_index=True) + df_ac['time'] = pd.to_datetime(df_ac['time']).apply(lambda x: x.day) + + # April. + df_ac = get_from_action_data(fname=ACTION_201604_FILE) + df_ac['time'] = pd.to_datetime(df_ac['time']).apply(lambda x: x.day) + + +def spec_ui_action_data(fname, user_id, item_id, chunk_size=100000): + reader = pd.read_csv(fname, header=0, iterator=True) + chunks = [] + loop = True + while loop: + try: + chunk = reader.get_chunk(chunk_size)[ + ["user_id", "sku_id", "type", "time"]] + chunks.append(chunk) + except StopIteration: + loop = False + print("Iteration is stopped") + + df_ac = pd.concat(chunks, ignore_index=True) + df_ac = df_ac[(df_ac['user_id'] == user_id) & (df_ac['sku_id'] == item_id)] + + return df_ac + + +def explore_user_item_via_time(): + user_id = 10396 + item_id = 65823 + df_ac = [] + df_ac.append(spec_ui_action_data(ACTION_201602_FILE, user_id, item_id)) + df_ac.append(spec_ui_action_data(ACTION_201603_FILE, user_id, item_id)) + df_ac.append(spec_ui_action_data( + ACTION_201603_EXTRA_FILE, user_id, item_id)) + df_ac.append(spec_ui_action_data(ACTION_201604_FILE, user_id, item_id)) + + df_ac = pd.concat(df_ac, ignore_index=False) + print(df_ac.sort_values(by='time')) -df_ac = get_from_action_data(fname=ACTION_201602_FILE) -df_ac['time'] = pd.to_datetime(df_ac['time']).apply(lambda x: x.day) +find_buy_user() diff --git a/explore_potential_user.ipynb b/explore_potential_user.ipynb new file mode 100644 index 0000000..4f17530 --- /dev/null +++ b/explore_potential_user.ipynb @@ -0,0 +1,257 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 京东JData算法大赛(3): 探索高潜用户的行为" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "比赛的题目是高潜用户购买意向预测, 那么理解清楚**什么是高潜用户**对于数据分析,特征抽取,以及之后的建立模型有着至关重要的作用. \n", + "简单来讲,作为训练集的高潜用户应该具有以下特征:\n", + "- 必须有购买行为\n", + "- 对一个商品购买和其他交互行为(浏览,点击,收藏等)时间差应该**多于一天** \n", + " 因为根据赛题,我们需要预测未来5天的购买情况,那么如果用户对某商品在同一天完成所有的交互行为(包括购买), \n", + " 我们无法从这种交易中指导未来的预测." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "那么接下来,让我们先尝试找出这些高潜用户,之后对他们的行为做一些数据分析." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 导入相关包\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# 定义文件名\n", + "ACTION_201602_FILE = \"data_ori/JData_Action_201602.csv\"\n", + "ACTION_201603_FILE = \"data_ori/JData_Action_201603.csv\"\n", + "ACTION_201603_EXTRA_FILE = \"data_ori/JData_Action_201603_extra.csv\"\n", + "ACTION_201604_FILE = \"data_ori/JData_Action_201604.csv\"\n", + "COMMENT_FILE = \"data/JData_Comment.csv\"\n", + "PRODUCT_FILE = \"data/JData_Product.csv\"\n", + "USER_FILE = \"data/JData_User.csv\"\n", + "NEW_USER_FILE = \"data/JData_User_New.csv\"\n", + "USER_TABLE_FILE = \"data/user_table.csv\"\n", + "BUY_USER_LIST_FILE = \"data/buy_user_list.csv\"\n", + "PROTENTIAL_USER_RECORD = \"data/protential_user_record.csv\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 寻找具有购买记录的用户" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 在一个文件中寻找有购买记录的用户-商品对\n", + "def buy_user_in_batch_data(fname, chunk_size=100000):\n", + " reader = pd.read_csv(fname, header=0, iterator=True)\n", + " chunks = []\n", + " loop = True\n", + " while loop:\n", + " try:\n", + " chunk = reader.get_chunk(chunk_size)[\n", + " [\"user_id\", \"sku_id\", \"type\"]]\n", + " chunks.append(chunk)\n", + " except StopIteration:\n", + " loop = False\n", + " print(\"Iteration is stopped\")\n", + "\n", + " df_ac = pd.concat(chunks, ignore_index=True)\n", + "\n", + " # type = 4, 购买\n", + " df_ac = df_ac[df_ac['type'] == 4][[\"user_id\", \"sku_id\"]]\n", + "\n", + " return df_ac" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 找出有购买记录的用户,并写到csv文件\n", + "def find_buy_user():\n", + " df_ac = []\n", + " df_ac.append(buy_user_in_batch_data(fname=ACTION_201602_FILE))\n", + " df_ac.append(buy_user_in_batch_data(fname=ACTION_201603_FILE))\n", + " df_ac.append(buy_user_in_batch_data(fname=ACTION_201603_EXTRA_FILE))\n", + " df_ac.append(buy_user_in_batch_data(fname=ACTION_201604_FILE))\n", + " \n", + " # 将多个子记录合并成一个dataframe\n", + " df_ac = pd.concat(df_ac, ignore_index=True)\n", + " # 将重复的用户-商品对丢弃\n", + " df_ac = df_ac.drop_duplicates()\n", + " # 写入文件\n", + " df_ac.to_csv(BUY_USER_LIST_FILE, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 执行程序\n", + "find_buy_user()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 在一个文件中寻找与给定的user-item对有关的所有记录\n", + "def ui_record_in_batch_data(fname, ui_pair, chunk_size=100000):\n", + " reader = pd.read_csv(fname, header=0, iterator=True)\n", + " chunks = []\n", + " loop = True\n", + " while loop:\n", + " try:\n", + " chunk = reader.get_chunk(chunk_size)[\n", + " [\"user_id\", \"sku_id\", \"time\", \"type\"]]\n", + " chunks.append(chunk)\n", + " except StopIteration:\n", + " loop = False\n", + " print(\"Iteration is stopped\")\n", + "\n", + " df_ac = pd.concat(chunks, ignore_index=True)\n", + " \n", + " df = []\n", + " for index, row in ui_pair.iterrows():\n", + " usr_id = row[\"user_id\"]\n", + " sku_id = row[\"sku_id\"]\n", + "\n", + " # 寻找与user-item对有关的所有记录\n", + " df.append(df_ac[(df_ac[\"user_id\"] == usr_id) &\n", + " (df_ac[\"sku_id\"] == sku_id)])\n", + "\n", + " df = pd.concat(df, ignore_index=True)\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# apply功能函数:根据一个user-item对的所有记录,计算当前是否是高潜用户\n", + "def more_than_a_day(group):\n", + " # 最后一次购买该商品的日期\n", + " last_buy_day = max(group[group[\"type\"] == 4][\"date\"])\n", + " # 最早与该商品发生交互的日期\n", + " earliest_behave_day = min(group[\"date\"])\n", + " \n", + " # 如果间隔不小于1天,则认为是高潜用户\n", + " if (last_buy_day - earliest_behave_day).days > 0:\n", + " # 字段potential_flag代表是否是高潜用户\n", + " group[\"potential_flag\"] = 1\n", + " else:\n", + " group[\"potential_flag\"] = 0\n", + "\n", + " return group" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 寻找高潜用户,并将相关行为记录写入文件\n", + "def find_potential_user():\n", + " # 有购买行为的user-item对\n", + " ui_pair = pd.read_csv(BUY_USER_LIST_FILE, header=0)\n", + "\n", + " df_ac = []\n", + " df_ac.append(ui_record_in_batch_data(ACTION_201602_FILE, ui_pair))\n", + " df_ac.append(ui_record_in_batch_data(fname=ACTION_201603_FILE))\n", + " df_ac.append(ui_record_in_batch_data(fname=ACTION_201603_EXTRA_FILE))\n", + " df_ac.append(ui_record_in_batch_data(fname=ACTION_201604_FILE))\n", + "\n", + " df_ac = pd.concat(df_ac, ignore_index=True)\n", + " # 丢弃重复的\n", + " df_ac = df_ac.drop_duplicates()\n", + " \n", + " # 增加日期属性\n", + " df_ac['date'] = pd.to_datetime(df_ac['time']).dt.date\n", + " df_ac = df_ac.groupby([\"user_id\", \"sku_id\"]).apply(more_than_a_day)\n", + " \n", + " # 找出高潜用户\n", + " df_ac = df_ac[df_ac[\"potential_flag\"] == 1]\n", + " # 写入文件\n", + " df_ac.to_csv(PROTENTIAL_USER_RECORD, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 执行程序\n", + "find_potential_user()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/potential_user.py b/potential_user.py new file mode 100644 index 0000000..580e965 --- /dev/null +++ b/potential_user.py @@ -0,0 +1,122 @@ +#-*- coding: utf-8 -*- + +import pandas as pd +import numpy as np + +ACTION_201602_FILE = "data_ori/JData_Action_201602.csv" +ACTION_201603_FILE = "data_ori/JData_Action_201603.csv" +ACTION_201603_EXTRA_FILE = "data_ori/JData_Action_201603_extra.csv" +ACTION_201604_FILE = "data_ori/JData_Action_201604.csv" +COMMENT_FILE = "data/JData_Comment.csv" +PRODUCT_FILE = "data/JData_Product.csv" +USER_FILE = "data/JData_User.csv" +NEW_USER_FILE = "data/JData_User_New.csv" +USER_TABLE_FILE = "data/user_table.csv" +BUY_USER_LIST_FILE = "data/buy_user_list.csv" +PROTENTIAL_USER_RECORD = "data/protential_user_record.csv" + + +def ui_record_in_batch_data(fname, ui_pair, chunk_size=100000): + reader = pd.read_csv(fname, header=0, iterator=True) + chunks = [] + loop = True + while loop: + try: + chunk = reader.get_chunk(chunk_size)[ + ["user_id", "sku_id", "time", "type"]] + chunks.append(chunk) + except StopIteration: + loop = False + print("Iteration is stopped") + + df_ac = pd.concat(chunks, ignore_index=True) + + df = [] + for index, row in ui_pair.iterrows(): + usr_id = row["user_id"] + sku_id = row["sku_id"] + + # find U-I related record + df.append(df_ac[(df_ac["user_id"] == usr_id) & + (df_ac["sku_id"] == sku_id)]) + + df = pd.concat(df, ignore_index=True) + + return df + + +def more_than_a_day(group): + + last_buy_day = max(group[group["type"] == 4]["date"]) + earliest_behave_day = min(group["date"]) + + if (last_buy_day - earliest_behave_day).days > 0: + group["potential_flag"] = 1 + else: + group["potential_flag"] = 0 + + return group + + +def find_potential_user(): + + ui_pair = pd.read_csv(BUY_USER_LIST_FILE, header=0) + + # ui_pair = ui_pair.head(5) + + df_ac = [] + df_ac.append(ui_record_in_batch_data(ACTION_201602_FILE, ui_pair)) + df_ac.append(ui_record_in_batch_data(fname=ACTION_201603_FILE)) + df_ac.append(ui_record_in_batch_data(fname=ACTION_201603_EXTRA_FILE)) + df_ac.append(ui_record_in_batch_data(fname=ACTION_201604_FILE)) + + df_ac = pd.concat(df_ac, ignore_index=True) + df_ac = df_ac.drop_duplicates() + + # df_ac = potential_user_in_batch_data(ACTION_201602_FILE, ui_pair) + + df_ac['date'] = pd.to_datetime(df_ac['time']).dt.date + + df_ac = df_ac.groupby(["user_id", "sku_id"]).apply(more_than_a_day) + + df_ac = df_ac[df_ac["potential_flag"] == 1] + + df_ac.to_csv(PROTENTIAL_USER_RECORD, index=False) + + +def buy_user_in_batch_data(fname, chunk_size=100000): + reader = pd.read_csv(fname, header=0, iterator=True) + chunks = [] + loop = True + while loop: + try: + chunk = reader.get_chunk(chunk_size)[ + ["user_id", "sku_id", "type"]] + chunks.append(chunk) + except StopIteration: + loop = False + print("Iteration is stopped") + + df_ac = pd.concat(chunks, ignore_index=True) + + # find buy record + df_ac = df_ac[df_ac['type'] == 4][["user_id", "sku_id"]] + + return df_ac + + +def find_buy_user(): + df_ac = [] + df_ac.append(buy_user_in_batch_data(fname=ACTION_201602_FILE)) + df_ac.append(buy_user_in_batch_data(fname=ACTION_201603_FILE)) + df_ac.append(buy_user_in_batch_data(fname=ACTION_201603_EXTRA_FILE)) + df_ac.append(buy_user_in_batch_data(fname=ACTION_201604_FILE)) + + df_ac = pd.concat(df_ac, ignore_index=True) + df_ac = df_ac.drop_duplicates() + + df_ac.to_csv(BUY_USER_LIST_FILE, index=False) + + +find_buy_user() +find_potential_user()