first update

lzdFeiFei · May 26, 2018 · f49bdb4 · f49bdb4
1 parent 859086c
commit f49bdb4
Show file tree

Hide file tree

Showing 213 changed files with 641,008 additions and 0 deletions.
diff --git a/case_data/movie_comment100.json b/case_data/movie_comment100.json
diff --git a/case_data/movie_comment120.json b/case_data/movie_comment120.json
diff --git a/case_data/movie_comment140.json b/case_data/movie_comment140.json
diff --git a/case_data/movie_comment160.json b/case_data/movie_comment160.json
diff --git a/case_data/movie_comment180.json b/case_data/movie_comment180.json
diff --git a/case_data/movie_comment20.json b/case_data/movie_comment20.json
diff --git a/case_data/movie_comment200.json b/case_data/movie_comment200.json
diff --git a/case_data/movie_comment225.json b/case_data/movie_comment225.json
diff --git a/case_data/movie_comment250.json b/case_data/movie_comment250.json
diff --git a/case_data/movie_comment40.json b/case_data/movie_comment40.json
diff --git a/case_data/movie_comment60.json b/case_data/movie_comment60.json
diff --git a/case_data/movie_comment80.json b/case_data/movie_comment80.json
diff --git a/case_data/movie_item.json b/case_data/movie_item.json
diff --git a/case_data/movie_people10000.json b/case_data/movie_people10000.json
diff --git a/case_data/movie_people15000.json b/case_data/movie_people15000.json
diff --git a/case_data/movie_people20000.json b/case_data/movie_people20000.json
diff --git a/case_data/movie_people25000.json b/case_data/movie_people25000.json
diff --git a/case_data/movie_people30000.json b/case_data/movie_people30000.json
diff --git a/case_data/movie_people35000.json b/case_data/movie_people35000.json
diff --git a/case_data/movie_people40000.json b/case_data/movie_people40000.json
diff --git a/case_data/movie_people5000.json b/case_data/movie_people5000.json
diff --git a/data_cleaning&feature_engineering/.ipynb_checkpoints/Filting-checkpoint.ipynb b/data_cleaning&feature_engineering/.ipynb_checkpoints/Filting-checkpoint.ipynb
diff --git a/data_cleaning&feature_engineering/Filting.ipynb b/data_cleaning&feature_engineering/Filting.ipynb
diff --git a/data_cleaning&feature_engineering/china_shp/bou2_4p.dbf b/data_cleaning&feature_engineering/china_shp/bou2_4p.dbf
diff --git a/data_cleaning&feature_engineering/china_shp/bou2_4p.shp b/data_cleaning&feature_engineering/china_shp/bou2_4p.shp
diff --git a/data_cleaning&feature_engineering/china_shp/bou2_4p.shx b/data_cleaning&feature_engineering/china_shp/bou2_4p.shx
diff --git a/data_cleaning&feature_engineering/china_shp/中国地图shp格式.rar b/data_cleaning&feature_engineering/china_shp/中国地图shp格式.rar
diff --git a/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4l.dbf b/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4l.dbf
diff --git a/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4l.shp b/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4l.shp
diff --git a/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4l.shx b/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4l.shx
diff --git a/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4p.dbf b/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4p.dbf
diff --git a/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4p.shp b/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4p.shp
diff --git a/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4p.shx b/data_cleaning&feature_engineering/china_shp/中国地图shp格式/china_basic_map/bou2_4p.shx
diff --git a/data_cleaning&feature_engineering/china_shp/中国地图shp格式/shp格式2/map/bou2_4p.dbf b/data_cleaning&feature_engineering/china_shp/中国地图shp格式/shp格式2/map/bou2_4p.dbf
diff --git a/data_cleaning&feature_engineering/china_shp/中国地图shp格式/shp格式2/map/bou2_4p.shp b/data_cleaning&feature_engineering/china_shp/中国地图shp格式/shp格式2/map/bou2_4p.shp
diff --git a/data_cleaning&feature_engineering/china_shp/中国地图shp格式/shp格式2/map/bou2_4p.shx b/data_cleaning&feature_engineering/china_shp/中国地图shp格式/shp格式2/map/bou2_4p.shx
diff --git a/data_cleaning&feature_engineering/classifier.model.bin b/data_cleaning&feature_engineering/classifier.model.bin
diff --git a/data_cleaning&feature_engineering/cover.jpg b/data_cleaning&feature_engineering/cover.jpg
diff --git a/data_cleaning&feature_engineering/model.vec b/data_cleaning&feature_engineering/model.vec
diff --git a/data_cleaning&feature_engineering/train_data_supervised_fasttext.txt b/data_cleaning&feature_engineering/train_data_supervised_fasttext.txt
diff --git a/data_cleaning&feature_engineering/train_data_unsupervised_fasttext.txt b/data_cleaning&feature_engineering/train_data_unsupervised_fasttext.txt
diff --git a/data_cleaning&feature_engineering/world_countries_shp.rar b/data_cleaning&feature_engineering/world_countries_shp.rar
diff --git a/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.dbf b/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.dbf
diff --git a/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.prj b/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.prj
@@ -0,0 +1 @@
+GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
diff --git a/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.sbn b/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.sbn
diff --git a/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.sbx b/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.sbx
diff --git a/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.shp b/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.shp
diff --git a/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.shp.xml b/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.shp.xml
@@ -0,0 +1,3 @@
+<?xml version="1.0"?>
+<!--<!DOCTYPE metadata SYSTEM "http://www.esri.com/metadata/esriprof80.dtd">-->
+<metadata xml:lang="en"><Esri><MetaID>{7BF31F4C-0ACE-4D07-A747-9F6A8485CB3A}</MetaID><CreaDate>20110324</CreaDate><CreaTime>14020800</CreaTime><SyncOnce>TRUE</SyncOnce><DataProperties><lineage><Process ToolSource="C:\Program Files\ArcGIS\ArcToolbox\Toolboxes\Data Management Tools.tbx\RepairGeometry" Date="20110324" Time="140208">RepairGeometry World_countries_shp DELETE_NULL World_countries_shp</Process></lineage></DataProperties></Esri></metadata>
diff --git a/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.shx b/data_cleaning&feature_engineering/world_countries_shp/World_countries_shp.shx
diff --git a/douban_movie/.floydexpt b/douban_movie/.floydexpt
@@ -0,0 +1 @@
+{"family_id": "DFFupXVj53JYsMV72VpUnh", "name": "douban_movie_coment"}
diff --git a/douban_movie/.floydignore b/douban_movie/.floydignore
@@ -0,0 +1,15 @@
+
+# Directories and files to ignore when uploading code to floyd
+
+.git
+.eggs
+eggs
+lib
+lib64
+parts
+sdist
+var
+*.pyc
+*.swp
+.DS_Store
+data
diff --git a/douban_movie/README b/douban_movie/README
@@ -0,0 +1,99 @@
+
+README file of Scrapy project for douban_movie
+------------------------------------------------------------------
+
+Before you crawl anything, you need to make sure some packages installed.
+You can check it by typing the following in your terminal:
+
+>> pip install scrapy faker selenium
+
+
+If there is no ‘data’ file, please make the directory which will store 
+the json file you crawl from the internet:
+
+>> mkdir data
+
+
+Then, you need to run the Scrapy project at the ‘bin’ directory:
+
+>> cd bin
+
+And, you have to download and unzip the phantomjs packages at the ‘bin’ directory:
+
+>> wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
+
+>> tar  -jxvf phantomjs-2.1.1-linux-x86_64.tar.bz2
+
+Finally, we can crawl now! 
+You can check all the spiders by typing the command:
+
+>> scraps list
+
+==============================================================
+
+STEP 1: Crawl for movie_item:
+
+Just run:
+
+>> scraps crawl douban-movie
+
+# 内含我的豆瓣账号和密码，懒得改了～ 请保密。。。。
+
+==============================================================
+
+STEP 2: Crawl for movie_comment
+
+Just run the following command one by one:
+
+>> scrapy crawl douban-comment20 -a pages=1000
+>> scrapy crawl douban-comment40 -a pages=1000
+>> scrapy crawl douban-comment60 -a pages=1000
+>> scrapy crawl douban-comment80 -a pages=1000
+>> scrapy crawl douban-comment100 -a pages=1000
+>> scrapy crawl douban-comment120 -a pages=1000
+>> scrapy crawl douban-comment140 -a pages=1000
+>> scrapy crawl douban-comment160 -a pages=1000
+>> scrapy crawl douban-comment180 -a pages=1000
+>> scrapy crawl douban-comment200 -a pages=1000
+>> scrapy crawl douban-comment220 -a pages=1000
+>> scrapy crawl douban-comment225 -a pages=1000
+>> scrapy crawl douban-comment250 -a pages=1000
+
+in which we have split the 250 movies into 13 parts to crawl and specify 
+the pages as parameter (1000 by defalt).
+
+(HIGH LEVEL!)
+Actually, you can crawl all the douban-comment spiders at once! 
+But also you would be banned at once! So you can crawl every two spiders
+for douban-comment by running the command.
+
+>> scraps crawlallcomment
+
+and modify the file in ./douban_movie/commands/crawlallcomment.py
+==============================================================
+
+STEP 3: Crawl for movie_people
+
+Just run the following command one by one:
+
+>> scrapy crawl douban-people5000 
+>> scrapy crawl douban-people10000
+>> scrapy crawl douban-people15000
+>> scrapy crawl douban-people20000
+>> scrapy crawl douban-people25000
+>> scrapy crawl douban-people30000
+>> scrapy crawl douban-people35000
+>> scrapy crawl douban-people40000
+
+in which we have split the 35776 peoples into 8 parts to crawl.
+
+(HIGH LEVEL!)
+Likewise, you also can crawl all the douban-people spiders at once by typing:
+
+>> scraps crawlallpeople
+
+However you would be banned without doubt! 
+You can modify the file in ./douban_movie/commands/crawlallpeople.py
+
+
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"family_id": "DFFupXVj53JYsMV72VpUnh", "name": "douban_movie_coment"}