Skip to content

Commit

Permalink
Tutorial scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
dmpetrov committed May 3, 2017
1 parent df33769 commit cd99d48
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 23 deletions.
1 change: 1 addition & 0 deletions functest/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
dvc
pandas
sklearn
scipy
51 changes: 28 additions & 23 deletions functest/tutorial_end_to_end.sh
Original file line number Diff line number Diff line change
@@ -1,48 +1,53 @@

# 1. First ML model
dvc init
dvc import https://s3-us-west-2.amazonaws.com/dvc-share/so/25K/Posts.xml.tgz data/
dvc run tar zxf data/Posts.xml.tgz -C data/

mkdir myrepo
cd myrepo
mkdir code
wget -nv -P code/ https://s3-us-west-2.amazonaws.com/dvc-share/so/code/df_to_matrix.py \
https://s3-us-west-2.amazonaws.com/dvc-share/so/code/metrics.py \
wget -nv -P code/ https://s3-us-west-2.amazonaws.com/dvc-share/so/code/featurization.py \
https://s3-us-west-2.amazonaws.com/dvc-share/so/code/evaluate.py \
https://s3-us-west-2.amazonaws.com/dvc-share/so/code/train_model.py \
https://s3-us-west-2.amazonaws.com/dvc-share/so/code/train_test_split.py \
https://s3-us-west-2.amazonaws.com/dvc-share/so/code/xml_to_tsv.py
https://s3-us-west-2.amazonaws.com/dvc-share/so/code/split_train_test.py \
https://s3-us-west-2.amazonaws.com/dvc-share/so/code/xml_to_tsv.py \
https://s3-us-west-2.amazonaws.com/dvc-share/so/code/requirements.txt
pip install -r code/requirements.txt

git init
git add code/
git commit -m 'Download code'


dvc init
dvc import https://s3-us-west-2.amazonaws.com/dvc-share/so/10K/Posts.xml.tgz data/
dvc run tar zxf data/Posts.xml.tgz -C data/

dvc run python code/xml_to_tsv.py data/Posts.xml data/Posts.tsv python
dvc run python code/train_test_split.py data/Posts.tsv 0.33 20170426 data/Posts-train.tsv data/Posts-test.tsv
dvc run python code/df_to_matrix.py data/Posts-train.tsv data/Posts-test.tsv data/matrix-train.p data/matrix-test.p
dvc run python code/split_train_test.py data/Posts.tsv 0.33 20170426 data/Posts-train.tsv data/Posts-test.tsv
dvc run python code/featurization.py data/Posts-train.tsv data/Posts-test.tsv data/matrix-train.p data/matrix-test.p

dvc run python code/train_model.py data/matrix-train.p data/model.p
dvc run python code/train_model.py data/matrix-train.p 20170426 data/model.p

dvc run python code/metrics.py data/model.p data/matrix-test.p data/summary.txt
dvc run python code/evaluate.py data/model.p data/matrix-test.p data/evaluation.txt

cat data/summary.txt
cat data/evaluation.txt
# AUC: 0.552980

exit 0

# 2. Reproduce: change input dataset

dvc remove data/Posts.xml.tgz
#dvc import https://s3-us-west-2.amazonaws.com/dvc-share/so/100K/Posts.xml.tgz data/
dvc import https://s3-us-west-2.amazonaws.com/dvc-share/so/25K/Posts.xml.tgz data/
dvc repro data/summary.txt
cat data/summary.txt
dvc repro data/evaluation.txt
cat data/evaluation.txt
# AUC: 0.639861

# 3. Share your research

# Improve features
echo " " >> code/df_to_matrix.py
git add code/df_to_matrix.py
echo " " >> code/featurization.py
git add code/featurization.py
git commit -m 'Include bigram'
dvc repro data/summary.txt
cat data/summary.txt
#

#dvc repro data/Posts-train.tsv -f
#dvc repro data/summary.txt
#cat data/summary.txt
dvc repro data/evaluation.txt
cat data/evaluation.txt

0 comments on commit cd99d48

Please sign in to comment.