Skip to content

Commit

Permalink
Add S3 remote and cleanup local data
Browse files Browse the repository at this point in the history
  • Loading branch information
dcereijodo committed Dec 29, 2020
1 parent e99305f commit 1bdc2e0
Show file tree
Hide file tree
Showing 9 changed files with 127 additions and 20,672 deletions.
6 changes: 3 additions & 3 deletions .dvc/config
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[core]
remote = myremote
['remote "myremote"']
url = /tmp/dvc-storage
remote = s3
['remote "s3"']
url = s3://player-scores/dvc
1 change: 1 addition & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
/appearances.json
/appearances.csv
4 changes: 4 additions & 0 deletions data/appearances.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
outs:
- md5: 5d0db76ab79867fa4b683b230b12547d
size: 585924
path: appearances.csv
4,024 changes: 0 additions & 4,024 deletions data/tfmkt__2019-08-03__GB1_ES1.json

This file was deleted.

16,556 changes: 0 additions & 16,556 deletions data/tfmkt__2019-08-03__GB1_ES1__prep.csv

This file was deleted.

1 change: 0 additions & 1 deletion infra/.gitignore

This file was deleted.

17 changes: 0 additions & 17 deletions infra/main.tf

This file was deleted.

182 changes: 115 additions & 67 deletions prep/prep.ipynb

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions prep/prep_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def add_new_columns(df: pandas.DataFrame) -> pandas.DataFrame:
return df_new

# - improved columns:
# - yellow_cards / red_cards (no need for second_yellows)
# - yellow_cards / red_cards (no need for second_yellow_cards)
# - club name formatting: fc-watford -> FC Watford
# - player name formatting: adam-masina -> Adam Masina
# - position: use longer names instead of the chryptic 'LB', etc (use 'filter by position' in https://www.transfermarkt.co.uk/diogo-jota/leistungsdatendetails/spieler/340950/saison/2020/verein/0/liga/0/wettbewerb/GB1/pos/0/trainer_id/0/plus/1)
Expand All @@ -79,16 +79,16 @@ def improve_columns(df: pandas.DataFrame) -> pandas.DataFrame:
df['date'] = pandas.to_datetime(df['date'])

# reshape cards columns
df['yellow_cards'] = (df['yellow_cards'] != '0').astype('int32') + (df['second_yellows'] != '0').astype('int32')
del df['second_yellows']
df['yellow_cards'] = (df['yellow_cards'] != '0').astype('int32') + (df['second_yellow_cards'] != '0').astype('int32')
del df['second_yellow_cards']

df['red_cards'] = (df['red_cards'] != '0').astype('int32')

return df

def filter_appearances(df: pandas.DataFrame) -> pandas.DataFrame:
# get rid of 2017 season data as we only have it partially
df = df[df['season'] == 2018]
df = df[df['season'] == 2020]

domestic_competitions = [
'ES1', 'GB1', 'DFB'
Expand Down

0 comments on commit 1bdc2e0

Please sign in to comment.