-
Notifications
You must be signed in to change notification settings - Fork 0
/
glue.py
53 lines (42 loc) · 2.17 KB
/
glue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/python
import pandas as pd
import os
from joblib import load
# get list of csv files that exist
csv_list = os.listdir('data/pricefetch')
# import csv files to list of Pandas dataframes
df_list = []
for i in range(len(csv_list)):
df_list.append(pd.read_csv(f'data/pricefetch/{csv_list[i]}'))
# turn list into one DataFrame
df_discogs = pd.DataFrame()
df_discogs = pd.concat(df_list, sort=False)
# make "date" column into datetime
df_discogs['date'] = pd.to_datetime(df_discogs['date'])
# drop extra columns -> erroneous index column, currency columns (it's all canadian)
df_discogs = df_discogs.drop(columns=['Unnamed: 0'])
cols = [c for c in df_discogs.columns if c.endswith('currency') != True]
df_discogs = df_discogs[cols]
# rename columns and reset index
df_discogs = df_discogs.rename(index=str, columns={
'Fair (F).value': 'fair', 'Good (G).value': 'good', 'Good Plus (G+).value': 'good_plus',
'Mint (M).value': 'mint', 'Near Mint (NM or M-).value': 'near_mint', 'Poor (P).value': 'poor',
'Very Good (VG).value': 'vg', 'Very Good Plus (VG+).value': 'vg_plus', 'date': 'date',
'release_id': 'release_id'})
df_discogs = df_discogs.reset_index()
# reorder columns
df_discogs = df_discogs[['date', 'release_id', 'poor','fair', 'good', 'good_plus', 'vg', 'vg_plus', 'near_mint', 'mint']]
# import popsike-scraped joblib file, put it into a df
list_popsike = load('data/popsike/release_list_may_24.joblib')
df_popsike = pd.DataFrame(list_popsike, columns=['release_id', 'year', 'title', 'artist', 'avg', 'min', 'max'])
# reorder columns on df_popsike, make 'year' datetime
df_popsike = df_popsike[['release_id', 'year', 'title', 'artist', 'min', 'avg', 'max']]
df_popsike['year'] = pd.to_datetime(df_popsike['year'], format="%Y")
# and get rid of the leading "min", "avg", "max"
df_popsike['min'] = df_popsike['min'].apply(lambda x: x.lstrip('min '))
df_popsike['avg'] = df_popsike['avg'].apply(lambda x: x.lstrip('avg '))
df_popsike['max'] = df_popsike['max'].apply(lambda x: x.lstrip('max '))
# make "min", "avg", "max" into ints
df_popsike['min'] = df_popsike['min'].astype(int)
df_popsike['avg'] = df_popsike['avg'].astype(int)
df_popsike['max'] = df_popsike['max'].astype(int)