Skip to content

Commit

Permalink
Completed web scrapers and Spotify album ID requester to create embed…
Browse files Browse the repository at this point in the history
…ded player HTML code
  • Loading branch information
dnaeye committed Dec 30, 2020
1 parent 0ac5cbe commit cd492a0
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 11 deletions.
6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 18 additions & 4 deletions boomkat.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Web scraper for Boomkat End-of-Year Top Releases
# Started 2020-12-28
# Updated 2020-12-28

# Import libraries
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

url_2019 = "https://boomkat.com/charts/boomkat-end-of-year-charts-2019/940"
url_2020 = "https://boomkat.com/charts/boomkat-end-of-year-charts-2020/1234"
Expand All @@ -23,10 +25,12 @@ def get_page(year):

results = page.find_all(class_='chart-item')

df = pd.DataFrame(columns=['rank', 'artist', 'album', 'genre', 'review'])
i = 1

for result in results:
try:
rank = i
release = result.find('div', class_='chart-item-content-mobile show-for-small-only')
artist = str(release.find(class_='release__artist').text.strip()).title()
#if len(re.search(r'^.{2}\s', artist).group()) == 3:
Expand All @@ -35,23 +39,33 @@ def get_page(year):
# artist = prefix + name
#else:
# pass
title = release.find(class_='release__title').text.strip()
album = release.find(class_='release__title').text.strip()
try:
genre = release.find(class_='release__genre').text.strip()
except:
genre = "Unknown"
try:
review = result.find('div', class_='chart-item-review').text.strip()[0:500].replace("\r","").replace("\n\n"," ")
review = result.find('div', class_='chart-item-review').text.strip()[0:500] \
.replace("\r", "").replace("\n\n", " ")
period = review.rfind('.')
question = review.rfind('?')
exclamation = review.rfind('!')
ends = [period, question, exclamation]
max_end = max(ends) + 1
if max(ends) < 1:
space = review.rfind(' ')
max_end = space
else:
max_end = max(ends) + 1
short_review = review[0:max_end]
except:
short_review = "No review available"
print(str(i) + " : " + artist, "-", title, ";", genre, "; Review: ", short_review)

df = df.append({'rank': rank, 'artist': artist, 'album': album, 'genre': genre, 'review': short_review}, ignore_index=True)
print('Rank ' + str(i) + " : " + artist, "-", album, ";", genre, "; Review: ", short_review)
i += 1

except:
print("Error getting chart item")

filename = "boomkat_" + user_year + ".csv"
df.to_csv(filename, index=False, encoding='utf-8-sig')
39 changes: 32 additions & 7 deletions pitchfork.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Web scraper for Pitchfork End-of-Year Top Releases
# Started 2020-12-28
# Updated 2020-12-28

# Import libraries
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

base_url = "https://pitchfork.com/features/lists-and-guides/best-albums-"

Expand All @@ -18,28 +19,52 @@ def get_page(year):

page = get_page(user_year)

df = pd.DataFrame(columns=['rank', 'artist', 'album'])
i = 50
titles = page.find_all('h2')

for title in titles:
rank = i
item = title.text.split(":")
artist = item[0].strip()
album = item[1].strip()

print(artist)
print(album + "\n")
print('Rank ' + str(i), ":", artist, "-", album)
df = df.append({'rank': i, 'artist': artist, 'album': album}, ignore_index=True)
i -= 1

tl = []

paragraphs = page.find_all('p')

for paragraph in paragraphs:
text = paragraph.text
text = paragraph.text[0:500]
words = text.split(" ")
paragraph_len = len(words)

period = text.rfind('.')
question = text.rfind('?')
exclamation = text.rfind('!')
ends = [period, question, exclamation]
max_end = max(ends)
max_end = max(ends) + 1

if paragraph_len > 10 and max_end > 1:
short_review = text[0:max_end]
print(short_review + "\n")
tl.append(short_review)
else:
pass

tf = pd.Series(tl)

# Need to manually exclude webpage intro and end paragraphs and additional para from reviews longer than 1 para
if user_year == '2020':
rf = tf[3:-1]
rf = rf.drop([8, 27, 50], axis=0)
rf = rf.reset_index(drop=True)

df['review'] = rf
df = df.sort_values('rank', ascending=True)

if paragraph_len > 10 and max_end > 0:
print(text)
filename = "pitchfork_" + user_year + ".csv"
df.to_csv(filename, index=False, encoding='utf-8-sig')
96 changes: 96 additions & 0 deletions spotify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Search for Spotify ID for given album and create HTML code for embedded player
# Started 2020-12-29
# Updated 2020-12-29

# Import libraries
import pandas as pd
import requests
import json
from spotify_secrets import * # Simple .py file declaring client_id and client_secret variables

# Authorization
token_endpoint = "https://accounts.spotify.com/api/token"

auth_response = requests.post(token_endpoint, {
'grant_type': 'client_credentials',
'client_id': client_id,
'client_secret': client_secret,
})

auth_response_data = auth_response.json()

token = auth_response_data['access_token']

# Search
endpoint = "https://api.spotify.com/v1/search"

# Import data
class import_data:
def __init__(self):
self.site = input("Enter site code (Pitchfork=p, Boomkat=b): ")
self.year = input("Enter year: ")

def get_input():
return import_data()

user_input = get_input()

if user_input.site == 'p':
site = 'pitchfork'
elif user_input.site == 'b':
site = 'boomkat'
else:
print('Enter appropriate code.')
year = user_input.year

filename = site + "_" + year + ".csv"

df = pd.read_csv(filename)

html = []

for i in range(len(df)):
artist = df.iloc[i]['artist'].replace(" / ", ", ")
if 'various' in artist.lower():
artist_param = ''
else:
artist_param = ' artist:' + artist
album = df.iloc[i]['album']
query = 'album:' + album + artist_param

headers = {'Authorization': 'Bearer {token}'.format(token=token)}
params = {'q': query, 'type': 'album', 'market': 'US'}

data = requests.get(
endpoint,
headers=headers,
params=params
)

print('artist:', artist, 'album', album, data.status_code)

if data.status_code == 200:
if len(data.json()['albums']['items']) > 0:
album_data = data.json()
album_id = album_data['albums']['items'][0]['id']

# Create embeddable player HTML code
# See https://developer.spotify.com/documentation/widgets/generate/embed/
width = '"300"'
height = '"80"'
base = '<iframe src="https://open.spotify.com/embed/album/'
frame_border = '"0"'
allow_transparency = '"true"'
allow = '"encrypted-media'
code = base + album_id + '"' + ' width=' + width + ' height=' + height + ' frameborder=' + frame_border \
+ ' allowtransparency=' + allow_transparency + ' allow=' + allow + '"></iframe>'
else:
code = "Not available"
else:
code = 'Not available'

html.append(code)

df['html'] = pd.Series(html)

df.to_csv(site + "_" + year + "_html.csv", index=False)
Empty file added spotify_secrets.py
Empty file.

0 comments on commit cd492a0

Please sign in to comment.