From cd492a02451aff8f8ff20471244b2b259d4f0531 Mon Sep 17 00:00:00 2001 From: dnaeye Date: Tue, 29 Dec 2020 22:47:33 -0800 Subject: [PATCH] Completed web scrapers and Spotify album ID requester to create embedded player HTML code --- .idea/vcs.xml | 6 +++ boomkat.py | 22 +++++++++-- pitchfork.py | 39 +++++++++++++++---- spotify.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++ spotify_secrets.py | 0 5 files changed, 152 insertions(+), 11 deletions(-) create mode 100644 .idea/vcs.xml create mode 100644 spotify.py create mode 100644 spotify_secrets.py diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/boomkat.py b/boomkat.py index a03f32f..b337fb4 100644 --- a/boomkat.py +++ b/boomkat.py @@ -1,10 +1,12 @@ # Web scraper for Boomkat End-of-Year Top Releases # Started 2020-12-28 +# Updated 2020-12-28 # Import libraries import requests from bs4 import BeautifulSoup import re +import pandas as pd url_2019 = "https://boomkat.com/charts/boomkat-end-of-year-charts-2019/940" url_2020 = "https://boomkat.com/charts/boomkat-end-of-year-charts-2020/1234" @@ -23,10 +25,12 @@ def get_page(year): results = page.find_all(class_='chart-item') +df = pd.DataFrame(columns=['rank', 'artist', 'album', 'genre', 'review']) i = 1 for result in results: try: + rank = i release = result.find('div', class_='chart-item-content-mobile show-for-small-only') artist = str(release.find(class_='release__artist').text.strip()).title() #if len(re.search(r'^.{2}\s', artist).group()) == 3: @@ -35,23 +39,33 @@ def get_page(year): # artist = prefix + name #else: # pass - title = release.find(class_='release__title').text.strip() + album = release.find(class_='release__title').text.strip() try: genre = release.find(class_='release__genre').text.strip() except: genre = "Unknown" try: - review = result.find('div', class_='chart-item-review').text.strip()[0:500].replace("\r","").replace("\n\n"," ") + review = result.find('div', class_='chart-item-review').text.strip()[0:500] \ + .replace("\r", "").replace("\n\n", " ") period = review.rfind('.') question = review.rfind('?') exclamation = review.rfind('!') ends = [period, question, exclamation] - max_end = max(ends) + 1 + if max(ends) < 1: + space = review.rfind(' ') + max_end = space + else: + max_end = max(ends) + 1 short_review = review[0:max_end] except: short_review = "No review available" - print(str(i) + " : " + artist, "-", title, ";", genre, "; Review: ", short_review) + + df = df.append({'rank': rank, 'artist': artist, 'album': album, 'genre': genre, 'review': short_review}, ignore_index=True) + print('Rank ' + str(i) + " : " + artist, "-", album, ";", genre, "; Review: ", short_review) i += 1 + except: print("Error getting chart item") +filename = "boomkat_" + user_year + ".csv" +df.to_csv(filename, index=False, encoding='utf-8-sig') \ No newline at end of file diff --git a/pitchfork.py b/pitchfork.py index 947259c..dfb4449 100644 --- a/pitchfork.py +++ b/pitchfork.py @@ -1,10 +1,11 @@ # Web scraper for Pitchfork End-of-Year Top Releases # Started 2020-12-28 +# Updated 2020-12-28 # Import libraries import requests from bs4 import BeautifulSoup -import re +import pandas as pd base_url = "https://pitchfork.com/features/lists-and-guides/best-albums-" @@ -18,20 +19,26 @@ def get_page(year): page = get_page(user_year) +df = pd.DataFrame(columns=['rank', 'artist', 'album']) +i = 50 titles = page.find_all('h2') for title in titles: + rank = i item = title.text.split(":") artist = item[0].strip() album = item[1].strip() - print(artist) - print(album + "\n") + print('Rank ' + str(i), ":", artist, "-", album) + df = df.append({'rank': i, 'artist': artist, 'album': album}, ignore_index=True) + i -= 1 + +tl = [] paragraphs = page.find_all('p') for paragraph in paragraphs: - text = paragraph.text + text = paragraph.text[0:500] words = text.split(" ") paragraph_len = len(words) @@ -39,7 +46,25 @@ def get_page(year): question = text.rfind('?') exclamation = text.rfind('!') ends = [period, question, exclamation] - max_end = max(ends) + max_end = max(ends) + 1 + + if paragraph_len > 10 and max_end > 1: + short_review = text[0:max_end] + print(short_review + "\n") + tl.append(short_review) + else: + pass + +tf = pd.Series(tl) + +# Need to manually exclude webpage intro and end paragraphs and additional para from reviews longer than 1 para +if user_year == '2020': + rf = tf[3:-1] + rf = rf.drop([8, 27, 50], axis=0) + rf = rf.reset_index(drop=True) + +df['review'] = rf +df = df.sort_values('rank', ascending=True) - if paragraph_len > 10 and max_end > 0: - print(text) \ No newline at end of file +filename = "pitchfork_" + user_year + ".csv" +df.to_csv(filename, index=False, encoding='utf-8-sig') \ No newline at end of file diff --git a/spotify.py b/spotify.py new file mode 100644 index 0000000..739f349 --- /dev/null +++ b/spotify.py @@ -0,0 +1,96 @@ +# Search for Spotify ID for given album and create HTML code for embedded player +# Started 2020-12-29 +# Updated 2020-12-29 + +# Import libraries +import pandas as pd +import requests +import json +from spotify_secrets import * # Simple .py file declaring client_id and client_secret variables + +# Authorization +token_endpoint = "https://accounts.spotify.com/api/token" + +auth_response = requests.post(token_endpoint, { + 'grant_type': 'client_credentials', + 'client_id': client_id, + 'client_secret': client_secret, +}) + +auth_response_data = auth_response.json() + +token = auth_response_data['access_token'] + +# Search +endpoint = "https://api.spotify.com/v1/search" + +# Import data +class import_data: + def __init__(self): + self.site = input("Enter site code (Pitchfork=p, Boomkat=b): ") + self.year = input("Enter year: ") + +def get_input(): + return import_data() + +user_input = get_input() + +if user_input.site == 'p': + site = 'pitchfork' +elif user_input.site == 'b': + site = 'boomkat' +else: + print('Enter appropriate code.') +year = user_input.year + +filename = site + "_" + year + ".csv" + +df = pd.read_csv(filename) + +html = [] + +for i in range(len(df)): + artist = df.iloc[i]['artist'].replace(" / ", ", ") + if 'various' in artist.lower(): + artist_param = '' + else: + artist_param = ' artist:' + artist + album = df.iloc[i]['album'] + query = 'album:' + album + artist_param + + headers = {'Authorization': 'Bearer {token}'.format(token=token)} + params = {'q': query, 'type': 'album', 'market': 'US'} + + data = requests.get( + endpoint, + headers=headers, + params=params + ) + + print('artist:', artist, 'album', album, data.status_code) + + if data.status_code == 200: + if len(data.json()['albums']['items']) > 0: + album_data = data.json() + album_id = album_data['albums']['items'][0]['id'] + + # Create embeddable player HTML code + # See https://developer.spotify.com/documentation/widgets/generate/embed/ + width = '"300"' + height = '"80"' + base = '' + else: + code = "Not available" + else: + code = 'Not available' + + html.append(code) + +df['html'] = pd.Series(html) + +df.to_csv(site + "_" + year + "_html.csv", index=False) \ No newline at end of file diff --git a/spotify_secrets.py b/spotify_secrets.py new file mode 100644 index 0000000..e69de29