Completed web scrapers and Spotify album ID requester to create embed…

…ded player HTML code
dnaeye · Dec 30, 2020 · cd492a0 · cd492a0
1 parent 0ac5cbe
commit cd492a0
Show file tree

Hide file tree

Showing 5 changed files with 152 additions and 11 deletions.
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/boomkat.py b/boomkat.py
@@ -1,10 +1,12 @@
 # Web scraper for Boomkat End-of-Year Top Releases
 # Started 2020-12-28
+# Updated 2020-12-28
 
 # Import libraries
 import requests
 from bs4 import BeautifulSoup
 import re
+import pandas as pd
 
 url_2019 = "https://boomkat.com/charts/boomkat-end-of-year-charts-2019/940"
 url_2020 = "https://boomkat.com/charts/boomkat-end-of-year-charts-2020/1234"
@@ -23,10 +25,12 @@ def get_page(year):
 
 results = page.find_all(class_='chart-item')
 
+df = pd.DataFrame(columns=['rank', 'artist', 'album', 'genre', 'review'])
 i = 1
 
 for result in results:
     try:
+        rank = i
         release = result.find('div', class_='chart-item-content-mobile show-for-small-only')
         artist = str(release.find(class_='release__artist').text.strip()).title()
         #if len(re.search(r'^.{2}\s', artist).group()) == 3:
@@ -35,23 +39,33 @@ def get_page(year):
         #    artist = prefix + name
         #else:
         #    pass
-        title = release.find(class_='release__title').text.strip()
+        album = release.find(class_='release__title').text.strip()
         try:
             genre = release.find(class_='release__genre').text.strip()
         except:
             genre = "Unknown"
         try:
-            review = result.find('div', class_='chart-item-review').text.strip()[0:500].replace("\r","").replace("\n\n"," ")
+            review = result.find('div', class_='chart-item-review').text.strip()[0:500] \
+                .replace("\r", "").replace("\n\n", " ")
             period = review.rfind('.')
             question = review.rfind('?')
             exclamation = review.rfind('!')
             ends = [period, question, exclamation]
-            max_end = max(ends) + 1
+            if max(ends) < 1:
+                space = review.rfind(' ')
+                max_end = space
+            else:
+                max_end = max(ends) + 1
             short_review = review[0:max_end]
         except:
             short_review = "No review available"
-        print(str(i) + " : " + artist, "-", title, ";", genre, "; Review: ", short_review)
+
+        df = df.append({'rank': rank, 'artist': artist, 'album': album, 'genre': genre, 'review': short_review}, ignore_index=True)
+        print('Rank ' + str(i) + " : " + artist, "-", album, ";", genre, "; Review: ", short_review)
         i += 1
+
     except:
         print("Error getting chart item")
 
+filename = "boomkat_" + user_year + ".csv"
+df.to_csv(filename, index=False, encoding='utf-8-sig')
diff --git a/pitchfork.py b/pitchfork.py
@@ -1,10 +1,11 @@
 # Web scraper for Pitchfork End-of-Year Top Releases
 # Started 2020-12-28
+# Updated 2020-12-28
 
 # Import libraries
 import requests
 from bs4 import BeautifulSoup
-import re
+import pandas as pd
 
 base_url = "https://pitchfork.com/features/lists-and-guides/best-albums-"
 
@@ -18,28 +19,52 @@ def get_page(year):
 
 page = get_page(user_year)
 
+df = pd.DataFrame(columns=['rank', 'artist', 'album'])
+i = 50
 titles = page.find_all('h2')
 
 for title in titles:
+    rank = i
     item = title.text.split(":")
     artist = item[0].strip()
     album = item[1].strip()
 
-    print(artist)
-    print(album + "\n")
+    print('Rank ' + str(i), ":", artist, "-", album)
+    df = df.append({'rank': i, 'artist': artist, 'album': album}, ignore_index=True)
+    i -= 1
+
+tl = []
 
 paragraphs = page.find_all('p')
 
 for paragraph in paragraphs:
-    text = paragraph.text
+    text = paragraph.text[0:500]
     words = text.split(" ")
     paragraph_len = len(words)
 
     period = text.rfind('.')
     question = text.rfind('?')
     exclamation = text.rfind('!')
     ends = [period, question, exclamation]
-    max_end = max(ends)
+    max_end = max(ends) + 1
+
+    if paragraph_len > 10 and max_end > 1:
+        short_review = text[0:max_end]
+        print(short_review + "\n")
+        tl.append(short_review)
+    else:
+        pass
+
+tf = pd.Series(tl)
+
+# Need to manually exclude webpage intro and end paragraphs and additional para from reviews longer than 1 para
+if user_year == '2020':
+    rf = tf[3:-1]
+    rf = rf.drop([8, 27, 50], axis=0)
+    rf = rf.reset_index(drop=True)
+
+df['review'] = rf
+df = df.sort_values('rank', ascending=True)
 
-    if paragraph_len > 10 and max_end > 0:
-        print(text)
+filename = "pitchfork_" + user_year + ".csv"
+df.to_csv(filename, index=False, encoding='utf-8-sig')
diff --git a/spotify.py b/spotify.py
@@ -0,0 +1,96 @@
+# Search for Spotify ID for given album and create HTML code for embedded player
+# Started 2020-12-29
+# Updated 2020-12-29
+
+# Import libraries
+import pandas as pd
+import requests
+import json
+from spotify_secrets import *  # Simple .py file declaring client_id and client_secret variables
+
+# Authorization
+token_endpoint = "https://accounts.spotify.com/api/token"
+
+auth_response = requests.post(token_endpoint, {
+    'grant_type': 'client_credentials',
+    'client_id': client_id,
+    'client_secret': client_secret,
+})
+
+auth_response_data = auth_response.json()
+
+token = auth_response_data['access_token']
+
+# Search
+endpoint = "https://api.spotify.com/v1/search"
+
+# Import data
+class import_data:
+    def __init__(self):
+        self.site = input("Enter site code (Pitchfork=p, Boomkat=b): ")
+        self.year = input("Enter year: ")
+
+def get_input():
+    return import_data()
+
+user_input = get_input()
+
+if user_input.site == 'p':
+    site = 'pitchfork'
+elif user_input.site  == 'b':
+    site = 'boomkat'
+else:
+    print('Enter appropriate code.')
+year = user_input.year
+
+filename = site + "_" + year + ".csv"
+
+df = pd.read_csv(filename)
+
+html = []
+
+for i in range(len(df)):
+    artist = df.iloc[i]['artist'].replace(" / ", ", ")
+    if 'various' in artist.lower():
+        artist_param = ''
+    else:
+        artist_param = ' artist:' + artist
+    album = df.iloc[i]['album']
+    query = 'album:' + album + artist_param
+
+    headers = {'Authorization': 'Bearer {token}'.format(token=token)}
+    params = {'q': query, 'type': 'album', 'market': 'US'}
+
+    data = requests.get(
+        endpoint,
+        headers=headers,
+        params=params
+    )
+
+    print('artist:', artist, 'album', album, data.status_code)
+
+    if data.status_code == 200:
+        if len(data.json()['albums']['items']) > 0:
+            album_data = data.json()
+            album_id = album_data['albums']['items'][0]['id']
+
+            # Create embeddable player HTML code
+            # See https://developer.spotify.com/documentation/widgets/generate/embed/
+            width = '"300"'
+            height = '"80"'
+            base = '<iframe src="https://open.spotify.com/embed/album/'
+            frame_border = '"0"'
+            allow_transparency = '"true"'
+            allow = '"encrypted-media'
+            code = base + album_id + '"' + ' width=' + width + ' height=' + height + ' frameborder=' + frame_border \
+                + ' allowtransparency=' + allow_transparency + ' allow=' + allow + '"></iframe>'
+        else:
+            code = "Not available"
+    else:
+        code = 'Not available'
+
+    html.append(code)
+
+df['html'] = pd.Series(html)
+
+df.to_csv(site + "_" + year + "_html.csv", index=False)
diff --git a/spotify_secrets.py b/spotify_secrets.py