From 8d0a84964b68d3ec9ea09398347a97eb3b873189 Mon Sep 17 00:00:00 2001 From: vvseva Date: Tue, 23 Nov 2021 17:37:44 +0300 Subject: [PATCH] semi-auto links --- hw2/grab-data-trip.ipynb | 217 ++++++++++++++++++--------------------- 1 file changed, 100 insertions(+), 117 deletions(-) diff --git a/hw2/grab-data-trip.ipynb b/hw2/grab-data-trip.ipynb index 72f83c7..0fd8407 100644 --- a/hw2/grab-data-trip.ipynb +++ b/hw2/grab-data-trip.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 18, "metadata": { "collapsed": true }, @@ -13,6 +13,10 @@ "import re\n", "from selenium import webdriver\n", "from selenium.webdriver.common.keys import Keys\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.common.exceptions import TimeoutException\n", "\n", "import pandas as pd\n", "import time" @@ -20,22 +24,13 @@ }, { "cell_type": "code", - "execution_count": 3, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_26152/1593989845.py:2: DeprecationWarning: executable_path has been deprecated, please pass in a Service object\n", - " driver = webdriver.Chrome(\"C:\\\\Users\\\\wirze\\\\DataspellProjects\\\\chromedriver_win32\\\\chromedriver.exe\")\n" - ] - } - ], + "execution_count": 81, + "outputs": [], "source": [ - "links = {}\n", - "driver = webdriver.Chrome(\"C:\\\\Users\\\\wirze\\\\DataspellProjects\\\\chromedriver_win32\\\\chromedriver.exe\")\n", - "driver.get(\"https://www.tripadvisor.com/Search?q=parks&searchSessionId=CF4A26D0C451E0783853C96C46AC8B131637532036921ssid&sid=72ED914F7F2B4263B4C7FB9FF61A5A671637532046511&geo=186338&blockRedirect=true&ssrc=a\")\n", - "javaScript = \"PagerNextClick();\"" + "city_name = \"Paris\"\n", + "delay = 5 # seconds\n", + "\n", + "links = {}" ], "metadata": { "collapsed": false, @@ -46,20 +41,43 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 82, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_26152/1130507203.py:1: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead\n", - " inputElement = driver.find_element_by_id(\"GEO_SCOPED_SEARCH_INPUT\")\n" + "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_17200/463546191.py:1: DeprecationWarning: executable_path has been deprecated, please pass in a Service object\n", + " driver = webdriver.Chrome(\"C:\\\\Users\\\\wirze\\\\DataspellProjects\\\\chromedriver_win32\\\\chromedriver.exe\")\n", + "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_17200/463546191.py:5: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead\n", + " inputElement = driver.find_element_by_id(\"GEO_SCOPED_SEARCH_INPUT\")\n", + "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_17200/463546191.py:8: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead\n", + " clearElement = driver.find_element_by_id(\"CLEAR_WHERE\")\n", + "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_17200/463546191.py:11: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead\n", + " inputElement = driver.find_element_by_id(\"GEO_SCOPED_SEARCH_INPUT\")\n", + "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_17200/463546191.py:16: DeprecationWarning: find_elements_by_* commands are deprecated. Please use find_elements() instead\n", + " parksElement = driver.find_elements_by_xpath(\"//*[contains(text(), 'Parks in')]\")\n" ] } ], "source": [ + "driver = webdriver.Chrome(\"C:\\\\Users\\\\wirze\\\\DataspellProjects\\\\chromedriver_win32\\\\chromedriver.exe\")\n", + "driver.get(\"https://www.tripadvisor.com/Search?q=parks&searchSessionId=CF4A26D0C451E0783853C96C46AC8B131637532036921ssid&sid=72ED914F7F2B4263B4C7FB9FF61A5A671637532046511&geo=186338&blockRedirect=true&ssrc=a\")\n", + "javaScript = \"PagerNextClick();\"\n", + "\n", + "inputElement = driver.find_element_by_id(\"GEO_SCOPED_SEARCH_INPUT\")\n", + "inputElement.send_keys('Moscow')\n", + "time.sleep(2)\n", + "clearElement = driver.find_element_by_id(\"CLEAR_WHERE\")\n", + "clearElement.click()\n", + "time.sleep(2)\n", "inputElement = driver.find_element_by_id(\"GEO_SCOPED_SEARCH_INPUT\")\n", - "inputElement.send_keys('Moscow')" + "inputElement.send_keys(city_name)\n", + "time.sleep(2)\n", + "inputElement.send_keys(Keys.ENTER)\n", + "time.sleep(3)\n", + "parksElement = driver.find_elements_by_xpath(\"//*[contains(text(), 'Parks in')]\")\n", + "parksElement[1].click()" ], "metadata": { "collapsed": false, @@ -70,58 +88,53 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 83, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_26152/3341757118.py:1: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead\n", - " clearElement = driver.find_element_by_id(\"CLEAR_WHERE\")\n" + "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_17200/2944505539.py:14: DeprecationWarning: find_elements_by_* commands are deprecated. Please use find_elements() instead\n", + " NextPageElement = driver.find_elements_by_css_selector('[aria-label=\"Next page\"]')\n" ] - } - ], - "source": [ - "clearElement = driver.find_element_by_id(\"CLEAR_WHERE\")\n", - "clearElement.click()" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": 6, - "outputs": [ + }, { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_26152/301636850.py:1: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead\n", - " inputElement = driver.find_element_by_id(\"GEO_SCOPED_SEARCH_INPUT\")\n" + "Page is ready!\n", + "Page is ready!\n", + "Page is ready!\n", + "Page is ready!\n", + "Page is ready!\n", + "Page is ready!\n" ] } ], "source": [ - "inputElement = driver.find_element_by_id(\"GEO_SCOPED_SEARCH_INPUT\")\n", - "inputElement.send_keys('London')" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": 7, - "outputs": [], - "source": [ - "inputElement.send_keys(Keys.ENTER)" + "for i in range(42):\n", + " html = driver.page_source\n", + " soup = BeautifulSoup(html)\n", + "\n", + " results_names = soup.find_all(\"div\", class_=\"csemS\")\n", + " results_links = soup.find_all(\"div\", class_=\"fVbwn cdAAV cagLQ eZTON dofsx\")\n", + " del results_links[1::2] # delete every second element (counting from the first)\n", + "\n", + " for result_link, result_name in zip(results_links, results_names):\n", + " if result_link.find(\"a\") is not None:\n", + " links[result_name.text] = result_link.find(\"a\")['href']\n", + " try:\n", + " myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'csemS')))\n", + " NextPageElement = driver.find_elements_by_css_selector('[aria-label=\"Next page\"]')\n", + " print (\"Page is ready!\")\n", + " except TimeoutException:\n", + " print (\"Loading took too much time!\")\n", + " time.sleep(5)\n", + "\n", + " if len(NextPageElement) == 0:\n", + " break\n", + " NextPageElement[0].click()\n", + " time.sleep(5)" ], "metadata": { "collapsed": false, @@ -132,20 +145,19 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 84, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_26152/1667884103.py:1: DeprecationWarning: find_elements_by_* commands are deprecated. Please use find_elements() instead\n", - " parksElement = driver.find_elements_by_xpath(\"//*[contains(text(), 'Parks in')]\")\n" - ] + "data": { + "text/plain": "'171. Jardin de la place Louise Losserand'" + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "parksElement = driver.find_elements_by_xpath(\"//*[contains(text(), 'Parks in')]\")\n", - "parksElement[1].click()" + "list(links)[-1]" ], "metadata": { "collapsed": false, @@ -156,47 +168,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 85, "outputs": [], - "source": [], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": 10, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\wirze\\AppData\\Local\\Temp/ipykernel_26152/2444517317.py:13: DeprecationWarning: find_elements_by_* commands are deprecated. Please use find_elements() instead\n", - " NextPageElement = driver.find_elements_by_css_selector('[aria-label=\"Next page\"]')\n" - ] - } - ], "source": [ - "for i in range(42):\n", - " html = driver.page_source\n", - " soup = BeautifulSoup(html)\n", + "df_links_all = pd.read_csv('all_parks_links.csv')\n", "\n", - " results_names = soup.find_all(\"div\", class_=\"csemS\")\n", - " results_links = soup.find_all(\"div\", class_=\"fVbwn cdAAV cagLQ eZTON dofsx\")\n", - " del results_links[1::2] # delete every second element (counting from the first)\n", - "\n", - " for result_link, result_name in zip(results_links, results_names):\n", - " if result_link.find(\"a\") is not None:\n", - " links[result_name.text] = result_link.find(\"a\")['href']\n", + "df_links = pd.DataFrame.from_dict(links, orient='index', columns=[\"link\"]).reset_index()\n", + "df_links['city'] = city_name\n", + "df_links = df_links[['city', 'index', 'link']]\n", "\n", - " NextPageElement = driver.find_elements_by_css_selector('[aria-label=\"Next page\"]')\n", - " if len(NextPageElement) == 0:\n", - " break\n", - " NextPageElement[0].click()\n", - " time.sleep(5)" + "df_links_all2 = pd.concat([df_links_all, df_links], ignore_index=True)\n", + "df_links_all2 = df_links_all2[['city', 'index', 'link']]\n", + "df_links_all2.to_csv(\"all_parks_links.csv\", index=False)" ], "metadata": { "collapsed": false, @@ -207,19 +190,20 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 86, "outputs": [ { "data": { - "text/plain": "'166. Arlington Square Gardens'" + "text/plain": " city index \\\n0 test_city test_park \n1 Saint-Petersburg 1. Catherine Palace and Park \n2 Saint-Petersburg 31. Inchkhona Skver \n3 Saint-Petersburg 32. Udelny Park of Recreation \n4 Saint-Petersburg 33. Ekateringof Recreation Park \n.. ... ... \n520 Paris 167. Square Cyprian Norwid \n521 Paris 168. Square Jules Durand \n522 Paris 169. Square Alésia Ridder \n523 Paris 170. Square Frédéric Bazille \n524 Paris 171. Jardin de la place Louise Losserand \n\n link \n0 test_link \n1 /Attraction_Review-g811323-d301025-Reviews-Cat... \n2 /Attraction_Review-g2394587-d18344144-Reviews-... \n3 /Attraction_Review-g298507-d6502804-Reviews-Ud... \n4 /Attraction_Review-g298507-d6502916-Reviews-Ek... \n.. ... \n520 /Attraction_Review-g187147-d23007212-Reviews-S... \n521 /Attraction_Review-g187147-d23035203-Reviews-S... \n522 /Attraction_Review-g187147-d23035214-Reviews-S... \n523 /Attraction_Review-g187147-d21025430-Reviews-S... \n524 /Attraction_Review-g187147-d23035218-Reviews-J... \n\n[525 rows x 3 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
cityindexlink
0test_citytest_parktest_link
1Saint-Petersburg1. Catherine Palace and Park/Attraction_Review-g811323-d301025-Reviews-Cat...
2Saint-Petersburg31. Inchkhona Skver/Attraction_Review-g2394587-d18344144-Reviews-...
3Saint-Petersburg32. Udelny Park of Recreation/Attraction_Review-g298507-d6502804-Reviews-Ud...
4Saint-Petersburg33. Ekateringof Recreation Park/Attraction_Review-g298507-d6502916-Reviews-Ek...
............
520Paris167. Square Cyprian Norwid/Attraction_Review-g187147-d23007212-Reviews-S...
521Paris168. Square Jules Durand/Attraction_Review-g187147-d23035203-Reviews-S...
522Paris169. Square Alésia Ridder/Attraction_Review-g187147-d23035214-Reviews-S...
523Paris170. Square Frédéric Bazille/Attraction_Review-g187147-d21025430-Reviews-S...
524Paris171. Jardin de la place Louise Losserand/Attraction_Review-g187147-d23035218-Reviews-J...
\n

525 rows × 3 columns

\n
" }, - "execution_count": 18, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "list(links)[-1]" + "df_links_all2" ], "metadata": { "collapsed": false, @@ -230,22 +214,21 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 87, "outputs": [ { "data": { - "text/plain": " index \\\n0 1. Hyde Park \n1 2. St. James's Park \n2 3. Regent's Park \n3 4. Kensington Gardens \n4 5. Hampstead Heath \n.. ... \n146 162. Martin Luther King Adventure Playground \n147 163. Timbuktu Adventure Playground \n148 164. Cannizaro Park \n149 165. Gladstone Park \n150 166. Arlington Square Gardens \n\n link \n0 /Attraction_Review-g186338-d189030-Reviews-Hyd... \n1 /Attraction_Review-g186338-d188126-Reviews-St_... \n2 /Attraction_Review-g186338-d187675-Reviews-Reg... \n3 /Attraction_Review-g186338-d188893-Reviews-Ken... \n4 /Attraction_Review-g186338-d211768-Reviews-Ham... \n.. ... \n146 /Attraction_Review-g186338-d12097677-Reviews-M... \n147 /Attraction_Review-g186338-d12097680-Reviews-T... \n148 /Attraction_Review-g186338-d13544032-Reviews-C... \n149 /Attraction_Review-g186338-d16729335-Reviews-G... \n150 /Attraction_Review-g186338-d17456738-Reviews-A... \n\n[151 rows x 2 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexlink
01. Hyde Park/Attraction_Review-g186338-d189030-Reviews-Hyd...
12. St. James's Park/Attraction_Review-g186338-d188126-Reviews-St_...
23. Regent's Park/Attraction_Review-g186338-d187675-Reviews-Reg...
34. Kensington Gardens/Attraction_Review-g186338-d188893-Reviews-Ken...
45. Hampstead Heath/Attraction_Review-g186338-d211768-Reviews-Ham...
.........
146162. Martin Luther King Adventure Playground/Attraction_Review-g186338-d12097677-Reviews-M...
147163. Timbuktu Adventure Playground/Attraction_Review-g186338-d12097680-Reviews-T...
148164. Cannizaro Park/Attraction_Review-g186338-d13544032-Reviews-C...
149165. Gladstone Park/Attraction_Review-g186338-d16729335-Reviews-G...
150166. Arlington Square Gardens/Attraction_Review-g186338-d17456738-Reviews-A...
\n

151 rows × 2 columns

\n
" + "text/plain": " city index \\\n0 test_city test_park \n1 Saint-Petersburg 1. Catherine Palace and Park \n2 Saint-Petersburg 31. Inchkhona Skver \n3 Saint-Petersburg 32. Udelny Park of Recreation \n4 Saint-Petersburg 33. Ekateringof Recreation Park \n.. ... ... \n520 Paris 167. Square Cyprian Norwid \n521 Paris 168. Square Jules Durand \n522 Paris 169. Square Alésia Ridder \n523 Paris 170. Square Frédéric Bazille \n524 Paris 171. Jardin de la place Louise Losserand \n\n link \n0 test_link \n1 /Attraction_Review-g811323-d301025-Reviews-Cat... \n2 /Attraction_Review-g2394587-d18344144-Reviews-... \n3 /Attraction_Review-g298507-d6502804-Reviews-Ud... \n4 /Attraction_Review-g298507-d6502916-Reviews-Ek... \n.. ... \n520 /Attraction_Review-g187147-d23007212-Reviews-S... \n521 /Attraction_Review-g187147-d23035203-Reviews-S... \n522 /Attraction_Review-g187147-d23035214-Reviews-S... \n523 /Attraction_Review-g187147-d21025430-Reviews-S... \n524 /Attraction_Review-g187147-d23035218-Reviews-J... \n\n[525 rows x 3 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
cityindexlink
0test_citytest_parktest_link
1Saint-Petersburg1. Catherine Palace and Park/Attraction_Review-g811323-d301025-Reviews-Cat...
2Saint-Petersburg31. Inchkhona Skver/Attraction_Review-g2394587-d18344144-Reviews-...
3Saint-Petersburg32. Udelny Park of Recreation/Attraction_Review-g298507-d6502804-Reviews-Ud...
4Saint-Petersburg33. Ekateringof Recreation Park/Attraction_Review-g298507-d6502916-Reviews-Ek...
............
520Paris167. Square Cyprian Norwid/Attraction_Review-g187147-d23007212-Reviews-S...
521Paris168. Square Jules Durand/Attraction_Review-g187147-d23035203-Reviews-S...
522Paris169. Square Alésia Ridder/Attraction_Review-g187147-d23035214-Reviews-S...
523Paris170. Square Frédéric Bazille/Attraction_Review-g187147-d21025430-Reviews-S...
524Paris171. Jardin de la place Louise Losserand/Attraction_Review-g187147-d23035218-Reviews-J...
\n

525 rows × 3 columns

\n
" }, - "execution_count": 12, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_links = pd.DataFrame.from_dict(links, orient='index', columns=[\"link\"]).reset_index()\n", - "df_links.to_csv(\"london_parks_links.csv\")\n", - "df_links" + "df_links_all = pd.read_csv('all_parks_links.csv')\n", + "df_links_all" ], "metadata": { "collapsed": false, @@ -256,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 61, "outputs": [], "source": [ "driver.quit()"