-
Notifications
You must be signed in to change notification settings - Fork 0
/
DeepLinks.py
121 lines (100 loc) · 4.87 KB
/
DeepLinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import time
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# Web scraping project to collect specific URLs from a public procurement website using Selenium and BeautifulSoup
# 1. SET UP THE WEB DRIVER
# Initialize a Chrome web driver to interact with the browser
# You can use other drivers like Firefox or Edge by replacing "webdriver.Chrome()" with another appropriate driver
# URL of the public procurement website to be accessed
# Load the driver
# URL to navigate to
# Start timer for performance tracking
driver = webdriver.Chrome()
url = "https://contrataciondelestado.es/wps/portal/licitaciones"
driver.get(url)
start_time = time.time()
# 2. ACCESS THE SEARCH FORM
# Find the button to access the form by its ID and click it
button = driver.find_element(By.ID, 'viewns_Z7_AVEQAI930OBRD02JPMTPG21004_:form1:linkFormularioBusqueda')
button.click()
# Implicitly wait for elements to load
# This allows JavaScript on the page to load all necessary elements
driver.implicitly_wait(20)
current_url = driver.current_url
# 3. FILL IN THE SEARCH FORM
# 3.1. Select "Obras" from the dropdown menu
select_element = Select(driver.find_element(By.ID, 'viewns_Z7_AVEQAI930OBRD02JPMTPG21004_:form1:combo1MAQ'))
select_element.select_by_value('3')
# 3.2. Enter a code "41000000" in the CPV code field
# If you want to obtain all the deeplins, comment this lines
input_element = driver.find_element(By.ID, 'viewns_Z7_AVEQAI930OBRD02JPMTPG21004_:form1:cpvMultiple:codigoCpv')
input_element.send_keys('41000000')
# 3.3. Click the "Añadir" button to add the CPV code to the form
add_button = driver.find_element(By.ID, 'viewns_Z7_AVEQAI930OBRD02JPMTPG21004_:form1:cpvMultiplebuttonAnyadirMultiple')
add_button.click()
# Click the search button to start the search based on filled criteria
button = driver.find_element(By.ID, 'viewns_Z7_AVEQAI930OBRD02JPMTPG21004_:form1:button1')
button.click()
driver.implicitly_wait(25)
# 4. EXTRACT URLs FROM THE SEARCH RESULTS
# 4.1. Create a list to store extracted URLs
all_deeplink_urls = []
i = 1
# Loop through each page of the search results to collect all URLs
while True:
# Start timer for each loop iteration
loop_start_time = time.time()
try:
# Wait until at least one row of search results is available on the page
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//tr[@class='rowClass1']"))
)
print('Page: ', i)
i += 1
# Parse the page source using BeautifulSoup to find the table containing search results
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
table = soup.find('table', {'id': 'myTablaBusquedaCustom'})
deeplink_urls = []
# Extract the URLs from the table rows that contain links with "deeplink"
for row in table.select('tbody tr'):
deeplink_link = row.find('a', href=lambda x: x and 'deeplink' in x)
if deeplink_link:
deeplink_url = deeplink_link['href']
deeplink_urls.append(deeplink_url)
# Append the extracted URLs to the list of all URLs
all_deeplink_urls.extend(deeplink_urls)
# Check if there is a "Next" button and click it to proceed to the next page
try:
next_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, 'viewns_Z7_AVEQAI930OBRD02JPMTPG21004_:form1:footerSiguiente'))
)
next_button.click()
except TimeoutException:
# If the "Next" button is not found, we have reached the last page
print("No more pages or 'Next' button not found.")
break
except NoSuchElementException:
# If no search results are found, break the loop
break
# End timer for each loop iteration
loop_end_time = time.time()
loop_elapsed_time = loop_end_time - loop_start_time
print(f"Time taken for this iteration: {loop_elapsed_time:.2f} seconds")
# Close the driver after all data is collected
driver.quit()
# 4.2. Write all collected URLs to a text file
with open('all_deeplink_urls.txt', 'w') as file:
for url in all_deeplink_urls:
file.write(url + '\n')
print("All URLs have been saved to all_deeplink_urls.txt")
# 4.3 End timer and print total elapsed time for the script
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Total time taken: {elapsed_time:.2f} seconds")