forked from giddyyupp/ganilla
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmiyazaki_downloader.py
109 lines (89 loc) · 3.96 KB
/
miyazaki_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import os
import json
import urllib2
import sys
import time
import random
# adding path to geckodriver to the OS environment variable
os.environ["PATH"] += os.pathsep + os.getcwd()
download_path = "./down/" # where to download images.
def randdelay(a,b):
time.sleep(random.uniform(a,b))
def main():
searchtext = "miyazaki wallpaper"
num_requested = int(5000)
number_of_scrolls = num_requested / 400 + 1
# number_of_scrolls * 400 images will be opened in the browser
if not os.path.exists(download_path + searchtext.replace(" ", "_")):
os.makedirs(download_path + searchtext.replace(" ", "_"))
url = "https://www.google.co.in/search?q=" + searchtext + "&source=lnms&tbm=isch"
# driver = webdriver.Chrome(r"D:\chromedriver")
driver = webdriver.Firefox()
driver.get(url)
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
extensions = {"jpg", 'png', 'jpeg', 'tiff'}
img_count = 0
downloaded_img_count = 0
for _ in xrange(number_of_scrolls):
for __ in xrange(10):
# multiple scrolls needed to show all 400 images
driver.execute_script("window.scrollBy(0, 1000000)")
randdelay(1, 3)
# to load next 400 images
randdelay(1, 3)
try:
driver.find_element_by_xpath("//input[@id='smb']").click()
except Exception as e:
print "Less images found:", e
break
# imges = driver.find_elements_by_xpath('//div[@class="rg_meta"]') # not working anymore
imges = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')
print "Total images:", len(imges), "\n"
for img in imges:
img_count += 1
img_url = json.loads(img.get_attribute('innerHTML'))["ou"]
img_type = json.loads(img.get_attribute('innerHTML'))["ity"]
print "Downloading image", img_count, ": ", img_url
try:
# Save the window opener (current window)
main_window = driver.current_window_handle
scripttt = '''window.open('{link}')'''.format(link=img_url)
driver.execute_script(scripttt)
windows = driver.window_handles
driver.switch_to.window(windows[1])
delay = 10 # seconds
try:
img = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.TAG_NAME, 'img')))
print "Page is ready!"
except TimeoutException:
print "Loading took too much time!"
# img = driver.find_element_by_tag_name('img')
randdelay(2, 4) # wait some to get ss. eger cok fazla siyah resim olursa bunu arttirmaniz lazim
img.screenshot(download_path + searchtext.replace(" ", "_") + "/" + str(downloaded_img_count) + ".png")
randdelay(3, 5) # wait some to get ss. eger cok fazla siyah resim olursa bunu arttirmaniz lazim
# Close current window
driver.close()
driver.switch_to.window(main_window)
downloaded_img_count += 1
except Exception as e:
# Close current window
driver.close()
windows = driver.window_handles
# get back to main window!!!
driver.switch_to.window(windows[0])
print "Download failed:", e
finally:
print
if downloaded_img_count >= num_requested:
break
print "Total downloaded: ", downloaded_img_count, "/", img_count
driver.quit()
if __name__ == "__main__":
main()