-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathwith_profile.py
executable file
·147 lines (130 loc) · 6.31 KB
/
with_profile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
#
from __future__ import print_function
import os,sys,time,re
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import JavascriptException
from selenium.webdriver.chrome.options import Options
import getopt
try:
opts, args = getopt.getopt(sys.argv[1:], 'hdan:', ['help', 'debug', 'headless', 'name='])
except getopt.GetoptError as err:
print('usage: with_profile.py [-n|--name <name>] [-a|--headless] [-d|--debug] [-h]')
profile_dir_name = None
headless = False
global debug
debug = False
for option, argument in opts:
if option == '-d':
debug = True
elif option in ('-h', '--help'):
print('usage: with_profile.py [-n|--name <name>] [-a|--headless] [-d|--debug] [-h]')
exit()
elif option in ('-a', '--headless'):
headless = True
elif option in ('-n', '--name'):
profile_dir_name = argument
is_windows = os.getenv('OS') != None and re.compile('.*NT').match( os.getenv('OS'))
homedir = os.getenv('USERPROFILE' if is_windows else 'HOME')
print('user home directory path: {}'.format(homedir), file = sys.stderr)
# if the chromedriver was downloaded manually to default location, add to the PATH
chromedriver_path = homedir + os.sep + 'Downloads' + os.sep + ('chromedriver.exe' if is_windows else 'chromedriver')
# if installed system-wide
# on Ubuntu
# on alpine
# apk add chromium chromium-chromedriver
# on Ubuntu
# apt-get install chromium-browser chromium-chromedriver
# the driver name is chromedriver
chromedriver_path = (homedir + os.sep + 'Downloads' + os.sep + 'chromedriver.exe') if is_windows else '/usr/bin/chromedriver'
# NOTE: https://www.browserstack.com/guide/get-current-url-in-selenium-and-python
# to switch to webdriver_manager,Service etc.
print('chromedriver path: {}'.format(chromedriver_path), file = sys.stderr)
options = Options()
if profile_dir_name == None:
profile_dir_name = 'CustomProfile'
user_data_dir = '{0}\\AppData\\Local\\Google\\Chrome\\User Data\\{1}'.format(os.getenv('USERPROFILE'), profile_dir_name) if is_windows else '{0}/.config/{1}'.format(os.getenv('HOME'), profile_dir_name)
# NOTE: the actual profile dir will be created as
# '{os.getenv('HOME')}\\AppData\\Local\\Google\\Chrome\\User Data\\{profile_dir_name}\\{profile_dir_name}'
# with profile_dir_name twice
# NOTE: will silently fail on Windows 10 x64 Chrome 109 x86
if os.path.isdir(user_data_dir):
print('Custom profile will be used: "{}"'.format(user_data_dir), file = sys.stderr)
else:
print('Custom profile will be created: "{}"'.format(user_data_dir), file = sys.stderr)
# see also:
# profile switches helping with custom chrome profile
# for scraping sites that require authentication
# https://habr.com/ru/post/587708
# TODO: do 2fa on the target site with headless chrome via
# --remote-debugging-port=9222
options.add_argument('--allow-profiles-outside-user-dir')
options.add_argument('--enable-profile-shortcut-manager')
# the next argument would lead to actual profile dir to become "~/.config/CustomProfile/CustomProfile"
options.add_argument('--profile-directory={}'.format(profile_dir_name))
flush_seconds = 30
options.add_argument( '--profiling-flush={}'.format(flush_seconds))
options.add_argument('--enable-aggressive-domstorage-flushing')
options.add_argument('--disable-blink-features=AutomationControlled')
# to clear the profile simply remove the contents of your profile folder
# '~/.config/CustomProfile/Profile 1/'
# for full list of command line switches see
# https://peter.sh/experiments/chromium-command-line-switches
options.add_argument( 'user-data-dir={}'.format(user_data_dir))
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0)'
if headless:
options.add_argument('--window-size=1920,1080')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--enable-javascript')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--allow-insecure-localhost')
options.add_argument('--allow-running-insecure-content')
options.add_argument('--disable-browser-side-navigation')
options.add_argument( 'user-agent={}'.format(user_agent))
try:
driver = webdriver.Chrome(executable_path = chromedriver_path, options = options)
# TODO: Message: unknown error: Chrome failed to start: exited normally.
# unknown error: DevToolsActivePort file doesn't exist
# The process started from chrome location
# C:\Program Files\Google\Chrome\Application\chrome.exe
# is no longer running, so ChromeDriver is assuming that Chrome has crashed.
# from chromedriver / chrome version mismatch
except WebDriverException as e:
driver = None
print(e, file = sys.stderr)
pass
# TODO: catch unknown error: Could not remove old devtools port file.
# Perhaps the given user-data-dir at ... is still
# attached to a running Chrome or Chromium process
if driver != None:
driver.get('chrome://version/')
# will show both Profile Path and Command Line
time.sleep(10)
if headless:
page_source = driver.page_source
page_text = 'undefined'
try:
element = driver.find_element_by_css_selector('table#inner')
# NOTE: selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"table#inner"}
page_text = element.text
except NoSuchElementException as e:
pass
try:
page_text = driver.execute_script('document.querySelector("#inner").textContent')
# NOTE: selenium.common.exceptions.JavascriptException: Message: javascript error: Cannot read property 'textContent' of null
except JavascriptException as e:
pass
page_url = driver.current_url
print('{0}: {1} {2}'.format(page_url,page_source,page_text), file = sys.stderr)
# NOTE: showing chrome://version/: <html><head></head><body></body></html>
driver.close()
driver.quit()
if not os.path.isdir(user_data_dir + os.sep + 'Default' ):
print('The profile was not created: "{}"'.format(user_data_dir), file = sys.stderr)
# on a vanilla Windows node
# PATH=%PATH%;c:\Python381;%USERPROFILE%\Downloads