-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtestig.py
170 lines (124 loc) · 5.44 KB
/
testig.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 21 22:04:33 2021
@author: she84
"""
from igramscraper.instagram import Instagram
import csv
proxies = {
'192.168.43.249'
}
instagram = Instagram()
instagram.set_proxies(proxies)
account = instagram.get_account('jjr_yaya')
# # Available fields
# print('Account info:')
# print('Id: ', account.identifier)
# print('Username: ', account.username)
# print('Full name: ', account.full_name)
# print('Biography: ', account.biography)
# print('Profile pic url: ', account.get_profile_picture_url())
# print('External Url: ', account.external_url)
# print('Number of published posts: ', account.media_count)
# print('Number of followers: ', account.followed_by_count)
# print('Number of follows: ', account.follows_count)
# print('Is private: ', account.is_private)
# print('Is verified: ', account.is_verified)
# or simply for printing use
print(account)
with open('Account info.csv', 'w', newline='',encoding="utf-8") as csvfile:
writer = csv.writer(csvfile, delimiter=';')
# writer.writerow([account])
writer.writerow(['identifier','username','full_name',
'profile_picture_url',
'external_url','media_count',
'followed_by_count','follows_count',
'account.is_private','account.is_verified'])
writer.writerow([account.identifier, account.username, account.full_name,
account.get_profile_picture_url(),
account.external_url, account.media_count,
account.followed_by_count, account.follows_count,
account.is_private, account.is_verified])
#==========================================================#
from igramscraper.instagram import Instagram
proxies = {
'192.168.43.249'
}
instagram = Instagram()
instagram.set_proxies(proxies)
account = instagram.get_account('itsgeenatime')
# Available fields
print('Account info:')
print('Id: ', account.identifier)
print('Username: ', account.username)
print('Full name: ', account.full_name)
print('Biography: ', account.biography)
print('Profile pic url: ', account.get_profile_picture_url())
print('External Url: ', account.external_url)
print('Number of published posts: ', account.media_count)
print('Number of followers: ', account.followed_by_count)
print('Number of follows: ', account.follows_count)
print('Is private: ', account.is_private)
print('Is verified: ', account.is_verified)
# or simply for printing use
print(account)
#==========================================================#
# 爬取IG貼文短連結
# https://medium.com/marketingdatascience/%E8%B7%9F%E8%91%97ig%E6%BD%AE%E6%B5%81%E4%BE%86%E7%88%AC%E8%9F%B2-%E5%A6%82%E4%BD%95%E7%88%AC%E5%8F%96ig%E8%B2%BC%E6%96%87%E7%9F%AD%E9%80%A3%E7%B5%90-%E7%B3%BB%E5%88%972-%E9%99%84python%E7%A8%8B%E5%BC%8F%E7%A2%BC-465b7f00eeee
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as Soup
import time
options = Options()
options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
browser = webdriver.Chrome(chrome_options = options, executable_path=r'C:\\Users\\she84\\anaconda3\\chromedriver.exe')
url = 'https://www.instagram.com/itsgeenatime/'
browser.get(url) # 前往該網址
# 往下滑並取得新的貼文連結
n_scroll = 5
post_url = []
for i in range(n_scroll):
scroll = 'window.scrollTo(0, document.body.scrollHeight);'
browser.execute_script(scroll)
html = browser.page_source
soup = Soup(html, 'lxml')
# 尋找所有的貼文連結
for elem in soup.select('article div div div div a'):
# 如果新獲得的貼文連結不在列表裡,則加入
if elem['href'] not in post_url:
post_url.append(elem['href'])
time.sleep(2) # 等待網頁加載
# 總共加載的貼文連結數
print("總共取得 " + str(len(post_url)) + " 篇貼文連結")
#==========================================================#
# 開始爬取貼文讚數及留言數
# https://aitmr1234567890.medium.com/%E8%B7%9F%E8%91%97ig%E6%BD%AE%E6%B5%81%E4%BE%86%E7%88%AC%E8%9F%B2-%E5%A6%82%E4%BD%95%E7%88%AC%E5%8F%96ig%E8%B2%BC%E6%96%87%E8%AE%9A%E6%95%B8-%E7%95%99%E8%A8%80%E6%95%B8-%E7%B3%BB%E5%88%973-%E9%99%84python%E7%A8%8B%E5%BC%8F%E7%A2%BC-4ac918b8fef4
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
# 進入到粉專的頁面
options = Options()
options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
browser = webdriver.Chrome(chrome_options = options, executable_path=r'C:\\Users\\she84\\anaconda3\\chromedriver.exe')
url = 'https://www.instagram.com/itsgeenatime/'
browser.get(url) # 前往該網址
post_url = 'p/CLPHcMqjv8j/'
find = False
# 不在目前的網頁元素裡,則往下滑,加載新貼文
while not find:
try:
# 找到對應的貼文,鼠標移入
post_elem = browser.find_element_by_xpath('//a[@href="'+str(post_url)+'"]')
action = ActionChains(browser)
action.move_to_element(post_elem).perform()
# 找到需要的網頁元素
n_like_elem = browser.find_elements_by_class_name('-V_eO')
# 取得讚數、留言數
n_like = n_like_elem[0].text
n_comment = n_like_elem[1].text
# 找到之後就可以回傳‘找到了’
find = True
except:
# 找不到就往下滑
scroll = 'window.scrollBy(0,250)'
browser.execute_script(scroll)
continue