forked from gippy/instagram-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
INPUT_SCHEMA.json
204 lines (204 loc) · 11 KB
/
INPUT_SCHEMA.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
{
"title": "Input schema for Instagram scraper",
"description": "Below you can provide search query which will be used to search Instagram for profiles, hashtags or places. Alternatively you can provide direct page URLs. <br />Then on each page you can scrape page details, posts or comments. <br/> <strong>Important: Proxy is required to use this solution</strong>",
"type": "object",
"schemaVersion": 1,
"properties": {
"search": {
"title": "Search",
"type": "string",
"description": "Here you can provide a search query which will be used to search Instagram for profiles, hashtags or places. <br /><br /><strong>This field is optional, but this field or 'Direct Instagram page URLs' must be provided</strong>",
"editor": "textfield",
"prefill": "Nature"
},
"searchType": {
"title": "Search type",
"type": "string",
"description": "What type of pages to search (you can search hashtags, profiles and places)",
"editor": "select",
"enum": ["user", "hashtag", "place", "stories"],
"enumTitles": ["Search users", "Search hashtags", "Search places (scrolls only with login)", "Stories (Only with login)"],
"default": "hashtag"
},
"searchLimit": {
"title": "Search results limit",
"type": "integer",
"description": "How many search results (eq. pages) should be processed.",
"editor": "number",
"default": 10,
"minimum": 1,
"maximum": 100
},
"directUrls": {
"title": "Direct Instagram page URLs",
"type": "array",
"description": "If you already have URL(s) of page(s) you wish to scrape, you can set them here. <br /><br /><strong>This field is optional, but this field or 'Search' must be provided</strong>. Location URLs scroll only with login.",
"editor": "stringList",
"placeholderValue": "URL",
"patternValue": "https:\\/\\/www\\.instagram\\.com\\/.+",
"uniqueItems": true
},
"resultsType": {
"title": "What to scrape from each page.",
"type": "string",
"description": "What to scrape from each Instagram page URL or search result. You need to provide correct parent pages, e.g. comments can be scraped only from posts, not from profiles.",
"editor": "select",
"enum": ["posts", "comments", "details", "stories", "cookies"],
"enumTitles": [
"Posts (from profile pages, hashtag or place search)",
"Comments (from post pages)",
"Details (of profiles, posts or search pages - from a profile, post or search pages)",
"Stories",
"Cookies (will export cookies gained by login using IG credentials - loginUsername, loginPassword, these cookies can be used in later runs for authorization)"],
"default": "posts"
},
"resultsLimit": {
"title": "Max items",
"type": "integer",
"description": "How many posts or comments to scrape from each Instagram URL or found page, ignored when \"Details\" type is specified. Notice that if you use search via hashtags or place, each found page will have this limit separately. You can limit searchLimit to reduce the number of total results.",
"editor": "number",
"unit": "per page",
"default": 200
},
"maxRequestRetries": {
"title": "Max request retries",
"type": "integer",
"description": "How many times a page will be retried in case of being blocked or other errors.",
"default": 5
},
"scrapePostsUntilDate": {
"title": "Scrape posts until date",
"type": "string",
"description": "Scrapes post from the current date until it reaches a post older than a provided date. The date should be in format YYYY-MM-DD. By default scrapes all posts.",
"editor": "textfield"
},
"scrollWaitSecs": {
"title": "Scroll wait seconds",
"type": "integer",
"description": "How many seconds it will wait every 100 posts/comments when scrolling. If you scrape more than 1000 items, it is important to slow down to not get blocked. The seconds are randomized a bit. If you get blocked anyway, try to increase this number.",
"default": 15
},
"proxy": {
"title": "Proxy configuration",
"type": "object",
"description": "<strong>Proxy is required to run this actor!</strong> Either use Apify RESIDENTIAL proxy, or provide your own proxy servers. Non-RESIDENTIAL proxy groups usually don't work.",
"prefill": { "useApifyProxy": true, "apifyProxyGroups": ["RESIDENTIAL"] },
"editor": "proxy"
},
"loginCookies": {
"title": "Cookies",
"type": "array",
"description": "If you want to sign in to Instagram to have access to more data than on anonymous profile, but do not wish to have your credentials in input, then you can use your cookies.",
"editor": "json",
"sectionCaption": "Login using cookies",
"sectionDescription": "Instagram limits access to certain data (likes, post user data, followers and others) if a user is anonymous. You can provide your cookies here (copied from chrome plugin like 'EditThisCookie' to use a normal account. Do not use your own cookies here and rather create a new temporary account. When cookies are used, the run is limited to concurrency 1 and one session, so that the account does not get banned too soon."
},
"maxErrorCount": {
"title": "Max error count cookies",
"type": "integer",
"description": "Setting this number too high might get your account blocked faster",
"editor": "number",
"default": 15
},
"cookiesPerConcurrency": {
"title": "Concurrency",
"type": "integer",
"description": "Set number of cookies that are used per one concurrent browser. (e.g. if set to 3 and set 6 sets of loginCookies, then it will run with concurrency 2 so each browser can choose randomly from three sets of cookies.)",
"editor": "number",
"default": 0
},
"loginUsername": {
"title": "Login Username",
"type": "string",
"description": "Login for IG account. You will have to fill in code, that you receive to email connected to IG account.",
"editor": "textfield"
},
"loginPassword": {
"title": "Login Password",
"type": "string",
"description": "Password for IG account. You will have to fill in code, that you receive to email connected to IG account.",
"editor": "textfield"
},
"likedByLimit": {
"title": "Get post likes",
"type": "integer",
"description": "If this limit is set above 0, then the solution will attempt to load additional details about users who liked the post.",
"editor": "number",
"unit": "likes",
"default": 0,
"sectionCaption": "Followers, following, likes",
"sectionDescription": "Features that require login. If limits in this section are set above 0, then when the solution enters a page, it attempts to load the specified items until the limit is reached. "
},
"followingLimit": {
"title": "Get profile following",
"type": "integer",
"description": "If this limit is set above 0, then the solution will attempt to find users who the profile owner is following.",
"editor": "number",
"unit": "users",
"default": 0
},
"followedByLimit": {
"title": "Get profile followed by",
"type": "integer",
"description": "If this limit is set above 0, then the solution will attempt to find users who the profile owner is followed by.",
"editor": "number",
"unit": "users",
"default": 0
},
"extendOutputFunction": {
"title": "Extend Output Function",
"description": "Add or remove properties on the output object or omit the output returning null",
"type": "string",
"default": "",
"prefill": "async ({ data, item, itemSpec, page, request, customData }) => {\n return item;\n}",
"editor": "javascript",
"sectionCaption": "Extend scraper functionality",
"sectionDescription": "You can change the output of the items for your dataset here, or add additional behavior on the scraper."
},
"extendScraperFunction": {
"title": "Extend Scraper Function",
"description": "Advanced function that allows you to extend the default scraper functionality, allowing you to manually perform actions on the page",
"type": "string",
"default": "",
"prefill": "async ({ page, request, itemSpec, customData, Apify }) => {\n \n}",
"editor": "javascript"
},
"customData": {
"title": "Custom data",
"description": "Any data that you want to have available inside the Extend Output/Scraper Function",
"default": {},
"prefill": {},
"type": "object",
"editor": "json"
},
"expandOwners": {
"title": "Query post owner",
"type": "boolean",
"description": "If this feature is used, then for each post the solution will attempt to query additional details about it's owner. This will drastically slow down the crawling speed and increase the chance of an account ban if credentials are provided.",
"editor": "checkbox",
"sectionCaption": "Experimental features",
"sectionDescription": "Features in this section are highly experimental, can result in a ban of the account if logged in and might cause the solution to fail."
},
"maxConcurrency": {
"title": "Max Concurrency",
"type": "integer",
"description": "Max number of allowed instances running in parallel. More instances need more memory!",
"editor": "number",
"default": 1000
},
"useStealth": {
"title": "Use stealth",
"type": "boolean",
"description": "Enable stealth in case of increased blocking",
"editor": "checkbox",
"default": false
},
"includeHasStories": {
"title": "Include Has Stories",
"type": "boolean",
"description": "If this feature is used, then for each user detail property has_public_story from xhr request is parsed.",
"editor": "checkbox"
}
},
"required": ["proxy"]
}