-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraping.js
109 lines (97 loc) · 4.04 KB
/
scraping.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
let cheerio = require("cheerio");
let request = require("request");
let async = require("async");
let MongoClient = require('mongodb').MongoClient;
const mongourl = "mongodb://localhost:27017/StartUpSchool";
let Urls = [];
const SOURCE = "STARTUP SCHOOL";
async.series([
function(callback) {
// fetching the webpage using request
request('https://www.startupschool.org/presentations/vertical/agriculture-agtech?course=1',function (error, response, html) {
if (error) return callback(error);
let $ = cheerio.load(html);
// calculations where I get NewUrl variable...
$('.presentation-card').each(function() {
const cmpLink = $(this).attr('href');
const link = 'https://www.startupschool.org/'+cmpLink;
Urls.push(link);
});
callback();
});
},
function(callback) {
// iterating the urls in array Urls
for (let i = 0; i <= Urls.length-1; i++) {
let url = Urls[i];
const items = {
Source: SOURCE,
Presenter: "",
Topic: "",
Email: "",
website: "",
Description: "",
Videosrc: "",
Team: "",
Positions: "",
};
request(url,function (error, response, html) {
let $ = cheerio.load(html);
$('div.ui.basic.wide.segment').each(function() {
items.Presenter = $(this).find('h1.ui.header.center.aligned')
.clone().children()
.remove().end().text();
items.Topic = $(this).find('h1>div.sub.header')
.clone().children()
.remove().end().text();
items.Email = $(this).find('h1>div>a:nth-child(2)')
.text().trim();
items.website = $(this).find('h1 a').first().attr('href');
items.Description=$(this).find('.ui.basic.segment>p')
.text().trim();
// FIXME: how do i get the youtube link? faced challenges here?its not returning anything
items.Videosrc = $(this).find('div.ui.embed.active>iframe').attr('src');
let Membership = [];
$('div.column.center.aligned.company-founder-bio').each(function() {
// getting the members of the Company and their positions
const people = {
Member: $(this).find('h4.ui.header').clone().children().remove().end().text(),
Title: $(this).find('h4>div.sub.header').text().trim(),
};
Membership.push(people);
});
/**
* separating the array of objects.
* FIXME: I don't know why its only returning the first object array details
*/
Membership.forEach(function(ItemArray) {
items.Team = ItemArray.Member;
items.Positions = ItemArray.Title;});
});
const item_source = items.Source;
const item_presenter = items.Presenter;
const item_topic = items.Topic;
const item_email = items.Email;
const item_website = items.website;
const item_description = items.Description;
const item_video = items.Videosrc;
const item_team = items.Team;
const item_titles = items.Positions;
// connecting and saving to mongodb
MongoClient.connect(mongourl, function(err, db) {
if (err) throw err;
const dbo = db.db("StartUpSchool");
const myobject = { websource:item_source, presenter:item_presenter, topic:item_topic, emailaddress:item_email, website:item_website,
description:item_description, video:item_video, itemMembers:item_team , memberspositions:item_titles};
dbo.collection("AgricultureCollection").insertOne(myobject, function(err, res) {
if (err) throw err;
console.log("successfully installed");
db.close();
});
});
});
}
}
], function (err) {
if (!err) callback();
});