Skip to content

Commit

Permalink
small improvement
Browse files Browse the repository at this point in the history
  • Loading branch information
cigolpl committed Jun 9, 2016
1 parent 8eeb1a3 commit 5f1215c
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 19 deletions.
1 change: 0 additions & 1 deletion helper.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,3 @@ exports.processScript = function(options, cb) {
return cb()
})
}

16 changes: 11 additions & 5 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@ var XmlStream = require('xml-stream')
var wikipedia = require('wtf_wikipedia')
var MongoClient = require('mongodb').MongoClient
var bz2 = require('unbzip2-stream');
var queue = require('./config/queue');
var helper = require('./helper')

var program = require('commander');

program
//.version(packageData.version)
.usage('node index.js afwiki-latest-pages-articles.xml.bz2 [options]')
.option('-w, --worker [worker]', 'Use worker (redis required)')
.parse(process.argv);
Expand All @@ -25,6 +23,14 @@ if (!file) {
var lang = file.match(/([a-z][a-z])wiki-/) || []
lang = lang[1] || '-'


var queue
// make redis and queue requirement optional
if (program.worker) {
queue = require('./config/queue');
}


// Connect to mongo
var url = 'mongodb://localhost:27017/' + lang + '_wikipedia';
MongoClient.connect(url, function(err, db) {
Expand All @@ -38,11 +44,12 @@ MongoClient.connect(url, function(err, db) {
var xml = new XmlStream(stream);
xml._preserveAll = true //keep newlines

var i = 0;
var i = 1;
xml.on('endElement: page', function(page) {
if (page.ns === '0') {
var script = page.revision.text['$text'] || ''
console.log(i);

console.log(page.title + ' ' + i);
++i;

var data = {
Expand Down Expand Up @@ -76,5 +83,4 @@ MongoClient.connect(url, function(err, db) {
db.close();
}, 20000)
});

});
1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"commander": "^2.9.0",
"kue": "^0.11.0",
"mongodb": "^2.1.18",
"mongoskin": "^2.1.0",
"unbzip2-stream": "^1.0.9",
"wtf_wikipedia": "^0.3.2",
"xml-stream": "^0.4.5"
Expand Down
27 changes: 15 additions & 12 deletions worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
var util = require('util');
var cluster = require('cluster')
var clusterWorkerSize = require('os').cpus().length;
var concurrency = 3;
var concurrency = 1;
var helper = require('./helper')
var queue = require('./config/queue');
var mongo = require('mongoskin');
var MongoClient = require('mongodb').MongoClient

if (cluster.isMaster) {
for (var i = 0; i < clusterWorkerSize; i++) {
Expand All @@ -16,15 +16,18 @@ if (cluster.isMaster) {
} else {
// url should be improved by configuration or cli arguments
var url = 'mongodb://localhost:27017/wikipedia_queue';
var db = mongo.db(url, {native_parser:true});
var collection = db.collection('wikipedia');
queue.process('article', concurrency, function(job, done){
var url = job.data.url;
var data = job.data
data.collection = collection;
helper.processScript(data, function(err, res) {
//console.log('processed');
done(err, res)

MongoClient.connect(url, function(err, db) {
var collection = db.collection('wikipedia');
queue.process('article', concurrency, function(job, done){
var url = job.data.url;
var data = job.data
data.collection = collection;
helper.processScript(data, function(err, res) {
//console.log('processed');
done(err, res)
})
})
});

})
}

0 comments on commit 5f1215c

Please sign in to comment.