Skip to content

Commit

Permalink
Create urlLinks.java
Browse files Browse the repository at this point in the history
  • Loading branch information
azeemshaikh38 committed Dec 7, 2014
1 parent 733eba6 commit 6362bc8
Showing 1 changed file with 78 additions and 0 deletions.
78 changes: 78 additions & 0 deletions src/urlLinks.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package org.myorg;

import java.io.IOException;
import java.util.*;
import java.util.regex.*;
import javax.xml.xpath.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

public class urlLinks {

public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
String line = value.toString().replaceAll("^\\s+", "");
//The 2nd element of below split will be the Title
String[] titleSplitOnce = line.split("</?title>");

//For obtaining text, the split is in two parts
String[] textSplitOnce = line.split("<text[^>]*>");
String[] textSplitTwice = textSplitOnce[1].split("</text>");

//From the obatined text, search for all [[nodeName]] pattern.
//Below pattern ignores all nested brackets inside [[ ]].
Matcher matchedURLs = Pattern.compile("\\[\\[[^\\[\\[]+\\]\\]").matcher(textSplitTwice[0]);
int matchCount = 0;
while (matchedURLs.find()) {
String matchedString = matchedURLs.group();
//Remove whitespaces and convert to lowercase to maximise string matching during PageRank
String[] urlList = matchedString.replaceAll("\\[\\[|\\]\\]", "").split("\\|");
for (int i=0; i<urlList.length; i++) {
output.collect(new Text(titleSplitOnce[1].replaceAll("\\s+","").toLowerCase()+" --->"), new Text("[["+urlList[i].replaceAll("\\s+","").toLowerCase()+"]]"));
}
matchCount++;
}
if (matchCount == 0) //Avoid dangling nodes
output.collect(new Text(titleSplitOnce[1].replaceAll("\\s+","").toLowerCase()+" --->"), new Text(""));
}
}

public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
//Reduce function simply combines all URLs and assigns initial PageRank
String sum = "";
while (values.hasNext()) {
String temp = values.next().toString();
sum += temp+", ";
}
output.collect(key, new Text("<pr>0.15</pr><urlLinks>"+sum+"</urlLinks>"));
}
}

public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(urlLinks.class);
conf.setJobName("wordcount");

conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);

conf.setMapperClass(Map.class);
//conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);

conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);

FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));

JobClient.runJob(conf);
}
}

0 comments on commit 6362bc8

Please sign in to comment.