-
Notifications
You must be signed in to change notification settings - Fork 0
/
WikiXmlMap.java
120 lines (91 loc) · 2.87 KB
/
WikiXmlMap.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/**
* Name: Sampath Sree Kumar K
* Email-id: skolluru@uncc.edu
* Studentid: 800887568
*/
package wiki.org;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.nio.charset.CharacterCodingException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author sam
*
*/
public class WikiXmlMap extends Mapper<LongWritable, Text, Text, Text> {
private static final Pattern outLinksPattern = Pattern.compile("\\[.+?\\]");
/*
* Sample Input <title>q0</title> <text>[[q2]]</text> <title>q1</title>
* <text>[[q1]][[q2]]</text>
*/
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
if(value.getLength()==0)return;
String[] titleAndText = getTitleAndText(value);
String title = titleAndText[0];
Text page = new Text(title.replace(' ', '_'));
Matcher matcher = outLinksPattern.matcher(titleAndText[1]);
boolean Flag = true;
// Loop through the matched links in <text></text>
while (matcher.find()) {
String outgoinglinks = matcher.group();
// Filter only wiki pages.
// - some have [[realPage|linkName]], some single [realPage]
// - some link to files or external pages.
// - some link to paragraphs into other pages.
outgoinglinks = getWikiPageFromLink(outgoinglinks);
if (outgoinglinks == null || outgoinglinks.isEmpty())
continue;
Flag = false;
// add valid outlinks to the map.
context.write(page, new Text(outgoinglinks));
System.out.println(page + ", " + outgoinglinks);
}
if (!matcher.find() && Flag == true && !(title=="")) {
String outgoing = "";
context.write(page, new Text(outgoing));
System.out.println(page + ", " + outgoing);
}
}
/*
* Sample Output <q0 q2> <q1 q1> <q1 q2> <q2 >
*/
private String getWikiPageFromLink(String outLink) {
//int start = 2;
int start = outLink.startsWith("[[") ? 2 : 1;
int endLink = outLink.indexOf("]");
int part = outLink.indexOf("#");
if (part > 0) {
endLink = part;
}
if(start==2){
outLink = outLink.substring(start, endLink);
outLink = outLink.replaceAll(" ", "_");
return outLink;
}else{
return "";
}
}
private String[] getTitleAndText(Text value)
throws CharacterCodingException {
String[] titleAndText = new String[2];
int start = value.find("<title>");
int end = value.find("</title>", start);
start += 7; // add <title> length.
titleAndText[0] = Text.decode(value.getBytes(), start, end - start);
start = value.find("<text");
start = value.find(">", start);
end = value.find("</text>", start);
start += 1;
if (start == -1 || end == -1) {
return new String[] { "", "" };
}
titleAndText[1] = Text.decode(value.getBytes(), start, end - start);
return titleAndText;
}
}