-
Notifications
You must be signed in to change notification settings - Fork 0
/
MyPageCallbackHandler.java
253 lines (210 loc) · 8.09 KB
/
MyPageCallbackHandler.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import edu.jhu.nlp.wikipedia.*;
import org.apache.tools.bzip2.CBZip2InputStream;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.io.FileWriter;
class MyPageCallbackHandler implements PageCallbackHandler
{
public int pagesWith = 0;
public int pagesTotal = 0;
public int numTotal = 0;
FileWriter fileWriter;
String fileName;
private static final String COMMA_DELIMITER = ",";
private static final String NEW_LINE_SEPARATOR = "\n";
private static final String FILE_HEADER = "pageName,tagContents,precedingSentence";
private static final int PRECEDING_SENTENCE_LENGTH = 30;
// Advance amount is the same regardless of case.
private int ADVANCE_AMOUNT = "{{citation needed".length();
private int x = 0;
// Created each time there is a new run.
// Outputs a single CSV file that should have a unique name based on system time.
// Each call of process will write all citation needed tags to the file created
// for the run.
public MyPageCallbackHandler(String fileName)
{
Calendar cal = Calendar.getInstance();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy.MM.dd G 'at' HH:mm:ss z");
fileName = sdf.format(cal.getTime()) + "." + fileName + ".csv";
try
{
fileWriter = new FileWriter(fileName, true);
fileWriter.append(FILE_HEADER);
fileWriter.append(NEW_LINE_SEPARATOR);
}
catch(Exception e)
{
e.printStackTrace();
}
}
// To be called by the creating method after all parsing is complete.
public void finishWriting()
{
try
{
fileWriter.flush();
fileWriter.close();
}
catch (Exception e)
{
e.printStackTrace();
}
}
// Contents to write: "pageName,tagContents,precedingSentence"
// NOTE: wikiPage is a whole page, and cnIndex references the whole page text.
private void writeRecordToFile(WikiPage wikiPage, int cnIndex)
{
// Simplest approach, prior to any data processing:
// Just get the whole {{}}, and the previous text delimited by . and .
String pageText = wikiPage.getWikiText();
try
{
// pageName
// NOTE: titles seem to include a new line delimiter, so it is stripped.
String pageName = wikiPage.getTitle().replace("\n", "").replace(",","").trim();
int secondIndex = pageText.indexOf("}}", cnIndex);
String tagContents = pageText.substring(cnIndex, secondIndex + 2);
tagContents = tagContents.replace("\n","").replace(",","").trim();
int precedingStartingIndex = cnIndex - PRECEDING_SENTENCE_LENGTH;
String precedingSentence;
if (precedingStartingIndex < 0)
{
System.out.println("cnIndex was " + cnIndex + " so I saved precedingSentence from 0 to cnIndex." + " The page was " + pageName + ".");
precedingSentence = pageText.substring(0, cnIndex);
}
else
{
precedingSentence = pageText.substring(precedingStartingIndex, cnIndex);
precedingSentence = precedingSentence.replace("\n","").replace(",","").trim();
}
fileWriter.append(pageName);
fileWriter.append(COMMA_DELIMITER);
fileWriter.append(tagContents);
fileWriter.append(COMMA_DELIMITER);
fileWriter.append(precedingSentence);
fileWriter.append(NEW_LINE_SEPARATOR);
}
catch(Exception e)
{
System.out.println("Error! We caught an exception. cnIndex was " + cnIndex +
" on page " + wikiPage.getTitle() + ".");
// e.printStackTrace();
}
}
private int getNextCaseInsensitiveIndex(int cIndex, String wikiPageText)
{
String pageRemaining = wikiPageText.substring(cIndex);
int nextIndex = cIndex;
int indexL = pageRemaining.indexOf("{{citation needed");
int indexU = pageRemaining.indexOf("{{Citation needed");
// Since nextIndex references the WHOLE article, but indexL/indexU
// reference the REMAINING article, indexL/indexU must be added
// to the current index.
if (indexL > -1)
{
if (indexU > -1)
// Both present, take the min index
nextIndex += Math.min(indexL, indexU);
// L present but not U, take L
else nextIndex += indexL;
}
// Not L, but U, then take U
else if (indexU > -1)
nextIndex += indexU;
// Neither L nor U
else nextIndex = -1;
return nextIndex;
}
public void process(WikiPage page)
{
// Increment total number of pages
pagesTotal++;
String text = page.getWikiText();
// Start at the beginning of the page.
int currentIndex = 0;
// Get the first index. If it's there, then we increment pagesWith.
currentIndex = getNextCaseInsensitiveIndex(currentIndex, text);
if(currentIndex != -1)
pagesWith++;
// Either start on the first index or skip if there aren't any.
while(currentIndex != -1)
{
numTotal++;
writeRecordToFile(page, currentIndex);
currentIndex += ADVANCE_AMOUNT;
// System.out.println("I found one and I'm asking for " + currentIndex);
currentIndex = getNextCaseInsensitiveIndex(currentIndex, text);
}
}
}
// // For processing animation
// // String anim= "|/-\\";
//
// int index = -1;
//
//
// // Duplicated code: find case insensitive next tag.
//
// int indexL = text.indexOf("{{citation needed");
// int indexU = text.indexOf("{{Citation needed");
//
// if (indexL > -1)
// {
// if (indexU > -1)
// // Both present, take the min index
// index = Math.min(indexL, indexU);
// // L present but not U, take L
// else index = indexL;
// }
// // Not L, but U, then take U
// else if (indexU > -1)
// index = indexU;
// else index = -1;
//
// if (index != -1) pagesWith++;
//
// while (index != -1)
// {
// numTotal++;
//
// // Advance forward past the current found tag.
// text = text.substring(index + "{{citation needed".length());
//
// // Write the current tag's data to file.
// // The whole page's text must be passed.
// writeRecordToFile(wikiPageText, index);
//
// // Find the next tag.
// indexL = text.indexOf("{{citation needed");
// indexU = text.indexOf("{{Citation needed");
//
// if (indexL > -1)
// {
// if (indexU > -1)
// // Both present, take the min index
// index = Math.min(indexL, indexU);
// // L present but not U, take L
// else index = indexL;
// }
// // Not L, but U, then take U
// else if (indexU > -1)
// index = indexU;
// else index = -1;
// }
//
// pagesTotal++;
// For Animation
// x++;
//
// x = x % 5;
//
// String data = " " + anim.charAt(x % anim.length()) + "\r";
//
// try
// {
// System.out.write(data.getBytes());
// }
// catch(Exception e)
// {
// e.printStackTrace();
// }