private String blankSectionHeaders(String markup, StringBuffer context) { Pattern p = Pattern.compile("(={2,})([^=]+)\\1"); Matcher m = p.matcher(markup); int lastPos = 0; StringBuilder sb = new StringBuilder(); while (m.find()) { sb.append(markup.substring(lastPos, m.start())); sb.append(getSpaceString(m.group().length())); String title = m.group(2).trim(); if (!title.equalsIgnoreCase("see also") && !title.equalsIgnoreCase("external links") && !title.equalsIgnoreCase("references") && !title.equalsIgnoreCase("further reading")) context.append("\n").append(title); lastPos = m.end(); } sb.append(markup.substring(lastPos)); return sb.toString(); }
@Override public PreprocessedDocument preprocess(String content) { StringBuffer context = new StringBuffer(); ArrayList<RegionTag> regionTags = getRegionTags(content); HashSet<Integer> bannedTopics = new HashSet<Integer>(); String temp = blankTemplates(content); temp = blankTables(temp); temp = blankLinks(temp, context, bannedTopics); temp = blankSectionHeaders(temp, context); temp = clearAllMentions("(?s)\\<\\!\\-\\-(.*?)\\-\\-\\>", temp); // strip comments temp = clearAllMentions("<ref\\\\>", temp); // remove simple ref tags temp = clearAllMentions( "(?s)<ref>(.*?)</ref>", temp); // remove ref tags and all content between them. temp = clearAllMentions( "(?s)<ref\\s(.*?)>(.*?)</ref>", temp); // remove ref tags and all content between them (with attributes). temp = clearAllMentions("<(.*?)>", temp); // remove remaining html tags ; temp = clearAllMentions("\\[(http|www)(.*?)\\]", temp); // remove external links ; temp = clearAllMentions("'{2,}", temp); // remove all bold and italic markup ; temp = clearAllMentionsRetainFirstCharacter("\n:+", temp); // remove indents. temp = clearAllMentionsRetainFirstCharacter("\n([\\*\\#]+)", temp); // remove list markers. temp = clearAllMentions("&\\w{2,6};", temp); // remove entities return new PreprocessedDocument(content, temp, context.toString(), regionTags, bannedTopics); }
private void processLink(String markup, StringBuffer context, HashSet<Integer> bannedTopics) { // ignore everything that is not in main namespace if (markup.indexOf(":") > 0) return; String anchor = markup; String dest = markup; int pos = markup.lastIndexOf("|"); if (pos > 0) { anchor = markup.substring(pos + 1); dest = markup.substring(0, pos); } context.append("\n").append(anchor); Article art = wikipedia.getArticleByTitle(dest); if (art != null) { bannedTopics.add(art.getId()); } }