/** * Strips a string of all markup; tries to turn it into plain text * * @param markup the text to be stripped * @return the stripped text */ public static String stripEverything(String markup) { String strippedMarkup = stripTemplates(markup); strippedMarkup = MarkupStripper.stripSection(strippedMarkup, "see also"); strippedMarkup = MarkupStripper.stripSection(strippedMarkup, "references"); strippedMarkup = MarkupStripper.stripSection(strippedMarkup, "further reading"); strippedMarkup = MarkupStripper.stripSection(strippedMarkup, "external links"); strippedMarkup = stripTables(strippedMarkup); strippedMarkup = stripIsolatedLinks(strippedMarkup); strippedMarkup = stripLinks(strippedMarkup); strippedMarkup = stripHTML(strippedMarkup); strippedMarkup = stripExternalLinks(strippedMarkup); strippedMarkup = stripFormatting(strippedMarkup); strippedMarkup = stripExcessNewlines(strippedMarkup); return strippedMarkup; }
/** * Build a map of label occurrences. * * @param key The map key * @param value The <tt>Text</tt> value. * @param output The <tt>OutputCollector</tt>. * @param reporter The <tt>Reporter</tt>. * @throws IOException */ @Override public void map( LongWritable key, Text value, OutputCollector<Text, ExLabel> output, Reporter reporter) throws IOException { try { if (labelVocabulary == null) { labelVocabulary = new THashSet<String>(); for (Path p : labelFiles) { labelVocabulary = gatherLabelVocabulary(p, labelVocabulary, reporter); } } DumpPage page = pageParser.parsePage(value.toString()); if (page != null) { // build up all the labels locally for this document before // emitting, // to maintain docCounts and occCounts // HashMap<String, ExLabel> labels = new HashMap<String, ExLabel>(); String markup = page.getMarkup(); markup = stripper.stripToPlainText(markup, null); String s = DOLLAR_TOKEN_OPEN + markup + DOLLAR_TOKEN_CLOSE; // would just match all non-word chars, but we don't want to // match utf chars // Pattern p = Pattern.compile(NON_WORD_PATTERN); Matcher m = p.matcher(s); List<Integer> matchIndexes = new ArrayList<Integer>(); while (m.find()) { matchIndexes.add(m.start()); } for (int i = 0; i < matchIndexes.size(); i++) { int startIndex = matchIndexes.get(i) + 1; if (Character.isWhitespace(s.charAt(startIndex))) { continue; } for (int j = Math.min(i + maxLabelLength, matchIndexes.size() - 1); j > i; j--) { int currIndex = matchIndexes.get(j); String ngram = s.substring(startIndex, currIndex); if (!(ngram.length() == 1 && "'".equals(s.substring(startIndex - 1, startIndex))) && !"".equals(ngram.trim())) { if (labelVocabulary.contains(ngram)) { ExLabel label = labels.get(ngram); if (label == null) { label = new ExLabel(0, 0, 1, 1, new TreeMap<Integer, ExSenseForLabel>()); } else { label.setTextOccCount(label.getTextOccCount() + 1); } labels.put(ngram, label); } } } } // now emit all of the labels we have gathered // for (Map.Entry<String, ExLabel> entry : labels.entrySet()) { output.collect(new Text(entry.getKey()), entry.getValue()); } } } catch (Exception e) { LOG.error(UNDETERMINED_EXCEPTION, e); } }