/**
   * Strips a string of all markup; tries to turn it into plain text
   *
   * @param markup the text to be stripped
   * @return the stripped text
   */
  public static String stripEverything(String markup) {

    String strippedMarkup = stripTemplates(markup);
    strippedMarkup = MarkupStripper.stripSection(strippedMarkup, "see also");
    strippedMarkup = MarkupStripper.stripSection(strippedMarkup, "references");
    strippedMarkup = MarkupStripper.stripSection(strippedMarkup, "further reading");
    strippedMarkup = MarkupStripper.stripSection(strippedMarkup, "external links");
    strippedMarkup = stripTables(strippedMarkup);
    strippedMarkup = stripIsolatedLinks(strippedMarkup);
    strippedMarkup = stripLinks(strippedMarkup);
    strippedMarkup = stripHTML(strippedMarkup);
    strippedMarkup = stripExternalLinks(strippedMarkup);
    strippedMarkup = stripFormatting(strippedMarkup);
    strippedMarkup = stripExcessNewlines(strippedMarkup);

    return strippedMarkup;
  }
  /**
   * Build a map of label occurrences.
   *
   * @param key The map key
   * @param value The <tt>Text</tt> value.
   * @param output The <tt>OutputCollector</tt>.
   * @param reporter The <tt>Reporter</tt>.
   * @throws IOException
   */
  @Override
  public void map(
      LongWritable key, Text value, OutputCollector<Text, ExLabel> output, Reporter reporter)
      throws IOException {

    try {

      if (labelVocabulary == null) {
        labelVocabulary = new THashSet<String>();

        for (Path p : labelFiles) {
          labelVocabulary = gatherLabelVocabulary(p, labelVocabulary, reporter);
        }
      }

      DumpPage page = pageParser.parsePage(value.toString());

      if (page != null) {

        // build up all the labels locally for this document before
        // emitting,
        // to maintain docCounts and occCounts
        //
        HashMap<String, ExLabel> labels = new HashMap<String, ExLabel>();

        String markup = page.getMarkup();
        markup = stripper.stripToPlainText(markup, null);

        String s = DOLLAR_TOKEN_OPEN + markup + DOLLAR_TOKEN_CLOSE;

        // would just match all non-word chars, but we don't want to
        // match utf chars
        //
        Pattern p = Pattern.compile(NON_WORD_PATTERN);

        Matcher m = p.matcher(s);

        List<Integer> matchIndexes = new ArrayList<Integer>();

        while (m.find()) {
          matchIndexes.add(m.start());
        }

        for (int i = 0; i < matchIndexes.size(); i++) {

          int startIndex = matchIndexes.get(i) + 1;

          if (Character.isWhitespace(s.charAt(startIndex))) {
            continue;
          }

          for (int j = Math.min(i + maxLabelLength, matchIndexes.size() - 1); j > i; j--) {
            int currIndex = matchIndexes.get(j);
            String ngram = s.substring(startIndex, currIndex);

            if (!(ngram.length() == 1 && "'".equals(s.substring(startIndex - 1, startIndex)))
                && !"".equals(ngram.trim())) {
              if (labelVocabulary.contains(ngram)) {

                ExLabel label = labels.get(ngram);

                if (label == null) {
                  label = new ExLabel(0, 0, 1, 1, new TreeMap<Integer, ExSenseForLabel>());
                } else {
                  label.setTextOccCount(label.getTextOccCount() + 1);
                }

                labels.put(ngram, label);
              }
            }
          }
        }

        // now emit all of the labels we have gathered
        //
        for (Map.Entry<String, ExLabel> entry : labels.entrySet()) {
          output.collect(new Text(entry.getKey()), entry.getValue());
        }
      }

    } catch (Exception e) {
      LOG.error(UNDETERMINED_EXCEPTION, e);
    }
  }