예제 #1
0
  private String blankSectionHeaders(String markup, StringBuffer context) {

    Pattern p = Pattern.compile("(={2,})([^=]+)\\1");
    Matcher m = p.matcher(markup);

    int lastPos = 0;
    StringBuilder sb = new StringBuilder();

    while (m.find()) {
      sb.append(markup.substring(lastPos, m.start()));
      sb.append(getSpaceString(m.group().length()));

      String title = m.group(2).trim();

      if (!title.equalsIgnoreCase("see also")
          && !title.equalsIgnoreCase("external links")
          && !title.equalsIgnoreCase("references")
          && !title.equalsIgnoreCase("further reading")) context.append("\n").append(title);

      lastPos = m.end();
    }

    sb.append(markup.substring(lastPos));
    return sb.toString();
  }
예제 #2
0
  @Override
  public PreprocessedDocument preprocess(String content) {

    StringBuffer context = new StringBuffer();
    ArrayList<RegionTag> regionTags = getRegionTags(content);
    HashSet<Integer> bannedTopics = new HashSet<Integer>();

    String temp = blankTemplates(content);
    temp = blankTables(temp);
    temp = blankLinks(temp, context, bannedTopics);
    temp = blankSectionHeaders(temp, context);

    temp = clearAllMentions("(?s)\\<\\!\\-\\-(.*?)\\-\\-\\>", temp); // strip comments

    temp = clearAllMentions("<ref\\\\>", temp); // remove simple ref tags
    temp =
        clearAllMentions(
            "(?s)<ref>(.*?)</ref>", temp); // remove ref tags and all content between them.
    temp =
        clearAllMentions(
            "(?s)<ref\\s(.*?)>(.*?)</ref>",
            temp); // remove ref tags and all content between them (with attributes).

    temp = clearAllMentions("<(.*?)>", temp); // remove remaining html tags ;

    temp = clearAllMentions("\\[(http|www)(.*?)\\]", temp); // remove external links ;

    temp = clearAllMentions("'{2,}", temp); // remove all bold and italic markup ;

    temp = clearAllMentionsRetainFirstCharacter("\n:+", temp); // remove indents.

    temp = clearAllMentionsRetainFirstCharacter("\n([\\*\\#]+)", temp); // remove list markers.

    temp = clearAllMentions("&\\w{2,6};", temp); // remove entities

    return new PreprocessedDocument(content, temp, context.toString(), regionTags, bannedTopics);
  }
예제 #3
0
  private void processLink(String markup, StringBuffer context, HashSet<Integer> bannedTopics) {

    // ignore everything that is not in main namespace
    if (markup.indexOf(":") > 0) return;

    String anchor = markup;
    String dest = markup;

    int pos = markup.lastIndexOf("|");
    if (pos > 0) {
      anchor = markup.substring(pos + 1);
      dest = markup.substring(0, pos);
    }

    context.append("\n").append(anchor);

    Article art = wikipedia.getArticleByTitle(dest);
    if (art != null) {
      bannedTopics.add(art.getId());
    }
  }