Exemplo n.º 1
    public void writeRevision(final Revision rev) throws IOException {
      final ParsedPage pp = parser.parse(rev.Text);
      if (pp == null) {
        LOGGER.warn("Could not parse page with title {}", pageTitle);
      } else if (pp.getSections() != null) {

        final Set<String> declinations = getDeclinations(pp.getTemplates());
        if (!declinations.isEmpty()) {

        for (final Section section : pp.getSections()) {

          final List<Template> partOfSpeechTemplates = getPartOfSpeechTemplates(section);
          if (!partOfSpeechTemplates.isEmpty()) {
            for (final Template template : partOfSpeechTemplates) {
              if (isNoun.f(getFirstParameter.f(template))) {
                if (declinations.isEmpty() && LOGGER.isDebugEnabled()) {
                  LOGGER.debug("Found no declinations for page {}", pageTitle);
        if (LOGGER.isDebugEnabled() && rev.Text.contains("Substantiv")) {
              "No part-of-speech found for {} (which indeed contains 'Substantiv')", pageTitle);
  public static void main(String[] args) throws WikiApiException {

    // db connection settings
    DatabaseConfiguration dbConfig = new DatabaseConfiguration();

    // initialize a wiki
    Wikipedia wiki = new Wikipedia(dbConfig);

    // get the page 'Dog'
    Page p = wiki.getPage("Dog");

    // get a ParsedPage object
    MediaWikiParserFactory pf = new MediaWikiParserFactory();
    pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements

    String IMAGE =
        "Image"; // Replace it with the image template name in your Wiki language edition,
    // e.g. "Image" in English

    // filtering Image-Elements

    // parse page text
    MediaWikiParser parser = pf.createParser();
    ParsedPage pp = parser.parse(p.getText());

  protected String getPlainDocumentText(Page page) {
    ParsedPage pp = parser.parse(page.getText());

    if (pp != null) {
      return pp.getText();
    } else {
      return "";
Exemplo n.º 4
  public static void main(String[] args) {

    // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt")
    String documentText = TestFile.getFileText();

    // get a ParsedPage object
    MediaWikiParserFactory pf = new MediaWikiParserFactory();
    MediaWikiParser parser = pf.createParser();
    ParsedPage pp = parser.parse(documentText);

    // Link Context (return 1 token left, 2 token right of the link)
    for (Link link : pp.getLinks()) {
          link.getContext(1, 0)
              + "<"
              + link.getText().toString().toUpperCase()
              + ">"
              + link.getContext(0, 2));
  private String getText(Revision rev) {
    String text = rev.getRevisionText();

    if (outputPlainText) {
      text = StringEscapeUtils.unescapeHtml(text);

      ParsedPage pp = parser.parse(text);

      if (pp == null) {
        return "";

      text = pp.getText();

      // text = WikiUtils.mediaWikiMarkup2PlainText(text);

      // replace multiple white space with single white space
      text = WikiUtils.cleanText(text);

    return text;
Exemplo n.º 6
 public int countWords(final ParsedPage parsedPage) {
   long start = System.currentTimeMillis();
   if (null == parsedPage) {
     throw new IllegalStateException("parsedPage must not be null");
   String text = parsedPage.getText();
   int wordCount = new StringTokenizer(text, " ").countTokens();
           "%scountWords: count=%s runtime=%sms",
           getThreadId(), wordCount, System.currentTimeMillis() - start));
   return wordCount;
Exemplo n.º 7
  /** Returns the Information of a ParsedPage which are selected by the actual configuration */
  public String getSelectedText(ParsedPage pp) {
    if (pp == null) return null;

    StringBuilder sb = new StringBuilder();

    levelModifier = pp.getSection(0).getLevel() - 1;

    if (pageHandling == null) {
      if (firstParagraphHandling != null) {
        handleContent(pp.getFirstParagraph(), firstParagraphHandling, sb);
        deleteParagraph(pp.getFirstParagraphNr(), pp.getSections());
      for (Section s : pp.getSections()) handleSection(s, sb);
    } else {
      if (pageHandling.get(CIT.TEXT)) {
      } else {
        if (pageHandling.get(CIT.BOLD)) {
          handleSpans(pp.getFormatSpans(FormatType.BOLD), pp.getText(), sb);
        if (pageHandling.get(CIT.ITALIC)) {
          handleSpans(pp.getFormatSpans(FormatType.ITALIC), pp.getText(), sb);

      if (pageHandling.get(CIT.LINK)) handleLinks(pp.getLinks(), !pageHandling.get(CIT.TEXT), sb);

    return sb.toString().trim();
Exemplo n.º 8
   * Produces TokenStream instance for tokenizing input text. First, a language is determined,
   * because a special treatment needs to be taken for Chinese. Then, the individual filters
   * (length, stemming, stopword removal) are hooked up and the corresponding TokenStream instance
   * is returned.
   * @param fieldName
   * @param reader
   * @return
  public TokenStream tokenStream(String fieldName, Reader reader) {
    if (snowballStemmer.equals(
        "*porter")) { // if you want to use porter stemmer instead of snowball (orig. wikiprep-esa)
      Tokenizer tokenizer = new WikipediaTokenizer(reader);
      TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer);
      stream = new LowerCaseFilter(Version.LUCENE_30, stream);

      if (stopWordSet != null) {
        stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet);

      stream = new PorterStemFilter(stream);
      stream = new PorterStemFilter(stream);
      stream = new PorterStemFilter(stream);

      return stream;

    } else if (lang == null || !lang.equals("zh")) {
      Tokenizer tokenizer = new WikipediaTokenizer(reader);

      TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer);
      // cstream = new LengthFilter(true, stream, 3, 100);
      stream = new LowerCaseFilter(Version.LUCENE_30, stream);
      // stopword filter
      if (stopWordSet != null) {
        stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet);
      // if stemmer is defined, add stemming filter
      if (snowballStemmer != null) {
        try {
          Class<SnowballProgram> stemmer = (Class<SnowballProgram>) Class.forName(snowballStemmer);
          stream = new SnowballFilter(stream, stemmer.newInstance());
        } catch (InstantiationException ex) {
          Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IllegalAccessException ex) {
          Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
        } catch (ClassNotFoundException ex) {
          Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
      return stream;
    } else if (lang.equals("zh")) {
      try {
        // For chinese, the input needs to be cleaned, because
        // the SentenceTokenizer does not accept token stream
        // as in case of English/other languages.
        MediaWikiParserFactory pf = new MediaWikiParserFactory();
        MediaWikiParser parser = pf.createParser();

        StringWriter sw = new StringWriter();
        IOUtils.copy(reader, sw);

        ParsedPage p = parser.parse(sw.toString());
        reader = new StringReader(p.getText());
      } catch (IOException ex) {
        Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
        reader = new StringReader("");

      Tokenizer tokenizer = new SentenceTokenizer(reader);
      TokenStream stream = new WordTokenFilter(tokenizer);
      stream = new PorterStemFilter(stream);
      stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet);

      return stream;

    } else {
      // if it gets here, something's wrong with the language selection IFs
      return null;