public static void main(String[] args) throws WikiApiException {

    // db connection settings
    DatabaseConfiguration dbConfig = new DatabaseConfiguration();
    dbConfig.setDatabase("DATABASE");
    dbConfig.setHost("HOST");
    dbConfig.setUser("USER");
    dbConfig.setPassword("PASSWORD");
    dbConfig.setLanguage(Language.english);

    // initialize a wiki
    Wikipedia wiki = new Wikipedia(dbConfig);

    // get the page 'Dog'
    Page p = wiki.getPage("Dog");

    // get a ParsedPage object
    MediaWikiParserFactory pf = new MediaWikiParserFactory();
    pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements

    String IMAGE =
        "Image"; // Replace it with the image template name in your Wiki language edition,
    // e.g. "Image" in English

    // filtering Image-Elements
    pf.getImageIdentifers().add(IMAGE);

    // parse page text
    MediaWikiParser parser = pf.createParser();
    ParsedPage pp = parser.parse(p.getText());

    System.out.println(pp.getText());
  }
Пример #2
0
    @Override
    public void writeRevision(final Revision rev) throws IOException {
      final ParsedPage pp = parser.parse(rev.Text);
      if (pp == null) {
        LOGGER.warn("Could not parse page with title {}", pageTitle);
      } else if (pp.getSections() != null) {

        final Set<String> declinations = getDeclinations(pp.getTemplates());
        if (!declinations.isEmpty()) {
          nounTitles.addAll(declinations);
        }

        for (final Section section : pp.getSections()) {

          final List<Template> partOfSpeechTemplates = getPartOfSpeechTemplates(section);
          if (!partOfSpeechTemplates.isEmpty()) {
            for (final Template template : partOfSpeechTemplates) {
              if (isNoun.f(getFirstParameter.f(template))) {
                nounTitles.add(pageTitle);
                if (declinations.isEmpty() && LOGGER.isDebugEnabled()) {
                  LOGGER.debug("Found no declinations for page {}", pageTitle);
                }
              }
            }
            return;
          }
        }
        if (LOGGER.isDebugEnabled() && rev.Text.contains("Substantiv")) {
          LOGGER.debug(
              "No part-of-speech found for {} (which indeed contains 'Substantiv')", pageTitle);
        }
      }
    }
Пример #3
0
  public static void main(String[] args) {

    // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt")
    String documentText = TestFile.getFileText();

    // get a ParsedPage object
    MediaWikiParserFactory pf = new MediaWikiParserFactory();
    MediaWikiParser parser = pf.createParser();
    ParsedPage pp = parser.parse(documentText);

    // Link Context (return 1 token left, 2 token right of the link)
    for (Link link : pp.getLinks()) {
      System.out.println(
          link.getContext(1, 0)
              + "<"
              + link.getText().toString().toUpperCase()
              + ">"
              + link.getContext(0, 2));
    }
  }
Пример #4
0
  /**
   * Produces TokenStream instance for tokenizing input text. First, a language is determined,
   * because a special treatment needs to be taken for Chinese. Then, the individual filters
   * (length, stemming, stopword removal) are hooked up and the corresponding TokenStream instance
   * is returned.
   *
   * @param fieldName
   * @param reader
   * @return
   */
  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    if (snowballStemmer.equals(
        "*porter")) { // if you want to use porter stemmer instead of snowball (orig. wikiprep-esa)
      Tokenizer tokenizer = new WikipediaTokenizer(reader);
      TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer);
      stream = new LowerCaseFilter(Version.LUCENE_30, stream);

      if (stopWordSet != null) {
        stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet);
      }

      stream = new PorterStemFilter(stream);
      stream = new PorterStemFilter(stream);
      stream = new PorterStemFilter(stream);

      return stream;

    } else if (lang == null || !lang.equals("zh")) {
      Tokenizer tokenizer = new WikipediaTokenizer(reader);

      TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer);
      // cstream = new LengthFilter(true, stream, 3, 100);
      stream = new LowerCaseFilter(Version.LUCENE_30, stream);
      // stopword filter
      if (stopWordSet != null) {
        stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet);
      }
      // if stemmer is defined, add stemming filter
      if (snowballStemmer != null) {
        try {
          Class<SnowballProgram> stemmer = (Class<SnowballProgram>) Class.forName(snowballStemmer);
          stream = new SnowballFilter(stream, stemmer.newInstance());
        } catch (InstantiationException ex) {
          Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IllegalAccessException ex) {
          Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
        } catch (ClassNotFoundException ex) {
          Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
        }
      }
      return stream;
    } else if (lang.equals("zh")) {
      try {
        // For chinese, the input needs to be cleaned, because
        // the SentenceTokenizer does not accept token stream
        // as in case of English/other languages.
        MediaWikiParserFactory pf = new MediaWikiParserFactory();
        MediaWikiParser parser = pf.createParser();

        StringWriter sw = new StringWriter();
        IOUtils.copy(reader, sw);

        ParsedPage p = parser.parse(sw.toString());
        reader = new StringReader(p.getText());
      } catch (IOException ex) {
        Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
        reader = new StringReader("");
      }

      Tokenizer tokenizer = new SentenceTokenizer(reader);
      TokenStream stream = new WordTokenFilter(tokenizer);
      stream = new PorterStemFilter(stream);
      stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet);

      return stream;

    } else {
      // if it gets here, something's wrong with the language selection IFs
      return null;
    }
  }