public static void main(String[] args) throws WikiApiException {

    // db connection settings
    DatabaseConfiguration dbConfig = new DatabaseConfiguration();
    dbConfig.setDatabase("DATABASE");
    dbConfig.setHost("HOST");
    dbConfig.setUser("USER");
    dbConfig.setPassword("PASSWORD");
    dbConfig.setLanguage(Language.english);

    // initialize a wiki
    Wikipedia wiki = new Wikipedia(dbConfig);

    // get the page 'Dog'
    Page p = wiki.getPage("Dog");

    // get a ParsedPage object
    MediaWikiParserFactory pf = new MediaWikiParserFactory();
    pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements

    String IMAGE =
        "Image"; // Replace it with the image template name in your Wiki language edition,
    // e.g. "Image" in English

    // filtering Image-Elements
    pf.getImageIdentifers().add(IMAGE);

    // parse page text
    MediaWikiParser parser = pf.createParser();
    ParsedPage pp = parser.parse(p.getText());

    System.out.println(pp.getText());
  }
示例#2
0
  public static void main(String[] args) {

    // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt")
    String documentText = TestFile.getFileText();

    // get a ParsedPage object
    MediaWikiParserFactory pf = new MediaWikiParserFactory();
    MediaWikiParser parser = pf.createParser();
    ParsedPage pp = parser.parse(documentText);

    // Link Context (return 1 token left, 2 token right of the link)
    for (Link link : pp.getLinks()) {
      System.out.println(
          link.getContext(1, 0)
              + "<"
              + link.getText().toString().toUpperCase()
              + ">"
              + link.getContext(0, 2));
    }
  }
示例#3
0
  /**
   * Create a new instance with a handle to the (unpacked) xml pages/articles dump.
   *
   * @param wiktionaryDump the unpacked xml dump.
   * @throws FileNotFoundException if the provided wiktionaryDump does not exist.
   */
  public WiktionaryLoader(final File wiktionaryDump) throws FileNotFoundException {
    // get a ParsedPage object
    final MediaWikiParserFactory pf = new MediaWikiParserFactory();
    final MediaWikiParser parser = pf.createParser();

    final FileInputStream fis = new FileInputStream(wiktionaryDump);
    final PageTitleNounCollector pageTitleNounCollector = new PageTitleNounCollector(parser);
    final XmlDumpReader dumpReader =
        new XmlDumpReader(
            fis, new NamespaceFilter(new LatestFilter(pageTitleNounCollector), "NS_MAIN"));
    try {
      dumpReader.readDump();
      nounTitles = pageTitleNounCollector.getNounTitles();
      LOGGER.info("Loaded {} nouns.", nounTitles.size());
    } catch (final IOException e) {
      LOGGER.error("An error occurred when trying to read dump.", e);
      throw new RuntimeException(e);
    } finally {
      Closeables.closeQuietly(fis);
    }
  }
  /**
   * Produces TokenStream instance for tokenizing input text. First, a language is determined,
   * because a special treatment needs to be taken for Chinese. Then, the individual filters
   * (length, stemming, stopword removal) are hooked up and the corresponding TokenStream instance
   * is returned.
   *
   * @param fieldName
   * @param reader
   * @return
   */
  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    if (snowballStemmer.equals(
        "*porter")) { // if you want to use porter stemmer instead of snowball (orig. wikiprep-esa)
      Tokenizer tokenizer = new WikipediaTokenizer(reader);
      TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer);
      stream = new LowerCaseFilter(Version.LUCENE_30, stream);

      if (stopWordSet != null) {
        stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet);
      }

      stream = new PorterStemFilter(stream);
      stream = new PorterStemFilter(stream);
      stream = new PorterStemFilter(stream);

      return stream;

    } else if (lang == null || !lang.equals("zh")) {
      Tokenizer tokenizer = new WikipediaTokenizer(reader);

      TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer);
      // cstream = new LengthFilter(true, stream, 3, 100);
      stream = new LowerCaseFilter(Version.LUCENE_30, stream);
      // stopword filter
      if (stopWordSet != null) {
        stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet);
      }
      // if stemmer is defined, add stemming filter
      if (snowballStemmer != null) {
        try {
          Class<SnowballProgram> stemmer = (Class<SnowballProgram>) Class.forName(snowballStemmer);
          stream = new SnowballFilter(stream, stemmer.newInstance());
        } catch (InstantiationException ex) {
          Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IllegalAccessException ex) {
          Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
        } catch (ClassNotFoundException ex) {
          Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
        }
      }
      return stream;
    } else if (lang.equals("zh")) {
      try {
        // For chinese, the input needs to be cleaned, because
        // the SentenceTokenizer does not accept token stream
        // as in case of English/other languages.
        MediaWikiParserFactory pf = new MediaWikiParserFactory();
        MediaWikiParser parser = pf.createParser();

        StringWriter sw = new StringWriter();
        IOUtils.copy(reader, sw);

        ParsedPage p = parser.parse(sw.toString());
        reader = new StringReader(p.getText());
      } catch (IOException ex) {
        Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
        reader = new StringReader("");
      }

      Tokenizer tokenizer = new SentenceTokenizer(reader);
      TokenStream stream = new WordTokenFilter(tokenizer);
      stream = new PorterStemFilter(stream);
      stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet);

      return stream;

    } else {
      // if it gets here, something's wrong with the language selection IFs
      return null;
    }
  }