public static void main(String[] args) throws WikiApiException { // db connection settings DatabaseConfiguration dbConfig = new DatabaseConfiguration(); dbConfig.setDatabase("DATABASE"); dbConfig.setHost("HOST"); dbConfig.setUser("USER"); dbConfig.setPassword("PASSWORD"); dbConfig.setLanguage(Language.english); // initialize a wiki Wikipedia wiki = new Wikipedia(dbConfig); // get the page 'Dog' Page p = wiki.getPage("Dog"); // get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements String IMAGE = "Image"; // Replace it with the image template name in your Wiki language edition, // e.g. "Image" in English // filtering Image-Elements pf.getImageIdentifers().add(IMAGE); // parse page text MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(p.getText()); System.out.println(pp.getText()); }
public static void main(String[] args) { // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") String documentText = TestFile.getFileText(); // get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(documentText); // Link Context (return 1 token left, 2 token right of the link) for (Link link : pp.getLinks()) { System.out.println( link.getContext(1, 0) + "<" + link.getText().toString().toUpperCase() + ">" + link.getContext(0, 2)); } }
/** * Create a new instance with a handle to the (unpacked) xml pages/articles dump. * * @param wiktionaryDump the unpacked xml dump. * @throws FileNotFoundException if the provided wiktionaryDump does not exist. */ public WiktionaryLoader(final File wiktionaryDump) throws FileNotFoundException { // get a ParsedPage object final MediaWikiParserFactory pf = new MediaWikiParserFactory(); final MediaWikiParser parser = pf.createParser(); final FileInputStream fis = new FileInputStream(wiktionaryDump); final PageTitleNounCollector pageTitleNounCollector = new PageTitleNounCollector(parser); final XmlDumpReader dumpReader = new XmlDumpReader( fis, new NamespaceFilter(new LatestFilter(pageTitleNounCollector), "NS_MAIN")); try { dumpReader.readDump(); nounTitles = pageTitleNounCollector.getNounTitles(); LOGGER.info("Loaded {} nouns.", nounTitles.size()); } catch (final IOException e) { LOGGER.error("An error occurred when trying to read dump.", e); throw new RuntimeException(e); } finally { Closeables.closeQuietly(fis); } }
/** * Produces TokenStream instance for tokenizing input text. First, a language is determined, * because a special treatment needs to be taken for Chinese. Then, the individual filters * (length, stemming, stopword removal) are hooked up and the corresponding TokenStream instance * is returned. * * @param fieldName * @param reader * @return */ @Override public TokenStream tokenStream(String fieldName, Reader reader) { if (snowballStemmer.equals( "*porter")) { // if you want to use porter stemmer instead of snowball (orig. wikiprep-esa) Tokenizer tokenizer = new WikipediaTokenizer(reader); TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer); stream = new LowerCaseFilter(Version.LUCENE_30, stream); if (stopWordSet != null) { stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet); } stream = new PorterStemFilter(stream); stream = new PorterStemFilter(stream); stream = new PorterStemFilter(stream); return stream; } else if (lang == null || !lang.equals("zh")) { Tokenizer tokenizer = new WikipediaTokenizer(reader); TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer); // cstream = new LengthFilter(true, stream, 3, 100); stream = new LowerCaseFilter(Version.LUCENE_30, stream); // stopword filter if (stopWordSet != null) { stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet); } // if stemmer is defined, add stemming filter if (snowballStemmer != null) { try { Class<SnowballProgram> stemmer = (Class<SnowballProgram>) Class.forName(snowballStemmer); stream = new SnowballFilter(stream, stemmer.newInstance()); } catch (InstantiationException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); } catch (IllegalAccessException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); } catch (ClassNotFoundException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); } } return stream; } else if (lang.equals("zh")) { try { // For chinese, the input needs to be cleaned, because // the SentenceTokenizer does not accept token stream // as in case of English/other languages. MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); StringWriter sw = new StringWriter(); IOUtils.copy(reader, sw); ParsedPage p = parser.parse(sw.toString()); reader = new StringReader(p.getText()); } catch (IOException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); reader = new StringReader(""); } Tokenizer tokenizer = new SentenceTokenizer(reader); TokenStream stream = new WordTokenFilter(tokenizer); stream = new PorterStemFilter(stream); stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet); return stream; } else { // if it gets here, something's wrong with the language selection IFs return null; } }