public static void main(String[] args) throws WikiApiException { // db connection settings DatabaseConfiguration dbConfig = new DatabaseConfiguration(); dbConfig.setDatabase("DATABASE"); dbConfig.setHost("HOST"); dbConfig.setUser("USER"); dbConfig.setPassword("PASSWORD"); dbConfig.setLanguage(Language.english); // initialize a wiki Wikipedia wiki = new Wikipedia(dbConfig); // get the page 'Dog' Page p = wiki.getPage("Dog"); // get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements String IMAGE = "Image"; // Replace it with the image template name in your Wiki language edition, // e.g. "Image" in English // filtering Image-Elements pf.getImageIdentifers().add(IMAGE); // parse page text MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(p.getText()); System.out.println(pp.getText()); }
@Override public void writeRevision(final Revision rev) throws IOException { final ParsedPage pp = parser.parse(rev.Text); if (pp == null) { LOGGER.warn("Could not parse page with title {}", pageTitle); } else if (pp.getSections() != null) { final Set<String> declinations = getDeclinations(pp.getTemplates()); if (!declinations.isEmpty()) { nounTitles.addAll(declinations); } for (final Section section : pp.getSections()) { final List<Template> partOfSpeechTemplates = getPartOfSpeechTemplates(section); if (!partOfSpeechTemplates.isEmpty()) { for (final Template template : partOfSpeechTemplates) { if (isNoun.f(getFirstParameter.f(template))) { nounTitles.add(pageTitle); if (declinations.isEmpty() && LOGGER.isDebugEnabled()) { LOGGER.debug("Found no declinations for page {}", pageTitle); } } } return; } } if (LOGGER.isDebugEnabled() && rev.Text.contains("Substantiv")) { LOGGER.debug( "No part-of-speech found for {} (which indeed contains 'Substantiv')", pageTitle); } } }
public static void main(String[] args) { // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") String documentText = TestFile.getFileText(); // get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(documentText); // Link Context (return 1 token left, 2 token right of the link) for (Link link : pp.getLinks()) { System.out.println( link.getContext(1, 0) + "<" + link.getText().toString().toUpperCase() + ">" + link.getContext(0, 2)); } }
/** * Produces TokenStream instance for tokenizing input text. First, a language is determined, * because a special treatment needs to be taken for Chinese. Then, the individual filters * (length, stemming, stopword removal) are hooked up and the corresponding TokenStream instance * is returned. * * @param fieldName * @param reader * @return */ @Override public TokenStream tokenStream(String fieldName, Reader reader) { if (snowballStemmer.equals( "*porter")) { // if you want to use porter stemmer instead of snowball (orig. wikiprep-esa) Tokenizer tokenizer = new WikipediaTokenizer(reader); TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer); stream = new LowerCaseFilter(Version.LUCENE_30, stream); if (stopWordSet != null) { stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet); } stream = new PorterStemFilter(stream); stream = new PorterStemFilter(stream); stream = new PorterStemFilter(stream); return stream; } else if (lang == null || !lang.equals("zh")) { Tokenizer tokenizer = new WikipediaTokenizer(reader); TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer); // cstream = new LengthFilter(true, stream, 3, 100); stream = new LowerCaseFilter(Version.LUCENE_30, stream); // stopword filter if (stopWordSet != null) { stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet); } // if stemmer is defined, add stemming filter if (snowballStemmer != null) { try { Class<SnowballProgram> stemmer = (Class<SnowballProgram>) Class.forName(snowballStemmer); stream = new SnowballFilter(stream, stemmer.newInstance()); } catch (InstantiationException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); } catch (IllegalAccessException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); } catch (ClassNotFoundException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); } } return stream; } else if (lang.equals("zh")) { try { // For chinese, the input needs to be cleaned, because // the SentenceTokenizer does not accept token stream // as in case of English/other languages. MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); StringWriter sw = new StringWriter(); IOUtils.copy(reader, sw); ParsedPage p = parser.parse(sw.toString()); reader = new StringReader(p.getText()); } catch (IOException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); reader = new StringReader(""); } Tokenizer tokenizer = new SentenceTokenizer(reader); TokenStream stream = new WordTokenFilter(tokenizer); stream = new PorterStemFilter(stream); stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet); return stream; } else { // if it gets here, something's wrong with the language selection IFs return null; } }