@Override public void writeRevision(final Revision rev) throws IOException { final ParsedPage pp = parser.parse(rev.Text); if (pp == null) { LOGGER.warn("Could not parse page with title {}", pageTitle); } else if (pp.getSections() != null) { final Set<String> declinations = getDeclinations(pp.getTemplates()); if (!declinations.isEmpty()) { nounTitles.addAll(declinations); } for (final Section section : pp.getSections()) { final List<Template> partOfSpeechTemplates = getPartOfSpeechTemplates(section); if (!partOfSpeechTemplates.isEmpty()) { for (final Template template : partOfSpeechTemplates) { if (isNoun.f(getFirstParameter.f(template))) { nounTitles.add(pageTitle); if (declinations.isEmpty() && LOGGER.isDebugEnabled()) { LOGGER.debug("Found no declinations for page {}", pageTitle); } } } return; } } if (LOGGER.isDebugEnabled() && rev.Text.contains("Substantiv")) { LOGGER.debug( "No part-of-speech found for {} (which indeed contains 'Substantiv')", pageTitle); } } }
public static void main(String[] args) throws WikiApiException { // db connection settings DatabaseConfiguration dbConfig = new DatabaseConfiguration(); dbConfig.setDatabase("DATABASE"); dbConfig.setHost("HOST"); dbConfig.setUser("USER"); dbConfig.setPassword("PASSWORD"); dbConfig.setLanguage(Language.english); // initialize a wiki Wikipedia wiki = new Wikipedia(dbConfig); // get the page 'Dog' Page p = wiki.getPage("Dog"); // get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements String IMAGE = "Image"; // Replace it with the image template name in your Wiki language edition, // e.g. "Image" in English // filtering Image-Elements pf.getImageIdentifers().add(IMAGE); // parse page text MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(p.getText()); System.out.println(pp.getText()); }
// TODO Use SWEBLE @Override protected String getPlainDocumentText(Page page) { ParsedPage pp = parser.parse(page.getText()); if (pp != null) { return pp.getText(); } else { return ""; } }
public static void main(String[] args) { // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") String documentText = TestFile.getFileText(); // get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(documentText); // Link Context (return 1 token left, 2 token right of the link) for (Link link : pp.getLinks()) { System.out.println( link.getContext(1, 0) + "<" + link.getText().toString().toUpperCase() + ">" + link.getContext(0, 2)); } }
// TODO Use SWEBLE private String getText(Revision rev) { String text = rev.getRevisionText(); if (outputPlainText) { text = StringEscapeUtils.unescapeHtml(text); ParsedPage pp = parser.parse(text); if (pp == null) { return ""; } text = pp.getText(); // text = WikiUtils.mediaWikiMarkup2PlainText(text); // replace multiple white space with single white space text = WikiUtils.cleanText(text); } return text; }
public int countWords(final ParsedPage parsedPage) { long start = System.currentTimeMillis(); if (null == parsedPage) { throw new IllegalStateException("parsedPage must not be null"); } String text = parsedPage.getText(); fixedDelay(30); int wordCount = new StringTokenizer(text, " ").countTokens(); System.out.println( String.format( "%scountWords: count=%s runtime=%sms", getThreadId(), wordCount, System.currentTimeMillis() - start)); return wordCount; }
/** Returns the Information of a ParsedPage which are selected by the actual configuration */ public String getSelectedText(ParsedPage pp) { if (pp == null) return null; StringBuilder sb = new StringBuilder(); levelModifier = pp.getSection(0).getLevel() - 1; if (pageHandling == null) { if (firstParagraphHandling != null) { handleContent(pp.getFirstParagraph(), firstParagraphHandling, sb); deleteParagraph(pp.getFirstParagraphNr(), pp.getSections()); } for (Section s : pp.getSections()) handleSection(s, sb); } else { if (pageHandling.get(CIT.TEXT)) { sb.append(pp.getText()); } else { if (pageHandling.get(CIT.BOLD)) { handleSpans(pp.getFormatSpans(FormatType.BOLD), pp.getText(), sb); } if (pageHandling.get(CIT.ITALIC)) { handleSpans(pp.getFormatSpans(FormatType.ITALIC), pp.getText(), sb); } } if (pageHandling.get(CIT.LINK)) handleLinks(pp.getLinks(), !pageHandling.get(CIT.TEXT), sb); } return sb.toString().trim(); }
/** * Produces TokenStream instance for tokenizing input text. First, a language is determined, * because a special treatment needs to be taken for Chinese. Then, the individual filters * (length, stemming, stopword removal) are hooked up and the corresponding TokenStream instance * is returned. * * @param fieldName * @param reader * @return */ @Override public TokenStream tokenStream(String fieldName, Reader reader) { if (snowballStemmer.equals( "*porter")) { // if you want to use porter stemmer instead of snowball (orig. wikiprep-esa) Tokenizer tokenizer = new WikipediaTokenizer(reader); TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer); stream = new LowerCaseFilter(Version.LUCENE_30, stream); if (stopWordSet != null) { stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet); } stream = new PorterStemFilter(stream); stream = new PorterStemFilter(stream); stream = new PorterStemFilter(stream); return stream; } else if (lang == null || !lang.equals("zh")) { Tokenizer tokenizer = new WikipediaTokenizer(reader); TokenStream stream = new StandardFilter(Version.LUCENE_30, tokenizer); // cstream = new LengthFilter(true, stream, 3, 100); stream = new LowerCaseFilter(Version.LUCENE_30, stream); // stopword filter if (stopWordSet != null) { stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet); } // if stemmer is defined, add stemming filter if (snowballStemmer != null) { try { Class<SnowballProgram> stemmer = (Class<SnowballProgram>) Class.forName(snowballStemmer); stream = new SnowballFilter(stream, stemmer.newInstance()); } catch (InstantiationException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); } catch (IllegalAccessException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); } catch (ClassNotFoundException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); } } return stream; } else if (lang.equals("zh")) { try { // For chinese, the input needs to be cleaned, because // the SentenceTokenizer does not accept token stream // as in case of English/other languages. MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); StringWriter sw = new StringWriter(); IOUtils.copy(reader, sw); ParsedPage p = parser.parse(sw.toString()); reader = new StringReader(p.getText()); } catch (IOException ex) { Logger.getLogger(LUCENEWikipediaAnalyzer.class.getName()).log(Level.SEVERE, null, ex); reader = new StringReader(""); } Tokenizer tokenizer = new SentenceTokenizer(reader); TokenStream stream = new WordTokenFilter(tokenizer); stream = new PorterStemFilter(stream); stream = new StopFilter(Version.LUCENE_30, stream, stopWordSet); return stream; } else { // if it gets here, something's wrong with the language selection IFs return null; } }