/** * Remove any token which is not in [:alpha:] character class. It also removes the tokens with a * length less than minimum size. However, a minimum size of 0 implies any length restriction. * * @param minimumSize Minimum size of accepted tokens. If it equals 0, all the tokens will be * accepted */ public void removeNonAlphabetic(int minimumSize) { CHK.CHECK_NOT_NULL(minimumSize); CHK.CHECK(minimumSize >= 0, "No negative values are accepted"); String pattern; if (language.toString().equalsIgnoreCase("ar")) { pattern = String.format("[\\p{IsArabic}\\p{Alpha}]{%d,}", minimumSize); } else if (language.toString().equalsIgnoreCase("el")) { pattern = String.format("[\\p{IsGreek}\\p{Alpha}]{%d,}", minimumSize); } else if (language.toString().equalsIgnoreCase("bg")) { pattern = String.format("[\\p{IsCyrillic}\\p{Alpha}]{%d,}", minimumSize); } else { pattern = String.format("[\\p{Alpha}]{%d,}", minimumSize); } Pattern p = Pattern.compile(pattern); removePattern(p); }
/** * Creates the vocabulary related to the given category. This vocabulary is composed by the terms * that appears in the category articles and its frequency. * * @param category The category. * @return The vocabulary related to the category. * @throws WikiApiException */ public DomainVocabulary createCategoryVocabulary(Category category) throws WikiApiException { CHK.CHECK_NOT_NULL(category); Locale language = new Locale(wiki.getLanguage().name()); DomainVocabulary vocabulary = new DomainVocabulary(language); HashSet<Page> pages = null; pages = (HashSet<Page>) category.getArticles(); for (Page page : pages) { String text = wiki.getParsedArticle(page.getPageId()).getText(); vocabulary.addTerms(text); } return vocabulary; }
/** * Checks if a category belongs to the domain defined by the given vocabulary. A category belongs * to one domain if its title contains at least one word which is included in the domain * vocabulary. * * @param category The category to check. * @param vocabulary The vocabulary of the desired domain. * @return {@code true} if the category belongs to the domain. Otherwise, {@code false}. * @throws WikiTitleParsingException */ public boolean isDomain(Category category, DomainVocabulary vocabulary) throws WikiTitleParsingException { CHK.CHECK_NOT_NULL(category); boolean isDomain = false; String title = category.getTitle().getPlainTitle(); Iterator<String> iterator = vocabulary.preprocess(title).iterator(); while (iterator.hasNext() && !isDomain) { String term = iterator.next(); isDomain = vocabulary.contains(term); } return isDomain; }
private void setLocale(Locale lan) { CHK.CHECK_NOT_NULL(lan); language = lan; // String langS = language.getLanguage(); // sent_detector = new SentencesOpennlp(Locale.ENGLISH); tokenizer = new WordDecompositionICU4J(language); stop = new Stopwords(language); stopEng = new Stopwords(Locale.ENGLISH); if (StemmerFactory.loadStemmer(language) != null) { isSnowball = true; stemmer = StemmerFactory.loadStemmer(language); } else { // if (langS.equalsIgnoreCase("ar") || // langS.equalsIgnoreCase("el")) { isSnowball = false; analyzer = AnalyzerFactoryLucene.loadAnalyzer(language); // } else{ } }
/** * Stores a copy of the original string and generates a tokenized copy. Apostrophes are * substituted by spaces as a previous step for tokenisation with only BreakIterators. * * @param str */ public void setStringTokens(String str) { CHK.CHECK_NOT_NULL(str); this.str = normalizeAndDeAposText(str); tokens = tokenizer.getStrings(this.str); }