Ejemplo n.º 1
0
 /**
  * Remove any token which is not in [:alpha:] character class. It also removes the tokens with a
  * length less than minimum size. However, a minimum size of 0 implies any length restriction.
  *
  * @param minimumSize Minimum size of accepted tokens. If it equals 0, all the tokens will be
  *     accepted
  */
 public void removeNonAlphabetic(int minimumSize) {
   CHK.CHECK_NOT_NULL(minimumSize);
   CHK.CHECK(minimumSize >= 0, "No negative values are accepted");
   String pattern;
   if (language.toString().equalsIgnoreCase("ar")) {
     pattern = String.format("[\\p{IsArabic}\\p{Alpha}]{%d,}", minimumSize);
   } else if (language.toString().equalsIgnoreCase("el")) {
     pattern = String.format("[\\p{IsGreek}\\p{Alpha}]{%d,}", minimumSize);
   } else if (language.toString().equalsIgnoreCase("bg")) {
     pattern = String.format("[\\p{IsCyrillic}\\p{Alpha}]{%d,}", minimumSize);
   } else {
     pattern = String.format("[\\p{Alpha}]{%d,}", minimumSize);
   }
   Pattern p = Pattern.compile(pattern);
   removePattern(p);
 }
Ejemplo n.º 2
0
  /**
   * Creates the vocabulary related to the given category. This vocabulary is composed by the terms
   * that appears in the category articles and its frequency.
   *
   * @param category The category.
   * @return The vocabulary related to the category.
   * @throws WikiApiException
   */
  public DomainVocabulary createCategoryVocabulary(Category category) throws WikiApiException {
    CHK.CHECK_NOT_NULL(category);
    Locale language = new Locale(wiki.getLanguage().name());
    DomainVocabulary vocabulary = new DomainVocabulary(language);
    HashSet<Page> pages = null;
    pages = (HashSet<Page>) category.getArticles();
    for (Page page : pages) {
      String text = wiki.getParsedArticle(page.getPageId()).getText();
      vocabulary.addTerms(text);
    }

    return vocabulary;
  }
Ejemplo n.º 3
0
  /**
   * Checks if a category belongs to the domain defined by the given vocabulary. A category belongs
   * to one domain if its title contains at least one word which is included in the domain
   * vocabulary.
   *
   * @param category The category to check.
   * @param vocabulary The vocabulary of the desired domain.
   * @return {@code true} if the category belongs to the domain. Otherwise, {@code false}.
   * @throws WikiTitleParsingException
   */
  public boolean isDomain(Category category, DomainVocabulary vocabulary)
      throws WikiTitleParsingException {
    CHK.CHECK_NOT_NULL(category);
    boolean isDomain = false;
    String title = category.getTitle().getPlainTitle();
    Iterator<String> iterator = vocabulary.preprocess(title).iterator();
    while (iterator.hasNext() && !isDomain) {
      String term = iterator.next();
      isDomain = vocabulary.contains(term);
    }

    return isDomain;
  }
Ejemplo n.º 4
0
  private void setLocale(Locale lan) {
    CHK.CHECK_NOT_NULL(lan);
    language = lan;
    // String langS = language.getLanguage();

    //	sent_detector = new SentencesOpennlp(Locale.ENGLISH);
    tokenizer = new WordDecompositionICU4J(language);
    stop = new Stopwords(language);
    stopEng = new Stopwords(Locale.ENGLISH);
    if (StemmerFactory.loadStemmer(language) != null) {
      isSnowball = true;
      stemmer = StemmerFactory.loadStemmer(language);
    } else {
      //		if (langS.equalsIgnoreCase("ar") ||
      //			langS.equalsIgnoreCase("el")) {
      isSnowball = false;
      analyzer = AnalyzerFactoryLucene.loadAnalyzer(language);
      //		} else{
    }
  }
Ejemplo n.º 5
0
 /**
  * Stores a copy of the original string and generates a tokenized copy. Apostrophes are
  * substituted by spaces as a previous step for tokenisation with only BreakIterators.
  *
  * @param str
  */
 public void setStringTokens(String str) {
   CHK.CHECK_NOT_NULL(str);
   this.str = normalizeAndDeAposText(str);
   tokens = tokenizer.getStrings(this.str);
 }