Пример #1
0
  @Override
  public float getOOVRate(String text, VocabularyWritable vocab) {
    int countOOV = 0, countAll = 0;
    tokenizer = new StandardTokenizer(Version.LUCENE_29, new StringReader(text));
    TokenStream tokenStream = new UMDStandardFilter(tokenizer);
    tokenStream = new LowerCaseFilter(tokenStream);
    String tokenized = postNormalize(streamToString(tokenStream));

    StringBuilder finalTokenized = new StringBuilder();
    for (String token : tokenized.split(" ")) {
      if (isStopwordRemoval() && isDiscard(false, token)) {
        continue;
      }
      if (!isStemming()) {
        if (vocab != null && vocab.get(token) <= 0) {
          countOOV++;
        }
        countAll++;
      } else {
        finalTokenized.append(token + " ");
      }
    }

    if (isStemming()) {
      tokenizer =
          new StandardTokenizer(
              Version.LUCENE_29, new StringReader(finalTokenized.toString().trim()));
      tokenStream = new ArabicStemFilter(new ArabicNormalizationFilter(tokenizer));
      Attribute termAtt = tokenStream.getAttribute(TermAttribute.class);
      tokenStream.clearAttributes();
      try {
        while (tokenStream.incrementToken()) {
          String curToken = termAtt.toString();
          if (vocab != null && vocab.get(curToken) <= 0) {
            countOOV++;
          }
          countAll++;
        }
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
    return (countOOV / (float) countAll);
  }
Пример #2
0
 @Override
 public String stem(String token) {
   tokenizer = new StandardTokenizer(Version.LUCENE_29, new StringReader(token));
   TokenStream tokenStream = new ArabicStemFilter(new ArabicNormalizationFilter(tokenizer));
   Attribute termAtt = tokenStream.getAttribute(TermAttribute.class);
   tokenStream.clearAttributes();
   StringBuilder stemmed = new StringBuilder();
   try {
     while (tokenStream.incrementToken()) {
       String curToken = termAtt.toString();
       if (vocab != null && vocab.get(curToken) <= 0) {
         continue;
       }
       stemmed.append(curToken + " ");
     }
   } catch (IOException e) {
     e.printStackTrace();
   }
   return stemmed.toString().trim();
 }