@Override public float getOOVRate(String text, VocabularyWritable vocab) { int countOOV = 0, countAll = 0; tokenizer = new StandardTokenizer(Version.LUCENE_29, new StringReader(text)); TokenStream tokenStream = new UMDStandardFilter(tokenizer); tokenStream = new LowerCaseFilter(tokenStream); String tokenized = postNormalize(streamToString(tokenStream)); StringBuilder finalTokenized = new StringBuilder(); for (String token : tokenized.split(" ")) { if (isStopwordRemoval() && isDiscard(false, token)) { continue; } if (!isStemming()) { if (vocab != null && vocab.get(token) <= 0) { countOOV++; } countAll++; } else { finalTokenized.append(token + " "); } } if (isStemming()) { tokenizer = new StandardTokenizer( Version.LUCENE_29, new StringReader(finalTokenized.toString().trim())); tokenStream = new ArabicStemFilter(new ArabicNormalizationFilter(tokenizer)); Attribute termAtt = tokenStream.getAttribute(TermAttribute.class); tokenStream.clearAttributes(); try { while (tokenStream.incrementToken()) { String curToken = termAtt.toString(); if (vocab != null && vocab.get(curToken) <= 0) { countOOV++; } countAll++; } } catch (IOException e) { e.printStackTrace(); } } return (countOOV / (float) countAll); }
@Override public String stem(String token) { tokenizer = new StandardTokenizer(Version.LUCENE_29, new StringReader(token)); TokenStream tokenStream = new ArabicStemFilter(new ArabicNormalizationFilter(tokenizer)); Attribute termAtt = tokenStream.getAttribute(TermAttribute.class); tokenStream.clearAttributes(); StringBuilder stemmed = new StringBuilder(); try { while (tokenStream.incrementToken()) { String curToken = termAtt.toString(); if (vocab != null && vocab.get(curToken) <= 0) { continue; } stemmed.append(curToken + " "); } } catch (IOException e) { e.printStackTrace(); } return stemmed.toString().trim(); }