private static void filteredUpdate(Set<String> vocabulary, String word) { if (word.trim().length() > MAX_STRING_LENGTH || ((LuceneNLPUtil.isAllASCII(word) || word.trim().length() <= 1) && word.matches(START_WITH_NONALPHA_CHAR))) { return; } vocabulary.add(word); vocabDist.put(word, vocabDist.getOrDefault(word, 0) + 1); }
private static void updateVocabulary(String fileNames, Set<String> vocabulary) throws IOException { try (BufferedReader rdr = new BufferedReader(new FileReader(fileNames))) { String line = null; while ((line = rdr.readLine()) != null) { LuceneNLPUtil.getRemovedStopAndStem( String.join( ConfigConstant.SPACE, TopicModelTemplateUtil.getWordsFromTemplate(line)), LuceneNLPUtil.getDefaultEnglishStopWordList()) .forEach( e -> { filteredUpdate(vocabulary, e); }); /* * vocabulary.addAll(TopicModelTemplateUtil * .getWordsFromTemplate(line)); */ } } System.out.println("completed vocabulary extractions::" + fileNames); }