/** * @param args * @throws IOException * @throws FileNotFoundException */ public static void main(String[] args) throws IOException { FileUtils.makeDirectory(ConfigConstant.TEMP_VOCABULARY_OUTPUT_DIR); // read one by one line from the review tips location and extract the // words Set<String> vocubalary = new HashSet<>(); for (String fileNames : FileUtils.getAllFiles(ConfigConstant.TEMP_REVIEW_TIPS_ML_TOPIC_DATA_OUT_LOCATIONS)) { updateVocabulary( FileUtils.getFullPath( ConfigConstant.TEMP_REVIEW_TIPS_ML_TOPIC_DATA_OUT_LOCATIONS, fileNames), vocubalary); } // remove all words that appeared very less often vocabDist.forEach( (word, value) -> { if (value <= DEFAULT_WORD_COUNT_ALLOWED) { vocubalary.remove(word); } }); vocabDist.clear(); // write the remaining files writeToFile( new TreeSet<String>(vocubalary), ConfigConstant.TEMP_RAW_ORIGINAL_VOCABULRAY_TEXT_LOCATIONS); }