/** * not used any more remove stop words * * @param words * @return * @throws IOException */ public static List<String> stopRemover(List<String> words) throws IOException { List<String> stopwords = Arrays.asList(StopWords.EnglishStopWords()); List<String> filterwords = new ArrayList<String>(); for (String w : words) { String w1 = w.substring(0, w.lastIndexOf("/")); String w2 = w1.replaceAll("[^a-zA-Z]", " ").trim(); if (!w2.equals("") && !stopwords.contains(w1)) { // check if a word is in stop list // or a white space filterwords.add(w); } } return filterwords; }
/** * replacing noun phrases as unique tokens removing stopwords, non-letter tokens from the stemmed * corpus * * @param termFile * @param stemmedFile * @param term_corpusFile * @throws IOException */ public static void replacePhrase(String termFile, String stemmedFile, String term_corpusFile) throws IOException { // HashMap<String,String> terms = new HashMap<String,String>(); List<String> terms1 = new ArrayList<>(); List<String> terms2 = new ArrayList<>(); // read term file to get the list of phrases logger.log(Level.INFO, "*********Reading term file...."); String combline = ""; String line = ""; BufferedReader br1 = new BufferedReader(new FileReader(termFile)); while ((line = br1.readLine()) != null) { String phr = line.replaceAll("[^A-Za-z]", " ").trim(); terms1.add(phr); } /** * List<String> words = Lists.newArrayList(phr.split(" ")); String mainWord =""; if * (words.contains("of")) { int pos = words.indexOf("of"); mainWord = words.get(pos-1); } else * mainWord = words.get(words.size()-1); String newMainWord=mainWord+"_"+i; * phr=phr.replace(mainWord, newMainWord); terms2.add(phr); */ // read the corpus file logger.log(Level.INFO, "*********Reading corpus file...."); combline = ""; line = ""; BufferedReader br2 = new BufferedReader(new FileReader(stemmedFile)); while ((line = br2.readLine()) != null) { combline = combline + " " + line; combline = combline.replaceAll("\\s+", " ").trim(); } /** * //sort the list by term length logger.log(Level.INFO, "*********Sorting term list by * length...."); Set<Entry<String, Integer>> set = terms.entrySet(); List<Entry<String, * Integer>> sortedList = new ArrayList<Entry<String, Integer>>( set); * Collections.sort(sortedList, new Comparator<Map.Entry<String, Integer>>() { public int * compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) { return * o2.getValue().compareTo(o1.getValue()); } }); * * <p>//replacing a noun phrases with a unique token in corpus logger.log(Level.INFO, * "*********Replacing terms with tokens...."); int size = sortedList.size(); int i =0; for * (Entry<String, Integer> entry : sortedList) { String phr = " "+entry.getKey()+" "; String * linkedphr = " "+ entry.getKey().replaceAll("\\s", "-")+" "; combline=combline.replaceAll(phr, * linkedphr); * * <p>i++; System.out.println(i + " out of " + size +" phrases are replaced."); } */ // replacing a noun phrases with a unique token in corpus logger.log(Level.INFO, "*********Replacing terms with tokens...."); int size = terms1.size(); int i = 0; for (String t : terms1) { String phr1 = " " + t + " "; String phr2 = " " + t + " term:" + t.replaceAll("\\s", "-") + " "; combline = combline.replaceAll(phr1, phr2); i++; System.out.println(i + " out of " + size + " phrases are replaced."); } // removing stop words, number, non-letter tokens from the corpus // writing corpus on disk logger.log(Level.INFO, "*********Rewritting the modified corpus..."); List<String> stopwords = Arrays.asList(StopWords.EnglishStopWords()); List<String> words = Lists.newArrayList(combline.split(" ")); PrintWriter pw = new PrintWriter(new FileOutputStream(term_corpusFile)); for (String w : words) { if (w.equals(".")) pw.println(); String w2 = w.replaceAll("[^a-zA-Z]", " ").trim(); // to remove non-letter tokens if (!w2.equals("") && !stopwords.contains(w)) { // to remove stop words pw.print(w + " "); } } pw.close(); logger.log(Level.INFO, "DONE!"); }