/** * extract technical terms using C-value method * * @param phraseFile * @throws IOException */ public static void extractTerm(String phraseFile) throws IOException { logger.log(Level.INFO, "*********Collecting and cleaning candidates. Please wait..."); HashMap<Candidate, Integer> map = new HashMap<Candidate, Integer>(); // map candiates and their frequency String line = ""; int percentComplete1 = 0; int i = 0; try { BufferedReader br = new BufferedReader(new FileReader(phraseFile)); long size = br.lines().count(); br = new BufferedReader(new FileReader(phraseFile)); while ((line = br.readLine()) != null) { if (!line.equals("")) { // check empty line Candidate cand = new Candidate(line, line.split("\\s").length); if (map.containsKey(cand)) { map.put(cand, map.get(cand) + 1); } else map.put(cand, 1); } // reporting the progress i++; if (i * 100 / size > percentComplete1) { percentComplete1 = percentComplete1 + 1; logger.log(Level.INFO, percentComplete1 + " percent of temp candidates processed."); } } } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } logger.log(Level.INFO, "*********Removing unfrequent noun phrases. Please wait..."); // List<Candidate> cleanedCand = new ArrayList<>(); Candidate[] cleanedCand = map.keySet().toArray(new Candidate[map.size()]); List<Candidate> candidates = new ArrayList<>(); for (Candidate c : cleanedCand) { if (map.get(c) >= 50) { // ignore phrases occurring less than 50 times in the corpus c.incrementFreq(map.get(c)); candidates.add(c); } } Document doc = new Document("C:\\", "terms.txt"); // doc.List(tokenizedSentenceList); CValueAlgortithm cvalue = new CValueAlgortithm(); cvalue.init(doc); // initializes the algorithm for processing the desired document. ILinguisticFilter pFilter = new AdjPrepNounFilter(); // filter cvalue.addNewProcessingFilter(pFilter); ; // for example the AdjNounFilter logger.log(Level.INFO, "*********Cvalue algorithm is running..."); cvalue.setCandidates(candidates); // set candidates to the algorithm cvalue.runAlgorithm(); // process the CValue algorithm with the provided filters doc.getTermList(); // get the results List<Term> termList = doc.getTermList(); logger.log(Level.INFO, "*********Terms being written..."); PrintWriter pw2 = new PrintWriter(new FileOutputStream(termFile)); int k = 0; for (Term t : termList) { k++; pw2.println(t.toString()); } pw2.close(); logger.log(Level.INFO, "Terms are saved."); System.out.println("Top 20 technical terms:"); for (int l = 0; l < 21; l++) System.out.println(termList.get(l).toString()); }