/** * extract technical terms using C-value method * * @param phraseFile * @throws IOException */ public static void extractTerm(String phraseFile) throws IOException { logger.log(Level.INFO, "*********Collecting and cleaning candidates. Please wait..."); HashMap<Candidate, Integer> map = new HashMap<Candidate, Integer>(); // map candiates and their frequency String line = ""; int percentComplete1 = 0; int i = 0; try { BufferedReader br = new BufferedReader(new FileReader(phraseFile)); long size = br.lines().count(); br = new BufferedReader(new FileReader(phraseFile)); while ((line = br.readLine()) != null) { if (!line.equals("")) { // check empty line Candidate cand = new Candidate(line, line.split("\\s").length); if (map.containsKey(cand)) { map.put(cand, map.get(cand) + 1); } else map.put(cand, 1); } // reporting the progress i++; if (i * 100 / size > percentComplete1) { percentComplete1 = percentComplete1 + 1; logger.log(Level.INFO, percentComplete1 + " percent of temp candidates processed."); } } } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } logger.log(Level.INFO, "*********Removing unfrequent noun phrases. Please wait..."); // List<Candidate> cleanedCand = new ArrayList<>(); Candidate[] cleanedCand = map.keySet().toArray(new Candidate[map.size()]); List<Candidate> candidates = new ArrayList<>(); for (Candidate c : cleanedCand) { if (map.get(c) >= 50) { // ignore phrases occurring less than 50 times in the corpus c.incrementFreq(map.get(c)); candidates.add(c); } } Document doc = new Document("C:\\", "terms.txt"); // doc.List(tokenizedSentenceList); CValueAlgortithm cvalue = new CValueAlgortithm(); cvalue.init(doc); // initializes the algorithm for processing the desired document. ILinguisticFilter pFilter = new AdjPrepNounFilter(); // filter cvalue.addNewProcessingFilter(pFilter); ; // for example the AdjNounFilter logger.log(Level.INFO, "*********Cvalue algorithm is running..."); cvalue.setCandidates(candidates); // set candidates to the algorithm cvalue.runAlgorithm(); // process the CValue algorithm with the provided filters doc.getTermList(); // get the results List<Term> termList = doc.getTermList(); logger.log(Level.INFO, "*********Terms being written..."); PrintWriter pw2 = new PrintWriter(new FileOutputStream(termFile)); int k = 0; for (Term t : termList) { k++; pw2.println(t.toString()); } pw2.close(); logger.log(Level.INFO, "Terms are saved."); System.out.println("Top 20 technical terms:"); for (int l = 0; l < 21; l++) System.out.println(termList.get(l).toString()); }
/** * extract a list of noun phrases and save it on the disk noun phrases with length > 10 are * ignored * * @throws IOException */ public static void extractNounPhrases() throws IOException { /** initial declaration */ List<String> nounPhrases = new ArrayList<>(); File folder = new File(tagged_stemmedSingleDir); File[] listOfFiles = folder.listFiles(); int filecount = listOfFiles.length; int i = 0; /** * scan the input folder that contains tokenized and tagged corpus extract noun phrases in * accordance with a pattern */ logger.log(Level.INFO, "Data files are being read. Please wait..."); for (File file : listOfFiles) { if (file.isFile()) { LinkedList<Token> tokenList = new LinkedList<>(); List<LinkedList<Token>> tokenizedSentenceList = new ArrayList<LinkedList<Token>>(); i++; String filepath = tagged_stemmedSingleDir + "\\" + file.getName(); String combline = ""; String line = ""; try { BufferedReader br = new BufferedReader(new FileReader(filepath)); while ((line = br.readLine()) != null) combline = combline + " " + line; } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } logger.log(Level.INFO, "The array of tagged tokens are being generated. Please wait..."); // remove duplicate spaces combline = combline.replaceAll("\\s+", " ").trim(); // save string to array list List<String> tagged_tokens = Lists.newArrayList(combline.split(" ")); logger.log(Level.INFO, "Tokenlist is being generated. Please wait..."); int j = 0; String preToken = ""; for (String t : tagged_tokens) { if (!t.equals("") && !t.equals(" ") && !t.equals(preToken)) { // ignore space, null or repeated tokens String token = t.substring(0, t.lastIndexOf("/")); String tag = t.substring(t.lastIndexOf("/") + 1); tokenList.add(new Token(token, tag)); j++; preToken = tagged_tokens.get(j - 1); // to check repeated tokens that may not have meanings } } /** extracting noun phrases */ tokenizedSentenceList.add(tokenList); Document doc = new Document("C:\\", "terms.txt"); doc.List(tokenizedSentenceList); CValueAlgortithm cvalue = new CValueAlgortithm(); cvalue.init(doc); // initializes the algorithm for processing the desired document. ILinguisticFilter pFilter = new AdjPrepNounFilter(); cvalue.addNewProcessingFilter(pFilter); ; // set noun phrase pattern, e.g. AdjNounFilter List<String> cList = cvalue.extractCandidate(); // extract NP candidate nounPhrases.addAll(cList); logger.log(Level.INFO, "---------" + i + " out of " + filecount + " files processed------"); } } /** writing noun phrases stemming */ logger.log(Level.INFO, "*********Writting noun phrases. Please wait..."); PrintWriter pw1 = new PrintWriter(new FileOutputStream(phraseFile)); List<String> stemmedwords = new ArrayList<String>(); PlingStemmer stemmer = new PlingStemmer(); for (String c : nounPhrases) { /** * //stemming phrase List<String> words = Lists.newArrayList(c.split("\\s")); String * stemmedPhrase =""; if (words.size()<=10) //ignore noun phrases with length >10 for (String * w :words) { w=w.toLowerCase(); //lowercase phrase w=stemmer.stem(w); //stemming phrase * stemmedPhrase=stemmedPhrase+" " +w; } pw1.println(stemmedPhrase); */ pw1.println(c); } pw1.close(); logger.log(Level.INFO, "DONE!"); }