示例#1
0
  /**
   * extract technical terms using C-value method
   *
   * @param phraseFile
   * @throws IOException
   */
  public static void extractTerm(String phraseFile) throws IOException {

    logger.log(Level.INFO, "*********Collecting and cleaning candidates. Please wait...");

    HashMap<Candidate, Integer> map =
        new HashMap<Candidate, Integer>(); // map candiates and their frequency
    String line = "";
    int percentComplete1 = 0;
    int i = 0;
    try {
      BufferedReader br = new BufferedReader(new FileReader(phraseFile));
      long size = br.lines().count();
      br = new BufferedReader(new FileReader(phraseFile));

      while ((line = br.readLine()) != null) {
        if (!line.equals("")) { // check empty line

          Candidate cand = new Candidate(line, line.split("\\s").length);

          if (map.containsKey(cand)) {
            map.put(cand, map.get(cand) + 1);
          } else map.put(cand, 1);
        }

        // reporting the progress
        i++;
        if (i * 100 / size > percentComplete1) {
          percentComplete1 = percentComplete1 + 1;
          logger.log(Level.INFO, percentComplete1 + " percent of temp candidates processed.");
        }
      }

    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    logger.log(Level.INFO, "*********Removing unfrequent noun phrases. Please wait...");
    // List<Candidate> cleanedCand = new ArrayList<>();

    Candidate[] cleanedCand = map.keySet().toArray(new Candidate[map.size()]);

    List<Candidate> candidates = new ArrayList<>();

    for (Candidate c : cleanedCand) {
      if (map.get(c) >= 50) { // ignore phrases occurring less than 50 times in the corpus
        c.incrementFreq(map.get(c));
        candidates.add(c);
      }
    }

    Document doc = new Document("C:\\", "terms.txt");
    // doc.List(tokenizedSentenceList);

    CValueAlgortithm cvalue = new CValueAlgortithm();
    cvalue.init(doc); // initializes the algorithm for processing the desired document.
    ILinguisticFilter pFilter = new AdjPrepNounFilter(); // filter
    cvalue.addNewProcessingFilter(pFilter);
    ; // for example the AdjNounFilter
    logger.log(Level.INFO, "*********Cvalue algorithm is running...");
    cvalue.setCandidates(candidates); // set candidates to the algorithm
    cvalue.runAlgorithm(); // process the CValue algorithm with the provided filters

    doc.getTermList(); // get the results
    List<Term> termList = doc.getTermList();

    logger.log(Level.INFO, "*********Terms being written...");

    PrintWriter pw2 = new PrintWriter(new FileOutputStream(termFile));
    int k = 0;
    for (Term t : termList) {
      k++;
      pw2.println(t.toString());
    }
    pw2.close();

    logger.log(Level.INFO, "Terms are saved.");

    System.out.println("Top 20 technical terms:");

    for (int l = 0; l < 21; l++) System.out.println(termList.get(l).toString());
  }