Ejemplo n.º 1
0
  /**
   * extract technical terms using C-value method
   *
   * @param phraseFile
   * @throws IOException
   */
  public static void extractTerm(String phraseFile) throws IOException {

    logger.log(Level.INFO, "*********Collecting and cleaning candidates. Please wait...");

    HashMap<Candidate, Integer> map =
        new HashMap<Candidate, Integer>(); // map candiates and their frequency
    String line = "";
    int percentComplete1 = 0;
    int i = 0;
    try {
      BufferedReader br = new BufferedReader(new FileReader(phraseFile));
      long size = br.lines().count();
      br = new BufferedReader(new FileReader(phraseFile));

      while ((line = br.readLine()) != null) {
        if (!line.equals("")) { // check empty line

          Candidate cand = new Candidate(line, line.split("\\s").length);

          if (map.containsKey(cand)) {
            map.put(cand, map.get(cand) + 1);
          } else map.put(cand, 1);
        }

        // reporting the progress
        i++;
        if (i * 100 / size > percentComplete1) {
          percentComplete1 = percentComplete1 + 1;
          logger.log(Level.INFO, percentComplete1 + " percent of temp candidates processed.");
        }
      }

    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    logger.log(Level.INFO, "*********Removing unfrequent noun phrases. Please wait...");
    // List<Candidate> cleanedCand = new ArrayList<>();

    Candidate[] cleanedCand = map.keySet().toArray(new Candidate[map.size()]);

    List<Candidate> candidates = new ArrayList<>();

    for (Candidate c : cleanedCand) {
      if (map.get(c) >= 50) { // ignore phrases occurring less than 50 times in the corpus
        c.incrementFreq(map.get(c));
        candidates.add(c);
      }
    }

    Document doc = new Document("C:\\", "terms.txt");
    // doc.List(tokenizedSentenceList);

    CValueAlgortithm cvalue = new CValueAlgortithm();
    cvalue.init(doc); // initializes the algorithm for processing the desired document.
    ILinguisticFilter pFilter = new AdjPrepNounFilter(); // filter
    cvalue.addNewProcessingFilter(pFilter);
    ; // for example the AdjNounFilter
    logger.log(Level.INFO, "*********Cvalue algorithm is running...");
    cvalue.setCandidates(candidates); // set candidates to the algorithm
    cvalue.runAlgorithm(); // process the CValue algorithm with the provided filters

    doc.getTermList(); // get the results
    List<Term> termList = doc.getTermList();

    logger.log(Level.INFO, "*********Terms being written...");

    PrintWriter pw2 = new PrintWriter(new FileOutputStream(termFile));
    int k = 0;
    for (Term t : termList) {
      k++;
      pw2.println(t.toString());
    }
    pw2.close();

    logger.log(Level.INFO, "Terms are saved.");

    System.out.println("Top 20 technical terms:");

    for (int l = 0; l < 21; l++) System.out.println(termList.get(l).toString());
  }
Ejemplo n.º 2
0
  /**
   * extract a list of noun phrases and save it on the disk noun phrases with length > 10 are
   * ignored
   *
   * @throws IOException
   */
  public static void extractNounPhrases() throws IOException {

    /** initial declaration */
    List<String> nounPhrases = new ArrayList<>();
    File folder = new File(tagged_stemmedSingleDir);
    File[] listOfFiles = folder.listFiles();
    int filecount = listOfFiles.length;
    int i = 0;

    /**
     * scan the input folder that contains tokenized and tagged corpus extract noun phrases in
     * accordance with a pattern
     */
    logger.log(Level.INFO, "Data files are being read. Please wait...");
    for (File file : listOfFiles) {
      if (file.isFile()) {
        LinkedList<Token> tokenList = new LinkedList<>();

        List<LinkedList<Token>> tokenizedSentenceList = new ArrayList<LinkedList<Token>>();
        i++;
        String filepath = tagged_stemmedSingleDir + "\\" + file.getName();
        String combline = "";
        String line = "";
        try {
          BufferedReader br = new BufferedReader(new FileReader(filepath));

          while ((line = br.readLine()) != null) combline = combline + " " + line;

        } catch (FileNotFoundException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }

        logger.log(Level.INFO, "The array of tagged tokens are being generated. Please wait...");

        // remove duplicate spaces
        combline = combline.replaceAll("\\s+", " ").trim();

        // save string to array list
        List<String> tagged_tokens = Lists.newArrayList(combline.split(" "));

        logger.log(Level.INFO, "Tokenlist is being generated. Please wait...");
        int j = 0;
        String preToken = "";
        for (String t : tagged_tokens) {

          if (!t.equals("")
              && !t.equals(" ")
              && !t.equals(preToken)) { // ignore space, null or repeated tokens
            String token = t.substring(0, t.lastIndexOf("/"));
            String tag = t.substring(t.lastIndexOf("/") + 1);
            tokenList.add(new Token(token, tag));
            j++;
            preToken =
                tagged_tokens.get(j - 1); // to check repeated tokens that may not have meanings
          }
        }

        /** extracting noun phrases */
        tokenizedSentenceList.add(tokenList);

        Document doc = new Document("C:\\", "terms.txt");
        doc.List(tokenizedSentenceList);

        CValueAlgortithm cvalue = new CValueAlgortithm();
        cvalue.init(doc); // initializes the algorithm for processing the desired document.

        ILinguisticFilter pFilter = new AdjPrepNounFilter();
        cvalue.addNewProcessingFilter(pFilter);
        ; // set noun phrase pattern, e.g. AdjNounFilter

        List<String> cList = cvalue.extractCandidate(); // extract NP candidate

        nounPhrases.addAll(cList);

        logger.log(Level.INFO, "---------" + i + " out of " + filecount + " files processed------");
      }
    }

    /** writing noun phrases stemming */
    logger.log(Level.INFO, "*********Writting noun phrases. Please wait...");
    PrintWriter pw1 = new PrintWriter(new FileOutputStream(phraseFile));
    List<String> stemmedwords = new ArrayList<String>();
    PlingStemmer stemmer = new PlingStemmer();

    for (String c : nounPhrases) {

      /**
       * //stemming phrase List<String> words = Lists.newArrayList(c.split("\\s")); String
       * stemmedPhrase =""; if (words.size()<=10) //ignore noun phrases with length >10 for (String
       * w :words) { w=w.toLowerCase(); //lowercase phrase w=stemmer.stem(w); //stemming phrase
       * stemmedPhrase=stemmedPhrase+" " +w; } pw1.println(stemmedPhrase);
       */
      pw1.println(c);
    }
    pw1.close();

    logger.log(Level.INFO, "DONE!");
  }