Java CValueAlgortithm Examples

Programming Language: Java

Namespace/Package Name: edu.ehu.galan.cvalue

Class/Type: CValueAlgortithm

Examples at hotexamples.com: 2

Java CValueAlgortithm - 2 examples found. These are the top rated real world Java examples of edu.ehu.galan.cvalue.CValueAlgortithm extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

addNewProcessingFilter(2)

init(2)

extractCandidate(1)

runAlgorithm(1)

setCandidates(1)

Example #1

Show file

File: mdvGenerator.java Project: tuyenbk/mdvgenerator

  /**
   * extract technical terms using C-value method
   *
   * @param phraseFile
   * @throws IOException
   */
  public static void extractTerm(String phraseFile) throws IOException {

    logger.log(Level.INFO, "*********Collecting and cleaning candidates. Please wait...");

    HashMap<Candidate, Integer> map =
        new HashMap<Candidate, Integer>(); // map candiates and their frequency
    String line = "";
    int percentComplete1 = 0;
    int i = 0;
    try {
      BufferedReader br = new BufferedReader(new FileReader(phraseFile));
      long size = br.lines().count();
      br = new BufferedReader(new FileReader(phraseFile));

      while ((line = br.readLine()) != null) {
        if (!line.equals("")) { // check empty line

          Candidate cand = new Candidate(line, line.split("\\s").length);

          if (map.containsKey(cand)) {
            map.put(cand, map.get(cand) + 1);
          } else map.put(cand, 1);
        }

        // reporting the progress
        i++;
        if (i * 100 / size > percentComplete1) {
          percentComplete1 = percentComplete1 + 1;
          logger.log(Level.INFO, percentComplete1 + " percent of temp candidates processed.");
        }
      }

    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    logger.log(Level.INFO, "*********Removing unfrequent noun phrases. Please wait...");
    // List<Candidate> cleanedCand = new ArrayList<>();

    Candidate[] cleanedCand = map.keySet().toArray(new Candidate[map.size()]);

    List<Candidate> candidates = new ArrayList<>();

    for (Candidate c : cleanedCand) {
      if (map.get(c) >= 50) { // ignore phrases occurring less than 50 times in the corpus
        c.incrementFreq(map.get(c));
        candidates.add(c);
      }
    }

    Document doc = new Document("C:\\", "terms.txt");
    // doc.List(tokenizedSentenceList);

    CValueAlgortithm cvalue = new CValueAlgortithm();
    cvalue.init(doc); // initializes the algorithm for processing the desired document.
    ILinguisticFilter pFilter = new AdjPrepNounFilter(); // filter
    cvalue.addNewProcessingFilter(pFilter);
    ; // for example the AdjNounFilter
    logger.log(Level.INFO, "*********Cvalue algorithm is running...");
    cvalue.setCandidates(candidates); // set candidates to the algorithm
    cvalue.runAlgorithm(); // process the CValue algorithm with the provided filters

    doc.getTermList(); // get the results
    List<Term> termList = doc.getTermList();

    logger.log(Level.INFO, "*********Terms being written...");

    PrintWriter pw2 = new PrintWriter(new FileOutputStream(termFile));
    int k = 0;
    for (Term t : termList) {
      k++;
      pw2.println(t.toString());
    }
    pw2.close();

    logger.log(Level.INFO, "Terms are saved.");

    System.out.println("Top 20 technical terms:");

    for (int l = 0; l < 21; l++) System.out.println(termList.get(l).toString());
  }

Example #2

Show file

File: mdvGenerator.java Project: tuyenbk/mdvgenerator

  /**
   * extract a list of noun phrases and save it on the disk noun phrases with length > 10 are
   * ignored
   *
   * @throws IOException
   */
  public static void extractNounPhrases() throws IOException {

    /** initial declaration */
    List<String> nounPhrases = new ArrayList<>();
    File folder = new File(tagged_stemmedSingleDir);
    File[] listOfFiles = folder.listFiles();
    int filecount = listOfFiles.length;
    int i = 0;

    /**
     * scan the input folder that contains tokenized and tagged corpus extract noun phrases in
     * accordance with a pattern
     */
    logger.log(Level.INFO, "Data files are being read. Please wait...");
    for (File file : listOfFiles) {
      if (file.isFile()) {
        LinkedList<Token> tokenList = new LinkedList<>();

        List<LinkedList<Token>> tokenizedSentenceList = new ArrayList<LinkedList<Token>>();
        i++;
        String filepath = tagged_stemmedSingleDir + "\\" + file.getName();
        String combline = "";
        String line = "";
        try {
          BufferedReader br = new BufferedReader(new FileReader(filepath));

          while ((line = br.readLine()) != null) combline = combline + " " + line;

        } catch (FileNotFoundException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }

        logger.log(Level.INFO, "The array of tagged tokens are being generated. Please wait...");

        // remove duplicate spaces
        combline = combline.replaceAll("\\s+", " ").trim();

        // save string to array list
        List<String> tagged_tokens = Lists.newArrayList(combline.split(" "));

        logger.log(Level.INFO, "Tokenlist is being generated. Please wait...");
        int j = 0;
        String preToken = "";
        for (String t : tagged_tokens) {

          if (!t.equals("")
              && !t.equals(" ")
              && !t.equals(preToken)) { // ignore space, null or repeated tokens
            String token = t.substring(0, t.lastIndexOf("/"));
            String tag = t.substring(t.lastIndexOf("/") + 1);
            tokenList.add(new Token(token, tag));
            j++;
            preToken =
                tagged_tokens.get(j - 1); // to check repeated tokens that may not have meanings
          }
        }

        /** extracting noun phrases */
        tokenizedSentenceList.add(tokenList);

        Document doc = new Document("C:\\", "terms.txt");
        doc.List(tokenizedSentenceList);

        CValueAlgortithm cvalue = new CValueAlgortithm();
        cvalue.init(doc); // initializes the algorithm for processing the desired document.

        ILinguisticFilter pFilter = new AdjPrepNounFilter();
        cvalue.addNewProcessingFilter(pFilter);
        ; // set noun phrase pattern, e.g. AdjNounFilter

        List<String> cList = cvalue.extractCandidate(); // extract NP candidate

        nounPhrases.addAll(cList);

        logger.log(Level.INFO, "---------" + i + " out of " + filecount + " files processed------");
      }
    }

    /** writing noun phrases stemming */
    logger.log(Level.INFO, "*********Writting noun phrases. Please wait...");
    PrintWriter pw1 = new PrintWriter(new FileOutputStream(phraseFile));
    List<String> stemmedwords = new ArrayList<String>();
    PlingStemmer stemmer = new PlingStemmer();

    for (String c : nounPhrases) {

      /**
       * //stemming phrase List<String> words = Lists.newArrayList(c.split("\\s")); String
       * stemmedPhrase =""; if (words.size()<=10) //ignore noun phrases with length >10 for (String
       * w :words) { w=w.toLowerCase(); //lowercase phrase w=stemmer.stem(w); //stemming phrase
       * stemmedPhrase=stemmedPhrase+" " +w; } pw1.println(stemmedPhrase);
       */
      pw1.println(c);
    }
    pw1.close();

    logger.log(Level.INFO, "DONE!");
  }