Exemplo n.º 1
0
  /**
   * not used any more remove stop words
   *
   * @param words
   * @return
   * @throws IOException
   */
  public static List<String> stopRemover(List<String> words) throws IOException {

    List<String> stopwords = Arrays.asList(StopWords.EnglishStopWords());
    List<String> filterwords = new ArrayList<String>();

    for (String w : words) {
      String w1 = w.substring(0, w.lastIndexOf("/"));
      String w2 = w1.replaceAll("[^a-zA-Z]", " ").trim();
      if (!w2.equals("") && !stopwords.contains(w1)) { // check if a word is in stop list
        // or a white space
        filterwords.add(w);
      }
    }

    return filterwords;
  }
Exemplo n.º 2
0
  /**
   * replacing noun phrases as unique tokens removing stopwords, non-letter tokens from the stemmed
   * corpus
   *
   * @param termFile
   * @param stemmedFile
   * @param term_corpusFile
   * @throws IOException
   */
  public static void replacePhrase(String termFile, String stemmedFile, String term_corpusFile)
      throws IOException {

    // HashMap<String,String> terms = new HashMap<String,String>();
    List<String> terms1 = new ArrayList<>();
    List<String> terms2 = new ArrayList<>();

    // read term file to get the list of phrases
    logger.log(Level.INFO, "*********Reading term file....");
    String combline = "";
    String line = "";
    BufferedReader br1 = new BufferedReader(new FileReader(termFile));
    while ((line = br1.readLine()) != null) {
      String phr = line.replaceAll("[^A-Za-z]", " ").trim();
      terms1.add(phr);
    }
    /**
     * List<String> words = Lists.newArrayList(phr.split(" ")); String mainWord =""; if
     * (words.contains("of")) { int pos = words.indexOf("of"); mainWord = words.get(pos-1); } else
     * mainWord = words.get(words.size()-1); String newMainWord=mainWord+"_"+i;
     * phr=phr.replace(mainWord, newMainWord); terms2.add(phr);
     */

    // read the corpus file
    logger.log(Level.INFO, "*********Reading corpus file....");
    combline = "";
    line = "";
    BufferedReader br2 = new BufferedReader(new FileReader(stemmedFile));

    while ((line = br2.readLine()) != null) {
      combline = combline + " " + line;
      combline = combline.replaceAll("\\s+", " ").trim();
    }

    /**
     * //sort the list by term length logger.log(Level.INFO, "*********Sorting term list by
     * length...."); Set<Entry<String, Integer>> set = terms.entrySet(); List<Entry<String,
     * Integer>> sortedList = new ArrayList<Entry<String, Integer>>( set);
     * Collections.sort(sortedList, new Comparator<Map.Entry<String, Integer>>() { public int
     * compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) { return
     * o2.getValue().compareTo(o1.getValue()); } });
     *
     * <p>//replacing a noun phrases with a unique token in corpus logger.log(Level.INFO,
     * "*********Replacing terms with tokens...."); int size = sortedList.size(); int i =0; for
     * (Entry<String, Integer> entry : sortedList) { String phr = " "+entry.getKey()+" "; String
     * linkedphr = " "+ entry.getKey().replaceAll("\\s", "-")+" "; combline=combline.replaceAll(phr,
     * linkedphr);
     *
     * <p>i++; System.out.println(i + " out of " + size +" phrases are replaced."); }
     */

    // replacing a noun phrases with a unique token in corpus
    logger.log(Level.INFO, "*********Replacing terms with tokens....");
    int size = terms1.size();
    int i = 0;

    for (String t : terms1) {
      String phr1 = " " + t + " ";
      String phr2 = " " + t + " term:" + t.replaceAll("\\s", "-") + " ";
      combline = combline.replaceAll(phr1, phr2);
      i++;
      System.out.println(i + " out of " + size + " phrases are replaced.");
    }

    // removing stop words, number, non-letter tokens from the corpus
    // writing corpus on disk
    logger.log(Level.INFO, "*********Rewritting the modified corpus...");
    List<String> stopwords = Arrays.asList(StopWords.EnglishStopWords());
    List<String> words = Lists.newArrayList(combline.split(" "));
    PrintWriter pw = new PrintWriter(new FileOutputStream(term_corpusFile));
    for (String w : words) {

      if (w.equals(".")) pw.println();

      String w2 = w.replaceAll("[^a-zA-Z]", " ").trim(); // to remove non-letter tokens
      if (!w2.equals("")
          && !stopwords.contains(w)) { // to remove stop words	  																							
        pw.print(w + " ");
      }
    }
    pw.close();

    logger.log(Level.INFO, "DONE!");
  }