示例#1
0
  /**
   * opennlp part of speech tagging
   *
   * @param tokens
   * @return
   * @return
   * @throws IOException
   */
  public static List<String> taggerAndStemmer(String inputFile) throws IOException {

    String tags[] = null;
    String[] tokens = tokenizer(inputFile);
    List<String> token_tag = new ArrayList<String>();
    List<String> stemmedwords = new ArrayList<String>();
    PlingStemmer stemmer = new PlingStemmer();

    try (InputStream posModelStream = new FileInputStream("openNLPmodels\\en-pos-maxent.bin");
        InputStream chunkerStream = new FileInputStream("openNLPmodels\\en-chunker.bin"); ) {
      POSModel modelTagger = new POSModel(posModelStream);
      POSTaggerME tagger = new POSTaggerME(modelTagger);
      tags = tagger.tag(tokens);

      for (int i = 0; i < tags.length; i++) {

        String w = tokens[i].toLowerCase(); // lowercase phrase
        w = stemmer.stem(w); // stemming phrase
        if (tags[i].equals("NNS")) token_tag.add(w + "/" + "NN");
        else token_tag.add(w + "/" + tags[i]);
      }

    } catch (IOException ex) {
      // Handle exceptions

    }
    return token_tag;
  }
示例#2
0
  /**
   * stemmer (remove stop words) remove special characters, duplicate spaces, numbers, stop words
   * return a list of words
   *
   * @param rawdatapath
   * @return
   * @throws IOException
   */
  public static List<String> cleanAndstemmer(String rawdatapath) throws IOException {
    String line = "";
    String combline = "";

    // read input data
    try {
      BufferedReader br = new BufferedReader(new FileReader(rawdatapath));

      while ((line = br.readLine()) != null) combline = combline + " " + line;

    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    /** */

    // remove url
    // combline = combline.replaceAll("https?://\\S+\\s?", "");

    // replace "- " with a white space
    // combline = combline.replaceAll("-", " ");

    // remove non letter
    // combline = combline.replaceAll("[^a-zA-Z\\s]", "");
    // one other way: combline=combline.replaceAll("[-+^,\\.\\;:?()'><*&*#%{}=$\\d]", "");

    // remove duplicate spaces
    combline = combline.replaceAll("\\s+", " ").trim();

    List<String> words = Lists.newArrayList(combline.split(" "));

    // stemming
    List<String> stemmedwords = new ArrayList<String>();

    PlingStemmer stemmer = new PlingStemmer();

    for (String w : words) {
      w = w.toLowerCase();
      if (!w.equals("'s")) w = stemmer.stem(w);
      stemmedwords.add(w);
    }

    return stemmedwords;
  }