/** * opennlp part of speech tagging * * @param tokens * @return * @return * @throws IOException */ public static List<String> taggerAndStemmer(String inputFile) throws IOException { String tags[] = null; String[] tokens = tokenizer(inputFile); List<String> token_tag = new ArrayList<String>(); List<String> stemmedwords = new ArrayList<String>(); PlingStemmer stemmer = new PlingStemmer(); try (InputStream posModelStream = new FileInputStream("openNLPmodels\\en-pos-maxent.bin"); InputStream chunkerStream = new FileInputStream("openNLPmodels\\en-chunker.bin"); ) { POSModel modelTagger = new POSModel(posModelStream); POSTaggerME tagger = new POSTaggerME(modelTagger); tags = tagger.tag(tokens); for (int i = 0; i < tags.length; i++) { String w = tokens[i].toLowerCase(); // lowercase phrase w = stemmer.stem(w); // stemming phrase if (tags[i].equals("NNS")) token_tag.add(w + "/" + "NN"); else token_tag.add(w + "/" + tags[i]); } } catch (IOException ex) { // Handle exceptions } return token_tag; }
/** * stemmer (remove stop words) remove special characters, duplicate spaces, numbers, stop words * return a list of words * * @param rawdatapath * @return * @throws IOException */ public static List<String> cleanAndstemmer(String rawdatapath) throws IOException { String line = ""; String combline = ""; // read input data try { BufferedReader br = new BufferedReader(new FileReader(rawdatapath)); while ((line = br.readLine()) != null) combline = combline + " " + line; } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } /** */ // remove url // combline = combline.replaceAll("https?://\\S+\\s?", ""); // replace "- " with a white space // combline = combline.replaceAll("-", " "); // remove non letter // combline = combline.replaceAll("[^a-zA-Z\\s]", ""); // one other way: combline=combline.replaceAll("[-+^,\\.\\;:?()'><*&*#%{}=$\\d]", ""); // remove duplicate spaces combline = combline.replaceAll("\\s+", " ").trim(); List<String> words = Lists.newArrayList(combline.split(" ")); // stemming List<String> stemmedwords = new ArrayList<String>(); PlingStemmer stemmer = new PlingStemmer(); for (String w : words) { w = w.toLowerCase(); if (!w.equals("'s")) w = stemmer.stem(w); stemmedwords.add(w); } return stemmedwords; }