/** * opennlp part of speech tagging * * @param tokens * @return * @return * @throws IOException */ public static List<String> taggerAndStemmer(String inputFile) throws IOException { String tags[] = null; String[] tokens = tokenizer(inputFile); List<String> token_tag = new ArrayList<String>(); List<String> stemmedwords = new ArrayList<String>(); PlingStemmer stemmer = new PlingStemmer(); try (InputStream posModelStream = new FileInputStream("openNLPmodels\\en-pos-maxent.bin"); InputStream chunkerStream = new FileInputStream("openNLPmodels\\en-chunker.bin"); ) { POSModel modelTagger = new POSModel(posModelStream); POSTaggerME tagger = new POSTaggerME(modelTagger); tags = tagger.tag(tokens); for (int i = 0; i < tags.length; i++) { String w = tokens[i].toLowerCase(); // lowercase phrase w = stemmer.stem(w); // stemming phrase if (tags[i].equals("NNS")) token_tag.add(w + "/" + "NN"); else token_tag.add(w + "/" + tags[i]); } } catch (IOException ex) { // Handle exceptions } return token_tag; }
public void run(String sentence) { POSTaggerME tagger = new POSTaggerME(getModel()); String[] words = sentence.split("\\s+"); String[] tags = tagger.tag(words); double[] probs = tagger.probs(); for (int i = 0; i < tags.length; i++) { System.out.println(words[i] + " => " + tags[i] + " @ " + probs[i]); } }
@Override public void annotate(Annotation sentence) { POSTaggerME posTagger = new POSTaggerME(loadPOSTagger(sentence.getLanguage())); String[] tokens = sentence.tokens().stream().map(Object::toString).toArray(String[]::new); String[] tags = posTagger.tag(tokens); for (int i = 0; i < tokens.length; i++) { Annotation token = sentence.tokenAt(i); token.put(Types.PART_OF_SPEECH, POS.fromString(tags[i])); } }
@Override public Matrix parseContent( LinkedList docRow, int idCol, int targetCol, LinkedList<Integer> dateColsNum, LinkedList<Integer> otherColsNum) { Matrix returnMat = new Matrix(); String inputStr = docRow.get(targetCol - 1).toString().toLowerCase(); String[] inputStrArray = tokenizer.tokenize(inputStr); String[] tagArray = tagger.tag(inputStrArray); for (int i = 0; i < tagArray.length; i++) { String keywordRole = tagArray[i]; String orgKeyword = inputStrArray[i]; // System.out.println(stemmer.stem("agreed")); // check keyword_role is usable boolean roleFlag = AnalysisThread.roleFilter(keywordRole); if (roleFlag) { AnalysisThread.addToMatrix(keywordRole, orgKeyword, docRow, returnMat); } } return returnMat; }
public void run(String file, POSModel model) throws Exception { @SuppressWarnings("resource") BufferedReader br = new BufferedReader(new FileReader(new File(file))); String l = ""; POSTaggerME tagger = new POSTaggerME(model); int correct = 0; int wrong = 0; while ((l = br.readLine()) != null) { String strip = l.replaceAll("_[A-Z]+", ""); String[] tags1 = l.replaceAll("[^_\\s]+?_", "").split(" "); // System.out.println(strip); // System.out.println(tags); String[] tags2 = tagger.tag(strip.split(" ")); String[] strips = strip.split(" "); if (tags2.length != tags1.length) { // something went wrong throw new Exception("Tag lists unequal size"); } for (int i = 0; i < tags2.length; i++) { if (tags1[i].equals(tags2[i])) { correct++; } else { System.err.println(strip); System.err.print(strips[i] + " "); System.err.println(tags1[i] + " tagged as " + tags2[i]); wrong++; } } } br.close(); System.out.println("Total: " + (correct + wrong)); System.out.println("Correct: " + correct); System.out.println("Wrong: " + wrong); System.out.println("Precision: " + (((double) correct) / (correct + wrong))); }
public static String getPredicates(String s) { InputStream modelPOSTagger = null; InputStream modelTokenizer = null; POSModel modelPOS = null; TokenizerModel modelToken = null; StringBuffer sent = new StringBuffer(); // loading the POSTagger Model try { modelPOSTagger = new FileInputStream( "/home/opnchaudhary/androidapps/openNlpWeb/WebContent/mod/en-pos-maxent.bin"); modelPOS = new POSModel(modelPOSTagger); } catch (Exception e) { e.printStackTrace(); } finally { if (modelPOSTagger != null) { try { modelPOSTagger.close(); } catch (Exception e1) { e1.printStackTrace(); } } } // loading the Tokenizer Model try { modelTokenizer = new FileInputStream( "/home/opnchaudhary/androidapps/openNlpWeb/WebContent/mod/en-token.bin"); modelToken = new TokenizerModel(modelTokenizer); } catch (Exception e) { e.printStackTrace(); } finally { if (modelTokenizer != null) { try { modelTokenizer.close(); } catch (Exception e1) { e1.printStackTrace(); } } } // POS Tagging // Tokenization Tokenizer tokenizer = new TokenizerME(modelToken); String[] tokens = tokenizer.tokenize(s); POSTaggerME posTagger = new POSTaggerME(modelPOS); /* * This works to some extent. The logic behind this is to take out verb * out of the sentence as attribute of the subject of the sentence to * form a predicate with object of the sentece as the second parameter * in the predicate */ String consts = null; String attr = null; String consts1 = null; int count = 0; for (int tempVar = 0; tempVar < tokens.length; tempVar++) { String temp = posTagger.tag(tokens[tempVar]); String[] temps = temp.split(" "); for (int i = 0; i < temps.length; i++) { String[] t = temps[i].split("/"); if (t[1].equals("NN") || t[1].equals("NNS") || t[1].equals("NNP") || t[1].equals("NNPS") || t[1].equals("FW")) { count++; if (count == 1) { consts = t[0]; break; } } } for (int i = 0; i < temps.length; i++) { String[] t = temps[i].split("/"); if (t[1].equals("VB") || t[1].equals("VBD") || t[1].equals("VBN") || t[1].equals("VBZ")) { attr = t[0]; } } for (int i = 0; i < temps.length; i++) { String[] t = temps[i].split("/"); if (t[1].equals("NN") || t[1].equals("NNS") || t[1].equals("NNP") || t[1].equals("NNPS") || t[1].equals("FW") || t[1].equals("JJ")) { consts1 = t[0]; } } } sent.append(attr + "(" + consts + "," + consts1 + ")"); return sent.toString(); }
public void run(String format, String[] args) { super.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), true); if (mlParams != null && !TrainerFactory.isValid(mlParams.getSettings())) { throw new TerminateToolException( 1, "Training parameters file '" + params.getParams() + "' is invalid!"); } if (mlParams == null) { mlParams = ModelUtil.createDefaultTrainingParameters(); mlParams.put(TrainingParameters.ALGORITHM_PARAM, getModelType(params.getType()).toString()); } File modelOutFile = params.getModel(); CmdLineUtil.checkOutputFile("pos tagger model", modelOutFile); Dictionary ngramDict = null; Integer ngramCutoff = params.getNgram(); if (ngramCutoff != null) { System.err.print("Building ngram dictionary ... "); try { ngramDict = POSTaggerME.buildNGramDictionary(sampleStream, ngramCutoff); sampleStream.reset(); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while building NGram Dictionary: " + e.getMessage(), e); } System.err.println("done"); } POSTaggerFactory postaggerFactory = null; try { postaggerFactory = POSTaggerFactory.create(params.getFactory(), ngramDict, null); } catch (InvalidFormatException e) { throw new TerminateToolException(-1, e.getMessage(), e); } if (params.getDict() != null) { try { postaggerFactory.setTagDictionary(postaggerFactory.createTagDictionary(params.getDict())); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while loading POS Dictionary: " + e.getMessage(), e); } } if (params.getTagDictCutoff() != null) { try { TagDictionary dict = postaggerFactory.getTagDictionary(); if (dict == null) { dict = postaggerFactory.createEmptyTagDictionary(); postaggerFactory.setTagDictionary(dict); } if (dict instanceof MutableTagDictionary) { POSTaggerME.populatePOSDictionary( sampleStream, (MutableTagDictionary) dict, params.getTagDictCutoff()); } else { throw new IllegalArgumentException( "Can't extend a POSDictionary that does not implement MutableTagDictionary."); } sampleStream.reset(); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while creating/extending POS Dictionary: " + e.getMessage(), e); } } POSModel model; try { model = opennlp.tools.postag.POSTaggerME.train( params.getLang(), sampleStream, mlParams, postaggerFactory); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while reading training data or indexing data: " + e.getMessage(), e); } finally { try { sampleStream.close(); } catch (IOException e) { // sorry that this can fail } } CmdLineUtil.writeModel("pos tagger", modelOutFile, model); }