public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("usage: java TaggerDemo2 modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(Sentence.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a // tagged sentence. List<HasWord> sent = Sentence.toWordList( "The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); }
public TreeMap<String, Integer> getOccurrencesOfTagsAfterTurnLength(int min) throws IOException { TreeMap<String, Integer> occurrences = new TreeMap<String, Integer>(); File[] files = inputDir.listFiles(); int totalTurns = 0; for (File curFile : files) { if (!curFile.getName().endsWith(datafileExtension)) continue; System.out.print("Processing file: " + curFile + " ..."); BufferedReader in = new BufferedReader(new FileReader(curFile)); String line = in.readLine(); while ((line = in.readLine()) != null) { String[] values = line.split("\\|", -1); if (values[0].equalsIgnoreCase("server") || values[1].equalsIgnoreCase("server") || values[2].equalsIgnoreCase("server") || values[9].isEmpty()) continue; String curTurn = values[8]; String spellingCorrected = fixSpelling(curTurn); float distance = 0.0f; if (spellingCorrected.trim().isEmpty()) { // System.out.println("EMPTY. SKIPPING THIS."); continue; } List<ArrayList<? extends HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(spellingCorrected)); if (sentences.isEmpty()) { continue; } totalTurns++; for (ArrayList<? extends HasWord> sent : sentences) { ArrayList<TaggedWord> taggedSentence = tagger.tagSentence(sent); boolean lastSentence = (sent == sentences.get(sentences.size() - 1)); if (lastSentence) { taggedSentence.add(new TaggedWord("", "EOT")); } for (int i = 0; i < taggedSentence.size(); i++) { TaggedWord cur = taggedSentence.get(i); distance++; if (distance >= min) { if (occurrences.containsKey(cur.tag())) { occurrences.put(cur.tag(), occurrences.get(cur.tag()) + 1); } else { occurrences.put(cur.tag(), 1); } } } } } } System.out.println("there were " + totalTurns + " turns in total."); return occurrences; }
public static void generate(String model, String fileToTag, String outfile) throws Exception { MaxentTagger tagger = new MaxentTagger(model); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outfile), "utf-8")); BufferedReader br = new BufferedReader(new FileReader(fileToTag)); String line = ""; ArrayList<String> toks = new ArrayList<>(); while ((line = br.readLine()) != null) { if (line.length() == 0) { String params[] = new String[toks.size()]; toks.toArray(params); List<HasWord> sent = Sentence.toWordList(params); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { pw.println(tw.word() + " " + tw.tag()); } pw.println(); toks = new ArrayList<>(); } else { toks.add(line); } } br.close(); pw.close(); }
protected List<IntTaggedWord> listToEvents(List<TaggedWord> taggedWords) { List<IntTaggedWord> itwList = new ArrayList<IntTaggedWord>(); for (TaggedWord tw : taggedWords) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); itwList.add(iTW); } return itwList; }
private static List<TaggedWord> cleanTags(List<TaggedWord> twList, TreebankLanguagePack tlp) { int sz = twList.size(); List<TaggedWord> l = new ArrayList<TaggedWord>(sz); for (int i = 0; i < sz; i++) { TaggedWord tw = twList.get(i); TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag())); l.add(tw2); } return l; }
public ArrayList<String> getNounsFromSentence(String sentence) { ArrayList<TaggedWord> tw = parseSentenceTD(sentence); ArrayList<String> nouns = new ArrayList<String>(); for (TaggedWord t : tw) { if (t.tag().startsWith("N")) { nouns.add(t.value()); } } return nouns; }
@Override public void train(List<TaggedWord> sentence) { lex.train(sentence, 1.0); String last = null; for (TaggedWord tagLabel : sentence) { String tag = tagLabel.tag(); tagIndex.add(tag); if (last == null) { initial.incrementCount(tag); } else { ruleCounter.incrementCount2D(last, tag); } last = tag; } }
public static TaggedWord verbToGerund(TaggedWord verb) { Morphology wordMorpher = new Morphology(); String stem = wordMorpher.stem(verb.word()); if (!stem.equals("do")) { stem = stem.replaceAll("[aeiou]?$", ""); } return new TaggedWord(stem + "ing", "VBG"); }
/** * Turns a sentence into a flat phrasal tree. The structure is S -> tag*. And then each tag goes * to a word. The tag is either found from the label or made "WD". The tag and phrasal node have a * StringLabel. * * @param s The Sentence to make the Tree from * @param lf The LabelFactory with which to create the new Tree labels * @return The one phrasal level Tree */ public static Tree toFlatTree(Sentence<?> s, LabelFactory lf) { List<Tree> daughters = new ArrayList<Tree>(s.length()); for (HasWord word : s) { Tree wordNode = new LabeledScoredTreeLeaf(lf.newLabel(word.word())); if (word instanceof TaggedWord) { TaggedWord taggedWord = (TaggedWord) word; wordNode = new LabeledScoredTreeNode( new StringLabel(taggedWord.tag()), Collections.singletonList(wordNode)); } else { wordNode = new LabeledScoredTreeNode(lf.newLabel("WD"), Collections.singletonList(wordNode)); } daughters.add(wordNode); } return new LabeledScoredTreeNode(new StringLabel("S"), daughters); }
/** Trains this UWM on the Collection of trees. */ public void train(TaggedWord tw, int loc, double weight) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.incrementCount(iW, weight); IntTaggedWord i = NULL_ITW; if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.getCount(iW) < 1.5) { // it's an entirely unknown word int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word)); if (DOCUMENT_UNKNOWNS) { String wStr = wordIndex.get(iTW.word); String tStr = tagIndex.get(iTW.tag); String sStr = wordIndex.get(s); EncodingPrintWriter.err.println( "Unknown word/tag/sig:\t" + wStr + '\t' + tStr + '\t' + sStr, "UTF-8"); } IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag); IntTaggedWord iS = new IntTaggedWord(s, nullTag); unSeenCounter.incrementCount(iTS, weight); unSeenCounter.incrementCount(iT, weight); unSeenCounter.incrementCount(iS, weight); unSeenCounter.incrementCount(i, weight); // rules.add(iTS); // sigs.add(iS); } // else { // if (seenCounter.getCount(iTW) < 2) { // it's a new tag for a known word // do nothing for now // } // } } }
private void processOutgoingSequenceFIFOToInsertCandidate() { // Should only process if last character added is enter or space or a specified timeout StringOfDocChangeInserts sodci = chOut.getStringOfDocChangeInserts(); int indexOfUnsentChanges = chOut.getFirstIndexForChanges(); Vector v2 = c.getHistory().getParserWrapper().parseText(sodci.getString()); Vector taggedWords = (Vector) v2.elementAt(0); int indexInSodciStringOfNextWordCandidate = 0; String sodciString = sodci.getString(); // -------------------------- Vector allInsertsSoFar = sodci.getSequence(); Vector vToBeRemoved = new Vector(); Vector vToBeAdded = new Vector(); vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null)); vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null)); vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null)); vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null)); counter++; Vector allInsWords = new Vector(); for (int i = 0; i < taggedWords.size(); i++) { TaggedWord tw = (TaggedWord) taggedWords.elementAt(i); Vector insWord = new Vector(); int beginIndex = sodciString.indexOf(tw.word(), indexInSodciStringOfNextWordCandidate); if (beginIndex >= indexInSodciStringOfNextWordCandidate) { // If it is found in the string int finishIndex = beginIndex + tw.word().length(); System.out.println(i + ": found index at: " + beginIndex + ": " + finishIndex + ": "); Vector v3 = sodci.getSubSequence(beginIndex, finishIndex); if (!StringOfDocChangeInserts.getSubSequenceString(insWord).equalsIgnoreCase(tw.word())) { insWord = v3; // To check that the words are actually equal } } allInsWords.addElement(insWord); } Vector allPossibleSubstitutions = new Vector(); for (int i = 0; i < taggedWords.size(); i++) { TaggedWord tw = (TaggedWord) taggedWords.elementAt(i); Vector vInsWords = (Vector) allInsWords.elementAt(i); boolean conductWordNetLookup = true; // To save processing time: Check if word in vector exists, check that word has not // already been printed if (vInsWords == null) { conductWordNetLookup = false; } else if (vInsWords.size() < 3) { // Two letter words cause all kinds of problems with wordnet conductWordNetLookup = false; } // else if(!chOut.checkSequenceIsContinuousAndNotAlreadySentOrAlreadyChanged(vInsWords)) // conductWordNetLookup = false; // if (conductWordNetLookup) { Vector v = c.getWordNetWrapper().getReplacementWord(tw.tag(), tw.word(), PointerType.HYPERNYM); allPossibleSubstitutions.addElement(v); } else { allPossibleSubstitutions.addElement(new Vector()); } } for (int i = 0; i < taggedWords.size(); i++) { TaggedWord tw = (TaggedWord) taggedWords.elementAt(i); Vector v = (Vector) allInsWords.elementAt(i); Vector v3 = ((Vector) allPossibleSubstitutions.elementAt(i)); // js.print(i+": "+tw.word()+"---"); if (v.size() == 0) { // js.print("Couldn't find word: "+v3.size()); } else { // js.print(StringOfDocChangeInserts.getSubSequenceString(v)+": "); } // js.print("WORDNET: "); for (int j = 0; j < v3.size(); j++) { String s4 = (String) v3.elementAt(j); // js.print(s4+"||||"); } // js.println(""); } /* Vector allPossibleSubstitutions = new Vector(); for(int i=0;i<taggedWords.size();i++){ TaggedWord tw = (TaggedWord)taggedWords.elementAt(i); Vector vInsWords = (Vector)allInsWords.elementAt(i); boolean conductWordNetLookup = true; // To save processing time: Check if word in vector exists, check that word has not already been printed if(vInsWords==null){ conductWordNetLookup = false; } else if (vInsWords.size()<3){ //Two letter words cause all kinds of problems with wordnet conductWordNetLookup = false; } //else if(!chOut.checkSequenceIsContinuousAndNotAlreadySentOrAlreadyChanged(vInsWords)) conductWordNetLookup = false; // if(conductWordNetLookup){ Vector v = c.getWordNetWrapper().getReplacementWord(tw.tag(),tw.word(),PointerType.HYPERNYM); allPossibleSubstitutions.addElement(v); } else{ allPossibleSubstitutions.addElement(new Vector()); } } for(int i=0;i<taggedWords.size();i++){ TaggedWord tw = (TaggedWord)taggedWords.elementAt(i); Vector v = (Vector)allInsWords.elementAt(i); Vector v3 = ((Vector)allPossibleSubstitutions.elementAt(i)); js.print(i+": "+tw.word()+"---"); if(v.size()==0){ js.print("Couldn't find word: "+v3.size()); } else{ js.print(StringOfDocChangeInserts.getSubSequenceString(v)+": "); } js.print("WORDNET: "); for(int j=0;j<v3.size();j++){ String s4 = (String)v3.elementAt(j); js.print(s4+"||||"); } js.println(""); } /* for(int i=0;i<taggedWords.size();i++){ TaggedWord tw = (TaggedWord)taggedWords.elementAt(i); Vector v = (Vector)allInsWords.elementAt(i); Vector v3 = ((Vector)allPossibleSubstitutions.elementAt(i)); if(v3.size()!=0){ String textToSubstitute = (String)v3.elementAt(0); Vector replacementIns = StringOfDocChangeInserts.getInsEquivalentOfString(textToSubstitute+" "); chOut.i3_replaceSequenceWithSequenceChangingTimestampOfEnsuingSequenceUsingOldTurnAsBasisFortypingTime(v,replacementIns); } } if(taggedWords.size()>10)System.exit(-1); */ // Filter out the possible substitutions that have already occurred and can't be replaced' // The index is already given but not used: indexOfUnsentChanges // chOut.i3_insertChangesAt(vToAdd,indexOfUnsentChanges); }
/** * parse sentence and generate .trees file * * @param en * @param align * @param out */ public static void parse(String en, String align, String out, boolean verbose) { // use alignments? boolean use_alignments = true; if (align.startsWith("no_align")) { use_alignments = false; System.err.println("Not using alignments."); } else { System.err.println("Using alignments from " + align); } // setup stanfordparser String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; String[] options = {"-outputFormat", "wordsAndTags, typedDependencies"}; LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options); TreebankLanguagePack tlp = lp.getOp().langpack(); java.util.function.Predicate<java.lang.String> punctuationFilter = x -> true; GrammaticalStructureFactory gsf = new edu.stanford.nlp.trees.EnglishGrammaticalStructureFactory(punctuationFilter); // read document Iterable<List<? extends HasWord>> sentences; Reader r = new Reader(en); String line = null; List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>(); while ((line = r.getNext()) != null) { Tokenizer<? extends HasWord> token = tlp.getTokenizerFactory().getTokenizer(new StringReader(line)); List<? extends HasWord> sentence = token.tokenize(); tmp.add(sentence); } sentences = tmp; // set up alignment file reader Reader alignment = new Reader(); if (use_alignments) { alignment = new Reader(align); } // set up tree file writer Writer treeWriter = new Writer(out); // parse long start = System.currentTimeMillis(); // System.err.print("Parsing sentences "); int sentID = 0; for (List<? extends HasWord> sentence : sentences) { Tree t = new Tree(); // t.setSentID(++sentID); System.err.println("parse Sentence :" + sentence + "..."); // System.err.print("."); System.err.println("-----------------------------------------------------------------------"); edu.stanford.nlp.trees.Tree parse = lp.parse(sentence); // parse.pennPrint(); // List for root node and lexical nodes List<Node> loneNodes = new LinkedList<Node>(); List<Node> governingNodes = new LinkedList<Node>(); // ROOT node Node root = new Node(true, true); root.setTag("ROOT"); t.setRoot(root); loneNodes.add(root); governingNodes.add(root); // tagging int counter = 0; String surface = ""; String tag = ""; for (TaggedWord tw : parse.taggedYield()) { Node n = new Node(); Node governingNode = new Node(); n.setNodeID(++counter); surface = tw.value(); tag = tw.tag(); if (surface.startsWith("-LRB-")) { surface = "("; } else if (surface.startsWith("-RRB-")) { surface = ")"; // } else if (surface.startsWith("-LSB-")){ // surface = "["; // } else if (surface.startsWith("-RSB-")){ // surface = "]"; // } else if (surface.startsWith("-LCB-")){ // surface = "{"; // } else if (surface.startsWith("-RCB-")){ // surface = "}"; } else if (surface.startsWith("''")) { surface = "\""; } tag = tag.replaceAll("#", "-NUM-"); surface = surface.replaceAll("&", "-AMP-"); surface = surface.replaceAll("#", "-NUM-"); surface = surface.replaceAll(">", "-GRE-"); surface = surface.replaceAll("=", "-EQU-"); n.setInitialLexicalIndex(counter); governingNode.setInitialLexicalIndex(counter); n.setSurface(surface); // System.out.print("("+tw.value()+" : "); n.setTag(tag); governingNode.setTag("_" + tag); governingNode.setLabel("_gov"); // System.out.print(tw.tag()+")"); loneNodes.add(n); governingNodes.add(governingNode); governingNode.setChild(n); } // System.out.println(""); // t.setSentLength(t.getNodes().size() - 1); // List<Node> loneNodes = new LinkedList<Node>(); Node[] nodes = new Node[2000]; // labeling int depIndex; int govIndex; String[] depInfo; String[] govInfo; GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependencies(false); // List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); for (TypedDependency td : tdl) { depIndex = td.dep().index(); govIndex = td.gov().index(); // System.out.println("Index1:"+depIndex); // System.out.println("Index2:"+govIndex); // if (nodes[depIndex] == null){ // System.out.println("Making node!"); // nodes[depIndex] = new Node(); // } // if (nodes[govIndex] == null){ // System.out.println("Making node!"); // nodes[govIndex] = new Node(); // } Node dep = loneNodes.get((depIndex)); Node gov = governingNodes.get((govIndex)); Node depcopy = governingNodes.get((depIndex)); Node govcopy = loneNodes.get((govIndex)); dep.setLabel(td.reln().toString()); depcopy.setLabel(td.reln().toString()); govcopy.setLabel("head"); // System.out.println(td.toString()); govInfo = td.gov().toString().split("/"); depInfo = td.dep().toString().split("/"); // System.out.println(td.gov().toString()); // System.out.println(td.dep().toString()); // dep.setSurface(depInfo[0]); // dep.setTag(depInfo[1]); gov.setChild(governingNodes.get(depIndex)); governingNodes.get(depIndex).setParent(gov); // gov.setChild(dep); dep.setParent(governingNodes.get(depIndex)); } // t.setRoot(nodes[0]); // Collapse tree to remove unneeded governing nodes: Node gov; Node dep; Node parent; List<Node> children; for (int i = 1; i < governingNodes.size(); i++) { // start with index 1 to skip root gov = governingNodes.get(i); dep = loneNodes.get(i); if (gov.getChildren().size() <= 1) { int k = 0; parent = gov.getParent(); children = parent.getChildren(); for (Node n : children) { if (n == gov) { gov.getParent().replaceChild(k, dep); dep.setParent(gov.getParent()); } k++; } } } // Mark head nodes with appropriate label: int k = 0; for (Node n : loneNodes) { if (k != 0) { if (n.getLabel() == n.getParent().getLabel()) { n.setLabel("head"); } } else { n.setLabel("null"); } k++; } // Sort lexical children of each governing node in lexical order for (Node n : governingNodes) { n.sortChildrenByInitialIndex(); } // combine with alignment if (use_alignments) { t.initialize(alignment.readNextAlign()); } else { t.initializeUnaligned(); } // write tree to file treeWriter.write(t); // print tree to console System.out.println(t.toSentence()); if (verbose) { System.err.println(t.toString()); // t.recursivePrint(); } System.err.println("#######################################################################"); } long stop = System.currentTimeMillis(); System.err.println("...done! [" + (stop - start) / 1000 + " sec]."); treeWriter.close(); }
public void generatePOSLexDensityMatrices(int minDistance, int maxDistance) throws IOException { POSDensityMatrix = new HashMap<OrderedPair, Integer>(); POSLexDensityMatrix = new HashMap<OrderedPair, Integer>(); File[] files = inputDir.listFiles(); for (File curFile : files) { if (!curFile.getName().endsWith(datafileExtension)) continue; System.out.print("Processing file: " + curFile + " ..."); BufferedReader in = new BufferedReader(new FileReader(curFile)); String line = in.readLine(); TURNS: while ((line = in.readLine()) != null) { String[] values = line.split("\\|", -1); if (values[0].equalsIgnoreCase("server") || values[1].equalsIgnoreCase("server") || values[2].equalsIgnoreCase("server") || values[9].isEmpty()) continue; String curTurn = values[8]; boolean debug = false; // System.out.println("Processing text: " + curTurn); String spellingCorrected = fixSpelling(curTurn); if (spellingCorrected.trim().isEmpty()) { // System.out.println("EMPTY. SKIPPING THIS."); continue; } int distance = 0; List<ArrayList<? extends HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(spellingCorrected)); if (sentences.isEmpty()) { continue; } for (ArrayList<? extends HasWord> sent : sentences) { // Sentence<? extends HasWord> sentCorrected ArrayList<TaggedWord> taggedSentence = tagger.tagSentence(sent); boolean lastSentence = (sent == sentences.get(sentences.size() - 1)); if (lastSentence) { taggedSentence.add(new TaggedWord("", "EOT")); } if (taggedSentence.size() < 2) continue; TaggedWord prev = taggedSentence.get(0); // System.out.print(prev.word() + ":" + prev.tag() + ", "); for (int i = 1; i < taggedSentence.size(); i++) { TaggedWord cur = taggedSentence.get(i); distance++; if (maxDistance > 0 && distance > maxDistance) continue TURNS; if (distance < minDistance) { prev = cur; continue; } // System.out.print(cur.word() + ":" + cur.tag() + ", "); if (filter(cur.word())) continue; OrderedPair keyPOS; OrderedPair keyLex; keyPOS = new OrderedPair(prev.tag(), cur.tag()); keyLex = new OrderedPair( prev.tag(), (misspellings.containsKey(cur.word()) ? misspellings.get(cur.word()) : cur.word())); if (POSDensityMatrix.containsKey(keyPOS)) { // System.out.println("putting "+key.tag1+","+key.tag2); POSDensityMatrix.put(keyPOS, POSDensityMatrix.get(keyPOS) + 1); } else { // System.out.println("putting "+key.tag1+","+key.tag2); POSDensityMatrix.put(keyPOS, 1); } // POSLex doesn't make sense at end of turn. if (lastSentence && i == taggedSentence.size() - 1) break; if (POSLexDensityMatrix.containsKey(keyLex)) { // System.out.println("putting "+key.tag1+","+key.tag2); POSLexDensityMatrix.put(keyLex, POSLexDensityMatrix.get(keyLex) + 1); } else { // System.out.println("putting "+key.tag1+","+key.tag2); POSLexDensityMatrix.put(keyLex, 1); } prev = cur; } } // System.out.println(); } System.out.println("done."); } }
public TreeMap<String, Float> averageDistancesFromTurnBeginning() throws IOException { TreeMap<String, Float> sumDistances = new TreeMap<String, Float>(); TreeMap<String, Float> counts = new TreeMap<String, Float>(); File[] files = inputDir.listFiles(); for (File curFile : files) { if (!curFile.getName().endsWith(datafileExtension)) continue; System.out.print("Processing file: " + curFile + " ..."); BufferedReader in = new BufferedReader(new FileReader(curFile)); String line = in.readLine(); while ((line = in.readLine()) != null) { String[] values = line.split("\\|", -1); if (values[0].equalsIgnoreCase("server") || values[1].equalsIgnoreCase("server") || values[2].equalsIgnoreCase("server") || values[9].isEmpty()) continue; String curTurn = values[8]; String spellingCorrected = fixSpelling(curTurn); float distance = 0.0f; if (spellingCorrected.trim().isEmpty()) { // System.out.println("EMPTY. SKIPPING THIS."); continue; } List<ArrayList<? extends HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(spellingCorrected)); if (sentences.isEmpty()) { continue; } for (ArrayList<? extends HasWord> sent : sentences) { ArrayList<TaggedWord> taggedSentence = tagger.tagSentence(sent); boolean lastSentence = (sent == sentences.get(sentences.size() - 1)); if (lastSentence) { taggedSentence.add(new TaggedWord("", "EOT")); } for (int i = 0; i < taggedSentence.size(); i++) { TaggedWord cur = taggedSentence.get(i); distance++; // if (cur.tag().equals("DT")) System.out.println("Turn was:"+spellingCorrected+"\nDT // Dist: "+distance); if (sumDistances.containsKey(cur.tag())) { sumDistances.put(cur.tag(), sumDistances.get(cur.tag()) + distance); counts.put(cur.tag(), counts.get(cur.tag()) + 1); } else { sumDistances.put(cur.tag(), distance); counts.put(cur.tag(), 1.0f); } } } } } // System.out.println(sumDistances); // System.out.println(counts); TreeMap<String, Float> averages = new TreeMap<String, Float>(); for (String tag : sumDistances.keySet()) { averages.put(tag, sumDistances.get(tag) / counts.get(tag)); } return averages; }
public static TaggedWord verbToBaseTense(TaggedWord verb) { Morphology wordMorpher = new Morphology(); return new TaggedWord(wordMorpher.stem(verb.word()), "VB"); }