public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("usage: java TaggerDemo2 modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(Sentence.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a // tagged sentence. List<HasWord> sent = Sentence.toWordList( "The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); }
public static void generate(String model, String fileToTag, String outfile) throws Exception { MaxentTagger tagger = new MaxentTagger(model); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outfile), "utf-8")); BufferedReader br = new BufferedReader(new FileReader(fileToTag)); String line = ""; ArrayList<String> toks = new ArrayList<>(); while ((line = br.readLine()) != null) { if (line.length() == 0) { String params[] = new String[toks.size()]; toks.toArray(params); List<HasWord> sent = Sentence.toWordList(params); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { pw.println(tw.word() + " " + tw.tag()); } pw.println(); toks = new ArrayList<>(); } else { toks.add(line); } } br.close(); pw.close(); }
public static TaggedWord verbToGerund(TaggedWord verb) { Morphology wordMorpher = new Morphology(); String stem = wordMorpher.stem(verb.word()); if (!stem.equals("do")) { stem = stem.replaceAll("[aeiou]?$", ""); } return new TaggedWord(stem + "ing", "VBG"); }
protected List<IntTaggedWord> listToEvents(List<TaggedWord> taggedWords) { List<IntTaggedWord> itwList = new ArrayList<IntTaggedWord>(); for (TaggedWord tw : taggedWords) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); itwList.add(iTW); } return itwList; }
private static List<TaggedWord> cleanTags(List<TaggedWord> twList, TreebankLanguagePack tlp) { int sz = twList.size(); List<TaggedWord> l = new ArrayList<TaggedWord>(sz); for (int i = 0; i < sz; i++) { TaggedWord tw = twList.get(i); TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag())); l.add(tw2); } return l; }
/** Trains this UWM on the Collection of trees. */ public void train(TaggedWord tw, int loc, double weight) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.incrementCount(iW, weight); IntTaggedWord i = NULL_ITW; if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.getCount(iW) < 1.5) { // it's an entirely unknown word int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word)); if (DOCUMENT_UNKNOWNS) { String wStr = wordIndex.get(iTW.word); String tStr = tagIndex.get(iTW.tag); String sStr = wordIndex.get(s); EncodingPrintWriter.err.println( "Unknown word/tag/sig:\t" + wStr + '\t' + tStr + '\t' + sStr, "UTF-8"); } IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag); IntTaggedWord iS = new IntTaggedWord(s, nullTag); unSeenCounter.incrementCount(iTS, weight); unSeenCounter.incrementCount(iT, weight); unSeenCounter.incrementCount(iS, weight); unSeenCounter.incrementCount(i, weight); // rules.add(iTS); // sigs.add(iS); } // else { // if (seenCounter.getCount(iTW) < 2) { // it's a new tag for a known word // do nothing for now // } // } } }
private void processOutgoingSequenceFIFOToInsertCandidate() { // Should only process if last character added is enter or space or a specified timeout StringOfDocChangeInserts sodci = chOut.getStringOfDocChangeInserts(); int indexOfUnsentChanges = chOut.getFirstIndexForChanges(); Vector v2 = c.getHistory().getParserWrapper().parseText(sodci.getString()); Vector taggedWords = (Vector) v2.elementAt(0); int indexInSodciStringOfNextWordCandidate = 0; String sodciString = sodci.getString(); // -------------------------- Vector allInsertsSoFar = sodci.getSequence(); Vector vToBeRemoved = new Vector(); Vector vToBeAdded = new Vector(); vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null)); vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null)); vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null)); vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null)); counter++; Vector allInsWords = new Vector(); for (int i = 0; i < taggedWords.size(); i++) { TaggedWord tw = (TaggedWord) taggedWords.elementAt(i); Vector insWord = new Vector(); int beginIndex = sodciString.indexOf(tw.word(), indexInSodciStringOfNextWordCandidate); if (beginIndex >= indexInSodciStringOfNextWordCandidate) { // If it is found in the string int finishIndex = beginIndex + tw.word().length(); System.out.println(i + ": found index at: " + beginIndex + ": " + finishIndex + ": "); Vector v3 = sodci.getSubSequence(beginIndex, finishIndex); if (!StringOfDocChangeInserts.getSubSequenceString(insWord).equalsIgnoreCase(tw.word())) { insWord = v3; // To check that the words are actually equal } } allInsWords.addElement(insWord); } Vector allPossibleSubstitutions = new Vector(); for (int i = 0; i < taggedWords.size(); i++) { TaggedWord tw = (TaggedWord) taggedWords.elementAt(i); Vector vInsWords = (Vector) allInsWords.elementAt(i); boolean conductWordNetLookup = true; // To save processing time: Check if word in vector exists, check that word has not // already been printed if (vInsWords == null) { conductWordNetLookup = false; } else if (vInsWords.size() < 3) { // Two letter words cause all kinds of problems with wordnet conductWordNetLookup = false; } // else if(!chOut.checkSequenceIsContinuousAndNotAlreadySentOrAlreadyChanged(vInsWords)) // conductWordNetLookup = false; // if (conductWordNetLookup) { Vector v = c.getWordNetWrapper().getReplacementWord(tw.tag(), tw.word(), PointerType.HYPERNYM); allPossibleSubstitutions.addElement(v); } else { allPossibleSubstitutions.addElement(new Vector()); } } for (int i = 0; i < taggedWords.size(); i++) { TaggedWord tw = (TaggedWord) taggedWords.elementAt(i); Vector v = (Vector) allInsWords.elementAt(i); Vector v3 = ((Vector) allPossibleSubstitutions.elementAt(i)); // js.print(i+": "+tw.word()+"---"); if (v.size() == 0) { // js.print("Couldn't find word: "+v3.size()); } else { // js.print(StringOfDocChangeInserts.getSubSequenceString(v)+": "); } // js.print("WORDNET: "); for (int j = 0; j < v3.size(); j++) { String s4 = (String) v3.elementAt(j); // js.print(s4+"||||"); } // js.println(""); } /* Vector allPossibleSubstitutions = new Vector(); for(int i=0;i<taggedWords.size();i++){ TaggedWord tw = (TaggedWord)taggedWords.elementAt(i); Vector vInsWords = (Vector)allInsWords.elementAt(i); boolean conductWordNetLookup = true; // To save processing time: Check if word in vector exists, check that word has not already been printed if(vInsWords==null){ conductWordNetLookup = false; } else if (vInsWords.size()<3){ //Two letter words cause all kinds of problems with wordnet conductWordNetLookup = false; } //else if(!chOut.checkSequenceIsContinuousAndNotAlreadySentOrAlreadyChanged(vInsWords)) conductWordNetLookup = false; // if(conductWordNetLookup){ Vector v = c.getWordNetWrapper().getReplacementWord(tw.tag(),tw.word(),PointerType.HYPERNYM); allPossibleSubstitutions.addElement(v); } else{ allPossibleSubstitutions.addElement(new Vector()); } } for(int i=0;i<taggedWords.size();i++){ TaggedWord tw = (TaggedWord)taggedWords.elementAt(i); Vector v = (Vector)allInsWords.elementAt(i); Vector v3 = ((Vector)allPossibleSubstitutions.elementAt(i)); js.print(i+": "+tw.word()+"---"); if(v.size()==0){ js.print("Couldn't find word: "+v3.size()); } else{ js.print(StringOfDocChangeInserts.getSubSequenceString(v)+": "); } js.print("WORDNET: "); for(int j=0;j<v3.size();j++){ String s4 = (String)v3.elementAt(j); js.print(s4+"||||"); } js.println(""); } /* for(int i=0;i<taggedWords.size();i++){ TaggedWord tw = (TaggedWord)taggedWords.elementAt(i); Vector v = (Vector)allInsWords.elementAt(i); Vector v3 = ((Vector)allPossibleSubstitutions.elementAt(i)); if(v3.size()!=0){ String textToSubstitute = (String)v3.elementAt(0); Vector replacementIns = StringOfDocChangeInserts.getInsEquivalentOfString(textToSubstitute+" "); chOut.i3_replaceSequenceWithSequenceChangingTimestampOfEnsuingSequenceUsingOldTurnAsBasisFortypingTime(v,replacementIns); } } if(taggedWords.size()>10)System.exit(-1); */ // Filter out the possible substitutions that have already occurred and can't be replaced' // The index is already given but not used: indexOfUnsentChanges // chOut.i3_insertChangesAt(vToAdd,indexOfUnsentChanges); }
public void generatePOSLexDensityMatrices(int minDistance, int maxDistance) throws IOException { POSDensityMatrix = new HashMap<OrderedPair, Integer>(); POSLexDensityMatrix = new HashMap<OrderedPair, Integer>(); File[] files = inputDir.listFiles(); for (File curFile : files) { if (!curFile.getName().endsWith(datafileExtension)) continue; System.out.print("Processing file: " + curFile + " ..."); BufferedReader in = new BufferedReader(new FileReader(curFile)); String line = in.readLine(); TURNS: while ((line = in.readLine()) != null) { String[] values = line.split("\\|", -1); if (values[0].equalsIgnoreCase("server") || values[1].equalsIgnoreCase("server") || values[2].equalsIgnoreCase("server") || values[9].isEmpty()) continue; String curTurn = values[8]; boolean debug = false; // System.out.println("Processing text: " + curTurn); String spellingCorrected = fixSpelling(curTurn); if (spellingCorrected.trim().isEmpty()) { // System.out.println("EMPTY. SKIPPING THIS."); continue; } int distance = 0; List<ArrayList<? extends HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(spellingCorrected)); if (sentences.isEmpty()) { continue; } for (ArrayList<? extends HasWord> sent : sentences) { // Sentence<? extends HasWord> sentCorrected ArrayList<TaggedWord> taggedSentence = tagger.tagSentence(sent); boolean lastSentence = (sent == sentences.get(sentences.size() - 1)); if (lastSentence) { taggedSentence.add(new TaggedWord("", "EOT")); } if (taggedSentence.size() < 2) continue; TaggedWord prev = taggedSentence.get(0); // System.out.print(prev.word() + ":" + prev.tag() + ", "); for (int i = 1; i < taggedSentence.size(); i++) { TaggedWord cur = taggedSentence.get(i); distance++; if (maxDistance > 0 && distance > maxDistance) continue TURNS; if (distance < minDistance) { prev = cur; continue; } // System.out.print(cur.word() + ":" + cur.tag() + ", "); if (filter(cur.word())) continue; OrderedPair keyPOS; OrderedPair keyLex; keyPOS = new OrderedPair(prev.tag(), cur.tag()); keyLex = new OrderedPair( prev.tag(), (misspellings.containsKey(cur.word()) ? misspellings.get(cur.word()) : cur.word())); if (POSDensityMatrix.containsKey(keyPOS)) { // System.out.println("putting "+key.tag1+","+key.tag2); POSDensityMatrix.put(keyPOS, POSDensityMatrix.get(keyPOS) + 1); } else { // System.out.println("putting "+key.tag1+","+key.tag2); POSDensityMatrix.put(keyPOS, 1); } // POSLex doesn't make sense at end of turn. if (lastSentence && i == taggedSentence.size() - 1) break; if (POSLexDensityMatrix.containsKey(keyLex)) { // System.out.println("putting "+key.tag1+","+key.tag2); POSLexDensityMatrix.put(keyLex, POSLexDensityMatrix.get(keyLex) + 1); } else { // System.out.println("putting "+key.tag1+","+key.tag2); POSLexDensityMatrix.put(keyLex, 1); } prev = cur; } } // System.out.println(); } System.out.println("done."); } }
public static TaggedWord verbToBaseTense(TaggedWord verb) { Morphology wordMorpher = new Morphology(); return new TaggedWord(wordMorpher.stem(verb.word()), "VB"); }