Beispiel #1
0
  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
      return;
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
    BufferedReader r =
        new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));

    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
      List<TaggedWord> tSentence = tagger.tagSentence(sentence);
      pw.println(Sentence.listToString(tSentence, false));
    }

    // print the adjectives in one more sentence. This shows how to get at words and tags in a
    // tagged sentence.
    List<HasWord> sent =
        Sentence.toWordList(
            "The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord tw : taggedSent) {
      if (tw.tag().startsWith("JJ")) {
        pw.println(tw.word());
      }
    }

    pw.close();
  }
  public static void generate(String model, String fileToTag, String outfile) throws Exception {

    MaxentTagger tagger = new MaxentTagger(model);
    PrintWriter pw =
        new PrintWriter(new OutputStreamWriter(new FileOutputStream(outfile), "utf-8"));

    BufferedReader br = new BufferedReader(new FileReader(fileToTag));
    String line = "";
    ArrayList<String> toks = new ArrayList<>();
    while ((line = br.readLine()) != null) {
      if (line.length() == 0) {
        String params[] = new String[toks.size()];
        toks.toArray(params);
        List<HasWord> sent = Sentence.toWordList(params);
        List<TaggedWord> taggedSent = tagger.tagSentence(sent);
        for (TaggedWord tw : taggedSent) {
          pw.println(tw.word() + " " + tw.tag());
        }
        pw.println();
        toks = new ArrayList<>();
      } else {
        toks.add(line);
      }
    }
    br.close();
    pw.close();
  }
 public static TaggedWord verbToGerund(TaggedWord verb) {
   Morphology wordMorpher = new Morphology();
   String stem = wordMorpher.stem(verb.word());
   if (!stem.equals("do")) {
     stem = stem.replaceAll("[aeiou]?$", "");
   }
   return new TaggedWord(stem + "ing", "VBG");
 }
Beispiel #4
0
 protected List<IntTaggedWord> listToEvents(List<TaggedWord> taggedWords) {
   List<IntTaggedWord> itwList = new ArrayList<IntTaggedWord>();
   for (TaggedWord tw : taggedWords) {
     IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex);
     itwList.add(iTW);
   }
   return itwList;
 }
 private static List<TaggedWord> cleanTags(List<TaggedWord> twList, TreebankLanguagePack tlp) {
   int sz = twList.size();
   List<TaggedWord> l = new ArrayList<TaggedWord>(sz);
   for (int i = 0; i < sz; i++) {
     TaggedWord tw = twList.get(i);
     TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag()));
     l.add(tw2);
   }
   return l;
 }
  /** Trains this UWM on the Collection of trees. */
  public void train(TaggedWord tw, int loc, double weight) {
    IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex);
    IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);
    IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);
    seenCounter.incrementCount(iW, weight);
    IntTaggedWord i = NULL_ITW;

    if (treesRead > indexToStartUnkCounting) {
      // start doing this once some way through trees;
      // treesRead is 1 based counting
      if (seenCounter.getCount(iW) < 1.5) {
        // it's an entirely unknown word
        int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word));
        if (DOCUMENT_UNKNOWNS) {
          String wStr = wordIndex.get(iTW.word);
          String tStr = tagIndex.get(iTW.tag);
          String sStr = wordIndex.get(s);
          EncodingPrintWriter.err.println(
              "Unknown word/tag/sig:\t" + wStr + '\t' + tStr + '\t' + sStr, "UTF-8");
        }
        IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag);
        IntTaggedWord iS = new IntTaggedWord(s, nullTag);
        unSeenCounter.incrementCount(iTS, weight);
        unSeenCounter.incrementCount(iT, weight);
        unSeenCounter.incrementCount(iS, weight);
        unSeenCounter.incrementCount(i, weight);
        // rules.add(iTS);
        // sigs.add(iS);
      } // else {
      // if (seenCounter.getCount(iTW) < 2) {
      // it's a new tag for a known word
      // do nothing for now
      // }
      // }
    }
  }
  private void processOutgoingSequenceFIFOToInsertCandidate() {

    // Should only process if last character added is enter or space or a specified timeout

    StringOfDocChangeInserts sodci = chOut.getStringOfDocChangeInserts();
    int indexOfUnsentChanges = chOut.getFirstIndexForChanges();

    Vector v2 = c.getHistory().getParserWrapper().parseText(sodci.getString());
    Vector taggedWords = (Vector) v2.elementAt(0);

    int indexInSodciStringOfNextWordCandidate = 0;
    String sodciString = sodci.getString();

    // --------------------------

    Vector allInsertsSoFar = sodci.getSequence();

    Vector vToBeRemoved = new Vector();
    Vector vToBeAdded = new Vector();

    vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null));
    vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null));
    vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null));
    vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null));

    counter++;

    Vector allInsWords = new Vector();
    for (int i = 0; i < taggedWords.size(); i++) {
      TaggedWord tw = (TaggedWord) taggedWords.elementAt(i);
      Vector insWord = new Vector();
      int beginIndex = sodciString.indexOf(tw.word(), indexInSodciStringOfNextWordCandidate);
      if (beginIndex >= indexInSodciStringOfNextWordCandidate) { // If it is found in the string

        int finishIndex = beginIndex + tw.word().length();
        System.out.println(i + ":  found index at: " + beginIndex + ": " + finishIndex + ": ");
        Vector v3 = sodci.getSubSequence(beginIndex, finishIndex);
        if (!StringOfDocChangeInserts.getSubSequenceString(insWord).equalsIgnoreCase(tw.word())) {
          insWord = v3; // To check that the words are actually equal
        }
      }
      allInsWords.addElement(insWord);
    }

    Vector allPossibleSubstitutions = new Vector();

    for (int i = 0; i < taggedWords.size(); i++) {
      TaggedWord tw = (TaggedWord) taggedWords.elementAt(i);
      Vector vInsWords = (Vector) allInsWords.elementAt(i);
      boolean conductWordNetLookup =
          true; // To save processing time: Check if word in vector exists, check that word has not
                // already been printed
      if (vInsWords == null) {
        conductWordNetLookup = false;
      } else if (vInsWords.size()
          < 3) { // Two letter words cause all kinds of problems with wordnet
        conductWordNetLookup = false;
      }
      // else if(!chOut.checkSequenceIsContinuousAndNotAlreadySentOrAlreadyChanged(vInsWords))
      // conductWordNetLookup = false; //

      if (conductWordNetLookup) {
        Vector v =
            c.getWordNetWrapper().getReplacementWord(tw.tag(), tw.word(), PointerType.HYPERNYM);
        allPossibleSubstitutions.addElement(v);
      } else {
        allPossibleSubstitutions.addElement(new Vector());
      }
    }

    for (int i = 0; i < taggedWords.size(); i++) {
      TaggedWord tw = (TaggedWord) taggedWords.elementAt(i);
      Vector v = (Vector) allInsWords.elementAt(i);
      Vector v3 = ((Vector) allPossibleSubstitutions.elementAt(i));
      // js.print(i+": "+tw.word()+"---");
      if (v.size() == 0) {
        //  js.print("Couldn't find word: "+v3.size());
      } else {
        //  js.print(StringOfDocChangeInserts.getSubSequenceString(v)+": ");
      }
      // js.print("WORDNET: ");
      for (int j = 0; j < v3.size(); j++) {
        String s4 = (String) v3.elementAt(j);
        //  js.print(s4+"||||");
      }
      // js.println("");
    }

    /*
    Vector allPossibleSubstitutions = new Vector();

    for(int i=0;i<taggedWords.size();i++){
       TaggedWord tw = (TaggedWord)taggedWords.elementAt(i);
       Vector vInsWords = (Vector)allInsWords.elementAt(i);
       boolean conductWordNetLookup = true; // To save processing time: Check if word in vector exists, check that word has not already been printed
       if(vInsWords==null){
          conductWordNetLookup = false;
       }
       else if (vInsWords.size()<3){ //Two letter words cause all kinds of problems with wordnet
           conductWordNetLookup = false;
       }
       //else if(!chOut.checkSequenceIsContinuousAndNotAlreadySentOrAlreadyChanged(vInsWords)) conductWordNetLookup = false; //

       if(conductWordNetLookup){
              Vector v = c.getWordNetWrapper().getReplacementWord(tw.tag(),tw.word(),PointerType.HYPERNYM);
              allPossibleSubstitutions.addElement(v);
       }
       else{
              allPossibleSubstitutions.addElement(new Vector());
       }
    }

    for(int i=0;i<taggedWords.size();i++){
       TaggedWord tw = (TaggedWord)taggedWords.elementAt(i);
       Vector v = (Vector)allInsWords.elementAt(i);
       Vector v3 = ((Vector)allPossibleSubstitutions.elementAt(i));
       js.print(i+": "+tw.word()+"---");
       if(v.size()==0){
           js.print("Couldn't find word: "+v3.size());
       }
       else{
           js.print(StringOfDocChangeInserts.getSubSequenceString(v)+": ");
       }
       js.print("WORDNET: ");
       for(int j=0;j<v3.size();j++){
           String s4 = (String)v3.elementAt(j);
           js.print(s4+"||||");
       }
       js.println("");
    }





     /*

     for(int i=0;i<taggedWords.size();i++){
       TaggedWord tw = (TaggedWord)taggedWords.elementAt(i);
       Vector v = (Vector)allInsWords.elementAt(i);
       Vector v3 = ((Vector)allPossibleSubstitutions.elementAt(i));
       if(v3.size()!=0){
            String textToSubstitute = (String)v3.elementAt(0);
            Vector replacementIns = StringOfDocChangeInserts.getInsEquivalentOfString(textToSubstitute+" ");
            chOut.i3_replaceSequenceWithSequenceChangingTimestampOfEnsuingSequenceUsingOldTurnAsBasisFortypingTime(v,replacementIns);
       }
     }


    if(taggedWords.size()>10)System.exit(-1); */
    // Filter out the possible substitutions that have already occurred and can't be replaced'
    // The index is already given but not used: indexOfUnsentChanges
    // chOut.i3_insertChangesAt(vToAdd,indexOfUnsentChanges);
  }
Beispiel #8
0
  public void generatePOSLexDensityMatrices(int minDistance, int maxDistance) throws IOException {
    POSDensityMatrix = new HashMap<OrderedPair, Integer>();
    POSLexDensityMatrix = new HashMap<OrderedPair, Integer>();
    File[] files = inputDir.listFiles();
    for (File curFile : files) {
      if (!curFile.getName().endsWith(datafileExtension)) continue;
      System.out.print("Processing file: " + curFile + " ...");
      BufferedReader in = new BufferedReader(new FileReader(curFile));
      String line = in.readLine();
      TURNS:
      while ((line = in.readLine()) != null) {
        String[] values = line.split("\\|", -1);
        if (values[0].equalsIgnoreCase("server")
            || values[1].equalsIgnoreCase("server")
            || values[2].equalsIgnoreCase("server")
            || values[9].isEmpty()) continue;

        String curTurn = values[8];

        boolean debug = false;
        // System.out.println("Processing text: " + curTurn);
        String spellingCorrected = fixSpelling(curTurn);

        if (spellingCorrected.trim().isEmpty()) {
          //   System.out.println("EMPTY. SKIPPING THIS.");
          continue;
        }
        int distance = 0;
        List<ArrayList<? extends HasWord>> sentences =
            MaxentTagger.tokenizeText(new StringReader(spellingCorrected));
        if (sentences.isEmpty()) {

          continue;
        }
        for (ArrayList<? extends HasWord> sent : sentences) {

          // Sentence<? extends HasWord> sentCorrected
          ArrayList<TaggedWord> taggedSentence = tagger.tagSentence(sent);
          boolean lastSentence = (sent == sentences.get(sentences.size() - 1));
          if (lastSentence) {
            taggedSentence.add(new TaggedWord("", "EOT"));
          }
          if (taggedSentence.size() < 2) continue;
          TaggedWord prev = taggedSentence.get(0);
          // System.out.print(prev.word() + ":" + prev.tag() + ", ");
          for (int i = 1; i < taggedSentence.size(); i++) {
            TaggedWord cur = taggedSentence.get(i);
            distance++;

            if (maxDistance > 0 && distance > maxDistance) continue TURNS;
            if (distance < minDistance) {
              prev = cur;

              continue;
            }
            // System.out.print(cur.word() + ":" + cur.tag() + ", ");
            if (filter(cur.word())) continue;

            OrderedPair keyPOS;
            OrderedPair keyLex;
            keyPOS = new OrderedPair(prev.tag(), cur.tag());
            keyLex =
                new OrderedPair(
                    prev.tag(),
                    (misspellings.containsKey(cur.word())
                        ? misspellings.get(cur.word())
                        : cur.word()));
            if (POSDensityMatrix.containsKey(keyPOS)) {
              // System.out.println("putting "+key.tag1+","+key.tag2);
              POSDensityMatrix.put(keyPOS, POSDensityMatrix.get(keyPOS) + 1);
            } else {

              //  System.out.println("putting "+key.tag1+","+key.tag2);
              POSDensityMatrix.put(keyPOS, 1);
            }
            // POSLex doesn't make sense at end of turn.
            if (lastSentence && i == taggedSentence.size() - 1) break;
            if (POSLexDensityMatrix.containsKey(keyLex)) {
              // System.out.println("putting "+key.tag1+","+key.tag2);
              POSLexDensityMatrix.put(keyLex, POSLexDensityMatrix.get(keyLex) + 1);
            } else {

              //  System.out.println("putting "+key.tag1+","+key.tag2);
              POSLexDensityMatrix.put(keyLex, 1);
            }

            prev = cur;
          }
        }
        // System.out.println();

      }
      System.out.println("done.");
    }
  }
 public static TaggedWord verbToBaseTense(TaggedWord verb) {
   Morphology wordMorpher = new Morphology();
   return new TaggedWord(wordMorpher.stem(verb.word()), "VB");
 }