Example #1
0
  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
      return;
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
    BufferedReader r =
        new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));

    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
      List<TaggedWord> tSentence = tagger.tagSentence(sentence);
      pw.println(Sentence.listToString(tSentence, false));
    }

    // print the adjectives in one more sentence. This shows how to get at words and tags in a
    // tagged sentence.
    List<HasWord> sent =
        Sentence.toWordList(
            "The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord tw : taggedSent) {
      if (tw.tag().startsWith("JJ")) {
        pw.println(tw.word());
      }
    }

    pw.close();
  }
Example #2
0
  public TreeMap<String, Integer> getOccurrencesOfTagsAfterTurnLength(int min) throws IOException {
    TreeMap<String, Integer> occurrences = new TreeMap<String, Integer>();

    File[] files = inputDir.listFiles();
    int totalTurns = 0;
    for (File curFile : files) {
      if (!curFile.getName().endsWith(datafileExtension)) continue;
      System.out.print("Processing file: " + curFile + " ...");
      BufferedReader in = new BufferedReader(new FileReader(curFile));
      String line = in.readLine();

      while ((line = in.readLine()) != null) {
        String[] values = line.split("\\|", -1);
        if (values[0].equalsIgnoreCase("server")
            || values[1].equalsIgnoreCase("server")
            || values[2].equalsIgnoreCase("server")
            || values[9].isEmpty()) continue;
        String curTurn = values[8];

        String spellingCorrected = fixSpelling(curTurn);
        float distance = 0.0f;

        if (spellingCorrected.trim().isEmpty()) {
          //   System.out.println("EMPTY. SKIPPING THIS.");
          continue;
        }
        List<ArrayList<? extends HasWord>> sentences =
            MaxentTagger.tokenizeText(new StringReader(spellingCorrected));
        if (sentences.isEmpty()) {

          continue;
        }
        totalTurns++;
        for (ArrayList<? extends HasWord> sent : sentences) {
          ArrayList<TaggedWord> taggedSentence = tagger.tagSentence(sent);
          boolean lastSentence = (sent == sentences.get(sentences.size() - 1));
          if (lastSentence) {
            taggedSentence.add(new TaggedWord("", "EOT"));
          }

          for (int i = 0; i < taggedSentence.size(); i++) {
            TaggedWord cur = taggedSentence.get(i);
            distance++;
            if (distance >= min) {
              if (occurrences.containsKey(cur.tag())) {
                occurrences.put(cur.tag(), occurrences.get(cur.tag()) + 1);

              } else {
                occurrences.put(cur.tag(), 1);
              }
            }
          }
        }
      }
    }

    System.out.println("there were " + totalTurns + " turns in total.");

    return occurrences;
  }
  public static void generate(String model, String fileToTag, String outfile) throws Exception {

    MaxentTagger tagger = new MaxentTagger(model);
    PrintWriter pw =
        new PrintWriter(new OutputStreamWriter(new FileOutputStream(outfile), "utf-8"));

    BufferedReader br = new BufferedReader(new FileReader(fileToTag));
    String line = "";
    ArrayList<String> toks = new ArrayList<>();
    while ((line = br.readLine()) != null) {
      if (line.length() == 0) {
        String params[] = new String[toks.size()];
        toks.toArray(params);
        List<HasWord> sent = Sentence.toWordList(params);
        List<TaggedWord> taggedSent = tagger.tagSentence(sent);
        for (TaggedWord tw : taggedSent) {
          pw.println(tw.word() + " " + tw.tag());
        }
        pw.println();
        toks = new ArrayList<>();
      } else {
        toks.add(line);
      }
    }
    br.close();
    pw.close();
  }
Example #4
0
 protected List<IntTaggedWord> listToEvents(List<TaggedWord> taggedWords) {
   List<IntTaggedWord> itwList = new ArrayList<IntTaggedWord>();
   for (TaggedWord tw : taggedWords) {
     IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex);
     itwList.add(iTW);
   }
   return itwList;
 }
 private static List<TaggedWord> cleanTags(List<TaggedWord> twList, TreebankLanguagePack tlp) {
   int sz = twList.size();
   List<TaggedWord> l = new ArrayList<TaggedWord>(sz);
   for (int i = 0; i < sz; i++) {
     TaggedWord tw = twList.get(i);
     TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag()));
     l.add(tw2);
   }
   return l;
 }
  public ArrayList<String> getNounsFromSentence(String sentence) {
    ArrayList<TaggedWord> tw = parseSentenceTD(sentence);
    ArrayList<String> nouns = new ArrayList<String>();

    for (TaggedWord t : tw) {
      if (t.tag().startsWith("N")) {
        nouns.add(t.value());
      }
    }

    return nouns;
  }
  @Override
  public void train(List<TaggedWord> sentence) {
    lex.train(sentence, 1.0);

    String last = null;
    for (TaggedWord tagLabel : sentence) {
      String tag = tagLabel.tag();
      tagIndex.add(tag);
      if (last == null) {
        initial.incrementCount(tag);
      } else {
        ruleCounter.incrementCount2D(last, tag);
      }
      last = tag;
    }
  }
Example #8
0
 public static TaggedWord verbToGerund(TaggedWord verb) {
   Morphology wordMorpher = new Morphology();
   String stem = wordMorpher.stem(verb.word());
   if (!stem.equals("do")) {
     stem = stem.replaceAll("[aeiou]?$", "");
   }
   return new TaggedWord(stem + "ing", "VBG");
 }
Example #9
0
 /**
  * Turns a sentence into a flat phrasal tree. The structure is S -> tag*. And then each tag goes
  * to a word. The tag is either found from the label or made "WD". The tag and phrasal node have a
  * StringLabel.
  *
  * @param s The Sentence to make the Tree from
  * @param lf The LabelFactory with which to create the new Tree labels
  * @return The one phrasal level Tree
  */
 public static Tree toFlatTree(Sentence<?> s, LabelFactory lf) {
   List<Tree> daughters = new ArrayList<Tree>(s.length());
   for (HasWord word : s) {
     Tree wordNode = new LabeledScoredTreeLeaf(lf.newLabel(word.word()));
     if (word instanceof TaggedWord) {
       TaggedWord taggedWord = (TaggedWord) word;
       wordNode =
           new LabeledScoredTreeNode(
               new StringLabel(taggedWord.tag()), Collections.singletonList(wordNode));
     } else {
       wordNode =
           new LabeledScoredTreeNode(lf.newLabel("WD"), Collections.singletonList(wordNode));
     }
     daughters.add(wordNode);
   }
   return new LabeledScoredTreeNode(new StringLabel("S"), daughters);
 }
  /** Trains this UWM on the Collection of trees. */
  public void train(TaggedWord tw, int loc, double weight) {
    IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex);
    IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);
    IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);
    seenCounter.incrementCount(iW, weight);
    IntTaggedWord i = NULL_ITW;

    if (treesRead > indexToStartUnkCounting) {
      // start doing this once some way through trees;
      // treesRead is 1 based counting
      if (seenCounter.getCount(iW) < 1.5) {
        // it's an entirely unknown word
        int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word));
        if (DOCUMENT_UNKNOWNS) {
          String wStr = wordIndex.get(iTW.word);
          String tStr = tagIndex.get(iTW.tag);
          String sStr = wordIndex.get(s);
          EncodingPrintWriter.err.println(
              "Unknown word/tag/sig:\t" + wStr + '\t' + tStr + '\t' + sStr, "UTF-8");
        }
        IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag);
        IntTaggedWord iS = new IntTaggedWord(s, nullTag);
        unSeenCounter.incrementCount(iTS, weight);
        unSeenCounter.incrementCount(iT, weight);
        unSeenCounter.incrementCount(iS, weight);
        unSeenCounter.incrementCount(i, weight);
        // rules.add(iTS);
        // sigs.add(iS);
      } // else {
      // if (seenCounter.getCount(iTW) < 2) {
      // it's a new tag for a known word
      // do nothing for now
      // }
      // }
    }
  }
  private void processOutgoingSequenceFIFOToInsertCandidate() {

    // Should only process if last character added is enter or space or a specified timeout

    StringOfDocChangeInserts sodci = chOut.getStringOfDocChangeInserts();
    int indexOfUnsentChanges = chOut.getFirstIndexForChanges();

    Vector v2 = c.getHistory().getParserWrapper().parseText(sodci.getString());
    Vector taggedWords = (Vector) v2.elementAt(0);

    int indexInSodciStringOfNextWordCandidate = 0;
    String sodciString = sodci.getString();

    // --------------------------

    Vector allInsertsSoFar = sodci.getSequence();

    Vector vToBeRemoved = new Vector();
    Vector vToBeAdded = new Vector();

    vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null));
    vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null));
    vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null));
    vToBeAdded.addElement(new DocInsert(0, Integer.toString(counter), null));

    counter++;

    Vector allInsWords = new Vector();
    for (int i = 0; i < taggedWords.size(); i++) {
      TaggedWord tw = (TaggedWord) taggedWords.elementAt(i);
      Vector insWord = new Vector();
      int beginIndex = sodciString.indexOf(tw.word(), indexInSodciStringOfNextWordCandidate);
      if (beginIndex >= indexInSodciStringOfNextWordCandidate) { // If it is found in the string

        int finishIndex = beginIndex + tw.word().length();
        System.out.println(i + ":  found index at: " + beginIndex + ": " + finishIndex + ": ");
        Vector v3 = sodci.getSubSequence(beginIndex, finishIndex);
        if (!StringOfDocChangeInserts.getSubSequenceString(insWord).equalsIgnoreCase(tw.word())) {
          insWord = v3; // To check that the words are actually equal
        }
      }
      allInsWords.addElement(insWord);
    }

    Vector allPossibleSubstitutions = new Vector();

    for (int i = 0; i < taggedWords.size(); i++) {
      TaggedWord tw = (TaggedWord) taggedWords.elementAt(i);
      Vector vInsWords = (Vector) allInsWords.elementAt(i);
      boolean conductWordNetLookup =
          true; // To save processing time: Check if word in vector exists, check that word has not
                // already been printed
      if (vInsWords == null) {
        conductWordNetLookup = false;
      } else if (vInsWords.size()
          < 3) { // Two letter words cause all kinds of problems with wordnet
        conductWordNetLookup = false;
      }
      // else if(!chOut.checkSequenceIsContinuousAndNotAlreadySentOrAlreadyChanged(vInsWords))
      // conductWordNetLookup = false; //

      if (conductWordNetLookup) {
        Vector v =
            c.getWordNetWrapper().getReplacementWord(tw.tag(), tw.word(), PointerType.HYPERNYM);
        allPossibleSubstitutions.addElement(v);
      } else {
        allPossibleSubstitutions.addElement(new Vector());
      }
    }

    for (int i = 0; i < taggedWords.size(); i++) {
      TaggedWord tw = (TaggedWord) taggedWords.elementAt(i);
      Vector v = (Vector) allInsWords.elementAt(i);
      Vector v3 = ((Vector) allPossibleSubstitutions.elementAt(i));
      // js.print(i+": "+tw.word()+"---");
      if (v.size() == 0) {
        //  js.print("Couldn't find word: "+v3.size());
      } else {
        //  js.print(StringOfDocChangeInserts.getSubSequenceString(v)+": ");
      }
      // js.print("WORDNET: ");
      for (int j = 0; j < v3.size(); j++) {
        String s4 = (String) v3.elementAt(j);
        //  js.print(s4+"||||");
      }
      // js.println("");
    }

    /*
    Vector allPossibleSubstitutions = new Vector();

    for(int i=0;i<taggedWords.size();i++){
       TaggedWord tw = (TaggedWord)taggedWords.elementAt(i);
       Vector vInsWords = (Vector)allInsWords.elementAt(i);
       boolean conductWordNetLookup = true; // To save processing time: Check if word in vector exists, check that word has not already been printed
       if(vInsWords==null){
          conductWordNetLookup = false;
       }
       else if (vInsWords.size()<3){ //Two letter words cause all kinds of problems with wordnet
           conductWordNetLookup = false;
       }
       //else if(!chOut.checkSequenceIsContinuousAndNotAlreadySentOrAlreadyChanged(vInsWords)) conductWordNetLookup = false; //

       if(conductWordNetLookup){
              Vector v = c.getWordNetWrapper().getReplacementWord(tw.tag(),tw.word(),PointerType.HYPERNYM);
              allPossibleSubstitutions.addElement(v);
       }
       else{
              allPossibleSubstitutions.addElement(new Vector());
       }
    }

    for(int i=0;i<taggedWords.size();i++){
       TaggedWord tw = (TaggedWord)taggedWords.elementAt(i);
       Vector v = (Vector)allInsWords.elementAt(i);
       Vector v3 = ((Vector)allPossibleSubstitutions.elementAt(i));
       js.print(i+": "+tw.word()+"---");
       if(v.size()==0){
           js.print("Couldn't find word: "+v3.size());
       }
       else{
           js.print(StringOfDocChangeInserts.getSubSequenceString(v)+": ");
       }
       js.print("WORDNET: ");
       for(int j=0;j<v3.size();j++){
           String s4 = (String)v3.elementAt(j);
           js.print(s4+"||||");
       }
       js.println("");
    }





     /*

     for(int i=0;i<taggedWords.size();i++){
       TaggedWord tw = (TaggedWord)taggedWords.elementAt(i);
       Vector v = (Vector)allInsWords.elementAt(i);
       Vector v3 = ((Vector)allPossibleSubstitutions.elementAt(i));
       if(v3.size()!=0){
            String textToSubstitute = (String)v3.elementAt(0);
            Vector replacementIns = StringOfDocChangeInserts.getInsEquivalentOfString(textToSubstitute+" ");
            chOut.i3_replaceSequenceWithSequenceChangingTimestampOfEnsuingSequenceUsingOldTurnAsBasisFortypingTime(v,replacementIns);
       }
     }


    if(taggedWords.size()>10)System.exit(-1); */
    // Filter out the possible substitutions that have already occurred and can't be replaced'
    // The index is already given but not used: indexOfUnsentChanges
    // chOut.i3_insertChangesAt(vToAdd,indexOfUnsentChanges);
  }
  /**
   * parse sentence and generate .trees file
   *
   * @param en
   * @param align
   * @param out
   */
  public static void parse(String en, String align, String out, boolean verbose) {

    // use alignments?
    boolean use_alignments = true;
    if (align.startsWith("no_align")) {
      use_alignments = false;
      System.err.println("Not using alignments.");
    } else {
      System.err.println("Using alignments from " + align);
    }

    // setup stanfordparser
    String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String[] options = {"-outputFormat", "wordsAndTags, typedDependencies"};
    LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
    TreebankLanguagePack tlp = lp.getOp().langpack();
    java.util.function.Predicate<java.lang.String> punctuationFilter = x -> true;

    GrammaticalStructureFactory gsf =
        new edu.stanford.nlp.trees.EnglishGrammaticalStructureFactory(punctuationFilter);

    // read document
    Iterable<List<? extends HasWord>> sentences;
    Reader r = new Reader(en);
    String line = null;
    List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>();
    while ((line = r.getNext()) != null) {
      Tokenizer<? extends HasWord> token =
          tlp.getTokenizerFactory().getTokenizer(new StringReader(line));
      List<? extends HasWord> sentence = token.tokenize();
      tmp.add(sentence);
    }
    sentences = tmp;

    // set up alignment file reader
    Reader alignment = new Reader();
    if (use_alignments) {
      alignment = new Reader(align);
    }

    // set up tree file writer
    Writer treeWriter = new Writer(out);

    // parse
    long start = System.currentTimeMillis();
    // System.err.print("Parsing sentences ");
    int sentID = 0;
    for (List<? extends HasWord> sentence : sentences) {
      Tree t = new Tree();
      // t.setSentID(++sentID);
      System.err.println("parse Sentence :" + sentence + "...");
      // System.err.print(".");
      System.err.println("-----------------------------------------------------------------------");
      edu.stanford.nlp.trees.Tree parse = lp.parse(sentence);
      // parse.pennPrint();

      // List for root node and lexical nodes
      List<Node> loneNodes = new LinkedList<Node>();
      List<Node> governingNodes = new LinkedList<Node>();

      // ROOT node
      Node root = new Node(true, true);
      root.setTag("ROOT");
      t.setRoot(root);
      loneNodes.add(root);
      governingNodes.add(root);

      // tagging

      int counter = 0;
      String surface = "";
      String tag = "";

      for (TaggedWord tw : parse.taggedYield()) {
        Node n = new Node();
        Node governingNode = new Node();
        n.setNodeID(++counter);
        surface = tw.value();
        tag = tw.tag();
        if (surface.startsWith("-LRB-")) {
          surface = "(";
        } else if (surface.startsWith("-RRB-")) {
          surface = ")";
          // } else if (surface.startsWith("-LSB-")){
          //    surface = "[";
          // } else if (surface.startsWith("-RSB-")){
          //    surface = "]";
          // } else if (surface.startsWith("-LCB-")){
          //    surface = "{";
          // } else if (surface.startsWith("-RCB-")){
          //    surface = "}";
        } else if (surface.startsWith("''")) {
          surface = "\"";
        }
        tag = tag.replaceAll("#", "-NUM-");
        surface = surface.replaceAll("&", "-AMP-");
        surface = surface.replaceAll("#", "-NUM-");
        surface = surface.replaceAll(">", "-GRE-");
        surface = surface.replaceAll("=", "-EQU-");
        n.setInitialLexicalIndex(counter);
        governingNode.setInitialLexicalIndex(counter);
        n.setSurface(surface);
        // System.out.print("("+tw.value()+" : ");
        n.setTag(tag);
        governingNode.setTag("_" + tag);
        governingNode.setLabel("_gov");
        // System.out.print(tw.tag()+")");
        loneNodes.add(n);
        governingNodes.add(governingNode);
        governingNode.setChild(n);
      }

      // System.out.println("");

      // t.setSentLength(t.getNodes().size() - 1);
      // List<Node> loneNodes = new LinkedList<Node>();
      Node[] nodes = new Node[2000];
      // labeling
      int depIndex;
      int govIndex;
      String[] depInfo;
      String[] govInfo;
      GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
      List<TypedDependency> tdl = gs.typedDependencies(false);
      // List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
      for (TypedDependency td : tdl) {
        depIndex = td.dep().index();
        govIndex = td.gov().index();
        // System.out.println("Index1:"+depIndex);
        // System.out.println("Index2:"+govIndex);
        // if (nodes[depIndex] == null){
        //	System.out.println("Making node!");
        //	nodes[depIndex] = new Node();
        // }
        // if (nodes[govIndex] == null){
        //	System.out.println("Making node!");
        //	nodes[govIndex] = new Node();
        // }
        Node dep = loneNodes.get((depIndex));
        Node gov = governingNodes.get((govIndex));
        Node depcopy = governingNodes.get((depIndex));
        Node govcopy = loneNodes.get((govIndex));
        dep.setLabel(td.reln().toString());
        depcopy.setLabel(td.reln().toString());
        govcopy.setLabel("head");
        // System.out.println(td.toString());
        govInfo = td.gov().toString().split("/");
        depInfo = td.dep().toString().split("/");
        // System.out.println(td.gov().toString());
        // System.out.println(td.dep().toString());
        // dep.setSurface(depInfo[0]);
        // dep.setTag(depInfo[1]);
        gov.setChild(governingNodes.get(depIndex));
        governingNodes.get(depIndex).setParent(gov);
        // gov.setChild(dep);
        dep.setParent(governingNodes.get(depIndex));
      }
      // t.setRoot(nodes[0]);

      // Collapse tree to remove unneeded governing nodes:

      Node gov;
      Node dep;
      Node parent;
      List<Node> children;

      for (int i = 1; i < governingNodes.size(); i++) { // start with index 1 to skip root
        gov = governingNodes.get(i);
        dep = loneNodes.get(i);
        if (gov.getChildren().size() <= 1) {
          int k = 0;
          parent = gov.getParent();
          children = parent.getChildren();

          for (Node n : children) {
            if (n == gov) {
              gov.getParent().replaceChild(k, dep);
              dep.setParent(gov.getParent());
            }
            k++;
          }
        }
      }
      // Mark head nodes with appropriate label:
      int k = 0;
      for (Node n : loneNodes) {
        if (k != 0) {
          if (n.getLabel() == n.getParent().getLabel()) {
            n.setLabel("head");
          }
        } else {
          n.setLabel("null");
        }
        k++;
      }
      // Sort lexical children of each governing node in lexical order

      for (Node n : governingNodes) {
        n.sortChildrenByInitialIndex();
      }

      // combine with alignment
      if (use_alignments) {
        t.initialize(alignment.readNextAlign());
      } else {
        t.initializeUnaligned();
      }

      // write tree to file
      treeWriter.write(t);

      // print tree to console

      System.out.println(t.toSentence());
      if (verbose) {
        System.err.println(t.toString());
        // t.recursivePrint();
      }
      System.err.println("#######################################################################");
    }
    long stop = System.currentTimeMillis();
    System.err.println("...done! [" + (stop - start) / 1000 + " sec].");

    treeWriter.close();
  }
Example #13
0
  public void generatePOSLexDensityMatrices(int minDistance, int maxDistance) throws IOException {
    POSDensityMatrix = new HashMap<OrderedPair, Integer>();
    POSLexDensityMatrix = new HashMap<OrderedPair, Integer>();
    File[] files = inputDir.listFiles();
    for (File curFile : files) {
      if (!curFile.getName().endsWith(datafileExtension)) continue;
      System.out.print("Processing file: " + curFile + " ...");
      BufferedReader in = new BufferedReader(new FileReader(curFile));
      String line = in.readLine();
      TURNS:
      while ((line = in.readLine()) != null) {
        String[] values = line.split("\\|", -1);
        if (values[0].equalsIgnoreCase("server")
            || values[1].equalsIgnoreCase("server")
            || values[2].equalsIgnoreCase("server")
            || values[9].isEmpty()) continue;

        String curTurn = values[8];

        boolean debug = false;
        // System.out.println("Processing text: " + curTurn);
        String spellingCorrected = fixSpelling(curTurn);

        if (spellingCorrected.trim().isEmpty()) {
          //   System.out.println("EMPTY. SKIPPING THIS.");
          continue;
        }
        int distance = 0;
        List<ArrayList<? extends HasWord>> sentences =
            MaxentTagger.tokenizeText(new StringReader(spellingCorrected));
        if (sentences.isEmpty()) {

          continue;
        }
        for (ArrayList<? extends HasWord> sent : sentences) {

          // Sentence<? extends HasWord> sentCorrected
          ArrayList<TaggedWord> taggedSentence = tagger.tagSentence(sent);
          boolean lastSentence = (sent == sentences.get(sentences.size() - 1));
          if (lastSentence) {
            taggedSentence.add(new TaggedWord("", "EOT"));
          }
          if (taggedSentence.size() < 2) continue;
          TaggedWord prev = taggedSentence.get(0);
          // System.out.print(prev.word() + ":" + prev.tag() + ", ");
          for (int i = 1; i < taggedSentence.size(); i++) {
            TaggedWord cur = taggedSentence.get(i);
            distance++;

            if (maxDistance > 0 && distance > maxDistance) continue TURNS;
            if (distance < minDistance) {
              prev = cur;

              continue;
            }
            // System.out.print(cur.word() + ":" + cur.tag() + ", ");
            if (filter(cur.word())) continue;

            OrderedPair keyPOS;
            OrderedPair keyLex;
            keyPOS = new OrderedPair(prev.tag(), cur.tag());
            keyLex =
                new OrderedPair(
                    prev.tag(),
                    (misspellings.containsKey(cur.word())
                        ? misspellings.get(cur.word())
                        : cur.word()));
            if (POSDensityMatrix.containsKey(keyPOS)) {
              // System.out.println("putting "+key.tag1+","+key.tag2);
              POSDensityMatrix.put(keyPOS, POSDensityMatrix.get(keyPOS) + 1);
            } else {

              //  System.out.println("putting "+key.tag1+","+key.tag2);
              POSDensityMatrix.put(keyPOS, 1);
            }
            // POSLex doesn't make sense at end of turn.
            if (lastSentence && i == taggedSentence.size() - 1) break;
            if (POSLexDensityMatrix.containsKey(keyLex)) {
              // System.out.println("putting "+key.tag1+","+key.tag2);
              POSLexDensityMatrix.put(keyLex, POSLexDensityMatrix.get(keyLex) + 1);
            } else {

              //  System.out.println("putting "+key.tag1+","+key.tag2);
              POSLexDensityMatrix.put(keyLex, 1);
            }

            prev = cur;
          }
        }
        // System.out.println();

      }
      System.out.println("done.");
    }
  }
Example #14
0
  public TreeMap<String, Float> averageDistancesFromTurnBeginning() throws IOException {
    TreeMap<String, Float> sumDistances = new TreeMap<String, Float>();

    TreeMap<String, Float> counts = new TreeMap<String, Float>();
    File[] files = inputDir.listFiles();
    for (File curFile : files) {
      if (!curFile.getName().endsWith(datafileExtension)) continue;
      System.out.print("Processing file: " + curFile + " ...");
      BufferedReader in = new BufferedReader(new FileReader(curFile));
      String line = in.readLine();

      while ((line = in.readLine()) != null) {
        String[] values = line.split("\\|", -1);
        if (values[0].equalsIgnoreCase("server")
            || values[1].equalsIgnoreCase("server")
            || values[2].equalsIgnoreCase("server")
            || values[9].isEmpty()) continue;
        String curTurn = values[8];

        String spellingCorrected = fixSpelling(curTurn);
        float distance = 0.0f;

        if (spellingCorrected.trim().isEmpty()) {
          //   System.out.println("EMPTY. SKIPPING THIS.");
          continue;
        }
        List<ArrayList<? extends HasWord>> sentences =
            MaxentTagger.tokenizeText(new StringReader(spellingCorrected));
        if (sentences.isEmpty()) {

          continue;
        }
        for (ArrayList<? extends HasWord> sent : sentences) {
          ArrayList<TaggedWord> taggedSentence = tagger.tagSentence(sent);
          boolean lastSentence = (sent == sentences.get(sentences.size() - 1));
          if (lastSentence) {
            taggedSentence.add(new TaggedWord("", "EOT"));
          }

          for (int i = 0; i < taggedSentence.size(); i++) {
            TaggedWord cur = taggedSentence.get(i);
            distance++;
            // if (cur.tag().equals("DT")) System.out.println("Turn was:"+spellingCorrected+"\nDT
            // Dist: "+distance);
            if (sumDistances.containsKey(cur.tag())) {
              sumDistances.put(cur.tag(), sumDistances.get(cur.tag()) + distance);
              counts.put(cur.tag(), counts.get(cur.tag()) + 1);

            } else {
              sumDistances.put(cur.tag(), distance);
              counts.put(cur.tag(), 1.0f);
            }
          }
        }
      }
    }
    // System.out.println(sumDistances);
    // System.out.println(counts);
    TreeMap<String, Float> averages = new TreeMap<String, Float>();
    for (String tag : sumDistances.keySet()) {
      averages.put(tag, sumDistances.get(tag) / counts.get(tag));
    }
    return averages;
  }
Example #15
0
 public static TaggedWord verbToBaseTense(TaggedWord verb) {
   Morphology wordMorpher = new Morphology();
   return new TaggedWord(wordMorpher.stem(verb.word()), "VB");
 }