Ejemplo n.º 1
0
  @Override
  public void evaluate(Tree guess, Tree gold, PrintWriter pw) {
    if (gold == null || guess == null) {
      System.err.printf(
          "%s: Cannot compare against a null gold or guess tree!\n", this.getClass().getName());
      return;

    } else if (guess.yield().size() != gold.yield().size()) {
      System.err.println("Warning: yield differs:");
      System.err.println("Guess: " + Sentence.listToString(guess.yield()));
      System.err.println("Gold:  " + Sentence.listToString(gold.yield()));
    }

    super.evaluate(guess, gold, pw);
  }
Ejemplo n.º 2
0
  public static ArrayList<ArrayList<TaggedWord>> tag(String sentencedoc) {
    MaxentTagger tagger;
    try {

      tagger = new MaxentTagger("data/wsj-0-18-bidirectional-distsim.tagger");

      List<List<HasWord>> sentences =
          MaxentTagger.tokenizeText(new BufferedReader(new FileReader(sentencedoc)));

      ArrayList<ArrayList<TaggedWord>> taggedsentences =
          new ArrayList<ArrayList<TaggedWord>>(sentences.size());

      for (List<HasWord> sentence : sentences) {
        ArrayList<TaggedWord> tSentence = tagger.tagSentence(sentence);
        taggedsentences.add(tSentence);
        System.out.println(Sentence.listToString(tSentence, false));
      }

      return taggedsentences;
    } catch (IOException e) {
      e.printStackTrace();
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    }
    return null;
  }
Ejemplo n.º 3
0
  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
      return;
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
    BufferedReader r =
        new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));

    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
      List<TaggedWord> tSentence = tagger.tagSentence(sentence);
      pw.println(Sentence.listToString(tSentence, false));
    }

    // print the adjectives in one more sentence. This shows how to get at words and tags in a
    // tagged sentence.
    List<HasWord> sent =
        Sentence.toWordList(
            "The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord tw : taggedSent) {
      if (tw.tag().startsWith("JJ")) {
        pw.println(tw.word());
      }
    }

    pw.close();
  }
Ejemplo n.º 4
0
  public static void handletaggedsentences(ArrayList<ArrayList<TaggedWord>> taggedsentences) {

    // rule 7: if a noun follows a plural demonstrative adjective (these or those)

    for (ArrayList<TaggedWord> sentence : taggedsentences) {
      for (int i = 0; i < sentence.size(); i++) {

        // adv adj
        if (i > 0) {
          // often/RB, become/VBN, robbery/NN, ,/,, sexual/JJ, harassment/NN, and/CC, other/JJ,
          // violence/NN, behavior/NN, of/IN, victim/NN
          if (sentence.get(i - 1).tag().equals("IN")
              && sentence.get(i - 1).word().equals("of")
              && sentence.get(i).tag().equals("NN")) {
            String word1 = sentence.get(i - 1).word();
            String word2 = sentence.get(i).word();

            int a = 1;
          }
        }

        //				//rule 1: DT NN DT --> DT NN
        //				if (i<sentence.size()-2)
        //				{
        //					if (sentence.get(i).tag().equals("DT") && sentence.get(i+1).tag().equals("NN")
        //							&& sentence.get(i+2).tag().equals("DT"))
        //					{
        //						sentence.remove(i + 2);
        //					}
        //				}
        //
        //				//rule 2: JJ NN --> NN JJ
        //				if (i>0)
        //				{
        //					if (sentence.get(i).tag().equals("JJ") && sentence.get(i-1).tag().equals("NN"))
        //					{
        //						TaggedWord temptagged2 = sentence.get(i);
        //
        //						sentence.remove(i);
        //						sentence.add(i-1, temptagged2);
        //					}
        //				}

      }
    }

    // if any rules need to be run after the first set, go here:

    for (ArrayList<TaggedWord> sentence : taggedsentences) {
      for (int i = 0; i < sentence.size(); i++) {}

      System.out.println(Sentence.listToString(sentence, false));
    }
  }
Ejemplo n.º 5
0
 /** TODO: clearly this should be a default method in ParserQuery once Java 8 comes out */
 @Override
 public void restoreOriginalWords(Tree tree) {
   if (originalSentence == null || tree == null) {
     return;
   }
   List<Tree> leaves = tree.getLeaves();
   if (leaves.size() != originalSentence.size()) {
     throw new IllegalStateException(
         "originalWords and sentence of different sizes: "
             + originalSentence.size()
             + " vs. "
             + leaves.size()
             + "\n Orig: "
             + Sentence.listToString(originalSentence)
             + "\n Pars: "
             + Sentence.listToString(leaves));
   }
   // TODO: get rid of this cast
   Iterator<? extends Label> wordsIterator =
       (Iterator<? extends Label>) originalSentence.iterator();
   for (Tree leaf : leaves) {
     leaf.setLabel(wordsIterator.next());
   }
 }
  /** Reads an annotation from the given filename using the requested input. */
  public static List<Annotation> getAnnotations(
      StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) {
    switch (inputFormat) {
      case TEXT:
        {
          String text = IOUtils.slurpFileNoExceptions(filename);
          Annotation annotation = new Annotation(text);
          tokenizer.annotate(annotation);
          List<Annotation> annotations = Generics.newArrayList();
          for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            Annotation nextAnnotation =
                new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));
            nextAnnotation.set(
                CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
            annotations.add(nextAnnotation);
          }
          return annotations;
        }
      case TREES:
        {
          List<Tree> trees;
          if (filterUnknown) {
            trees = SentimentUtils.readTreesWithGoldLabels(filename);
            trees = SentimentUtils.filterUnknownRoots(trees);
          } else {
            trees = Generics.newArrayList();
            MemoryTreebank treebank = new MemoryTreebank("utf-8");
            treebank.loadPath(filename, null);
            for (Tree tree : treebank) {
              trees.add(tree);
            }
          }

          List<Annotation> annotations = Generics.newArrayList();
          for (Tree tree : trees) {
            CoreMap sentence = new Annotation(Sentence.listToString(tree.yield()));
            sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
            List<CoreMap> sentences = Collections.singletonList(sentence);
            Annotation annotation = new Annotation("");
            annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
            annotations.add(annotation);
          }
          return annotations;
        }
      default:
        throw new IllegalArgumentException("Unknown format " + inputFormat);
    }
  }
  public static void startParsing(String paragraph) throws FileNotFoundException, IOException {
    String parseTrees = "";

    // Can we just split on new line as paragraph is already sentence splitted.
    Reader reader = new StringReader(paragraph);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();

    for (List<HasWord> sentence : dp) {
      String sentenceString = Sentence.listToString(sentence);
      sentenceList.add(sentenceString);
    }

    for (String sentence : sentenceList) {
      //			System.out.println(sentence);
      parseTrees += createParseTree(sentence);
    }
    writeToFile(parseTrees, "trees.txt");
  }
  /**
   * for testing -- CURRENTLY BROKEN!!!
   *
   * @param args input dir and output filename
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    if (args.length != 3) {
      throw new RuntimeException("args: treebankPath trainNums testNums");
    }

    ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
    ctpp.charTags = true;
    // TODO: these options are getting clobbered by reading in the
    // parser object (unless it's a text file parser?)
    Options op = new Options(ctpp);
    op.doDep = false;
    op.testOptions.maxLength = 90;

    LexicalizedParser lp;
    try {
      FileFilter trainFilt = new NumberRangesFileFilter(args[1], false);

      lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op);
      try {
        String filename = "chineseCharTagPCFG.ser.gz";
        System.err.println("Writing parser in serialized format to file " + filename + ' ');
        System.err.flush();
        ObjectOutputStream out = IOUtils.writeStreamFromString(filename);

        out.writeObject(lp);
        out.close();
        System.err.println("done.");
      } catch (IOException ioe) {
        ioe.printStackTrace();
      }
    } catch (IllegalArgumentException e) {
      lp = LexicalizedParser.loadModel(args[1], op);
    }

    FileFilter testFilt = new NumberRangesFileFilter(args[2], false);
    MemoryTreebank testTreebank = ctpp.memoryTreebank();
    testTreebank.loadPath(new File(args[0]), testFilt);
    PrintWriter pw =
        new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true);
    WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
    WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
    EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck);
    //    System.out.println("Preterminals:" + preterminals);
    System.out.println("Testing...");
    for (Tree gold : testTreebank) {
      Tree tree;
      try {
        tree = lp.parseTree(gold.yieldHasWord());
        if (tree == null) {
          System.out.println("Failed to parse " + gold.yieldHasWord());
          continue;
        }
      } catch (Exception e) {
        e.printStackTrace();
        continue;
      }
      gold = gold.firstChild();
      pw.println(Sentence.listToString(gold.preTerminalYield()));
      pw.println(Sentence.listToString(gold.yield()));
      gold.pennPrint(pw);

      pw.println(tree.preTerminalYield());
      pw.println(tree.yield());
      tree.pennPrint(pw);
      //      Collection allBrackets = WordCatConstituent.allBrackets(tree);
      //      Collection goldBrackets = WordCatConstituent.allBrackets(gold);
      //      eval.eval(allBrackets, goldBrackets);
      eval.displayLast();
    }
    System.out.println();
    System.out.println();
    eval.display();
  }