Esempio n. 1
0
 private static Sentence wordify(List wList) {
   Sentence s = new Sentence();
   for (Object obj : wList) {
     s.add(new Word(obj.toString()));
   }
   return s;
 }
Esempio n. 2
0
  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
      return;
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
    BufferedReader r =
        new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));

    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
      List<TaggedWord> tSentence = tagger.tagSentence(sentence);
      pw.println(Sentence.listToString(tSentence, false));
    }

    // print the adjectives in one more sentence. This shows how to get at words and tags in a
    // tagged sentence.
    List<HasWord> sent =
        Sentence.toWordList(
            "The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord tw : taggedSent) {
      if (tw.tag().startsWith("JJ")) {
        pw.println(tw.word());
      }
    }

    pw.close();
  }
Esempio n. 3
0
  private boolean LexicalAnalyzer(ArrayList<Word> words, int index, String newWord) {
    String[] sent = toSentence(words);
    /// lexical analyzer
    List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
    Tree parse = lp.apply(rawWords);

    //		PrintStream outa = new PrintStream(new FileOutputStream("output1.txt"));

    //	    System.setOut(outa);
    //	    System.out.println("KKKKKKK");
    //	    parse.pennPrint();
    String oldTree = parse.toString();
    //	    String oldTree=baos.toString();
    //	    System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out)));
    //	    System.out.println(oldTree);

    words.get(index).setNewValue(newWord);
    sent = toSentence(words);
    rawWords = Sentence.toCoreLabelList(sent);
    parse = lp.apply(rawWords);
    //	    PrintStream outb = new PrintStream(new FileOutputStream("output2.txt"));
    //	    System.setOut(outb);

    //	    parse.pennPrint();
    String newTree = parse.toString();

    oldTree = oldTree.replaceAll(words.get(index).getOrigValue() + "[)]", newWord + ")");
    //	    System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out)));
    System.out.println(oldTree + "\n" + newTree);

    //	    	System.out.println(oldTree.equals(newTree));

    if (oldTree.equals(newTree)) {
      if (index == 0) {
        String str = words.get(index).getNewValue();
        String cap = str.substring(0, 1).toUpperCase() + str.substring(1);
        words.get(index).setNewValue(cap);
      }
      return true;
    } else {
      words.get(index).setNewValue(null);
      return false;
    }

    /* catch (FileNotFoundException e) {
    	// TODO Auto-generated catch block
    	e.printStackTrace();
    	return false;
    } catch (IOException e) {
    	// TODO Auto-generated catch block
    	e.printStackTrace();
    	return false;
    }*/

    //		return true;
  }
  @Override
  public void evaluate(Tree guess, Tree gold, PrintWriter pw) {
    if (gold == null || guess == null) {
      System.err.printf(
          "%s: Cannot compare against a null gold or guess tree!\n", this.getClass().getName());
      return;

    } else if (guess.yield().size() != gold.yield().size()) {
      System.err.println("Warning: yield differs:");
      System.err.println("Guess: " + Sentence.listToString(guess.yield()));
      System.err.println("Gold:  " + Sentence.listToString(gold.yield()));
    }

    super.evaluate(guess, gold, pw);
  }
 public List<? extends HasWord> defaultTestSentence() {
   String[] sent = {
     "H", "MWX", "MTPLC", "LA", "RQ", "M", "H", "TWPEH", "H", "MBIFH", "ALA", "GM", "M", "DRKI",
     "H", "HERMH", "yyDOT"
   };
   return Sentence.toWordList(sent);
 }
  public static void generate(String model, String fileToTag, String outfile) throws Exception {

    MaxentTagger tagger = new MaxentTagger(model);
    PrintWriter pw =
        new PrintWriter(new OutputStreamWriter(new FileOutputStream(outfile), "utf-8"));

    BufferedReader br = new BufferedReader(new FileReader(fileToTag));
    String line = "";
    ArrayList<String> toks = new ArrayList<>();
    while ((line = br.readLine()) != null) {
      if (line.length() == 0) {
        String params[] = new String[toks.size()];
        toks.toArray(params);
        List<HasWord> sent = Sentence.toWordList(params);
        List<TaggedWord> taggedSent = tagger.tagSentence(sent);
        for (TaggedWord tw : taggedSent) {
          pw.println(tw.word() + " " + tw.tag());
        }
        pw.println();
        toks = new ArrayList<>();
      } else {
        toks.add(line);
      }
    }
    br.close();
    pw.close();
  }
Esempio n. 7
0
  public static ArrayList<ArrayList<TaggedWord>> tag(String sentencedoc) {
    MaxentTagger tagger;
    try {

      tagger = new MaxentTagger("data/wsj-0-18-bidirectional-distsim.tagger");

      List<List<HasWord>> sentences =
          MaxentTagger.tokenizeText(new BufferedReader(new FileReader(sentencedoc)));

      ArrayList<ArrayList<TaggedWord>> taggedsentences =
          new ArrayList<ArrayList<TaggedWord>>(sentences.size());

      for (List<HasWord> sentence : sentences) {
        ArrayList<TaggedWord> tSentence = tagger.tagSentence(sentence);
        taggedsentences.add(tSentence);
        System.out.println(Sentence.listToString(tSentence, false));
      }

      return taggedsentences;
    } catch (IOException e) {
      e.printStackTrace();
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    }
    return null;
  }
 /** Return a default sentence for the language (for testing) */
 @Override
 public ArrayList<Word> defaultTestSentence() {
   return Sentence.toUntaggedList(
       "\u951f\u65a4\u62f7",
       "\u951f\u65a4\u62f7",
       "\u5b66\u6821",
       "\u951f\u65a4\u62f7",
       "\u5b66\u4e60",
       "\u951f\u65a4\u62f7");
 }
Esempio n. 9
0
  public static void handletaggedsentences(ArrayList<ArrayList<TaggedWord>> taggedsentences) {

    // rule 7: if a noun follows a plural demonstrative adjective (these or those)

    for (ArrayList<TaggedWord> sentence : taggedsentences) {
      for (int i = 0; i < sentence.size(); i++) {

        // adv adj
        if (i > 0) {
          // often/RB, become/VBN, robbery/NN, ,/,, sexual/JJ, harassment/NN, and/CC, other/JJ,
          // violence/NN, behavior/NN, of/IN, victim/NN
          if (sentence.get(i - 1).tag().equals("IN")
              && sentence.get(i - 1).word().equals("of")
              && sentence.get(i).tag().equals("NN")) {
            String word1 = sentence.get(i - 1).word();
            String word2 = sentence.get(i).word();

            int a = 1;
          }
        }

        //				//rule 1: DT NN DT --> DT NN
        //				if (i<sentence.size()-2)
        //				{
        //					if (sentence.get(i).tag().equals("DT") && sentence.get(i+1).tag().equals("NN")
        //							&& sentence.get(i+2).tag().equals("DT"))
        //					{
        //						sentence.remove(i + 2);
        //					}
        //				}
        //
        //				//rule 2: JJ NN --> NN JJ
        //				if (i>0)
        //				{
        //					if (sentence.get(i).tag().equals("JJ") && sentence.get(i-1).tag().equals("NN"))
        //					{
        //						TaggedWord temptagged2 = sentence.get(i);
        //
        //						sentence.remove(i);
        //						sentence.add(i-1, temptagged2);
        //					}
        //				}

      }
    }

    // if any rules need to be run after the first set, go here:

    for (ArrayList<TaggedWord> sentence : taggedsentences) {
      for (int i = 0; i < sentence.size(); i++) {}

      System.out.println(Sentence.listToString(sentence, false));
    }
  }
Esempio n. 10
0
 public void testInitialStateFromTagged() {
   String[] words = {"This", "is", "a", "short", "test", "."};
   String[] tags = {"DT", "VBZ", "DT", "JJ", "NN", "."};
   assertEquals(words.length, tags.length);
   List<TaggedWord> sentence = Sentence.toTaggedList(Arrays.asList(words), Arrays.asList(tags));
   State state = ShiftReduceParser.initialStateFromTaggedSentence(sentence);
   for (int i = 0; i < words.length; ++i) {
     assertEquals(tags[i], state.sentence.get(i).value());
     assertEquals(1, state.sentence.get(i).children().length);
     assertEquals(words[i], state.sentence.get(i).children()[0].value());
   }
 }
 /** TODO: clearly this should be a default method in ParserQuery once Java 8 comes out */
 @Override
 public void restoreOriginalWords(Tree tree) {
   if (originalSentence == null || tree == null) {
     return;
   }
   List<Tree> leaves = tree.getLeaves();
   if (leaves.size() != originalSentence.size()) {
     throw new IllegalStateException(
         "originalWords and sentence of different sizes: "
             + originalSentence.size()
             + " vs. "
             + leaves.size()
             + "\n Orig: "
             + Sentence.listToString(originalSentence)
             + "\n Pars: "
             + Sentence.listToString(leaves));
   }
   // TODO: get rid of this cast
   Iterator<? extends Label> wordsIterator =
       (Iterator<? extends Label>) originalSentence.iterator();
   for (Tree leaf : leaves) {
     leaf.setLabel(wordsIterator.next());
   }
 }
  /** Reads an annotation from the given filename using the requested input. */
  public static List<Annotation> getAnnotations(
      StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) {
    switch (inputFormat) {
      case TEXT:
        {
          String text = IOUtils.slurpFileNoExceptions(filename);
          Annotation annotation = new Annotation(text);
          tokenizer.annotate(annotation);
          List<Annotation> annotations = Generics.newArrayList();
          for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            Annotation nextAnnotation =
                new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));
            nextAnnotation.set(
                CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
            annotations.add(nextAnnotation);
          }
          return annotations;
        }
      case TREES:
        {
          List<Tree> trees;
          if (filterUnknown) {
            trees = SentimentUtils.readTreesWithGoldLabels(filename);
            trees = SentimentUtils.filterUnknownRoots(trees);
          } else {
            trees = Generics.newArrayList();
            MemoryTreebank treebank = new MemoryTreebank("utf-8");
            treebank.loadPath(filename, null);
            for (Tree tree : treebank) {
              trees.add(tree);
            }
          }

          List<Annotation> annotations = Generics.newArrayList();
          for (Tree tree : trees) {
            CoreMap sentence = new Annotation(Sentence.listToString(tree.yield()));
            sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
            List<CoreMap> sentences = Collections.singletonList(sentence);
            Annotation annotation = new Annotation("");
            annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
            annotations.add(annotation);
          }
          return annotations;
        }
      default:
        throw new IllegalArgumentException("Unknown format " + inputFormat);
    }
  }
Esempio n. 13
0
 /**
  * Turns a sentence into a flat phrasal tree. The structure is S -> tag*. And then each tag goes
  * to a word. The tag is either found from the label or made "WD". The tag and phrasal node have a
  * StringLabel.
  *
  * @param s The Sentence to make the Tree from
  * @param lf The LabelFactory with which to create the new Tree labels
  * @return The one phrasal level Tree
  */
 public static Tree toFlatTree(Sentence<?> s, LabelFactory lf) {
   List<Tree> daughters = new ArrayList<Tree>(s.length());
   for (HasWord word : s) {
     Tree wordNode = new LabeledScoredTreeLeaf(lf.newLabel(word.word()));
     if (word instanceof TaggedWord) {
       TaggedWord taggedWord = (TaggedWord) word;
       wordNode =
           new LabeledScoredTreeNode(
               new StringLabel(taggedWord.tag()), Collections.singletonList(wordNode));
     } else {
       wordNode =
           new LabeledScoredTreeNode(lf.newLabel("WD"), Collections.singletonList(wordNode));
     }
     daughters.add(wordNode);
   }
   return new LabeledScoredTreeNode(new StringLabel("S"), daughters);
 }
  public static void startParsing(String paragraph) throws FileNotFoundException, IOException {
    String parseTrees = "";

    // Can we just split on new line as paragraph is already sentence splitted.
    Reader reader = new StringReader(paragraph);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();

    for (List<HasWord> sentence : dp) {
      String sentenceString = Sentence.listToString(sentence);
      sentenceList.add(sentenceString);
    }

    for (String sentence : sentenceList) {
      //			System.out.println(sentence);
      parseTrees += createParseTree(sentence);
    }
    writeToFile(parseTrees, "trees.txt");
  }
  /**
   * for testing -- CURRENTLY BROKEN!!!
   *
   * @param args input dir and output filename
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    if (args.length != 3) {
      throw new RuntimeException("args: treebankPath trainNums testNums");
    }

    ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
    ctpp.charTags = true;
    // TODO: these options are getting clobbered by reading in the
    // parser object (unless it's a text file parser?)
    Options op = new Options(ctpp);
    op.doDep = false;
    op.testOptions.maxLength = 90;

    LexicalizedParser lp;
    try {
      FileFilter trainFilt = new NumberRangesFileFilter(args[1], false);

      lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op);
      try {
        String filename = "chineseCharTagPCFG.ser.gz";
        System.err.println("Writing parser in serialized format to file " + filename + ' ');
        System.err.flush();
        ObjectOutputStream out = IOUtils.writeStreamFromString(filename);

        out.writeObject(lp);
        out.close();
        System.err.println("done.");
      } catch (IOException ioe) {
        ioe.printStackTrace();
      }
    } catch (IllegalArgumentException e) {
      lp = LexicalizedParser.loadModel(args[1], op);
    }

    FileFilter testFilt = new NumberRangesFileFilter(args[2], false);
    MemoryTreebank testTreebank = ctpp.memoryTreebank();
    testTreebank.loadPath(new File(args[0]), testFilt);
    PrintWriter pw =
        new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true);
    WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
    WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
    EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck);
    //    System.out.println("Preterminals:" + preterminals);
    System.out.println("Testing...");
    for (Tree gold : testTreebank) {
      Tree tree;
      try {
        tree = lp.parseTree(gold.yieldHasWord());
        if (tree == null) {
          System.out.println("Failed to parse " + gold.yieldHasWord());
          continue;
        }
      } catch (Exception e) {
        e.printStackTrace();
        continue;
      }
      gold = gold.firstChild();
      pw.println(Sentence.listToString(gold.preTerminalYield()));
      pw.println(Sentence.listToString(gold.yield()));
      gold.pennPrint(pw);

      pw.println(tree.preTerminalYield());
      pw.println(tree.yield());
      tree.pennPrint(pw);
      //      Collection allBrackets = WordCatConstituent.allBrackets(tree);
      //      Collection goldBrackets = WordCatConstituent.allBrackets(gold);
      //      eval.eval(allBrackets, goldBrackets);
      eval.displayLast();
    }
    System.out.println();
    System.out.println();
    eval.display();
  }
Esempio n. 16
0
 private static Sentence addLast(Sentence s) {
   Sentence s2 = new Sentence(s);
   // s2.add(new StringLabel(Lexicon.BOUNDARY));
   s2.add(new Word(Lexicon.BOUNDARY));
   return s2;
 }
Esempio n. 17
0
 private static Sentence cutLast(Sentence s) {
   return new Sentence(s.subList(0, s.size() - 1));
 }
Esempio n. 18
0
  public ArrayList<String> getKeyWordsDependency(String sentence, String keyword) {
    LexicalizedParser lp =
        LexicalizedParser.loadModel(
            "/home/mingrui/Desktop/englishPCFG.ser.gz",
            "-maxLength",
            "80",
            "-retainTmpSubcategories");
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    // Uncomment the following line to obtain original Stanford Dependencies
    // tlp.setGenerateOriginalDependencies(true);
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    String[] array = sentence.split("\\s+");
    Tree parse = lp.apply(Sentence.toWordList(array));
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    ArrayList<String> keywordsDependency = new ArrayList<String>();
    ArrayList<String> keywordsDependencyWithLemmatization = new ArrayList<String>();
    // String lemmatizedKeyword = lemmatize(keyword);
    for (TypedDependency t : tdl) {
      String d = t.toString();
      String dependencyType = d.substring(0, d.indexOf("("));
      String pair = d.substring(d.indexOf("(") + 1, d.indexOf("("));
      String[] terms = pair.split(",");
      String term1 = terms[0].trim();
      String term2 = terms[1].trim();

      // Match keywords with the terms in the tuples, if matched, add the
      // tuple into the arraylist
      String[] wordsplitted = keyword.split(" ");
      for (String key : wordsplitted) {
        if (term1.equals(key)) {
          keywordsDependency.add(t.toString());
        }
        if (term2.equals(key)) {
          keywordsDependency.add(t.toString());
        }
      }
    }

    String lemmatizedKeywords = lemmatize(keyword);
    int lbefore = keyword.split(" ").length;
    int lafter = lemmatizedKeywords.split(" ").length;
    if (lbefore == lafter) {
      return keywordsDependency;
    } else {
      String[] split = keyword.split(" ");
      for (String s : split) {
        String[] lemmas = lemmatize(s).split(" ");
        boolean sameLength = lemmas.length == s.split(" ").length;
        if (sameLength) { // Compare the length of one key_word or key_phrase before and after
                          // lemmatization
          continue;
        } else {
          for (String tuple : keywordsDependency) {
            if (getTupleTerms(tuple)[0].equals(
                s)) { // Find the tuple that contains the original keyword/key_phrase
              String dependent = getTupleTerms(tuple)[1];
              // String[]
            }
          }
          // for(String l : lemma)
        }
      }
      return keywordsDependencyWithLemmatization;
    }
  }