Java LexicalizedParser 예제들, edu.stanford.nlp.parser.lexparser.LexicalizedParser Java 예제들

예제 #1

0

파일 보기

파일: StanfordParser.java 프로젝트: thomasancheriyil/Emotion-and-Sarcasm-Identification

  /**
   * Parses a sentence and returns the parse tree.
   *
   * @param sentence a sentence
   * @return Tree character offsets in keys BEGIN_KEY and END_KEY
   */
  @SuppressWarnings("unchecked")
  public static Tree parseTree(String sentence) {
    if (tlp == null || parser == null)
      throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce stanford Tree
    log.debug("Parsing sentence");
    Tree tree = null;
    synchronized (parser) {
      Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
      List<Word> words = tokenizer.tokenize();
      log.debug("Tokenization: " + words);
      parser.parse(new Sentence(words));
      tree = parser.getBestParse();
    }

    // label tree with character extents
    // log.debug("Setting character extents");
    // updateTreeLabels(tree, tree, new MutableInteger(), new MutableInteger(-1));
    // log.debug("Creating offset mapping");
    // List<RangeMap> mapping = createMapping(sentence);
    // log.debug(mapping.toString());
    // log.debug("Applying offset mapping");
    // mapOffsets(tree, mapping);

    return tree;
  }

예제 #2

0

파일 보기

파일: ParseEssay.java 프로젝트: hogueyy/essayx

  private boolean LexicalAnalyzer(ArrayList<Word> words, int index, String newWord) {
    String[] sent = toSentence(words);
    /// lexical analyzer
    List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
    Tree parse = lp.apply(rawWords);

    //		PrintStream outa = new PrintStream(new FileOutputStream("output1.txt"));

    //	    System.setOut(outa);
    //	    System.out.println("KKKKKKK");
    //	    parse.pennPrint();
    String oldTree = parse.toString();
    //	    String oldTree=baos.toString();
    //	    System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out)));
    //	    System.out.println(oldTree);

    words.get(index).setNewValue(newWord);
    sent = toSentence(words);
    rawWords = Sentence.toCoreLabelList(sent);
    parse = lp.apply(rawWords);
    //	    PrintStream outb = new PrintStream(new FileOutputStream("output2.txt"));
    //	    System.setOut(outb);

    //	    parse.pennPrint();
    String newTree = parse.toString();

    oldTree = oldTree.replaceAll(words.get(index).getOrigValue() + "[)]", newWord + ")");
    //	    System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out)));
    System.out.println(oldTree + "\n" + newTree);

    //	    	System.out.println(oldTree.equals(newTree));

    if (oldTree.equals(newTree)) {
      if (index == 0) {
        String str = words.get(index).getNewValue();
        String cap = str.substring(0, 1).toUpperCase() + str.substring(1);
        words.get(index).setNewValue(cap);
      }
      return true;
    } else {
      words.get(index).setNewValue(null);
      return false;
    }

    /* catch (FileNotFoundException e) {
    	// TODO Auto-generated catch block
    	e.printStackTrace();
    	return false;
    } catch (IOException e) {
    	// TODO Auto-generated catch block
    	e.printStackTrace();
    	return false;
    }*/

    //		return true;
  }

예제 #3

0

파일 보기

파일: Parser.java 프로젝트: kurokocon/ExpertSystem

  public LinkedList<String> getKeyWrodsFromSentence(String string) {
    LinkedList<String> list = new LinkedList<String>();

    String[] sent = string.split(" ");
    List<HasWord> sentence = new ArrayList<HasWord>();
    for (String word : sent) sentence.add(new Word(word));

    Tree parse = lp.parse(sentence);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);

    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();

    String[] current;
    String type, key;
    List<CoreLabel> labelsList = parse.taggedLabeledYield();
    for (Label l : labelsList) {
      current = l.toString().split("-");
      type = current[0];
      if (type.equals("NN") || type.equals("NNS")) {
        key = sent[Integer.parseInt(current[1])];
        list.add(key);
      }
    }
    return list;
  }

예제 #4

0

파일 보기

파일: LexicalSimilarity.java 프로젝트: sumitbhagwani/SententialSimilarity

 public static void main(String args[]) {
   //        String sentence1 = "A large bird standing on a table picks up a plastic glass
   // containing liquid and places it in a bowl of something.";
   //        String sentence2 = "A bird picks up a plastic cup containing a liquid with it's beak
   // and puts the cup into a bowl.";
   //        LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
   //        LeskWSD tm = new LeskWSD(lp);
   //        WordNetSimilarity ws = new WordNetSimilarity();
   //
   //        System.out.println(LexicalSimilarityScoreWordNet(sentence1, sentence2, tm, lp, ws));
   String sentence =
       "The broader Standard & Poor's 500 Index <.SPX> shed 2.38 points, or 0.24 percent, at 995.10.";
   LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
   Tree parse = lp.apply(sentence);
   ArrayList<TaggedWord> taggedWords = parse.taggedYield();
   taggedWords = Preprocess(taggedWords);
   for (int i = 0; i < taggedWords.size(); i++) System.out.println(taggedWords.get(i).word());
 }

예제 #5

0

파일 보기

파일: StanfordParser.java 프로젝트: thomasancheriyil/Emotion-and-Sarcasm-Identification

  /**
   * Parses a sentence and returns the PCFG score as a confidence measure.
   *
   * @param sentence a sentence
   * @return PCFG score
   */
  @SuppressWarnings("unchecked")
  public static double getPCFGScore(String sentence) {
    if (tlp == null || parser == null)
      throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce PCFG score
    log.debug("Parsing sentence");
    double score;
    synchronized (parser) {
      Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
      List<Word> words = tokenizer.tokenize();
      log.debug("Tokenization: " + words);
      parser.parse(new Sentence(words));
      score = parser.getPCFGScore();
    }

    return score;
  }

예제 #6

0

파일 보기

파일: ParseEssay.java 프로젝트: hogueyy/essayx

  public ParseEssay() {
    System.setProperty("wordnet.database.dir", "../war/dict");
    synonyms = new ArrayList<String>();
    database = WordNetDatabase.getFileInstance();
    baos = new ByteArrayOutputStream();
    lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");

    // ??
  }

예제 #7

0

파일 보기

파일: LexicalSimilarity.java 프로젝트: sumitbhagwani/SententialSimilarity

  public static ArrayList<TaggedWord> StanfordParse(String sentence, LexicalizedParser lp) {

    TokenizerFactory<CoreLabel> tokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> rawWords2 =
        tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
    Tree parse = lp.apply(rawWords2);
    ArrayList<TaggedWord> taggedWords = parse.taggedYield();

    return taggedWords;
  }

예제 #8

0

파일 보기

파일: LexicalParsingEngine.java 프로젝트: ferojuras/EntitySearch

  private Collection<TypedDependency> parseSentenceTDL(String text) {
    System.out.println("Parsing sentence...");

    Collection<TypedDependency> tdl = null;
    TreebankLanguagePack tlp = lp.treebankLanguagePack();
    GrammaticalStructureFactory gsf = null;
    if (tlp.supportsGrammaticalStructures()) {
      gsf = tlp.grammaticalStructureFactory();
    }

    Reader reader = new StringReader(text);

    for (List<HasWord> sentence : new DocumentPreprocessor(reader)) {
      Tree parse = lp.apply(sentence);
      if (gsf != null) {
        GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
        tdl = gs.allTypedDependencies();
      }
    }
    return tdl;
  }

예제 #9

0

파일 보기

파일: StanfordParser.java 프로젝트: ag-sc/DeptDUDES

  private static List<TypedDependency> getDependencies(String sentence) {

    if (pipeline == null) {
      loadModels();
    }

    TokenizerFactory<CoreLabel> tokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sentence));
    List<CoreLabel> rawWords2 = tok.tokenize();
    Tree parse = lp.apply(rawWords2);
    //        parse.pennPrint();
    //
    //        System.out.println(parse.toString());

    TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();

    return tdl;
  }

예제 #10

0

파일 보기

파일: ParserDemo3.java 프로젝트: danlassiter/gradable-adjective-corpus-analysis

  public static void main(String[] args) // start of the main method
      {
    System.out.println("\n\n\nSTART\n\n\n"); // print START
    try // device to handle potential errors
    {
      // open file whose path is passed
      // as the first argument of the main method:
      FileInputStream fis = new FileInputStream(args[0]);
      DataInputStream dis = new DataInputStream(fis);
      BufferedReader br = new BufferedReader(new InputStreamReader(dis));

      // prepare Parser, Tokenizer and Tree printer:
      LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
      TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory());
      TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

      String sentence; // initialization
      // for each line of the file
      // retrieve it as a string called 'sentence':
      while ((sentence = br.readLine()) != null) {
        // print sentence:
        System.out.println("\n\n\n\nORIGINAL:\n\n" + sentence);
        // put tokens in a list:
        List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize();
        lp.parse(tokens); // parse the tokens
        Tree t = lp.getBestParse(); // get the best parse tree
        System.out.println("\nPROCESSED:\n\n");
        tp.printTree(t); // print tree
      }
      dis.close(); // close input file
    } catch (Exception e) // catch error if any
    {
      System.err.println("ERROR: " + e.getMessage()); // print error message
    }
    System.out.println("\n\n\nTHE END\n\n\n"); // print THE END
  } // end of the main method

예제 #11

0

파일 보기

파일: LexicalParsingEngine.java 프로젝트: ferojuras/EntitySearch

  private ArrayList<TaggedWord> parseSentenceTD(String text) {
    System.out.println("Parsing sentence...");

    ArrayList<TaggedWord> tw = new ArrayList<TaggedWord>();

    Reader reader = new StringReader(text);

    for (List<HasWord> sentence : new DocumentPreprocessor(reader)) {

      Tree parse = lp.apply(sentence);

      tw = parse.taggedYield();
    }
    return tw;
  }

예제 #12

0

파일 보기

파일: LexicalSimilarity.java 프로젝트: sumitbhagwani/SententialSimilarity

  public static double LexicalSimilarity2Level(
      String sentence1, String sentence2, DISCOSimilarity discoRAM, LexicalizedParser lp) {
    Tree parse1 = lp.apply(sentence1);
    Tree parse2 = lp.apply(sentence2);

    int phraseSizeLimit = 2;

    ArrayList<ArrayList<TaggedWord>> phrasesList1 = getPhrases(parse1, phraseSizeLimit);
    ArrayList<ArrayList<TaggedWord>> phrasesList2 = getPhrases(parse2, phraseSizeLimit);

    int length1 = phrasesList1.size();
    int length2 = phrasesList2.size();
    int arrSize = Math.max(length1, length2);
    double[][] array = new double[arrSize][arrSize];
    for (int i = 0; i < arrSize; i++) {
      for (int j = 0; j < arrSize; j++) {
        array[i][j] = 0;
      }
    }
    for (int i = 0; i < length1; i++) {
      for (int j = 0; j < length2; j++) {
        double edgeWeight = 0;
        ArrayList<TaggedWord> taggedWords1 = phrasesList1.get(i);
        ArrayList<TaggedWord> taggedWords2 = phrasesList2.get(j);
        // edgeWeight = LexicalSimilarityScore(taggedWords1, taggedWords2, discoRAM, lp)/5.0;
        edgeWeight = BestWordMatchEdgeWeight(taggedWords1, taggedWords2, discoRAM);

        array[i][j] = edgeWeight;
      }
    }

    // System.out.println("Hungarian starts " + arrSize);

    double finalScore;
    String sumType = "max";
    // int minLength = Math.min(length1, length2);
    // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5;
    if (arrSize == 0) finalScore = 0;
    else finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5;

    return finalScore;
  }

예제 #13

0

파일 보기

파일: Parser.java 프로젝트: kurokocon/ExpertSystem

  public LinkedList<String> getKeyWrodsFromSentenceTest(String string) {

    LinkedList<String> list = new LinkedList<String>();

    String[] sent = string.split(" ");
    List<HasWord> sentence = new ArrayList<HasWord>();
    for (String word : sent) {
      sentence.add(new Word(word));
    }

    Tree parse = lp.parse(sentence);
    parse.pennPrint();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);

    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    System.out.println(tdl);

    System.out.println();

    System.out.println("The words of the sentence:");
    for (Label lab : parse.yield()) {
      if (lab instanceof CoreLabel) {
        System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP));
      } else {
        System.out.println(lab);
      }
    }
    System.out.println();
    System.out.println("tagged");
    System.out.println(parse.taggedYield());

    List<CoreLabel> temp = parse.taggedLabeledYield();
    for (Label l : temp) {
      String[] sss = l.toString().split("-");
      String type = sss[0];
      System.out.println(sss[0] + "  " + sss[1] + "    " + sent[Integer.parseInt(sss[1])]);
    }

    for (Iterator<String> ite = list.iterator(); ite.hasNext(); ) System.out.println(ite.next());
    return list;
  }

예제 #14

0

파일 보기

파일: Parser.java 프로젝트: kurokocon/ExpertSystem

public class Parser {

  private String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
  private String[] options = {"-maxLength", "80", "-retainTmpSubcategories"};
  private LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
  private TreebankLanguagePack tlp = lp.getOp().langpack();
  private GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();

  public Parser() {}

  public LinkedList<String> getKeyWrodsFromSentence(String string) {
    LinkedList<String> list = new LinkedList<String>();

    String[] sent = string.split(" ");
    List<HasWord> sentence = new ArrayList<HasWord>();
    for (String word : sent) sentence.add(new Word(word));

    Tree parse = lp.parse(sentence);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);

    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();

    String[] current;
    String type, key;
    List<CoreLabel> labelsList = parse.taggedLabeledYield();
    for (Label l : labelsList) {
      current = l.toString().split("-");
      type = current[0];
      if (type.equals("NN") || type.equals("NNS")) {
        key = sent[Integer.parseInt(current[1])];
        list.add(key);
      }
    }
    return list;
  }

  public LinkedList<String> getKeyWrodsFromSentenceTest(String string) {

    LinkedList<String> list = new LinkedList<String>();

    String[] sent = string.split(" ");
    List<HasWord> sentence = new ArrayList<HasWord>();
    for (String word : sent) {
      sentence.add(new Word(word));
    }

    Tree parse = lp.parse(sentence);
    parse.pennPrint();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);

    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    System.out.println(tdl);

    System.out.println();

    System.out.println("The words of the sentence:");
    for (Label lab : parse.yield()) {
      if (lab instanceof CoreLabel) {
        System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP));
      } else {
        System.out.println(lab);
      }
    }
    System.out.println();
    System.out.println("tagged");
    System.out.println(parse.taggedYield());

    List<CoreLabel> temp = parse.taggedLabeledYield();
    for (Label l : temp) {
      String[] sss = l.toString().split("-");
      String type = sss[0];
      System.out.println(sss[0] + "  " + sss[1] + "    " + sent[Integer.parseInt(sss[1])]);
    }

    for (Iterator<String> ite = list.iterator(); ite.hasNext(); ) System.out.println(ite.next());
    return list;
  }

  public static void main(String[] args) throws IOException {
    Parser parser = new Parser();
    parser.getKeyWrodsFromSentence(
        "When athletes begin to exercise, their heart rates and respiration rates increase.  At what level of organization does the human body coordinate these functions?");
    parser.getKeyWrodsFromSentenceTest(
        "When athletes begin to exercise, their heart rates and respiration rates increase.  At what level of organization does the human body coordinate these functions?");
    // main2();

  }
}

예제 #15

0

파일 보기

파일: Parser.java 프로젝트: awemcdonald/Connections

		public HashMap<String,ArrayList<TreeData>> parseAllDocs() throws IOException{ 
			String grammar =  "./jsan_resources/englishPCFG.ser.gz";
			String[] options = { "-maxLength", "120", "-retainTmpSubcategories" };
//			LexicalizedParser lp = new LexicalizedParser(grammar, options);
			
			LexicalizedParser lp = new LexicalizedParser()
			TreebankLanguagePack tlp = new PennTreebankLanguagePack();
			
			GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
			Iterable<List<? extends HasWord>> sentences;
			ArrayList<HashMap<String,ArrayList<String>>> everything = new ArrayList<HashMap<String,ArrayList<String>>>(3); 
			everything.add(0,otherSampleStrings);
			everything.add(1,authorSampleStrings);
			everything.add(2,toModifyStrings);
			Iterator<HashMap<String,ArrayList<String>>> everythingIter = everything.iterator();
			int docTypeNumber = -1; // 0 for otherSampleStrings, 1 for authorSampleStrings, 2 for toModifyStrings
			int numLoaded = 0;
			while(everythingIter.hasNext()){
				docTypeNumber++;
				HashMap<String,ArrayList<String>> currentSampleStrings = docPathFinder();
				Set<String> currentDocStrings = currentSampleStrings.keySet();
				Iterator<String> docStrIter = currentDocStrings.iterator();
				String docID;
				ArrayList<String> sentenceTokens;
				allTreeProcessors[docTypeNumber]  = new TreeProcessor();
				allTreeProcessors[docTypeNumber].clearLoadedTreeDataMaps();
				numLoaded=0;
				while(docStrIter.hasNext()){
					docID = docStrIter.next();
					sentenceTokens = currentSampleStrings.get(docID);
					if(sentenceTokens == null){
						allTreeProcessors[docTypeNumber].loadTreeDataMap(docID, GRAMMAR_DIR, false);
						numLoaded++;
						continue;
					}
					//System.out.println(sentenceTokens.size()+", strIter.hasNext? -> "+strIter.hasNext());

					numSentences = sentenceTokens.size();
					//initialize(numSentences);
					Iterator<String> sentIter = sentenceTokens.iterator();
					List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>();
					String tempSent;
					while(sentIter.hasNext()){
						tempSent = sentIter.next();
						Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(tempSent));
						List<? extends HasWord> sentenceTokenized = toke.tokenize();
						tmp.add(sentenceTokenized);
					}
					
					sentences = tmp;
					//int numDone = 0;
					TreeProcessor.singleDocMap.clear();
					boolean willSaveResults = true;
					for (List<? extends HasWord> sentence : sentences) {
						Tree parse = lp.apply(sentence);
						//parse.pennPrint();
						//System.out.println(parse.treeSkeletonCopy().toString());
						//System.out.println(parse.taggedYield());
						//System.out.println();
						//printSubTrees(parse);
						//TreeContainer.recurseTree(parse,"breadth");
						allTreeProcessors[docTypeNumber].processTree(parse, 0, willSaveResults); 
						//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
						//numDone++;
						//System.out.println("sent "+numDone+" of "+numSentences+" done ");
						//System.out.println(tc.processedTrees.toString());
						//in.nextLine();
						//TreeContainer.recurseTree(parse, "depth");
						//in.nextLine();
						//addTree(parse);
						//GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);//TODO: LOOK AT THIS
						//Collection tdl = gs.typedDependenciesCCprocessed(true);
						//System.out.println(tdl);
						//System.out.println();
					}
					if(willSaveResults == true)
						ObjectIO.writeObject(TreeProcessor.singleDocMap,docID, GRAMMAR_DIR);

					//System.out.println("After all sents: ");
					//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
					//String sent3 = "This is one last test!";
					//Tree parse3 = lp.apply(sent3);
					//parse3.pennPrint();
					//System.out.println("After sorting and writing:");
					//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
					//Scanner in = new Scanner(System.in);
					//System.out.println("First one done.");
					//in.nextLine();
					//viewTrees();
				}
				
				//TreeProcessor.writeTreeDataToCSV(sortedTD,docID);
				allTreeProcessors[docTypeNumber].unmergedMaps = new ArrayList<HashMap<String,TreeData>>(numLoaded+1);
				
			}	
			
			
			int i= 0;
			allParsedAndOrdered.clear();
			String[] docTypes = new String[]{"otherSample","authorSample","toModify"};
			for(i=0; i < 3; i++){
				allTreeProcessors[i].unmergedMaps.add(allTreeProcessors[i].processedTrees);
				allTreeProcessors[i].unmergedMaps.addAll(allTreeProcessors[i].loadedTreeDataMaps);
				allTreeProcessors[i].mergeTreeDataLists(allTreeProcessors[i].unmergedMaps);
				allParsedAndOrdered.put(docTypes[i],allTreeProcessors[i].sortTreeData(allTreeProcessors[i].mergedMap));
				
			}
			
			//ArrayList<TreeData> sortedTD = TreeContainer.sortTreeData(TreeContainer.allProcessedTrees);
			//TreeContainer.writeTreeDataToCSV(sortedTD,"ALL_AUTHORS");
			
			return allParsedAndOrdered;
		}

예제 #16

0

파일 보기

파일: StanfordParserWrapper.java 프로젝트: StatNLP/otedama

  /**
   * parse sentence and generate .trees file
   *
   * @param en
   * @param align
   * @param out
   */
  public static void parse(String en, String align, String out, boolean verbose) {

    // use alignments?
    boolean use_alignments = true;
    if (align.startsWith("no_align")) {
      use_alignments = false;
      System.err.println("Not using alignments.");
    } else {
      System.err.println("Using alignments from " + align);
    }

    // setup stanfordparser
    String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String[] options = {"-outputFormat", "wordsAndTags, typedDependencies"};
    LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
    TreebankLanguagePack tlp = lp.getOp().langpack();
    java.util.function.Predicate<java.lang.String> punctuationFilter = x -> true;

    GrammaticalStructureFactory gsf =
        new edu.stanford.nlp.trees.EnglishGrammaticalStructureFactory(punctuationFilter);

    // read document
    Iterable<List<? extends HasWord>> sentences;
    Reader r = new Reader(en);
    String line = null;
    List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>();
    while ((line = r.getNext()) != null) {
      Tokenizer<? extends HasWord> token =
          tlp.getTokenizerFactory().getTokenizer(new StringReader(line));
      List<? extends HasWord> sentence = token.tokenize();
      tmp.add(sentence);
    }
    sentences = tmp;

    // set up alignment file reader
    Reader alignment = new Reader();
    if (use_alignments) {
      alignment = new Reader(align);
    }

    // set up tree file writer
    Writer treeWriter = new Writer(out);

    // parse
    long start = System.currentTimeMillis();
    // System.err.print("Parsing sentences ");
    int sentID = 0;
    for (List<? extends HasWord> sentence : sentences) {
      Tree t = new Tree();
      // t.setSentID(++sentID);
      System.err.println("parse Sentence :" + sentence + "...");
      // System.err.print(".");
      System.err.println("-----------------------------------------------------------------------");
      edu.stanford.nlp.trees.Tree parse = lp.parse(sentence);
      // parse.pennPrint();

      // List for root node and lexical nodes
      List<Node> loneNodes = new LinkedList<Node>();
      List<Node> governingNodes = new LinkedList<Node>();

      // ROOT node
      Node root = new Node(true, true);
      root.setTag("ROOT");
      t.setRoot(root);
      loneNodes.add(root);
      governingNodes.add(root);

      // tagging

      int counter = 0;
      String surface = "";
      String tag = "";

      for (TaggedWord tw : parse.taggedYield()) {
        Node n = new Node();
        Node governingNode = new Node();
        n.setNodeID(++counter);
        surface = tw.value();
        tag = tw.tag();
        if (surface.startsWith("-LRB-")) {
          surface = "(";
        } else if (surface.startsWith("-RRB-")) {
          surface = ")";
          // } else if (surface.startsWith("-LSB-")){
          //    surface = "[";
          // } else if (surface.startsWith("-RSB-")){
          //    surface = "]";
          // } else if (surface.startsWith("-LCB-")){
          //    surface = "{";
          // } else if (surface.startsWith("-RCB-")){
          //    surface = "}";
        } else if (surface.startsWith("''")) {
          surface = "\"";
        }
        tag = tag.replaceAll("#", "-NUM-");
        surface = surface.replaceAll("&", "-AMP-");
        surface = surface.replaceAll("#", "-NUM-");
        surface = surface.replaceAll(">", "-GRE-");
        surface = surface.replaceAll("=", "-EQU-");
        n.setInitialLexicalIndex(counter);
        governingNode.setInitialLexicalIndex(counter);
        n.setSurface(surface);
        // System.out.print("("+tw.value()+" : ");
        n.setTag(tag);
        governingNode.setTag("_" + tag);
        governingNode.setLabel("_gov");
        // System.out.print(tw.tag()+")");
        loneNodes.add(n);
        governingNodes.add(governingNode);
        governingNode.setChild(n);
      }

      // System.out.println("");

      // t.setSentLength(t.getNodes().size() - 1);
      // List<Node> loneNodes = new LinkedList<Node>();
      Node[] nodes = new Node[2000];
      // labeling
      int depIndex;
      int govIndex;
      String[] depInfo;
      String[] govInfo;
      GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
      List<TypedDependency> tdl = gs.typedDependencies(false);
      // List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
      for (TypedDependency td : tdl) {
        depIndex = td.dep().index();
        govIndex = td.gov().index();
        // System.out.println("Index1:"+depIndex);
        // System.out.println("Index2:"+govIndex);
        // if (nodes[depIndex] == null){
        //	System.out.println("Making node!");
        //	nodes[depIndex] = new Node();
        // }
        // if (nodes[govIndex] == null){
        //	System.out.println("Making node!");
        //	nodes[govIndex] = new Node();
        // }
        Node dep = loneNodes.get((depIndex));
        Node gov = governingNodes.get((govIndex));
        Node depcopy = governingNodes.get((depIndex));
        Node govcopy = loneNodes.get((govIndex));
        dep.setLabel(td.reln().toString());
        depcopy.setLabel(td.reln().toString());
        govcopy.setLabel("head");
        // System.out.println(td.toString());
        govInfo = td.gov().toString().split("/");
        depInfo = td.dep().toString().split("/");
        // System.out.println(td.gov().toString());
        // System.out.println(td.dep().toString());
        // dep.setSurface(depInfo[0]);
        // dep.setTag(depInfo[1]);
        gov.setChild(governingNodes.get(depIndex));
        governingNodes.get(depIndex).setParent(gov);
        // gov.setChild(dep);
        dep.setParent(governingNodes.get(depIndex));
      }
      // t.setRoot(nodes[0]);

      // Collapse tree to remove unneeded governing nodes:

      Node gov;
      Node dep;
      Node parent;
      List<Node> children;

      for (int i = 1; i < governingNodes.size(); i++) { // start with index 1 to skip root
        gov = governingNodes.get(i);
        dep = loneNodes.get(i);
        if (gov.getChildren().size() <= 1) {
          int k = 0;
          parent = gov.getParent();
          children = parent.getChildren();

          for (Node n : children) {
            if (n == gov) {
              gov.getParent().replaceChild(k, dep);
              dep.setParent(gov.getParent());
            }
            k++;
          }
        }
      }
      // Mark head nodes with appropriate label:
      int k = 0;
      for (Node n : loneNodes) {
        if (k != 0) {
          if (n.getLabel() == n.getParent().getLabel()) {
            n.setLabel("head");
          }
        } else {
          n.setLabel("null");
        }
        k++;
      }
      // Sort lexical children of each governing node in lexical order

      for (Node n : governingNodes) {
        n.sortChildrenByInitialIndex();
      }

      // combine with alignment
      if (use_alignments) {
        t.initialize(alignment.readNextAlign());
      } else {
        t.initializeUnaligned();
      }

      // write tree to file
      treeWriter.write(t);

      // print tree to console

      System.out.println(t.toSentence());
      if (verbose) {
        System.err.println(t.toString());
        // t.recursivePrint();
      }
      System.err.println("#######################################################################");
    }
    long stop = System.currentTimeMillis();
    System.err.println("...done! [" + (stop - start) / 1000 + " sec].");

    treeWriter.close();
  }

예제 #17

0

파일 보기

파일: StanfordParser.java 프로젝트: Tyler-Yates/TuringThesis

 private Tree getPosTree(String sentence) {
   final Tokenizer<CoreLabel> tokenizer =
       tokenizerFactory.getTokenizer(new StringReader(sentence));
   final List<CoreLabel> tokens = tokenizer.tokenize();
   return parser.apply(tokens);
 }

예제 #18

0

파일 보기

파일: Doubleq.java 프로젝트: vinodhkris/MultipleQuestionSplitter

  public static void main(String args[]) throws IOException {
    long startTime = System.currentTimeMillis();

    LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
    TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory());
    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
    String sentence = "Where did the first President die ?";

    System.out.println("Enter the question or press enter for default : ");
    String tempInput;
    BufferedReader b1 = new BufferedReader(new InputStreamReader(System.in));
    tempInput = b1.readLine();
    if (tempInput.length() == 0)
      System.out.println("The question is the default one : " + sentence);
    else {
      sentence = tempInput;
      System.out.println("The question entered is : " + sentence);
    }

    String sentence1 = PreProcess.removeStopWords1(sentence);

    System.out.println(sentence1);
    StringTokenizer st1 = new StringTokenizer(sentence1, " ");
    int n = 0;
    while (st1.hasMoreTokens()) {
      String temp1 = st1.nextToken();
      //	System.out.println("temp replace all is
      // "+temp1.replaceAll("'s","").replaceAll("[^A-Za-z]",""));
      map.put(n, temp1.replaceAll("'s", "").replaceAll("[^A-Za-z]", ""));

      n++;
    }
    //	for(int s=0;s<n;s++)
    //		System.out.println(map.get(s));
    List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize();
    lp.parse(tokens); // parse the tokens
    Tree t = lp.getBestParse(); // get the best parse tree\

    tp.printTree(t);
    System.out.println("\nPROCESSED:\n\n"); // tp.printTree(t); // print tree
    // dependencies only print
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(t);

    // dependencies

    //		Tree b = t.firstChild();
    //	System.out.println("\nFirst child of the tree is :\n\n"); tp.printTree(b);
    String dependency = gs.typedDependenciesCollapsed().toString();
    System.out.println("Dependencies :" + dependency);
    //	BufferedReader reader = new BufferedReader( new InputStreamReader(System.in) );
    //	String wordForm = reader.readLine();
    String wordForm = "yes";
    int i = -1;
    String s[][] = new String[20][3];

    if (wordForm.equals("yes")) {
      StringTokenizer st = new StringTokenizer(dependency, " ([)],");
      while (st.hasMoreTokens()) {
        String as = st.nextToken();
        System.out.println(as);
        if (!as.contains("-")) {
          i++;
          s[i][0] = as;
        } else {
          s[i][1] = as;
          s[i][2] = st.nextToken();
        }
      }
    }

    length = i + 1;
    interchange1(s);
    System.out.println("The sorted version is ");
    //	System.out.println("\n\n***********Li8 from here on***********");
    for (i = 0; i < length; i++) {
      for (int j = 0; j < 3; j++) {
        System.out.print(s[i][j] + " ");
      }
      System.out.println();
    }

    // int adjmatrix[][] = new int[length][length];
    System.out.println("What answer type is required: ");
    BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));

    String answtype = reader.readLine();
    String[] temp;
    temp = sentence.split(" ", 2);
    int g = 0;
    int h = 0;
    String secque = null;

    // dijikstra implementation
    int adjmatrix[][] = new int[length][length];
    int j = 0;
    for (i = 0; i < length; i++) for (j = 0; j < length; j++) adjmatrix[i][j] = 100;
    formadj(adjmatrix, s);
    print(adjmatrix);
    //	Dijikstraalgo.dijikstra(adjmatrix,length-2);
    //	Dijikstraalgo.dijikstra(adjmatrix,length-1);
    if (Dijikstraalgo.dijikstra(adjmatrix, length - 1)
            - Dijikstraalgo.dijikstra(adjmatrix, length - 2)
        >= 0) {
      System.out.println("Type 1");
      if (makesentence(s, length - 1) == null) {
        secque = s[length - 1][2] + " " + s[length - 1][1];
        System.out.println(answtype + " is " + s[length - 1][2] + " " + s[length - 1][1] + " ?");

      } else {
        secque = makesentence(s, length - 1);
        System.out.println(answtype + " is " + secque + " ?");
      }
    } else {
      System.out.println("Type 2");
      System.out.println(
          "Before entering the makesentence function(the cause of the null pointer exception) "
              + s[length - 2][0]
              + " "
              + s[length - 2][1]);
      if (makesentence(s, length - 2) == null) {

        secque = s[length - 2][2] + " " + s[length - 2][1];
        System.out.println(answtype + " is " + s[length - 2][2] + " " + s[length - 2][1] + " ?");
      } else {
        //	System.out.println("null");
        secque = makesentence(s, length - 2);

        System.out.println(answtype + " is " + secque + " ?");
      }
    }
    //	System.out.println("Secque is "+secque.replaceAll("[^A-Za-z ]",""));
    System.out.println(sentence.replace(secque.replaceAll("[^A-Za-z ]", ""), ""));

    long endTime = System.currentTimeMillis();
    System.out.println("The time elapsed is : " + (int) (endTime - startTime) / 1000);
    System.out.println("The end");
  }

예제 #19

0

파일 보기

파일: StanfordParser.java 프로젝트: Tyler-Yates/TuringThesis

class StanfordParser {
  private final String PCG_MODEL = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
  private final TokenizerFactory<CoreLabel> tokenizerFactory =
      PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
  private final LexicalizedParser parser = LexicalizedParser.loadModel(PCG_MODEL);
  private final String serializedClassifier =
      "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf" + ".ser.gz";
  private final AbstractSequenceClassifier<CoreLabel> classifier =
      CRFClassifier.getClassifierNoExceptions(serializedClassifier);

  public ParsedSentence parseSentence(String sentence, boolean removePunctuation) {
    if (removePunctuation) {
      sentence = cleanSentence(sentence);
    }

    final Tree posTree = getPosTree(sentence);
    return new ParsedSentence(posTree, getDependencies(posTree), findNamedEntities(sentence));
  }

  public Tense calculateTense(String clause) {
    final Tree posTree = getPosTree(clause);
    final Tree word = posTree.getLeaves().get(0);
    final String pos = word.parent(posTree).label().value().toLowerCase();
    if (pos.equals("md")) {
      return Tense.FUTURE;
    }
    if (pos.equals("vbd") || pos.equals("vbn")) {
      return Tense.PAST;
    }
    return Tense.PRESENT;
  }

  public Map<String, NamedEntity> findNamedEntities(String sentence) {
    final Map<String, NamedEntity> namedEntities = new HashMap<>();
    final List<Triple<String, Integer, Integer>> nerSubstrings = findNerSubstrings(sentence);
    for (final Triple<String, Integer, Integer> substring : nerSubstrings) {
      namedEntities.put(
          sentence.substring(substring.second(), substring.third()),
          NamedEntity.getNamedEntity(substring.first()));
    }
    return namedEntities;
  }

  private List<Triple<String, Integer, Integer>> findNerSubstrings(String sentence) {
    return classifier.classifyToCharacterOffsets(sentence);
  }

  private String cleanSentence(String sentence) {
    return sentence.replaceAll("\\p{Punct}", "").replaceAll("[ ]+", " ");
  }

  private Tree getPosTree(String sentence) {
    final Tokenizer<CoreLabel> tokenizer =
        tokenizerFactory.getTokenizer(new StringReader(sentence));
    final List<CoreLabel> tokens = tokenizer.tokenize();
    return parser.apply(tokens);
  }

  private Collection<TypedDependency> getDependencies(Tree sentenceParseTree) {
    final TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    final GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    final GrammaticalStructure gs = gsf.newGrammaticalStructure(sentenceParseTree);
    return gs.typedDependenciesCollapsed();
  }
}

예제 #20

0

파일 보기

파일: TextSimplification.java 프로젝트: inkit-padhi/storyteller

 public static Tree parse(String str) {
   List<CoreLabel> tokens = tokenize(str);
   Tree tree = parser.apply(tokens);
   return tree;
 }

예제 #21

0

파일 보기

파일: TextSimplification.java 프로젝트: inkit-padhi/storyteller

@SuppressWarnings("serial")
public class TextSimplification {

  public static List<String> replacementList =
      new ArrayList<String>() {
        {
          add("he");
          add("him");
          add("his");
          add("she");
          add("her");
          add("they");
          add("them");
          add("their");
          add("i");
          add("her's");
          add("you");
          add("your");
          add("your's");
          add("mine");
          add("my");
          add("us");
          add("we");
          //		add("it");
          //		add("its");
          //		add("this");
          //		add("that");
        }
      };

  public static String resolvedSentences = "";

  private static final String PCG_MODEL = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

  private static final TokenizerFactory<CoreLabel> tokenizerFactory =
      PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");

  private static final LexicalizedParser parser = LexicalizedParser.loadModel(PCG_MODEL);

  public static void main(String[] args) throws IOException {

    // :TODO
    // * Do not consider roots with more than 2 words
    // * Root should not be he, she her, his, him etc...
    // * If it is, den take the last known gender noun and make it the root.

    String text = new String(Files.readAllBytes(Paths.get(args[0])), StandardCharsets.UTF_8);
    text = text.replace("\n", " ");

    // Resolve Anaphora
    System.out.println("Anaphora Resolution...");
    resolveAnaphora(text);
    System.out.println(
        "Anaphora Resolution Completed!\nIntermediate Output in \"AnaphoraResolved.txt\"");
    writeToFile(resolvedSentences, "AnaphoraResolved.txt");

    // Create ParseTrees
    System.out.println("Parse Tree Generation...");
    startParsing((resolvedSentences));
    System.out.println("Parse Tree Generation Completed!\nIntermediate Output in \"Tree.txt\"");
  }

  public static void resolveAnaphora(String text) {

    RedwoodConfiguration.empty().capture(System.err).apply();

    Annotation document = new Annotation(text);
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    props.put("dcoref.female", "female.unigram.txt");
    props.put("dcoref.male", "male.unigram.txt");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    pipeline.annotate(document);

    RedwoodConfiguration.current().clear().apply();

    Map<Integer, CorefChain> graph = document.get(CorefChainAnnotation.class);
    List<CoreMap> stnfrdSentences = document.get(SentencesAnnotation.class);

    ImmutableMultimap.Builder<Integer, Pair<CorefChain, CorefMention>> records =
        ImmutableMultimap.builder();
    ImmutableMultimap.Builder<Integer, Pair<CorefChain, CorefMention>> recordsOrdered =
        ImmutableMultimap.builder();

    graph.forEach(
        (key, value) -> {
          value
              .getMentionMap()
              .forEach(
                  (intPair, corefSet) -> {
                    corefSet.forEach(
                        mention -> records.put(mention.sentNum, Pair.of(value, mention)));
                  });
        });

    recordsOrdered =
        records.orderKeysBy(
            new Comparator<Integer>() {
              @Override
              public int compare(Integer o1, Integer o2) {
                return o1 - o2;
              }
            });

    recordsOrdered
        .build()
        .asMap()
        .forEach(
            (sentNum, mentionList) -> {
              CoreMap sentence = stnfrdSentences.get(sentNum - 1);
              List<CoreLabel> stnfrdtokens = sentence.get(TokensAnnotation.class);

              mentionList.forEach(
                  pair -> {
                    CorefChain chain = pair.getLeft();
                    CorefMention mention = pair.getRight();
                    String root = chain.getRepresentativeMention().mentionSpan;

                    if (!mention.mentionSpan.equalsIgnoreCase(root)
                        && (!root.contains(mention.mentionSpan)
                            && !mention.mentionSpan.contains(root))
                        && (!replacementList.contains(root.toLowerCase()))
                        && (root.split("\\s").length < 3)
                        && (replacementList.contains(mention.mentionSpan.toLowerCase()))) {
                      if (mention.mentionSpan.equalsIgnoreCase("her")
                          || mention.mentionSpan.equalsIgnoreCase("his")) {
                        root += "'s";
                      }
                      stnfrdtokens.get(mention.startIndex - 1).setOriginalText(root);
                    }
                  });

              String sent = "";
              for (CoreLabel token : stnfrdtokens) {
                sent += token.originalText() + " ";
              }
              ;
              resolvedSentences += sent + "\n";
            });
  }

  public static Tree parse(String str) {
    List<CoreLabel> tokens = tokenize(str);
    Tree tree = parser.apply(tokens);
    return tree;
  }

  private static List<CoreLabel> tokenize(String str) {
    Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(str));
    return tokenizer.tokenize();
  }

  public static void startParsing(String paragraph) throws FileNotFoundException, IOException {
    String parseTrees = "";

    // Can we just split on new line as paragraph is already sentence splitted.
    Reader reader = new StringReader(paragraph);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();

    for (List<HasWord> sentence : dp) {
      String sentenceString = Sentence.listToString(sentence);
      sentenceList.add(sentenceString);
    }

    for (String sentence : sentenceList) {
      //			System.out.println(sentence);
      parseTrees += createParseTree(sentence);
    }
    writeToFile(parseTrees, "trees.txt");
  }

  public static void writeToFile(String content, String filename) throws IOException {
    File file = new File(filename);
    file.delete();

    FileWriter fout = new FileWriter(filename);
    fout.write(content);
    fout.close();
  }

  public static String createParseTree(String sentence) {
    Tree tree = parse(sentence);
    //		System.out.println(tree.toString());
    return (tree.toString() + "\n");
  }
}

예제 #22

0

파일 보기

파일: LexicalParsingEngine.java 프로젝트: ferojuras/EntitySearch

  public LexicalParsingEngine(String parserModel)
      throws FileNotFoundException, UnsupportedEncodingException {

    System.out.println("Initializing Lexical Parser...");
    lp = LexicalizedParser.loadModel(parserModel);
  }

예제 #23

0

파일 보기

파일: CharacterLevelTagExtender.java 프로젝트: automenta/corenlp

  /**
   * for testing -- CURRENTLY BROKEN!!!
   *
   * @param args input dir and output filename
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    if (args.length != 3) {
      throw new RuntimeException("args: treebankPath trainNums testNums");
    }

    ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
    ctpp.charTags = true;
    // TODO: these options are getting clobbered by reading in the
    // parser object (unless it's a text file parser?)
    Options op = new Options(ctpp);
    op.doDep = false;
    op.testOptions.maxLength = 90;

    LexicalizedParser lp;
    try {
      FileFilter trainFilt = new NumberRangesFileFilter(args[1], false);

      lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op);
      try {
        String filename = "chineseCharTagPCFG.ser.gz";
        System.err.println("Writing parser in serialized format to file " + filename + ' ');
        System.err.flush();
        ObjectOutputStream out = IOUtils.writeStreamFromString(filename);

        out.writeObject(lp);
        out.close();
        System.err.println("done.");
      } catch (IOException ioe) {
        ioe.printStackTrace();
      }
    } catch (IllegalArgumentException e) {
      lp = LexicalizedParser.loadModel(args[1], op);
    }

    FileFilter testFilt = new NumberRangesFileFilter(args[2], false);
    MemoryTreebank testTreebank = ctpp.memoryTreebank();
    testTreebank.loadPath(new File(args[0]), testFilt);
    PrintWriter pw =
        new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true);
    WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
    WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
    EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck);
    //    System.out.println("Preterminals:" + preterminals);
    System.out.println("Testing...");
    for (Tree gold : testTreebank) {
      Tree tree;
      try {
        tree = lp.parseTree(gold.yieldHasWord());
        if (tree == null) {
          System.out.println("Failed to parse " + gold.yieldHasWord());
          continue;
        }
      } catch (Exception e) {
        e.printStackTrace();
        continue;
      }
      gold = gold.firstChild();
      pw.println(Sentence.listToString(gold.preTerminalYield()));
      pw.println(Sentence.listToString(gold.yield()));
      gold.pennPrint(pw);

      pw.println(tree.preTerminalYield());
      pw.println(tree.yield());
      tree.pennPrint(pw);
      //      Collection allBrackets = WordCatConstituent.allBrackets(tree);
      //      Collection goldBrackets = WordCatConstituent.allBrackets(gold);
      //      eval.eval(allBrackets, goldBrackets);
      eval.displayLast();
    }
    System.out.println();
    System.out.println();
    eval.display();
  }

예제 #24

0

파일 보기

파일: AnalysisUtilities.java 프로젝트: kutschkem/SmithHeilmann_fork

  public ParseResult parseSentence(String sentence) {
    String result = "";

    // see if a parser socket server is available
    int port = new Integer(ARKref.getProperties().getProperty("parserServerPort", "5556"));
    String host = "127.0.0.1";
    Socket client;
    PrintWriter pw;
    BufferedReader br;
    String line;
    try {
      client = new Socket(host, port);

      pw = new PrintWriter(client.getOutputStream());
      br = new BufferedReader(new InputStreamReader(client.getInputStream()));
      pw.println(sentence);
      pw.flush(); // flush to complete the transmission
      while ((line = br.readLine()) != null) {
        // if(!line.matches(".*\\S.*")){
        //        System.out.println();
        // }
        if (br.ready()) {
          line = line.replaceAll("\n", "");
          line = line.replaceAll("\\s+", " ");
          result += line + " ";
        } else {
          lastParseScore = new Double(line);
        }
      }

      br.close();
      pw.close();
      client.close();

      System.err.println("parser output:" + result);

      lastParse = readTreeFromString(result);
      boolean success = !Strings.normalizeWhitespace(result).equals("(ROOT (. .))");
      return new ParseResult(success, lastParse, lastParseScore);
    } catch (Exception ex) {

      // ex.printStackTrace();
    }

    // if socket server not available, then use a local parser object
    if (parser == null) {
      if (DEBUG) System.err.println("Could not connect to parser server.  Loading parser...");
      try {
        Options op = new Options();
        String serializedInputFileOrUrl =
            ClassLoader.getSystemResource(
                    ARKref.getProperties()
                        .getProperty("parserGrammarFile", "lib/englishPCFG.ser.gz"))
                .toExternalForm();
        parser = LexicalizedParser.loadModel(serializedInputFileOrUrl, op);
        //				int maxLength = new Integer(ARKref.getProperties().getProperty("parserMaxLength",
        // "40")).intValue();
        //				parser.setMaxLength(maxLength);
        parser.setOptionFlags("-outputFormat", "oneline");
      } catch (Exception e) {
        e.printStackTrace();
      }
    }

    try {
      DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(sentence));

      LexicalizedParserQuery query = parser.parserQuery();

      if (query.parse(dp.iterator().next())) {
        lastParse = query.getBestParse();
        lastParseScore = query.getPCFGScore();
        TreePrint tp = new TreePrint("penn", "", new PennTreebankLanguagePack());
        StringWriter sb = new StringWriter();
        pw = new PrintWriter(sb);
        tp.printTree(lastParse, pw);
        pw.flush();
        lastParse = readTreeFromString(sb.getBuffer().toString());

        return new ParseResult(true, lastParse, lastParseScore);
      }
    } catch (Exception e) {
    }

    lastParse = readTreeFromString("(ROOT (. .))");
    lastParseScore = -99999.0;
    return new ParseResult(false, lastParse, lastParseScore);
  }

예제 #25

0

파일 보기

파일: Pivot.java 프로젝트: chmr123/phd

  public ArrayList<String> getKeyWordsDependency(String sentence, String keyword) {
    LexicalizedParser lp =
        LexicalizedParser.loadModel(
            "/home/mingrui/Desktop/englishPCFG.ser.gz",
            "-maxLength",
            "80",
            "-retainTmpSubcategories");
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    // Uncomment the following line to obtain original Stanford Dependencies
    // tlp.setGenerateOriginalDependencies(true);
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    String[] array = sentence.split("\\s+");
    Tree parse = lp.apply(Sentence.toWordList(array));
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    ArrayList<String> keywordsDependency = new ArrayList<String>();
    ArrayList<String> keywordsDependencyWithLemmatization = new ArrayList<String>();
    // String lemmatizedKeyword = lemmatize(keyword);
    for (TypedDependency t : tdl) {
      String d = t.toString();
      String dependencyType = d.substring(0, d.indexOf("("));
      String pair = d.substring(d.indexOf("(") + 1, d.indexOf("("));
      String[] terms = pair.split(",");
      String term1 = terms[0].trim();
      String term2 = terms[1].trim();

      // Match keywords with the terms in the tuples, if matched, add the
      // tuple into the arraylist
      String[] wordsplitted = keyword.split(" ");
      for (String key : wordsplitted) {
        if (term1.equals(key)) {
          keywordsDependency.add(t.toString());
        }
        if (term2.equals(key)) {
          keywordsDependency.add(t.toString());
        }
      }
    }

    String lemmatizedKeywords = lemmatize(keyword);
    int lbefore = keyword.split(" ").length;
    int lafter = lemmatizedKeywords.split(" ").length;
    if (lbefore == lafter) {
      return keywordsDependency;
    } else {
      String[] split = keyword.split(" ");
      for (String s : split) {
        String[] lemmas = lemmatize(s).split(" ");
        boolean sameLength = lemmas.length == s.split(" ").length;
        if (sameLength) { // Compare the length of one key_word or key_phrase before and after
                          // lemmatization
          continue;
        } else {
          for (String tuple : keywordsDependency) {
            if (getTupleTerms(tuple)[0].equals(
                s)) { // Find the tuple that contains the original keyword/key_phrase
              String dependent = getTupleTerms(tuple)[1];
              // String[]
            }
          }
          // for(String l : lemma)
        }
      }
      return keywordsDependencyWithLemmatization;
    }
  }