/**
   * Read parse trees from a Reader.
   *
   * @param filename
   * @param in The <code>Reader</code>
   * @param simplifiedTagset If `true`, convert part-of-speech labels to a simplified version of the
   *     EAGLES tagset, where the tags do not include extensive morphological analysis
   * @param aggressiveNormalization Perform aggressive "normalization" on the trees read from the
   *     provided corpus documents: split multi-word tokens into their constituent words (and infer
   *     parts of speech of the constituent words).
   * @param retainNER Retain NER information in preterminals (for later use in
   *     `MultiWordPreprocessor) and add NER-specific parents to single-word NE tokens
   * @param detailedAnnotations Retain detailed tree node annotations. These annotations on parse
   *     tree constituents may be useful for e.g. training a parser.
   */
  public SpanishXMLTreeReader(
      String filename,
      Reader in,
      boolean simplifiedTagset,
      boolean aggressiveNormalization,
      boolean retainNER,
      boolean detailedAnnotations) {
    TreebankLanguagePack tlp = new SpanishTreebankLanguagePack();

    this.simplifiedTagset = simplifiedTagset;
    this.detailedAnnotations = detailedAnnotations;

    stream = new ReaderInputStream(in, tlp.getEncoding());
    treeFactory = new LabeledScoredTreeFactory();
    treeNormalizer =
        new SpanishTreeNormalizer(simplifiedTagset, aggressiveNormalization, retainNER);

    DocumentBuilder parser = XMLUtils.getXmlParser();
    try {
      final Document xml = parser.parse(stream);
      final Element root = xml.getDocumentElement();
      sentences = root.getElementsByTagName(NODE_SENT);
      sentIdx = 0;

    } catch (SAXException e) {
      System.err.println("Parse exception while reading " + filename);
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
  /**
   * Parses a sentence and returns the parse tree.
   *
   * @param sentence a sentence
   * @return Tree character offsets in keys BEGIN_KEY and END_KEY
   */
  @SuppressWarnings("unchecked")
  public static Tree parseTree(String sentence) {
    if (tlp == null || parser == null)
      throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce stanford Tree
    log.debug("Parsing sentence");
    Tree tree = null;
    synchronized (parser) {
      Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
      List<Word> words = tokenizer.tokenize();
      log.debug("Tokenization: " + words);
      parser.parse(new Sentence(words));
      tree = parser.getBestParse();
    }

    // label tree with character extents
    // log.debug("Setting character extents");
    // updateTreeLabels(tree, tree, new MutableInteger(), new MutableInteger(-1));
    // log.debug("Creating offset mapping");
    // List<RangeMap> mapping = createMapping(sentence);
    // log.debug(mapping.toString());
    // log.debug("Applying offset mapping");
    // mapOffsets(tree, mapping);

    return tree;
  }
 /**
  * Initializes static resources.
  *
  * @throws Exception
  */
 public static void initialize() throws Exception {
   if (parser != null) return;
   Properties properties = Properties.loadFromClassName(StanfordParser.class.getName());
   tlp = new PennTreebankLanguagePack();
   gsf = tlp.grammaticalStructureFactory();
   String modelFile = properties.getProperty("modelFile");
   if (modelFile == null) throw new Exception("Required property '" + "modelFile' is undefined");
   parser = new LexicalizedParser(modelFile);
 }
Exemple #4
0
 private void populateTagsToBaseTags(TreebankLanguagePack tlp) {
   int total = tagIndex.size();
   tagsToBaseTags = new int[total];
   for (int i = 0; i < total; i++) {
     String tag = tagIndex.get(i);
     String baseTag = tlp.basicCategory(tag);
     int j = tagIndex.indexOf(baseTag, true);
     tagsToBaseTags[i] = j;
   }
 }
 private static List<TaggedWord> cleanTags(List<TaggedWord> twList, TreebankLanguagePack tlp) {
   int sz = twList.size();
   List<TaggedWord> l = new ArrayList<TaggedWord>(sz);
   for (int i = 0; i < sz; i++) {
     TaggedWord tw = twList.get(i);
     TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag()));
     l.add(tw2);
   }
   return l;
 }
  private Collection<TypedDependency> parseSentenceTDL(String text) {
    System.out.println("Parsing sentence...");

    Collection<TypedDependency> tdl = null;
    TreebankLanguagePack tlp = lp.treebankLanguagePack();
    GrammaticalStructureFactory gsf = null;
    if (tlp.supportsGrammaticalStructures()) {
      gsf = tlp.grammaticalStructureFactory();
    }

    Reader reader = new StringReader(text);

    for (List<HasWord> sentence : new DocumentPreprocessor(reader)) {
      Tree parse = lp.apply(sentence);
      if (gsf != null) {
        GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
        tdl = gs.allTypedDependencies();
      }
    }
    return tdl;
  }
Exemple #7
0
  private static List<TypedDependency> getDependencies(String sentence) {

    if (pipeline == null) {
      loadModels();
    }

    TokenizerFactory<CoreLabel> tokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sentence));
    List<CoreLabel> rawWords2 = tok.tokenize();
    Tree parse = lp.apply(rawWords2);
    //        parse.pennPrint();
    //
    //        System.out.println(parse.toString());

    TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();

    return tdl;
  }
  /**
   * Parses a sentence and returns the PCFG score as a confidence measure.
   *
   * @param sentence a sentence
   * @return PCFG score
   */
  @SuppressWarnings("unchecked")
  public static double getPCFGScore(String sentence) {
    if (tlp == null || parser == null)
      throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce PCFG score
    log.debug("Parsing sentence");
    double score;
    synchronized (parser) {
      Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
      List<Word> words = tokenizer.tokenize();
      log.debug("Tokenization: " + words);
      parser.parse(new Sentence(words));
      score = parser.getPCFGScore();
    }

    return score;
  }
Exemple #9
0
public class Parser {

  private String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
  private String[] options = {"-maxLength", "80", "-retainTmpSubcategories"};
  private LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
  private TreebankLanguagePack tlp = lp.getOp().langpack();
  private GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();

  public Parser() {}

  public LinkedList<String> getKeyWrodsFromSentence(String string) {
    LinkedList<String> list = new LinkedList<String>();

    String[] sent = string.split(" ");
    List<HasWord> sentence = new ArrayList<HasWord>();
    for (String word : sent) sentence.add(new Word(word));

    Tree parse = lp.parse(sentence);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);

    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();

    String[] current;
    String type, key;
    List<CoreLabel> labelsList = parse.taggedLabeledYield();
    for (Label l : labelsList) {
      current = l.toString().split("-");
      type = current[0];
      if (type.equals("NN") || type.equals("NNS")) {
        key = sent[Integer.parseInt(current[1])];
        list.add(key);
      }
    }
    return list;
  }

  public LinkedList<String> getKeyWrodsFromSentenceTest(String string) {

    LinkedList<String> list = new LinkedList<String>();

    String[] sent = string.split(" ");
    List<HasWord> sentence = new ArrayList<HasWord>();
    for (String word : sent) {
      sentence.add(new Word(word));
    }

    Tree parse = lp.parse(sentence);
    parse.pennPrint();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);

    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    System.out.println(tdl);

    System.out.println();

    System.out.println("The words of the sentence:");
    for (Label lab : parse.yield()) {
      if (lab instanceof CoreLabel) {
        System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP));
      } else {
        System.out.println(lab);
      }
    }
    System.out.println();
    System.out.println("tagged");
    System.out.println(parse.taggedYield());

    List<CoreLabel> temp = parse.taggedLabeledYield();
    for (Label l : temp) {
      String[] sss = l.toString().split("-");
      String type = sss[0];
      System.out.println(sss[0] + "  " + sss[1] + "    " + sent[Integer.parseInt(sss[1])]);
    }

    for (Iterator<String> ite = list.iterator(); ite.hasNext(); ) System.out.println(ite.next());
    return list;
  }

  public static void main(String[] args) throws IOException {
    Parser parser = new Parser();
    parser.getKeyWrodsFromSentence(
        "When athletes begin to exercise, their heart rates and respiration rates increase.  At what level of organization does the human body coordinate these functions?");
    parser.getKeyWrodsFromSentenceTest(
        "When athletes begin to exercise, their heart rates and respiration rates increase.  At what level of organization does the human body coordinate these functions?");
    // main2();

  }
}
  /**
   * parse sentence and generate .trees file
   *
   * @param en
   * @param align
   * @param out
   */
  public static void parse(String en, String align, String out, boolean verbose) {

    // use alignments?
    boolean use_alignments = true;
    if (align.startsWith("no_align")) {
      use_alignments = false;
      System.err.println("Not using alignments.");
    } else {
      System.err.println("Using alignments from " + align);
    }

    // setup stanfordparser
    String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String[] options = {"-outputFormat", "wordsAndTags, typedDependencies"};
    LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
    TreebankLanguagePack tlp = lp.getOp().langpack();
    java.util.function.Predicate<java.lang.String> punctuationFilter = x -> true;

    GrammaticalStructureFactory gsf =
        new edu.stanford.nlp.trees.EnglishGrammaticalStructureFactory(punctuationFilter);

    // read document
    Iterable<List<? extends HasWord>> sentences;
    Reader r = new Reader(en);
    String line = null;
    List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>();
    while ((line = r.getNext()) != null) {
      Tokenizer<? extends HasWord> token =
          tlp.getTokenizerFactory().getTokenizer(new StringReader(line));
      List<? extends HasWord> sentence = token.tokenize();
      tmp.add(sentence);
    }
    sentences = tmp;

    // set up alignment file reader
    Reader alignment = new Reader();
    if (use_alignments) {
      alignment = new Reader(align);
    }

    // set up tree file writer
    Writer treeWriter = new Writer(out);

    // parse
    long start = System.currentTimeMillis();
    // System.err.print("Parsing sentences ");
    int sentID = 0;
    for (List<? extends HasWord> sentence : sentences) {
      Tree t = new Tree();
      // t.setSentID(++sentID);
      System.err.println("parse Sentence :" + sentence + "...");
      // System.err.print(".");
      System.err.println("-----------------------------------------------------------------------");
      edu.stanford.nlp.trees.Tree parse = lp.parse(sentence);
      // parse.pennPrint();

      // List for root node and lexical nodes
      List<Node> loneNodes = new LinkedList<Node>();
      List<Node> governingNodes = new LinkedList<Node>();

      // ROOT node
      Node root = new Node(true, true);
      root.setTag("ROOT");
      t.setRoot(root);
      loneNodes.add(root);
      governingNodes.add(root);

      // tagging

      int counter = 0;
      String surface = "";
      String tag = "";

      for (TaggedWord tw : parse.taggedYield()) {
        Node n = new Node();
        Node governingNode = new Node();
        n.setNodeID(++counter);
        surface = tw.value();
        tag = tw.tag();
        if (surface.startsWith("-LRB-")) {
          surface = "(";
        } else if (surface.startsWith("-RRB-")) {
          surface = ")";
          // } else if (surface.startsWith("-LSB-")){
          //    surface = "[";
          // } else if (surface.startsWith("-RSB-")){
          //    surface = "]";
          // } else if (surface.startsWith("-LCB-")){
          //    surface = "{";
          // } else if (surface.startsWith("-RCB-")){
          //    surface = "}";
        } else if (surface.startsWith("''")) {
          surface = "\"";
        }
        tag = tag.replaceAll("#", "-NUM-");
        surface = surface.replaceAll("&", "-AMP-");
        surface = surface.replaceAll("#", "-NUM-");
        surface = surface.replaceAll(">", "-GRE-");
        surface = surface.replaceAll("=", "-EQU-");
        n.setInitialLexicalIndex(counter);
        governingNode.setInitialLexicalIndex(counter);
        n.setSurface(surface);
        // System.out.print("("+tw.value()+" : ");
        n.setTag(tag);
        governingNode.setTag("_" + tag);
        governingNode.setLabel("_gov");
        // System.out.print(tw.tag()+")");
        loneNodes.add(n);
        governingNodes.add(governingNode);
        governingNode.setChild(n);
      }

      // System.out.println("");

      // t.setSentLength(t.getNodes().size() - 1);
      // List<Node> loneNodes = new LinkedList<Node>();
      Node[] nodes = new Node[2000];
      // labeling
      int depIndex;
      int govIndex;
      String[] depInfo;
      String[] govInfo;
      GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
      List<TypedDependency> tdl = gs.typedDependencies(false);
      // List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
      for (TypedDependency td : tdl) {
        depIndex = td.dep().index();
        govIndex = td.gov().index();
        // System.out.println("Index1:"+depIndex);
        // System.out.println("Index2:"+govIndex);
        // if (nodes[depIndex] == null){
        //	System.out.println("Making node!");
        //	nodes[depIndex] = new Node();
        // }
        // if (nodes[govIndex] == null){
        //	System.out.println("Making node!");
        //	nodes[govIndex] = new Node();
        // }
        Node dep = loneNodes.get((depIndex));
        Node gov = governingNodes.get((govIndex));
        Node depcopy = governingNodes.get((depIndex));
        Node govcopy = loneNodes.get((govIndex));
        dep.setLabel(td.reln().toString());
        depcopy.setLabel(td.reln().toString());
        govcopy.setLabel("head");
        // System.out.println(td.toString());
        govInfo = td.gov().toString().split("/");
        depInfo = td.dep().toString().split("/");
        // System.out.println(td.gov().toString());
        // System.out.println(td.dep().toString());
        // dep.setSurface(depInfo[0]);
        // dep.setTag(depInfo[1]);
        gov.setChild(governingNodes.get(depIndex));
        governingNodes.get(depIndex).setParent(gov);
        // gov.setChild(dep);
        dep.setParent(governingNodes.get(depIndex));
      }
      // t.setRoot(nodes[0]);

      // Collapse tree to remove unneeded governing nodes:

      Node gov;
      Node dep;
      Node parent;
      List<Node> children;

      for (int i = 1; i < governingNodes.size(); i++) { // start with index 1 to skip root
        gov = governingNodes.get(i);
        dep = loneNodes.get(i);
        if (gov.getChildren().size() <= 1) {
          int k = 0;
          parent = gov.getParent();
          children = parent.getChildren();

          for (Node n : children) {
            if (n == gov) {
              gov.getParent().replaceChild(k, dep);
              dep.setParent(gov.getParent());
            }
            k++;
          }
        }
      }
      // Mark head nodes with appropriate label:
      int k = 0;
      for (Node n : loneNodes) {
        if (k != 0) {
          if (n.getLabel() == n.getParent().getLabel()) {
            n.setLabel("head");
          }
        } else {
          n.setLabel("null");
        }
        k++;
      }
      // Sort lexical children of each governing node in lexical order

      for (Node n : governingNodes) {
        n.sortChildrenByInitialIndex();
      }

      // combine with alignment
      if (use_alignments) {
        t.initialize(alignment.readNextAlign());
      } else {
        t.initializeUnaligned();
      }

      // write tree to file
      treeWriter.write(t);

      // print tree to console

      System.out.println(t.toSentence());
      if (verbose) {
        System.err.println(t.toString());
        // t.recursivePrint();
      }
      System.err.println("#######################################################################");
    }
    long stop = System.currentTimeMillis();
    System.err.println("...done! [" + (stop - start) / 1000 + " sec].");

    treeWriter.close();
  }
 /**
  * Stores the passed-in TreebankLanguagePack and sets up charset encodings.
  *
  * @param tlp The treebank language pack to use
  */
 protected AbstractTreebankParserParams(TreebankLanguagePack tlp) {
   this.tlp = tlp;
   inputEncoding = tlp.getEncoding();
   outputEncoding = tlp.getEncoding();
 }
Exemple #12
0
 @Test
 public void testPTBTokenizerGerman() {
   String[] sample = {
     "Das TV-Duell von Kanzlerin Merkel und SPD-Herausforderer Steinbrück war eher lahm - können es die Spitzenleute der kleinen Parteien besser? ",
     "Die erquickende Sicherheit und Festigkeit in der Bewegung, den Vorrat von Kraft, kann ja die Versammlung nicht fühlen, hören will sie sie nicht, also muß sie sie sehen; und die sehe man einmal in einem Paar spitzen Schultern, zylindrischen Schenkeln, oder leeren Ärmeln, oder lattenförmigen Beinen."
   };
   String[][] tokenized = {
     {
       "Das",
       "TV-Duell",
       "von",
       "Kanzlerin",
       "Merkel",
       "und",
       "SPD-Herausforderer",
       "Steinbrück",
       "war",
       "eher",
       "lahm",
       "-",
       "können",
       "es",
       "die",
       "Spitzenleute",
       "der",
       "kleinen",
       "Parteien",
       "besser",
       "?",
     },
     {
       "Die",
       "erquickende",
       "Sicherheit",
       "und",
       "Festigkeit",
       "in",
       "der",
       "Bewegung",
       ",",
       "den",
       "Vorrat",
       "von",
       "Kraft",
       ",",
       "kann",
       "ja",
       "die",
       "Versammlung",
       "nicht",
       "fühlen",
       ",",
       "hören",
       "will",
       "sie",
       "sie",
       "nicht",
       ",",
       "also",
       "muß",
       "sie",
       "sie",
       "sehen",
       ";",
       "und",
       "die",
       "sehe",
       "man",
       "einmal",
       "in",
       "einem",
       "Paar",
       "spitzen",
       "Schultern",
       ",",
       "zylindrischen",
       "Schenkeln",
       ",",
       "oder",
       "leeren",
       "Ärmeln",
       ",",
       "oder",
       "lattenförmigen",
       "Beinen",
       "."
     }
   };
   TreebankLanguagePack tlp = new NegraPennLanguagePack();
   TokenizerFactory tokFactory = tlp.getTokenizerFactory();
   runOnTwoArrays(tokFactory, sample, tokenized);
 }
Exemple #13
0
 public String project(String tagStr) {
   // return tagStr;
   String ret = tlp.basicCategory(tagStr);
   // System.err.println("BCTP mapped " + tagStr + " to " + ret);
   return ret;
 }
  public static void main(String args[]) throws IOException {
    long startTime = System.currentTimeMillis();

    LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
    TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory());
    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
    String sentence = "Where did the first President die ?";

    System.out.println("Enter the question or press enter for default : ");
    String tempInput;
    BufferedReader b1 = new BufferedReader(new InputStreamReader(System.in));
    tempInput = b1.readLine();
    if (tempInput.length() == 0)
      System.out.println("The question is the default one : " + sentence);
    else {
      sentence = tempInput;
      System.out.println("The question entered is : " + sentence);
    }

    String sentence1 = PreProcess.removeStopWords1(sentence);

    System.out.println(sentence1);
    StringTokenizer st1 = new StringTokenizer(sentence1, " ");
    int n = 0;
    while (st1.hasMoreTokens()) {
      String temp1 = st1.nextToken();
      //	System.out.println("temp replace all is
      // "+temp1.replaceAll("'s","").replaceAll("[^A-Za-z]",""));
      map.put(n, temp1.replaceAll("'s", "").replaceAll("[^A-Za-z]", ""));

      n++;
    }
    //	for(int s=0;s<n;s++)
    //		System.out.println(map.get(s));
    List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize();
    lp.parse(tokens); // parse the tokens
    Tree t = lp.getBestParse(); // get the best parse tree\

    tp.printTree(t);
    System.out.println("\nPROCESSED:\n\n"); // tp.printTree(t); // print tree
    // dependencies only print
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(t);

    // dependencies

    //		Tree b = t.firstChild();
    //	System.out.println("\nFirst child of the tree is :\n\n"); tp.printTree(b);
    String dependency = gs.typedDependenciesCollapsed().toString();
    System.out.println("Dependencies :" + dependency);
    //	BufferedReader reader = new BufferedReader( new InputStreamReader(System.in) );
    //	String wordForm = reader.readLine();
    String wordForm = "yes";
    int i = -1;
    String s[][] = new String[20][3];

    if (wordForm.equals("yes")) {
      StringTokenizer st = new StringTokenizer(dependency, " ([)],");
      while (st.hasMoreTokens()) {
        String as = st.nextToken();
        System.out.println(as);
        if (!as.contains("-")) {
          i++;
          s[i][0] = as;
        } else {
          s[i][1] = as;
          s[i][2] = st.nextToken();
        }
      }
    }

    length = i + 1;
    interchange1(s);
    System.out.println("The sorted version is ");
    //	System.out.println("\n\n***********Li8 from here on***********");
    for (i = 0; i < length; i++) {
      for (int j = 0; j < 3; j++) {
        System.out.print(s[i][j] + " ");
      }
      System.out.println();
    }

    // int adjmatrix[][] = new int[length][length];
    System.out.println("What answer type is required: ");
    BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));

    String answtype = reader.readLine();
    String[] temp;
    temp = sentence.split(" ", 2);
    int g = 0;
    int h = 0;
    String secque = null;

    // dijikstra implementation
    int adjmatrix[][] = new int[length][length];
    int j = 0;
    for (i = 0; i < length; i++) for (j = 0; j < length; j++) adjmatrix[i][j] = 100;
    formadj(adjmatrix, s);
    print(adjmatrix);
    //	Dijikstraalgo.dijikstra(adjmatrix,length-2);
    //	Dijikstraalgo.dijikstra(adjmatrix,length-1);
    if (Dijikstraalgo.dijikstra(adjmatrix, length - 1)
            - Dijikstraalgo.dijikstra(adjmatrix, length - 2)
        >= 0) {
      System.out.println("Type 1");
      if (makesentence(s, length - 1) == null) {
        secque = s[length - 1][2] + " " + s[length - 1][1];
        System.out.println(answtype + " is " + s[length - 1][2] + " " + s[length - 1][1] + " ?");

      } else {
        secque = makesentence(s, length - 1);
        System.out.println(answtype + " is " + secque + " ?");
      }
    } else {
      System.out.println("Type 2");
      System.out.println(
          "Before entering the makesentence function(the cause of the null pointer exception) "
              + s[length - 2][0]
              + " "
              + s[length - 2][1]);
      if (makesentence(s, length - 2) == null) {

        secque = s[length - 2][2] + " " + s[length - 2][1];
        System.out.println(answtype + " is " + s[length - 2][2] + " " + s[length - 2][1] + " ?");
      } else {
        //	System.out.println("null");
        secque = makesentence(s, length - 2);

        System.out.println(answtype + " is " + secque + " ?");
      }
    }
    //	System.out.println("Secque is "+secque.replaceAll("[^A-Za-z ]",""));
    System.out.println(sentence.replace(secque.replaceAll("[^A-Za-z ]", ""), ""));

    long endTime = System.currentTimeMillis();
    System.out.println("The time elapsed is : " + (int) (endTime - startTime) / 1000);
    System.out.println("The end");
  }
Exemple #15
0
		public HashMap<String,ArrayList<TreeData>> parseAllDocs() throws IOException{ 
			String grammar =  "./jsan_resources/englishPCFG.ser.gz";
			String[] options = { "-maxLength", "120", "-retainTmpSubcategories" };
//			LexicalizedParser lp = new LexicalizedParser(grammar, options);
			
			LexicalizedParser lp = new LexicalizedParser()
			TreebankLanguagePack tlp = new PennTreebankLanguagePack();
			
			GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
			Iterable<List<? extends HasWord>> sentences;
			ArrayList<HashMap<String,ArrayList<String>>> everything = new ArrayList<HashMap<String,ArrayList<String>>>(3); 
			everything.add(0,otherSampleStrings);
			everything.add(1,authorSampleStrings);
			everything.add(2,toModifyStrings);
			Iterator<HashMap<String,ArrayList<String>>> everythingIter = everything.iterator();
			int docTypeNumber = -1; // 0 for otherSampleStrings, 1 for authorSampleStrings, 2 for toModifyStrings
			int numLoaded = 0;
			while(everythingIter.hasNext()){
				docTypeNumber++;
				HashMap<String,ArrayList<String>> currentSampleStrings = docPathFinder();
				Set<String> currentDocStrings = currentSampleStrings.keySet();
				Iterator<String> docStrIter = currentDocStrings.iterator();
				String docID;
				ArrayList<String> sentenceTokens;
				allTreeProcessors[docTypeNumber]  = new TreeProcessor();
				allTreeProcessors[docTypeNumber].clearLoadedTreeDataMaps();
				numLoaded=0;
				while(docStrIter.hasNext()){
					docID = docStrIter.next();
					sentenceTokens = currentSampleStrings.get(docID);
					if(sentenceTokens == null){
						allTreeProcessors[docTypeNumber].loadTreeDataMap(docID, GRAMMAR_DIR, false);
						numLoaded++;
						continue;
					}
					//System.out.println(sentenceTokens.size()+", strIter.hasNext? -> "+strIter.hasNext());

					numSentences = sentenceTokens.size();
					//initialize(numSentences);
					Iterator<String> sentIter = sentenceTokens.iterator();
					List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>();
					String tempSent;
					while(sentIter.hasNext()){
						tempSent = sentIter.next();
						Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(tempSent));
						List<? extends HasWord> sentenceTokenized = toke.tokenize();
						tmp.add(sentenceTokenized);
					}
					
					sentences = tmp;
					//int numDone = 0;
					TreeProcessor.singleDocMap.clear();
					boolean willSaveResults = true;
					for (List<? extends HasWord> sentence : sentences) {
						Tree parse = lp.apply(sentence);
						//parse.pennPrint();
						//System.out.println(parse.treeSkeletonCopy().toString());
						//System.out.println(parse.taggedYield());
						//System.out.println();
						//printSubTrees(parse);
						//TreeContainer.recurseTree(parse,"breadth");
						allTreeProcessors[docTypeNumber].processTree(parse, 0, willSaveResults); 
						//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
						//numDone++;
						//System.out.println("sent "+numDone+" of "+numSentences+" done ");
						//System.out.println(tc.processedTrees.toString());
						//in.nextLine();
						//TreeContainer.recurseTree(parse, "depth");
						//in.nextLine();
						//addTree(parse);
						//GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);//TODO: LOOK AT THIS
						//Collection tdl = gs.typedDependenciesCCprocessed(true);
						//System.out.println(tdl);
						//System.out.println();
					}
					if(willSaveResults == true)
						ObjectIO.writeObject(TreeProcessor.singleDocMap,docID, GRAMMAR_DIR);

					//System.out.println("After all sents: ");
					//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
					//String sent3 = "This is one last test!";
					//Tree parse3 = lp.apply(sent3);
					//parse3.pennPrint();
					//System.out.println("After sorting and writing:");
					//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
					//Scanner in = new Scanner(System.in);
					//System.out.println("First one done.");
					//in.nextLine();
					//viewTrees();
				}
				
				//TreeProcessor.writeTreeDataToCSV(sortedTD,docID);
				allTreeProcessors[docTypeNumber].unmergedMaps = new ArrayList<HashMap<String,TreeData>>(numLoaded+1);
				
			}	
			
			
			int i= 0;
			allParsedAndOrdered.clear();
			String[] docTypes = new String[]{"otherSample","authorSample","toModify"};
			for(i=0; i < 3; i++){
				allTreeProcessors[i].unmergedMaps.add(allTreeProcessors[i].processedTrees);
				allTreeProcessors[i].unmergedMaps.addAll(allTreeProcessors[i].loadedTreeDataMaps);
				allTreeProcessors[i].mergeTreeDataLists(allTreeProcessors[i].unmergedMaps);
				allParsedAndOrdered.put(docTypes[i],allTreeProcessors[i].sortTreeData(allTreeProcessors[i].mergedMap));
				
			}
			
			//ArrayList<TreeData> sortedTD = TreeContainer.sortTreeData(TreeContainer.allProcessedTrees);
			//TreeContainer.writeTreeDataToCSV(sortedTD,"ALL_AUTHORS");
			
			return allParsedAndOrdered;
		}
 private Collection<TypedDependency> getDependencies(Tree sentenceParseTree) {
   final TreebankLanguagePack tlp = new PennTreebankLanguagePack();
   final GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
   final GrammaticalStructure gs = gsf.newGrammaticalStructure(sentenceParseTree);
   return gs.typedDependenciesCollapsed();
 }
  public static void main(String[] args) {
    Options op = new Options(new EnglishTreebankParserParams());
    // op.tlpParams may be changed to something else later, so don't use it till
    // after options are parsed.

    System.out.println(StringUtils.toInvocationString("FactoredParser", args));

    String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj";
    int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219;
    String serializeFile = null;

    int i = 0;
    while (i < args.length && args[i].startsWith("-")) {
      if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) {
        path = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) {
        trainLow = Integer.parseInt(args[i + 1]);
        trainHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) {
        testLow = Integer.parseInt(args[i + 1]);
        testHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) {
        serializeFile = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) {
        try {
          op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance();
        } catch (ClassNotFoundException e) {
          System.err.println("Class not found: " + args[i + 1]);
          throw new RuntimeException(e);
        } catch (InstantiationException e) {
          System.err.println("Couldn't instantiate: " + args[i + 1] + ": " + e.toString());
          throw new RuntimeException(e);
        } catch (IllegalAccessException e) {
          System.err.println("illegal access" + e);
          throw new RuntimeException(e);
        }
        i += 2;
      } else if (args[i].equals("-encoding")) {
        // sets encoding for TreebankLangParserParams
        op.tlpParams.setInputEncoding(args[i + 1]);
        op.tlpParams.setOutputEncoding(args[i + 1]);
        i += 2;
      } else {
        i = op.setOptionOrWarn(args, i);
      }
    }
    // System.out.println(tlpParams.getClass());
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();

    op.trainOptions.sisterSplitters =
        new HashSet<String>(Arrays.asList(op.tlpParams.sisterSplitters()));
    //    BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
    PrintWriter pw = op.tlpParams.pw();

    op.testOptions.display();
    op.trainOptions.display();
    op.display();
    op.tlpParams.display();

    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank();
    MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank();
    // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
    // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
    // blippTreebank.loadPath(blippPath, "", true);

    Timing.startTime();
    System.err.print("Reading trees...");
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    if (op.testOptions.increasingLength) {
      Collections.sort(testTreebank, new TreeLengthComparator());
    }

    trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
    Timing.tick("done.");

    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer;
    if (!op.trainOptions.leftToRight) {
      binarizer =
          new TreeAnnotatorAndBinarizer(
              op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
    } else {
      binarizer =
          new TreeAnnotatorAndBinarizer(
              op.tlpParams.headFinder(),
              new LeftHeadFinder(),
              op.tlpParams,
              op.forceCNF,
              !op.trainOptions.outsideFactor(),
              true,
              op);
    }

    CollinsPuncTransformer collinsPuncTransformer = null;
    if (op.trainOptions.collinsPunc) {
      collinsPuncTransformer = new CollinsPuncTransformer(tlp);
    }
    TreeTransformer debinarizer = new Debinarizer(op.forceCNF);
    List<Tree> binaryTrainTrees = new ArrayList<Tree>();

    if (op.trainOptions.selectiveSplit) {
      op.trainOptions.splitters =
          ParentAnnotationStats.getSplitCategories(
              trainTreebank,
              op.trainOptions.tagSelectiveSplit,
              0,
              op.trainOptions.selectiveSplitCutOff,
              op.trainOptions.tagSelectiveSplitCutOff,
              op.tlpParams.treebankLanguagePack());
      if (op.trainOptions.deleteSplitters != null) {
        List<String> deleted = new ArrayList<String>();
        for (String del : op.trainOptions.deleteSplitters) {
          String baseDel = tlp.basicCategory(del);
          boolean checkBasic = del.equals(baseDel);
          for (Iterator<String> it = op.trainOptions.splitters.iterator(); it.hasNext(); ) {
            String elem = it.next();
            String baseElem = tlp.basicCategory(elem);
            boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del);
            if (delStr) {
              it.remove();
              deleted.add(elem);
            }
          }
        }
        System.err.println("Removed from vertical splitters: " + deleted);
      }
    }
    if (op.trainOptions.selectivePostSplit) {
      TreeTransformer myTransformer =
          new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams, op);
      Treebank annotatedTB = trainTreebank.transform(myTransformer);
      op.trainOptions.postSplitters =
          ParentAnnotationStats.getSplitCategories(
              annotatedTB,
              true,
              0,
              op.trainOptions.selectivePostSplitCutOff,
              op.trainOptions.tagSelectivePostSplitCutOff,
              op.tlpParams.treebankLanguagePack());
    }

    if (op.trainOptions.hSelSplit) {
      binarizer.setDoSelectiveSplit(false);
      for (Tree tree : trainTreebank) {
        if (op.trainOptions.collinsPunc) {
          tree = collinsPuncTransformer.transformTree(tree);
        }
        // tree.pennPrint(tlpParams.pw());
        tree = binarizer.transformTree(tree);
        // binaryTrainTrees.add(tree);
      }
      binarizer.setDoSelectiveSplit(true);
    }
    for (Tree tree : trainTreebank) {
      if (op.trainOptions.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTrainTrees.add(tree);
    }
    if (op.testOptions.verbose) {
      binarizer.dumpStats();
    }

    List<Tree> binaryTestTrees = new ArrayList<Tree>();
    for (Tree tree : testTreebank) {
      if (op.trainOptions.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTestTrees.add(tree);
    }
    Timing.tick("done."); // binarization
    BinaryGrammar bg = null;
    UnaryGrammar ug = null;
    DependencyGrammar dg = null;
    // DependencyGrammar dgBLIPP = null;
    Lexicon lex = null;
    Index<String> stateIndex = new HashIndex<String>();

    // extract grammars
    Extractor<Pair<UnaryGrammar, BinaryGrammar>> bgExtractor =
        new BinaryGrammarExtractor(op, stateIndex);
    // Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor();
    // Extractor lexExtractor = new LexiconExtractor();

    // Extractor dgExtractor = new DependencyMemGrammarExtractor();

    if (op.doPCFG) {
      System.err.print("Extracting PCFG...");
      Pair<UnaryGrammar, BinaryGrammar> bgug = null;
      if (op.trainOptions.cheatPCFG) {
        List<Tree> allTrees = new ArrayList<Tree>(binaryTrainTrees);
        allTrees.addAll(binaryTestTrees);
        bgug = bgExtractor.extract(allTrees);
      } else {
        bgug = bgExtractor.extract(binaryTrainTrees);
      }
      bg = bgug.second;
      bg.splitRules();
      ug = bgug.first;
      ug.purgeRules();
      Timing.tick("done.");
    }
    System.err.print("Extracting Lexicon...");
    Index<String> wordIndex = new HashIndex<String>();
    Index<String> tagIndex = new HashIndex<String>();
    lex = op.tlpParams.lex(op, wordIndex, tagIndex);
    lex.train(binaryTrainTrees);
    Timing.tick("done.");

    if (op.doDep) {
      System.err.print("Extracting Dependencies...");
      binaryTrainTrees.clear();
      Extractor<DependencyGrammar> dgExtractor =
          new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex);
      // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams,true));

      // DependencyGrammar dg1 = dgExtractor.extract(trainTreebank.iterator(), new
      // TransformTreeDependency(op.tlpParams, true));
      // dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new
      // TransformTreeDependency(tlpParams));

      // dg = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams));
      // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
      dg =
          dgExtractor.extract(
              binaryTrainTrees); // uses information whether the words are known or not, discards
      // unknown words
      Timing.tick("done.");
      // System.out.print("Extracting Unknown Word Model...");
      // UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
      // Timing.tick("done.");
      System.out.print("Tuning Dependency Model...");
      dg.tune(binaryTestTrees);
      // System.out.println("TUNE DEPS: "+tuneDeps);
      Timing.tick("done.");
    }

    BinaryGrammar boundBG = bg;
    UnaryGrammar boundUG = ug;

    GrammarProjection gp = new NullGrammarProjection(bg, ug);

    // serialization
    if (serializeFile != null) {
      System.err.print("Serializing parser...");
      LexicalizedParser.saveParserDataToSerialized(
          new ParserData(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op), serializeFile);
      Timing.tick("done.");
    }

    // test: pcfg-parse and output

    ExhaustivePCFGParser parser = null;
    if (op.doPCFG) {
      parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op, stateIndex, wordIndex, tagIndex);
    }

    ExhaustiveDependencyParser dparser =
        ((op.doDep && !op.testOptions.useFastFactored)
            ? new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex)
            : null);

    Scorer scorer =
        (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp, op), dparser) : null);
    // Scorer scorer = parser;
    BiLexPCFGParser bparser = null;
    if (op.doPCFG && op.doDep) {
      bparser =
          (op.testOptions.useN5)
              ? new BiLexPCFGParser.N5BiLexPCFGParser(
                  scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex)
              : new BiLexPCFGParser(
                  scorer,
                  parser,
                  dparser,
                  bg,
                  ug,
                  dg,
                  lex,
                  op,
                  gp,
                  stateIndex,
                  wordIndex,
                  tagIndex);
    }

    Evalb pcfgPE = new Evalb("pcfg  PE", true);
    Evalb comboPE = new Evalb("combo PE", true);
    AbstractEval pcfgCB = new Evalb.CBEval("pcfg  CB", true);

    AbstractEval pcfgTE = new TaggingEval("pcfg  TE");
    AbstractEval comboTE = new TaggingEval("combo TE");
    AbstractEval pcfgTEnoPunct = new TaggingEval("pcfg nopunct TE");
    AbstractEval comboTEnoPunct = new TaggingEval("combo nopunct TE");
    AbstractEval depTE = new TaggingEval("depnd TE");

    AbstractEval depDE =
        new UnlabeledAttachmentEval("depnd DE", true, null, tlp.punctuationWordRejectFilter());
    AbstractEval comboDE =
        new UnlabeledAttachmentEval("combo DE", true, null, tlp.punctuationWordRejectFilter());

    if (op.testOptions.evalb) {
      EvalbFormatWriter.initEVALBfiles(op.tlpParams);
    }

    // int[] countByLength = new int[op.testOptions.maxLength+1];

    // Use a reflection ruse, so one can run this without needing the
    // tagger.  Using a function rather than a MaxentTagger means we
    // can distribute a version of the parser that doesn't include the
    // entire tagger.
    Function<List<? extends HasWord>, ArrayList<TaggedWord>> tagger = null;
    if (op.testOptions.preTag) {
      try {
        Class[] argsClass = {String.class};
        Object[] arguments = new Object[] {op.testOptions.taggerSerializedFile};
        tagger =
            (Function<List<? extends HasWord>, ArrayList<TaggedWord>>)
                Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger")
                    .getConstructor(argsClass)
                    .newInstance(arguments);
      } catch (Exception e) {
        System.err.println(e);
        System.err.println("Warning: No pretagging of sentences will be done.");
      }
    }

    for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) {
      Tree tree = testTreebank.get(tNum);
      int testTreeLen = tree.yield().size();
      if (testTreeLen > op.testOptions.maxLength) {
        continue;
      }
      Tree binaryTree = binaryTestTrees.get(tNum);
      // countByLength[testTreeLen]++;
      System.out.println("-------------------------------------");
      System.out.println("Number: " + (tNum + 1));
      System.out.println("Length: " + testTreeLen);

      // tree.pennPrint(pw);
      // System.out.println("XXXX The binary tree is");
      // binaryTree.pennPrint(pw);
      // System.out.println("Here are the tags in the lexicon:");
      // System.out.println(lex.showTags());
      // System.out.println("Here's the tagnumberer:");
      // System.out.println(Numberer.getGlobalNumberer("tags").toString());

      long timeMil1 = System.currentTimeMillis();
      Timing.tick("Starting parse.");
      if (op.doPCFG) {
        // System.err.println(op.testOptions.forceTags);
        if (op.testOptions.forceTags) {
          if (tagger != null) {
            // System.out.println("Using a tagger to set tags");
            // System.out.println("Tagged sentence as: " +
            // tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false));
            parser.parse(addLast(tagger.apply(cutLast(wordify(binaryTree.yield())))));
          } else {
            // System.out.println("Forcing tags to match input.");
            parser.parse(cleanTags(binaryTree.taggedYield(), tlp));
          }
        } else {
          // System.out.println("XXXX Parsing " + binaryTree.yield());
          parser.parse(binaryTree.yieldHasWord());
        }
        // Timing.tick("Done with pcfg phase.");
      }
      if (op.doDep) {
        dparser.parse(binaryTree.yieldHasWord());
        // Timing.tick("Done with dependency phase.");
      }
      boolean bothPassed = false;
      if (op.doPCFG && op.doDep) {
        bothPassed = bparser.parse(binaryTree.yieldHasWord());
        // Timing.tick("Done with combination phase.");
      }
      long timeMil2 = System.currentTimeMillis();
      long elapsed = timeMil2 - timeMil1;
      System.err.println("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec.");
      // System.out.println("PCFG Best Parse:");
      Tree tree2b = null;
      Tree tree2 = null;
      // System.out.println("Got full best parse...");
      if (op.doPCFG) {
        tree2b = parser.getBestParse();
        tree2 = debinarizer.transformTree(tree2b);
      }
      // System.out.println("Debinarized parse...");
      // tree2.pennPrint();
      // System.out.println("DepG Best Parse:");
      Tree tree3 = null;
      Tree tree3db = null;
      if (op.doDep) {
        tree3 = dparser.getBestParse();
        // was: but wrong Tree tree3db = debinarizer.transformTree(tree2);
        tree3db = debinarizer.transformTree(tree3);
        tree3.pennPrint(pw);
      }
      // tree.pennPrint();
      // ((Tree)binaryTrainTrees.get(tNum)).pennPrint();
      // System.out.println("Combo Best Parse:");
      Tree tree4 = null;
      if (op.doPCFG && op.doDep) {
        try {
          tree4 = bparser.getBestParse();
          if (tree4 == null) {
            tree4 = tree2b;
          }
        } catch (NullPointerException e) {
          System.err.println("Blocked, using PCFG parse!");
          tree4 = tree2b;
        }
      }
      if (op.doPCFG && !bothPassed) {
        tree4 = tree2b;
      }
      // tree4.pennPrint();
      if (op.doDep) {
        depDE.evaluate(tree3, binaryTree, pw);
        depTE.evaluate(tree3db, tree, pw);
      }
      TreeTransformer tc = op.tlpParams.collinizer();
      TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb();
      if (op.doPCFG) {
        // System.out.println("XXXX Best PCFG was: ");
        // tree2.pennPrint();
        // System.out.println("XXXX Transformed best PCFG is: ");
        // tc.transformTree(tree2).pennPrint();
        // System.out.println("True Best Parse:");
        // tree.pennPrint();
        // tc.transformTree(tree).pennPrint();
        pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        Tree tree4b = null;
        if (op.doDep) {
          comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw);
          tree4b = tree4;
          tree4 = debinarizer.transformTree(tree4);
          if (op.nodePrune) {
            NodePruner np = new NodePruner(parser, debinarizer);
            tree4 = np.prune(tree4);
          }
          // tree4.pennPrint();
          comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        // pcfgTE.evaluate(tree2, tree);
        pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw);
        pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);

        if (op.doDep) {
          comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw);
          comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0));

        // tc.transformTree(tree2).pennPrint();
        tree2.pennPrint(pw);

        if (op.doDep) {
          System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0));
          // tc.transformTree(tree4).pennPrint(pw);
          tree4.pennPrint(pw);
        }
        System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0));
        /*
        if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) {
          System.out.println("SCORE INVERSION");
          parser.validateBinarizedTree(binaryTree,0);
        }
        */
        tree.pennPrint(pw);
      } // end if doPCFG

      if (op.testOptions.evalb) {
        if (op.doPCFG && op.doDep) {
          EvalbFormatWriter.writeEVALBline(
              tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4));
        } else if (op.doPCFG) {
          EvalbFormatWriter.writeEVALBline(
              tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2));
        } else if (op.doDep) {
          EvalbFormatWriter.writeEVALBline(
              tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db));
        }
      }
    } // end for each tree in test treebank

    if (op.testOptions.evalb) {
      EvalbFormatWriter.closeEVALBfiles();
    }

    // op.testOptions.display();
    if (op.doPCFG) {
      pcfgPE.display(false, pw);
      System.out.println("Grammar size: " + stateIndex.size());
      pcfgCB.display(false, pw);
      if (op.doDep) {
        comboPE.display(false, pw);
      }
      pcfgTE.display(false, pw);
      pcfgTEnoPunct.display(false, pw);
      if (op.doDep) {
        comboTE.display(false, pw);
        comboTEnoPunct.display(false, pw);
      }
    }
    if (op.doDep) {
      depTE.display(false, pw);
      depDE.display(false, pw);
    }
    if (op.doPCFG && op.doDep) {
      comboDE.display(false, pw);
    }
    // pcfgPE.printGoodBad();
  }
Exemple #18
0
  public ArrayList<String> getKeyWordsDependency(String sentence, String keyword) {
    LexicalizedParser lp =
        LexicalizedParser.loadModel(
            "/home/mingrui/Desktop/englishPCFG.ser.gz",
            "-maxLength",
            "80",
            "-retainTmpSubcategories");
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    // Uncomment the following line to obtain original Stanford Dependencies
    // tlp.setGenerateOriginalDependencies(true);
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    String[] array = sentence.split("\\s+");
    Tree parse = lp.apply(Sentence.toWordList(array));
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    ArrayList<String> keywordsDependency = new ArrayList<String>();
    ArrayList<String> keywordsDependencyWithLemmatization = new ArrayList<String>();
    // String lemmatizedKeyword = lemmatize(keyword);
    for (TypedDependency t : tdl) {
      String d = t.toString();
      String dependencyType = d.substring(0, d.indexOf("("));
      String pair = d.substring(d.indexOf("(") + 1, d.indexOf("("));
      String[] terms = pair.split(",");
      String term1 = terms[0].trim();
      String term2 = terms[1].trim();

      // Match keywords with the terms in the tuples, if matched, add the
      // tuple into the arraylist
      String[] wordsplitted = keyword.split(" ");
      for (String key : wordsplitted) {
        if (term1.equals(key)) {
          keywordsDependency.add(t.toString());
        }
        if (term2.equals(key)) {
          keywordsDependency.add(t.toString());
        }
      }
    }

    String lemmatizedKeywords = lemmatize(keyword);
    int lbefore = keyword.split(" ").length;
    int lafter = lemmatizedKeywords.split(" ").length;
    if (lbefore == lafter) {
      return keywordsDependency;
    } else {
      String[] split = keyword.split(" ");
      for (String s : split) {
        String[] lemmas = lemmatize(s).split(" ");
        boolean sameLength = lemmas.length == s.split(" ").length;
        if (sameLength) { // Compare the length of one key_word or key_phrase before and after
                          // lemmatization
          continue;
        } else {
          for (String tuple : keywordsDependency) {
            if (getTupleTerms(tuple)[0].equals(
                s)) { // Find the tuple that contains the original keyword/key_phrase
              String dependent = getTupleTerms(tuple)[1];
              // String[]
            }
          }
          // for(String l : lemma)
        }
      }
      return keywordsDependencyWithLemmatization;
    }
  }