Beispiel #1
0
  public ParseEssay() {
    System.setProperty("wordnet.database.dir", "../war/dict");
    synonyms = new ArrayList<String>();
    database = WordNetDatabase.getFileInstance();
    baos = new ByteArrayOutputStream();
    lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");

    // ??
  }
Beispiel #2
0
public class Parser {

  private String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
  private String[] options = {"-maxLength", "80", "-retainTmpSubcategories"};
  private LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
  private TreebankLanguagePack tlp = lp.getOp().langpack();
  private GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();

  public Parser() {}

  public LinkedList<String> getKeyWrodsFromSentence(String string) {
    LinkedList<String> list = new LinkedList<String>();

    String[] sent = string.split(" ");
    List<HasWord> sentence = new ArrayList<HasWord>();
    for (String word : sent) sentence.add(new Word(word));

    Tree parse = lp.parse(sentence);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);

    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();

    String[] current;
    String type, key;
    List<CoreLabel> labelsList = parse.taggedLabeledYield();
    for (Label l : labelsList) {
      current = l.toString().split("-");
      type = current[0];
      if (type.equals("NN") || type.equals("NNS")) {
        key = sent[Integer.parseInt(current[1])];
        list.add(key);
      }
    }
    return list;
  }

  public LinkedList<String> getKeyWrodsFromSentenceTest(String string) {

    LinkedList<String> list = new LinkedList<String>();

    String[] sent = string.split(" ");
    List<HasWord> sentence = new ArrayList<HasWord>();
    for (String word : sent) {
      sentence.add(new Word(word));
    }

    Tree parse = lp.parse(sentence);
    parse.pennPrint();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);

    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    System.out.println(tdl);

    System.out.println();

    System.out.println("The words of the sentence:");
    for (Label lab : parse.yield()) {
      if (lab instanceof CoreLabel) {
        System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP));
      } else {
        System.out.println(lab);
      }
    }
    System.out.println();
    System.out.println("tagged");
    System.out.println(parse.taggedYield());

    List<CoreLabel> temp = parse.taggedLabeledYield();
    for (Label l : temp) {
      String[] sss = l.toString().split("-");
      String type = sss[0];
      System.out.println(sss[0] + "  " + sss[1] + "    " + sent[Integer.parseInt(sss[1])]);
    }

    for (Iterator<String> ite = list.iterator(); ite.hasNext(); ) System.out.println(ite.next());
    return list;
  }

  public static void main(String[] args) throws IOException {
    Parser parser = new Parser();
    parser.getKeyWrodsFromSentence(
        "When athletes begin to exercise, their heart rates and respiration rates increase.  At what level of organization does the human body coordinate these functions?");
    parser.getKeyWrodsFromSentenceTest(
        "When athletes begin to exercise, their heart rates and respiration rates increase.  At what level of organization does the human body coordinate these functions?");
    // main2();

  }
}
  /**
   * parse sentence and generate .trees file
   *
   * @param en
   * @param align
   * @param out
   */
  public static void parse(String en, String align, String out, boolean verbose) {

    // use alignments?
    boolean use_alignments = true;
    if (align.startsWith("no_align")) {
      use_alignments = false;
      System.err.println("Not using alignments.");
    } else {
      System.err.println("Using alignments from " + align);
    }

    // setup stanfordparser
    String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String[] options = {"-outputFormat", "wordsAndTags, typedDependencies"};
    LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
    TreebankLanguagePack tlp = lp.getOp().langpack();
    java.util.function.Predicate<java.lang.String> punctuationFilter = x -> true;

    GrammaticalStructureFactory gsf =
        new edu.stanford.nlp.trees.EnglishGrammaticalStructureFactory(punctuationFilter);

    // read document
    Iterable<List<? extends HasWord>> sentences;
    Reader r = new Reader(en);
    String line = null;
    List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>();
    while ((line = r.getNext()) != null) {
      Tokenizer<? extends HasWord> token =
          tlp.getTokenizerFactory().getTokenizer(new StringReader(line));
      List<? extends HasWord> sentence = token.tokenize();
      tmp.add(sentence);
    }
    sentences = tmp;

    // set up alignment file reader
    Reader alignment = new Reader();
    if (use_alignments) {
      alignment = new Reader(align);
    }

    // set up tree file writer
    Writer treeWriter = new Writer(out);

    // parse
    long start = System.currentTimeMillis();
    // System.err.print("Parsing sentences ");
    int sentID = 0;
    for (List<? extends HasWord> sentence : sentences) {
      Tree t = new Tree();
      // t.setSentID(++sentID);
      System.err.println("parse Sentence :" + sentence + "...");
      // System.err.print(".");
      System.err.println("-----------------------------------------------------------------------");
      edu.stanford.nlp.trees.Tree parse = lp.parse(sentence);
      // parse.pennPrint();

      // List for root node and lexical nodes
      List<Node> loneNodes = new LinkedList<Node>();
      List<Node> governingNodes = new LinkedList<Node>();

      // ROOT node
      Node root = new Node(true, true);
      root.setTag("ROOT");
      t.setRoot(root);
      loneNodes.add(root);
      governingNodes.add(root);

      // tagging

      int counter = 0;
      String surface = "";
      String tag = "";

      for (TaggedWord tw : parse.taggedYield()) {
        Node n = new Node();
        Node governingNode = new Node();
        n.setNodeID(++counter);
        surface = tw.value();
        tag = tw.tag();
        if (surface.startsWith("-LRB-")) {
          surface = "(";
        } else if (surface.startsWith("-RRB-")) {
          surface = ")";
          // } else if (surface.startsWith("-LSB-")){
          //    surface = "[";
          // } else if (surface.startsWith("-RSB-")){
          //    surface = "]";
          // } else if (surface.startsWith("-LCB-")){
          //    surface = "{";
          // } else if (surface.startsWith("-RCB-")){
          //    surface = "}";
        } else if (surface.startsWith("''")) {
          surface = "\"";
        }
        tag = tag.replaceAll("#", "-NUM-");
        surface = surface.replaceAll("&", "-AMP-");
        surface = surface.replaceAll("#", "-NUM-");
        surface = surface.replaceAll(">", "-GRE-");
        surface = surface.replaceAll("=", "-EQU-");
        n.setInitialLexicalIndex(counter);
        governingNode.setInitialLexicalIndex(counter);
        n.setSurface(surface);
        // System.out.print("("+tw.value()+" : ");
        n.setTag(tag);
        governingNode.setTag("_" + tag);
        governingNode.setLabel("_gov");
        // System.out.print(tw.tag()+")");
        loneNodes.add(n);
        governingNodes.add(governingNode);
        governingNode.setChild(n);
      }

      // System.out.println("");

      // t.setSentLength(t.getNodes().size() - 1);
      // List<Node> loneNodes = new LinkedList<Node>();
      Node[] nodes = new Node[2000];
      // labeling
      int depIndex;
      int govIndex;
      String[] depInfo;
      String[] govInfo;
      GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
      List<TypedDependency> tdl = gs.typedDependencies(false);
      // List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
      for (TypedDependency td : tdl) {
        depIndex = td.dep().index();
        govIndex = td.gov().index();
        // System.out.println("Index1:"+depIndex);
        // System.out.println("Index2:"+govIndex);
        // if (nodes[depIndex] == null){
        //	System.out.println("Making node!");
        //	nodes[depIndex] = new Node();
        // }
        // if (nodes[govIndex] == null){
        //	System.out.println("Making node!");
        //	nodes[govIndex] = new Node();
        // }
        Node dep = loneNodes.get((depIndex));
        Node gov = governingNodes.get((govIndex));
        Node depcopy = governingNodes.get((depIndex));
        Node govcopy = loneNodes.get((govIndex));
        dep.setLabel(td.reln().toString());
        depcopy.setLabel(td.reln().toString());
        govcopy.setLabel("head");
        // System.out.println(td.toString());
        govInfo = td.gov().toString().split("/");
        depInfo = td.dep().toString().split("/");
        // System.out.println(td.gov().toString());
        // System.out.println(td.dep().toString());
        // dep.setSurface(depInfo[0]);
        // dep.setTag(depInfo[1]);
        gov.setChild(governingNodes.get(depIndex));
        governingNodes.get(depIndex).setParent(gov);
        // gov.setChild(dep);
        dep.setParent(governingNodes.get(depIndex));
      }
      // t.setRoot(nodes[0]);

      // Collapse tree to remove unneeded governing nodes:

      Node gov;
      Node dep;
      Node parent;
      List<Node> children;

      for (int i = 1; i < governingNodes.size(); i++) { // start with index 1 to skip root
        gov = governingNodes.get(i);
        dep = loneNodes.get(i);
        if (gov.getChildren().size() <= 1) {
          int k = 0;
          parent = gov.getParent();
          children = parent.getChildren();

          for (Node n : children) {
            if (n == gov) {
              gov.getParent().replaceChild(k, dep);
              dep.setParent(gov.getParent());
            }
            k++;
          }
        }
      }
      // Mark head nodes with appropriate label:
      int k = 0;
      for (Node n : loneNodes) {
        if (k != 0) {
          if (n.getLabel() == n.getParent().getLabel()) {
            n.setLabel("head");
          }
        } else {
          n.setLabel("null");
        }
        k++;
      }
      // Sort lexical children of each governing node in lexical order

      for (Node n : governingNodes) {
        n.sortChildrenByInitialIndex();
      }

      // combine with alignment
      if (use_alignments) {
        t.initialize(alignment.readNextAlign());
      } else {
        t.initializeUnaligned();
      }

      // write tree to file
      treeWriter.write(t);

      // print tree to console

      System.out.println(t.toSentence());
      if (verbose) {
        System.err.println(t.toString());
        // t.recursivePrint();
      }
      System.err.println("#######################################################################");
    }
    long stop = System.currentTimeMillis();
    System.err.println("...done! [" + (stop - start) / 1000 + " sec].");

    treeWriter.close();
  }
class StanfordParser {
  private final String PCG_MODEL = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
  private final TokenizerFactory<CoreLabel> tokenizerFactory =
      PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
  private final LexicalizedParser parser = LexicalizedParser.loadModel(PCG_MODEL);
  private final String serializedClassifier =
      "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf" + ".ser.gz";
  private final AbstractSequenceClassifier<CoreLabel> classifier =
      CRFClassifier.getClassifierNoExceptions(serializedClassifier);

  public ParsedSentence parseSentence(String sentence, boolean removePunctuation) {
    if (removePunctuation) {
      sentence = cleanSentence(sentence);
    }

    final Tree posTree = getPosTree(sentence);
    return new ParsedSentence(posTree, getDependencies(posTree), findNamedEntities(sentence));
  }

  public Tense calculateTense(String clause) {
    final Tree posTree = getPosTree(clause);
    final Tree word = posTree.getLeaves().get(0);
    final String pos = word.parent(posTree).label().value().toLowerCase();
    if (pos.equals("md")) {
      return Tense.FUTURE;
    }
    if (pos.equals("vbd") || pos.equals("vbn")) {
      return Tense.PAST;
    }
    return Tense.PRESENT;
  }

  public Map<String, NamedEntity> findNamedEntities(String sentence) {
    final Map<String, NamedEntity> namedEntities = new HashMap<>();
    final List<Triple<String, Integer, Integer>> nerSubstrings = findNerSubstrings(sentence);
    for (final Triple<String, Integer, Integer> substring : nerSubstrings) {
      namedEntities.put(
          sentence.substring(substring.second(), substring.third()),
          NamedEntity.getNamedEntity(substring.first()));
    }
    return namedEntities;
  }

  private List<Triple<String, Integer, Integer>> findNerSubstrings(String sentence) {
    return classifier.classifyToCharacterOffsets(sentence);
  }

  private String cleanSentence(String sentence) {
    return sentence.replaceAll("\\p{Punct}", "").replaceAll("[ ]+", " ");
  }

  private Tree getPosTree(String sentence) {
    final Tokenizer<CoreLabel> tokenizer =
        tokenizerFactory.getTokenizer(new StringReader(sentence));
    final List<CoreLabel> tokens = tokenizer.tokenize();
    return parser.apply(tokens);
  }

  private Collection<TypedDependency> getDependencies(Tree sentenceParseTree) {
    final TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    final GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    final GrammaticalStructure gs = gsf.newGrammaticalStructure(sentenceParseTree);
    return gs.typedDependenciesCollapsed();
  }
}
  /**
   * for testing -- CURRENTLY BROKEN!!!
   *
   * @param args input dir and output filename
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    if (args.length != 3) {
      throw new RuntimeException("args: treebankPath trainNums testNums");
    }

    ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
    ctpp.charTags = true;
    // TODO: these options are getting clobbered by reading in the
    // parser object (unless it's a text file parser?)
    Options op = new Options(ctpp);
    op.doDep = false;
    op.testOptions.maxLength = 90;

    LexicalizedParser lp;
    try {
      FileFilter trainFilt = new NumberRangesFileFilter(args[1], false);

      lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op);
      try {
        String filename = "chineseCharTagPCFG.ser.gz";
        System.err.println("Writing parser in serialized format to file " + filename + ' ');
        System.err.flush();
        ObjectOutputStream out = IOUtils.writeStreamFromString(filename);

        out.writeObject(lp);
        out.close();
        System.err.println("done.");
      } catch (IOException ioe) {
        ioe.printStackTrace();
      }
    } catch (IllegalArgumentException e) {
      lp = LexicalizedParser.loadModel(args[1], op);
    }

    FileFilter testFilt = new NumberRangesFileFilter(args[2], false);
    MemoryTreebank testTreebank = ctpp.memoryTreebank();
    testTreebank.loadPath(new File(args[0]), testFilt);
    PrintWriter pw =
        new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true);
    WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
    WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
    EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck);
    //    System.out.println("Preterminals:" + preterminals);
    System.out.println("Testing...");
    for (Tree gold : testTreebank) {
      Tree tree;
      try {
        tree = lp.parseTree(gold.yieldHasWord());
        if (tree == null) {
          System.out.println("Failed to parse " + gold.yieldHasWord());
          continue;
        }
      } catch (Exception e) {
        e.printStackTrace();
        continue;
      }
      gold = gold.firstChild();
      pw.println(Sentence.listToString(gold.preTerminalYield()));
      pw.println(Sentence.listToString(gold.yield()));
      gold.pennPrint(pw);

      pw.println(tree.preTerminalYield());
      pw.println(tree.yield());
      tree.pennPrint(pw);
      //      Collection allBrackets = WordCatConstituent.allBrackets(tree);
      //      Collection goldBrackets = WordCatConstituent.allBrackets(gold);
      //      eval.eval(allBrackets, goldBrackets);
      eval.displayLast();
    }
    System.out.println();
    System.out.println();
    eval.display();
  }
  public ParseResult parseSentence(String sentence) {
    String result = "";

    // see if a parser socket server is available
    int port = new Integer(ARKref.getProperties().getProperty("parserServerPort", "5556"));
    String host = "127.0.0.1";
    Socket client;
    PrintWriter pw;
    BufferedReader br;
    String line;
    try {
      client = new Socket(host, port);

      pw = new PrintWriter(client.getOutputStream());
      br = new BufferedReader(new InputStreamReader(client.getInputStream()));
      pw.println(sentence);
      pw.flush(); // flush to complete the transmission
      while ((line = br.readLine()) != null) {
        // if(!line.matches(".*\\S.*")){
        //        System.out.println();
        // }
        if (br.ready()) {
          line = line.replaceAll("\n", "");
          line = line.replaceAll("\\s+", " ");
          result += line + " ";
        } else {
          lastParseScore = new Double(line);
        }
      }

      br.close();
      pw.close();
      client.close();

      System.err.println("parser output:" + result);

      lastParse = readTreeFromString(result);
      boolean success = !Strings.normalizeWhitespace(result).equals("(ROOT (. .))");
      return new ParseResult(success, lastParse, lastParseScore);
    } catch (Exception ex) {

      // ex.printStackTrace();
    }

    // if socket server not available, then use a local parser object
    if (parser == null) {
      if (DEBUG) System.err.println("Could not connect to parser server.  Loading parser...");
      try {
        Options op = new Options();
        String serializedInputFileOrUrl =
            ClassLoader.getSystemResource(
                    ARKref.getProperties()
                        .getProperty("parserGrammarFile", "lib/englishPCFG.ser.gz"))
                .toExternalForm();
        parser = LexicalizedParser.loadModel(serializedInputFileOrUrl, op);
        //				int maxLength = new Integer(ARKref.getProperties().getProperty("parserMaxLength",
        // "40")).intValue();
        //				parser.setMaxLength(maxLength);
        parser.setOptionFlags("-outputFormat", "oneline");
      } catch (Exception e) {
        e.printStackTrace();
      }
    }

    try {
      DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(sentence));

      LexicalizedParserQuery query = parser.parserQuery();

      if (query.parse(dp.iterator().next())) {
        lastParse = query.getBestParse();
        lastParseScore = query.getPCFGScore();
        TreePrint tp = new TreePrint("penn", "", new PennTreebankLanguagePack());
        StringWriter sb = new StringWriter();
        pw = new PrintWriter(sb);
        tp.printTree(lastParse, pw);
        pw.flush();
        lastParse = readTreeFromString(sb.getBuffer().toString());

        return new ParseResult(true, lastParse, lastParseScore);
      }
    } catch (Exception e) {
    }

    lastParse = readTreeFromString("(ROOT (. .))");
    lastParseScore = -99999.0;
    return new ParseResult(false, lastParse, lastParseScore);
  }
  public LexicalParsingEngine(String parserModel)
      throws FileNotFoundException, UnsupportedEncodingException {

    System.out.println("Initializing Lexical Parser...");
    lp = LexicalizedParser.loadModel(parserModel);
  }
@SuppressWarnings("serial")
public class TextSimplification {

  public static List<String> replacementList =
      new ArrayList<String>() {
        {
          add("he");
          add("him");
          add("his");
          add("she");
          add("her");
          add("they");
          add("them");
          add("their");
          add("i");
          add("her's");
          add("you");
          add("your");
          add("your's");
          add("mine");
          add("my");
          add("us");
          add("we");
          //		add("it");
          //		add("its");
          //		add("this");
          //		add("that");
        }
      };

  public static String resolvedSentences = "";

  private static final String PCG_MODEL = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

  private static final TokenizerFactory<CoreLabel> tokenizerFactory =
      PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");

  private static final LexicalizedParser parser = LexicalizedParser.loadModel(PCG_MODEL);

  public static void main(String[] args) throws IOException {

    // :TODO
    // * Do not consider roots with more than 2 words
    // * Root should not be he, she her, his, him etc...
    // * If it is, den take the last known gender noun and make it the root.

    String text = new String(Files.readAllBytes(Paths.get(args[0])), StandardCharsets.UTF_8);
    text = text.replace("\n", " ");

    // Resolve Anaphora
    System.out.println("Anaphora Resolution...");
    resolveAnaphora(text);
    System.out.println(
        "Anaphora Resolution Completed!\nIntermediate Output in \"AnaphoraResolved.txt\"");
    writeToFile(resolvedSentences, "AnaphoraResolved.txt");

    // Create ParseTrees
    System.out.println("Parse Tree Generation...");
    startParsing((resolvedSentences));
    System.out.println("Parse Tree Generation Completed!\nIntermediate Output in \"Tree.txt\"");
  }

  public static void resolveAnaphora(String text) {

    RedwoodConfiguration.empty().capture(System.err).apply();

    Annotation document = new Annotation(text);
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    props.put("dcoref.female", "female.unigram.txt");
    props.put("dcoref.male", "male.unigram.txt");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    pipeline.annotate(document);

    RedwoodConfiguration.current().clear().apply();

    Map<Integer, CorefChain> graph = document.get(CorefChainAnnotation.class);
    List<CoreMap> stnfrdSentences = document.get(SentencesAnnotation.class);

    ImmutableMultimap.Builder<Integer, Pair<CorefChain, CorefMention>> records =
        ImmutableMultimap.builder();
    ImmutableMultimap.Builder<Integer, Pair<CorefChain, CorefMention>> recordsOrdered =
        ImmutableMultimap.builder();

    graph.forEach(
        (key, value) -> {
          value
              .getMentionMap()
              .forEach(
                  (intPair, corefSet) -> {
                    corefSet.forEach(
                        mention -> records.put(mention.sentNum, Pair.of(value, mention)));
                  });
        });

    recordsOrdered =
        records.orderKeysBy(
            new Comparator<Integer>() {
              @Override
              public int compare(Integer o1, Integer o2) {
                return o1 - o2;
              }
            });

    recordsOrdered
        .build()
        .asMap()
        .forEach(
            (sentNum, mentionList) -> {
              CoreMap sentence = stnfrdSentences.get(sentNum - 1);
              List<CoreLabel> stnfrdtokens = sentence.get(TokensAnnotation.class);

              mentionList.forEach(
                  pair -> {
                    CorefChain chain = pair.getLeft();
                    CorefMention mention = pair.getRight();
                    String root = chain.getRepresentativeMention().mentionSpan;

                    if (!mention.mentionSpan.equalsIgnoreCase(root)
                        && (!root.contains(mention.mentionSpan)
                            && !mention.mentionSpan.contains(root))
                        && (!replacementList.contains(root.toLowerCase()))
                        && (root.split("\\s").length < 3)
                        && (replacementList.contains(mention.mentionSpan.toLowerCase()))) {
                      if (mention.mentionSpan.equalsIgnoreCase("her")
                          || mention.mentionSpan.equalsIgnoreCase("his")) {
                        root += "'s";
                      }
                      stnfrdtokens.get(mention.startIndex - 1).setOriginalText(root);
                    }
                  });

              String sent = "";
              for (CoreLabel token : stnfrdtokens) {
                sent += token.originalText() + " ";
              }
              ;
              resolvedSentences += sent + "\n";
            });
  }

  public static Tree parse(String str) {
    List<CoreLabel> tokens = tokenize(str);
    Tree tree = parser.apply(tokens);
    return tree;
  }

  private static List<CoreLabel> tokenize(String str) {
    Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(str));
    return tokenizer.tokenize();
  }

  public static void startParsing(String paragraph) throws FileNotFoundException, IOException {
    String parseTrees = "";

    // Can we just split on new line as paragraph is already sentence splitted.
    Reader reader = new StringReader(paragraph);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();

    for (List<HasWord> sentence : dp) {
      String sentenceString = Sentence.listToString(sentence);
      sentenceList.add(sentenceString);
    }

    for (String sentence : sentenceList) {
      //			System.out.println(sentence);
      parseTrees += createParseTree(sentence);
    }
    writeToFile(parseTrees, "trees.txt");
  }

  public static void writeToFile(String content, String filename) throws IOException {
    File file = new File(filename);
    file.delete();

    FileWriter fout = new FileWriter(filename);
    fout.write(content);
    fout.close();
  }

  public static String createParseTree(String sentence) {
    Tree tree = parse(sentence);
    //		System.out.println(tree.toString());
    return (tree.toString() + "\n");
  }
}
Beispiel #9
0
  public ArrayList<String> getKeyWordsDependency(String sentence, String keyword) {
    LexicalizedParser lp =
        LexicalizedParser.loadModel(
            "/home/mingrui/Desktop/englishPCFG.ser.gz",
            "-maxLength",
            "80",
            "-retainTmpSubcategories");
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    // Uncomment the following line to obtain original Stanford Dependencies
    // tlp.setGenerateOriginalDependencies(true);
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    String[] array = sentence.split("\\s+");
    Tree parse = lp.apply(Sentence.toWordList(array));
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    ArrayList<String> keywordsDependency = new ArrayList<String>();
    ArrayList<String> keywordsDependencyWithLemmatization = new ArrayList<String>();
    // String lemmatizedKeyword = lemmatize(keyword);
    for (TypedDependency t : tdl) {
      String d = t.toString();
      String dependencyType = d.substring(0, d.indexOf("("));
      String pair = d.substring(d.indexOf("(") + 1, d.indexOf("("));
      String[] terms = pair.split(",");
      String term1 = terms[0].trim();
      String term2 = terms[1].trim();

      // Match keywords with the terms in the tuples, if matched, add the
      // tuple into the arraylist
      String[] wordsplitted = keyword.split(" ");
      for (String key : wordsplitted) {
        if (term1.equals(key)) {
          keywordsDependency.add(t.toString());
        }
        if (term2.equals(key)) {
          keywordsDependency.add(t.toString());
        }
      }
    }

    String lemmatizedKeywords = lemmatize(keyword);
    int lbefore = keyword.split(" ").length;
    int lafter = lemmatizedKeywords.split(" ").length;
    if (lbefore == lafter) {
      return keywordsDependency;
    } else {
      String[] split = keyword.split(" ");
      for (String s : split) {
        String[] lemmas = lemmatize(s).split(" ");
        boolean sameLength = lemmas.length == s.split(" ").length;
        if (sameLength) { // Compare the length of one key_word or key_phrase before and after
                          // lemmatization
          continue;
        } else {
          for (String tuple : keywordsDependency) {
            if (getTupleTerms(tuple)[0].equals(
                s)) { // Find the tuple that contains the original keyword/key_phrase
              String dependent = getTupleTerms(tuple)[1];
              // String[]
            }
          }
          // for(String l : lemma)
        }
      }
      return keywordsDependencyWithLemmatization;
    }
  }