Java PTBTokenizer.factory 예제들, edu.stanford.nlp.process.PTBTokenizer.factory Java 예제들

예제 #1

0

파일 보기

파일: TaggerDemo2.java 프로젝트: yuanb10/Thesis

  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
      return;
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
    BufferedReader r =
        new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));

    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
      List<TaggedWord> tSentence = tagger.tagSentence(sentence);
      pw.println(Sentence.listToString(tSentence, false));
    }

    // print the adjectives in one more sentence. This shows how to get at words and tags in a
    // tagged sentence.
    List<HasWord> sent =
        Sentence.toWordList(
            "The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord tw : taggedSent) {
      if (tw.tag().startsWith("JJ")) {
        pw.println(tw.word());
      }
    }

    pw.close();
  }

예제 #2

0

파일 보기

파일: MUCMentionExtractor.java 프로젝트: taarraas/matrixfactorization

 public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics)
     throws Exception {
   super(dict, semantics);
   String fileName = props.getProperty(Constants.MUC_PROP);
   fileContents = IOUtils.slurpFile(fileName);
   currentOffset = 0;
   tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), "");
   stanfordProcessor = loadStanfordProcessor(props);
 }

예제 #3

0

파일 보기

파일: LexicalSimilarity.java 프로젝트: sumitbhagwani/SententialSimilarity

  public static ArrayList<TaggedWord> StanfordParse(String sentence, LexicalizedParser lp) {

    TokenizerFactory<CoreLabel> tokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> rawWords2 =
        tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
    Tree parse = lp.apply(rawWords2);
    ArrayList<TaggedWord> taggedWords = parse.taggedYield();

    return taggedWords;
  }

예제 #4

0

파일 보기

파일: StanfordParser.java 프로젝트: ag-sc/DeptDUDES

  private static List<TypedDependency> getDependencies(String sentence) {

    if (pipeline == null) {
      loadModels();
    }

    TokenizerFactory<CoreLabel> tokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sentence));
    List<CoreLabel> rawWords2 = tok.tokenize();
    Tree parse = lp.apply(rawWords2);
    //        parse.pennPrint();
    //
    //        System.out.println(parse.toString());

    TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();

    return tdl;
  }

예제 #5

0

파일 보기

파일: ParserDemo3.java 프로젝트: danlassiter/gradable-adjective-corpus-analysis

  public static void main(String[] args) // start of the main method
      {
    System.out.println("\n\n\nSTART\n\n\n"); // print START
    try // device to handle potential errors
    {
      // open file whose path is passed
      // as the first argument of the main method:
      FileInputStream fis = new FileInputStream(args[0]);
      DataInputStream dis = new DataInputStream(fis);
      BufferedReader br = new BufferedReader(new InputStreamReader(dis));

      // prepare Parser, Tokenizer and Tree printer:
      LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
      TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory());
      TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

      String sentence; // initialization
      // for each line of the file
      // retrieve it as a string called 'sentence':
      while ((sentence = br.readLine()) != null) {
        // print sentence:
        System.out.println("\n\n\n\nORIGINAL:\n\n" + sentence);
        // put tokens in a list:
        List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize();
        lp.parse(tokens); // parse the tokens
        Tree t = lp.getBestParse(); // get the best parse tree
        System.out.println("\nPROCESSED:\n\n");
        tp.printTree(t); // print tree
      }
      dis.close(); // close input file
    } catch (Exception e) // catch error if any
    {
      System.err.println("ERROR: " + e.getMessage()); // print error message
    }
    System.out.println("\n\n\nTHE END\n\n\n"); // print THE END
  } // end of the main method

예제 #6

0

파일 보기

파일: StanfordParser.java 프로젝트: Tyler-Yates/TuringThesis

class StanfordParser {
  private final String PCG_MODEL = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
  private final TokenizerFactory<CoreLabel> tokenizerFactory =
      PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
  private final LexicalizedParser parser = LexicalizedParser.loadModel(PCG_MODEL);
  private final String serializedClassifier =
      "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf" + ".ser.gz";
  private final AbstractSequenceClassifier<CoreLabel> classifier =
      CRFClassifier.getClassifierNoExceptions(serializedClassifier);

  public ParsedSentence parseSentence(String sentence, boolean removePunctuation) {
    if (removePunctuation) {
      sentence = cleanSentence(sentence);
    }

    final Tree posTree = getPosTree(sentence);
    return new ParsedSentence(posTree, getDependencies(posTree), findNamedEntities(sentence));
  }

  public Tense calculateTense(String clause) {
    final Tree posTree = getPosTree(clause);
    final Tree word = posTree.getLeaves().get(0);
    final String pos = word.parent(posTree).label().value().toLowerCase();
    if (pos.equals("md")) {
      return Tense.FUTURE;
    }
    if (pos.equals("vbd") || pos.equals("vbn")) {
      return Tense.PAST;
    }
    return Tense.PRESENT;
  }

  public Map<String, NamedEntity> findNamedEntities(String sentence) {
    final Map<String, NamedEntity> namedEntities = new HashMap<>();
    final List<Triple<String, Integer, Integer>> nerSubstrings = findNerSubstrings(sentence);
    for (final Triple<String, Integer, Integer> substring : nerSubstrings) {
      namedEntities.put(
          sentence.substring(substring.second(), substring.third()),
          NamedEntity.getNamedEntity(substring.first()));
    }
    return namedEntities;
  }

  private List<Triple<String, Integer, Integer>> findNerSubstrings(String sentence) {
    return classifier.classifyToCharacterOffsets(sentence);
  }

  private String cleanSentence(String sentence) {
    return sentence.replaceAll("\\p{Punct}", "").replaceAll("[ ]+", " ");
  }

  private Tree getPosTree(String sentence) {
    final Tokenizer<CoreLabel> tokenizer =
        tokenizerFactory.getTokenizer(new StringReader(sentence));
    final List<CoreLabel> tokens = tokenizer.tokenize();
    return parser.apply(tokens);
  }

  private Collection<TypedDependency> getDependencies(Tree sentenceParseTree) {
    final TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    final GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    final GrammaticalStructure gs = gsf.newGrammaticalStructure(sentenceParseTree);
    return gs.typedDependenciesCollapsed();
  }
}

예제 #7

0

파일 보기

파일: TextSimplification.java 프로젝트: inkit-padhi/storyteller

@SuppressWarnings("serial")
public class TextSimplification {

  public static List<String> replacementList =
      new ArrayList<String>() {
        {
          add("he");
          add("him");
          add("his");
          add("she");
          add("her");
          add("they");
          add("them");
          add("their");
          add("i");
          add("her's");
          add("you");
          add("your");
          add("your's");
          add("mine");
          add("my");
          add("us");
          add("we");
          //		add("it");
          //		add("its");
          //		add("this");
          //		add("that");
        }
      };

  public static String resolvedSentences = "";

  private static final String PCG_MODEL = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

  private static final TokenizerFactory<CoreLabel> tokenizerFactory =
      PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");

  private static final LexicalizedParser parser = LexicalizedParser.loadModel(PCG_MODEL);

  public static void main(String[] args) throws IOException {

    // :TODO
    // * Do not consider roots with more than 2 words
    // * Root should not be he, she her, his, him etc...
    // * If it is, den take the last known gender noun and make it the root.

    String text = new String(Files.readAllBytes(Paths.get(args[0])), StandardCharsets.UTF_8);
    text = text.replace("\n", " ");

    // Resolve Anaphora
    System.out.println("Anaphora Resolution...");
    resolveAnaphora(text);
    System.out.println(
        "Anaphora Resolution Completed!\nIntermediate Output in \"AnaphoraResolved.txt\"");
    writeToFile(resolvedSentences, "AnaphoraResolved.txt");

    // Create ParseTrees
    System.out.println("Parse Tree Generation...");
    startParsing((resolvedSentences));
    System.out.println("Parse Tree Generation Completed!\nIntermediate Output in \"Tree.txt\"");
  }

  public static void resolveAnaphora(String text) {

    RedwoodConfiguration.empty().capture(System.err).apply();

    Annotation document = new Annotation(text);
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    props.put("dcoref.female", "female.unigram.txt");
    props.put("dcoref.male", "male.unigram.txt");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    pipeline.annotate(document);

    RedwoodConfiguration.current().clear().apply();

    Map<Integer, CorefChain> graph = document.get(CorefChainAnnotation.class);
    List<CoreMap> stnfrdSentences = document.get(SentencesAnnotation.class);

    ImmutableMultimap.Builder<Integer, Pair<CorefChain, CorefMention>> records =
        ImmutableMultimap.builder();
    ImmutableMultimap.Builder<Integer, Pair<CorefChain, CorefMention>> recordsOrdered =
        ImmutableMultimap.builder();

    graph.forEach(
        (key, value) -> {
          value
              .getMentionMap()
              .forEach(
                  (intPair, corefSet) -> {
                    corefSet.forEach(
                        mention -> records.put(mention.sentNum, Pair.of(value, mention)));
                  });
        });

    recordsOrdered =
        records.orderKeysBy(
            new Comparator<Integer>() {
              @Override
              public int compare(Integer o1, Integer o2) {
                return o1 - o2;
              }
            });

    recordsOrdered
        .build()
        .asMap()
        .forEach(
            (sentNum, mentionList) -> {
              CoreMap sentence = stnfrdSentences.get(sentNum - 1);
              List<CoreLabel> stnfrdtokens = sentence.get(TokensAnnotation.class);

              mentionList.forEach(
                  pair -> {
                    CorefChain chain = pair.getLeft();
                    CorefMention mention = pair.getRight();
                    String root = chain.getRepresentativeMention().mentionSpan;

                    if (!mention.mentionSpan.equalsIgnoreCase(root)
                        && (!root.contains(mention.mentionSpan)
                            && !mention.mentionSpan.contains(root))
                        && (!replacementList.contains(root.toLowerCase()))
                        && (root.split("\\s").length < 3)
                        && (replacementList.contains(mention.mentionSpan.toLowerCase()))) {
                      if (mention.mentionSpan.equalsIgnoreCase("her")
                          || mention.mentionSpan.equalsIgnoreCase("his")) {
                        root += "'s";
                      }
                      stnfrdtokens.get(mention.startIndex - 1).setOriginalText(root);
                    }
                  });

              String sent = "";
              for (CoreLabel token : stnfrdtokens) {
                sent += token.originalText() + " ";
              }
              ;
              resolvedSentences += sent + "\n";
            });
  }

  public static Tree parse(String str) {
    List<CoreLabel> tokens = tokenize(str);
    Tree tree = parser.apply(tokens);
    return tree;
  }

  private static List<CoreLabel> tokenize(String str) {
    Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(str));
    return tokenizer.tokenize();
  }

  public static void startParsing(String paragraph) throws FileNotFoundException, IOException {
    String parseTrees = "";

    // Can we just split on new line as paragraph is already sentence splitted.
    Reader reader = new StringReader(paragraph);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();

    for (List<HasWord> sentence : dp) {
      String sentenceString = Sentence.listToString(sentence);
      sentenceList.add(sentenceString);
    }

    for (String sentence : sentenceList) {
      //			System.out.println(sentence);
      parseTrees += createParseTree(sentence);
    }
    writeToFile(parseTrees, "trees.txt");
  }

  public static void writeToFile(String content, String filename) throws IOException {
    File file = new File(filename);
    file.delete();

    FileWriter fout = new FileWriter(filename);
    fout.write(content);
    fout.close();
  }

  public static String createParseTree(String sentence) {
    Tree tree = parse(sentence);
    //		System.out.println(tree.toString());
    return (tree.toString() + "\n");
  }
}

예제 #8

0

파일 보기

파일: Doubleq.java 프로젝트: vinodhkris/MultipleQuestionSplitter

  public static void main(String args[]) throws IOException {
    long startTime = System.currentTimeMillis();

    LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
    TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory());
    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
    String sentence = "Where did the first President die ?";

    System.out.println("Enter the question or press enter for default : ");
    String tempInput;
    BufferedReader b1 = new BufferedReader(new InputStreamReader(System.in));
    tempInput = b1.readLine();
    if (tempInput.length() == 0)
      System.out.println("The question is the default one : " + sentence);
    else {
      sentence = tempInput;
      System.out.println("The question entered is : " + sentence);
    }

    String sentence1 = PreProcess.removeStopWords1(sentence);

    System.out.println(sentence1);
    StringTokenizer st1 = new StringTokenizer(sentence1, " ");
    int n = 0;
    while (st1.hasMoreTokens()) {
      String temp1 = st1.nextToken();
      //	System.out.println("temp replace all is
      // "+temp1.replaceAll("'s","").replaceAll("[^A-Za-z]",""));
      map.put(n, temp1.replaceAll("'s", "").replaceAll("[^A-Za-z]", ""));

      n++;
    }
    //	for(int s=0;s<n;s++)
    //		System.out.println(map.get(s));
    List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize();
    lp.parse(tokens); // parse the tokens
    Tree t = lp.getBestParse(); // get the best parse tree\

    tp.printTree(t);
    System.out.println("\nPROCESSED:\n\n"); // tp.printTree(t); // print tree
    // dependencies only print
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(t);

    // dependencies

    //		Tree b = t.firstChild();
    //	System.out.println("\nFirst child of the tree is :\n\n"); tp.printTree(b);
    String dependency = gs.typedDependenciesCollapsed().toString();
    System.out.println("Dependencies :" + dependency);
    //	BufferedReader reader = new BufferedReader( new InputStreamReader(System.in) );
    //	String wordForm = reader.readLine();
    String wordForm = "yes";
    int i = -1;
    String s[][] = new String[20][3];

    if (wordForm.equals("yes")) {
      StringTokenizer st = new StringTokenizer(dependency, " ([)],");
      while (st.hasMoreTokens()) {
        String as = st.nextToken();
        System.out.println(as);
        if (!as.contains("-")) {
          i++;
          s[i][0] = as;
        } else {
          s[i][1] = as;
          s[i][2] = st.nextToken();
        }
      }
    }

    length = i + 1;
    interchange1(s);
    System.out.println("The sorted version is ");
    //	System.out.println("\n\n***********Li8 from here on***********");
    for (i = 0; i < length; i++) {
      for (int j = 0; j < 3; j++) {
        System.out.print(s[i][j] + " ");
      }
      System.out.println();
    }

    // int adjmatrix[][] = new int[length][length];
    System.out.println("What answer type is required: ");
    BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));

    String answtype = reader.readLine();
    String[] temp;
    temp = sentence.split(" ", 2);
    int g = 0;
    int h = 0;
    String secque = null;

    // dijikstra implementation
    int adjmatrix[][] = new int[length][length];
    int j = 0;
    for (i = 0; i < length; i++) for (j = 0; j < length; j++) adjmatrix[i][j] = 100;
    formadj(adjmatrix, s);
    print(adjmatrix);
    //	Dijikstraalgo.dijikstra(adjmatrix,length-2);
    //	Dijikstraalgo.dijikstra(adjmatrix,length-1);
    if (Dijikstraalgo.dijikstra(adjmatrix, length - 1)
            - Dijikstraalgo.dijikstra(adjmatrix, length - 2)
        >= 0) {
      System.out.println("Type 1");
      if (makesentence(s, length - 1) == null) {
        secque = s[length - 1][2] + " " + s[length - 1][1];
        System.out.println(answtype + " is " + s[length - 1][2] + " " + s[length - 1][1] + " ?");

      } else {
        secque = makesentence(s, length - 1);
        System.out.println(answtype + " is " + secque + " ?");
      }
    } else {
      System.out.println("Type 2");
      System.out.println(
          "Before entering the makesentence function(the cause of the null pointer exception) "
              + s[length - 2][0]
              + " "
              + s[length - 2][1]);
      if (makesentence(s, length - 2) == null) {

        secque = s[length - 2][2] + " " + s[length - 2][1];
        System.out.println(answtype + " is " + s[length - 2][2] + " " + s[length - 2][1] + " ?");
      } else {
        //	System.out.println("null");
        secque = makesentence(s, length - 2);

        System.out.println(answtype + " is " + secque + " ?");
      }
    }
    //	System.out.println("Secque is "+secque.replaceAll("[^A-Za-z ]",""));
    System.out.println(sentence.replace(secque.replaceAll("[^A-Za-z ]", ""), ""));

    long endTime = System.currentTimeMillis();
    System.out.println("The time elapsed is : " + (int) (endTime - startTime) / 1000);
    System.out.println("The end");
  }