public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("usage: java TaggerDemo2 modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(Sentence.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a // tagged sentence. List<HasWord> sent = Sentence.toWordList( "The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); }
public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { super(dict, semantics); String fileName = props.getProperty(Constants.MUC_PROP); fileContents = IOUtils.slurpFile(fileName); currentOffset = 0; tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), ""); stanfordProcessor = loadStanfordProcessor(props); }
public static ArrayList<TaggedWord> StanfordParse(String sentence, LexicalizedParser lp) { TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> rawWords2 = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); Tree parse = lp.apply(rawWords2); ArrayList<TaggedWord> taggedWords = parse.taggedYield(); return taggedWords; }
private static List<TypedDependency> getDependencies(String sentence) { if (pipeline == null) { loadModels(); } TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sentence)); List<CoreLabel> rawWords2 = tok.tokenize(); Tree parse = lp.apply(rawWords2); // parse.pennPrint(); // // System.out.println(parse.toString()); TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); return tdl; }
public static void main(String[] args) // start of the main method { System.out.println("\n\n\nSTART\n\n\n"); // print START try // device to handle potential errors { // open file whose path is passed // as the first argument of the main method: FileInputStream fis = new FileInputStream(args[0]); DataInputStream dis = new DataInputStream(fis); BufferedReader br = new BufferedReader(new InputStreamReader(dis)); // prepare Parser, Tokenizer and Tree printer: LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory()); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); String sentence; // initialization // for each line of the file // retrieve it as a string called 'sentence': while ((sentence = br.readLine()) != null) { // print sentence: System.out.println("\n\n\n\nORIGINAL:\n\n" + sentence); // put tokens in a list: List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize(); lp.parse(tokens); // parse the tokens Tree t = lp.getBestParse(); // get the best parse tree System.out.println("\nPROCESSED:\n\n"); tp.printTree(t); // print tree } dis.close(); // close input file } catch (Exception e) // catch error if any { System.err.println("ERROR: " + e.getMessage()); // print error message } System.out.println("\n\n\nTHE END\n\n\n"); // print THE END } // end of the main method
class StanfordParser { private final String PCG_MODEL = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; private final TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true"); private final LexicalizedParser parser = LexicalizedParser.loadModel(PCG_MODEL); private final String serializedClassifier = "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf" + ".ser.gz"; private final AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier); public ParsedSentence parseSentence(String sentence, boolean removePunctuation) { if (removePunctuation) { sentence = cleanSentence(sentence); } final Tree posTree = getPosTree(sentence); return new ParsedSentence(posTree, getDependencies(posTree), findNamedEntities(sentence)); } public Tense calculateTense(String clause) { final Tree posTree = getPosTree(clause); final Tree word = posTree.getLeaves().get(0); final String pos = word.parent(posTree).label().value().toLowerCase(); if (pos.equals("md")) { return Tense.FUTURE; } if (pos.equals("vbd") || pos.equals("vbn")) { return Tense.PAST; } return Tense.PRESENT; } public Map<String, NamedEntity> findNamedEntities(String sentence) { final Map<String, NamedEntity> namedEntities = new HashMap<>(); final List<Triple<String, Integer, Integer>> nerSubstrings = findNerSubstrings(sentence); for (final Triple<String, Integer, Integer> substring : nerSubstrings) { namedEntities.put( sentence.substring(substring.second(), substring.third()), NamedEntity.getNamedEntity(substring.first())); } return namedEntities; } private List<Triple<String, Integer, Integer>> findNerSubstrings(String sentence) { return classifier.classifyToCharacterOffsets(sentence); } private String cleanSentence(String sentence) { return sentence.replaceAll("\\p{Punct}", "").replaceAll("[ ]+", " "); } private Tree getPosTree(String sentence) { final Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(sentence)); final List<CoreLabel> tokens = tokenizer.tokenize(); return parser.apply(tokens); } private Collection<TypedDependency> getDependencies(Tree sentenceParseTree) { final TreebankLanguagePack tlp = new PennTreebankLanguagePack(); final GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); final GrammaticalStructure gs = gsf.newGrammaticalStructure(sentenceParseTree); return gs.typedDependenciesCollapsed(); } }
@SuppressWarnings("serial") public class TextSimplification { public static List<String> replacementList = new ArrayList<String>() { { add("he"); add("him"); add("his"); add("she"); add("her"); add("they"); add("them"); add("their"); add("i"); add("her's"); add("you"); add("your"); add("your's"); add("mine"); add("my"); add("us"); add("we"); // add("it"); // add("its"); // add("this"); // add("that"); } }; public static String resolvedSentences = ""; private static final String PCG_MODEL = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; private static final TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true"); private static final LexicalizedParser parser = LexicalizedParser.loadModel(PCG_MODEL); public static void main(String[] args) throws IOException { // :TODO // * Do not consider roots with more than 2 words // * Root should not be he, she her, his, him etc... // * If it is, den take the last known gender noun and make it the root. String text = new String(Files.readAllBytes(Paths.get(args[0])), StandardCharsets.UTF_8); text = text.replace("\n", " "); // Resolve Anaphora System.out.println("Anaphora Resolution..."); resolveAnaphora(text); System.out.println( "Anaphora Resolution Completed!\nIntermediate Output in \"AnaphoraResolved.txt\""); writeToFile(resolvedSentences, "AnaphoraResolved.txt"); // Create ParseTrees System.out.println("Parse Tree Generation..."); startParsing((resolvedSentences)); System.out.println("Parse Tree Generation Completed!\nIntermediate Output in \"Tree.txt\""); } public static void resolveAnaphora(String text) { RedwoodConfiguration.empty().capture(System.err).apply(); Annotation document = new Annotation(text); Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); props.put("dcoref.female", "female.unigram.txt"); props.put("dcoref.male", "male.unigram.txt"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(document); RedwoodConfiguration.current().clear().apply(); Map<Integer, CorefChain> graph = document.get(CorefChainAnnotation.class); List<CoreMap> stnfrdSentences = document.get(SentencesAnnotation.class); ImmutableMultimap.Builder<Integer, Pair<CorefChain, CorefMention>> records = ImmutableMultimap.builder(); ImmutableMultimap.Builder<Integer, Pair<CorefChain, CorefMention>> recordsOrdered = ImmutableMultimap.builder(); graph.forEach( (key, value) -> { value .getMentionMap() .forEach( (intPair, corefSet) -> { corefSet.forEach( mention -> records.put(mention.sentNum, Pair.of(value, mention))); }); }); recordsOrdered = records.orderKeysBy( new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { return o1 - o2; } }); recordsOrdered .build() .asMap() .forEach( (sentNum, mentionList) -> { CoreMap sentence = stnfrdSentences.get(sentNum - 1); List<CoreLabel> stnfrdtokens = sentence.get(TokensAnnotation.class); mentionList.forEach( pair -> { CorefChain chain = pair.getLeft(); CorefMention mention = pair.getRight(); String root = chain.getRepresentativeMention().mentionSpan; if (!mention.mentionSpan.equalsIgnoreCase(root) && (!root.contains(mention.mentionSpan) && !mention.mentionSpan.contains(root)) && (!replacementList.contains(root.toLowerCase())) && (root.split("\\s").length < 3) && (replacementList.contains(mention.mentionSpan.toLowerCase()))) { if (mention.mentionSpan.equalsIgnoreCase("her") || mention.mentionSpan.equalsIgnoreCase("his")) { root += "'s"; } stnfrdtokens.get(mention.startIndex - 1).setOriginalText(root); } }); String sent = ""; for (CoreLabel token : stnfrdtokens) { sent += token.originalText() + " "; } ; resolvedSentences += sent + "\n"; }); } public static Tree parse(String str) { List<CoreLabel> tokens = tokenize(str); Tree tree = parser.apply(tokens); return tree; } private static List<CoreLabel> tokenize(String str) { Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(str)); return tokenizer.tokenize(); } public static void startParsing(String paragraph) throws FileNotFoundException, IOException { String parseTrees = ""; // Can we just split on new line as paragraph is already sentence splitted. Reader reader = new StringReader(paragraph); DocumentPreprocessor dp = new DocumentPreprocessor(reader); List<String> sentenceList = new ArrayList<String>(); for (List<HasWord> sentence : dp) { String sentenceString = Sentence.listToString(sentence); sentenceList.add(sentenceString); } for (String sentence : sentenceList) { // System.out.println(sentence); parseTrees += createParseTree(sentence); } writeToFile(parseTrees, "trees.txt"); } public static void writeToFile(String content, String filename) throws IOException { File file = new File(filename); file.delete(); FileWriter fout = new FileWriter(filename); fout.write(content); fout.close(); } public static String createParseTree(String sentence) { Tree tree = parse(sentence); // System.out.println(tree.toString()); return (tree.toString() + "\n"); } }
public static void main(String args[]) throws IOException { long startTime = System.currentTimeMillis(); LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory()); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); String sentence = "Where did the first President die ?"; System.out.println("Enter the question or press enter for default : "); String tempInput; BufferedReader b1 = new BufferedReader(new InputStreamReader(System.in)); tempInput = b1.readLine(); if (tempInput.length() == 0) System.out.println("The question is the default one : " + sentence); else { sentence = tempInput; System.out.println("The question entered is : " + sentence); } String sentence1 = PreProcess.removeStopWords1(sentence); System.out.println(sentence1); StringTokenizer st1 = new StringTokenizer(sentence1, " "); int n = 0; while (st1.hasMoreTokens()) { String temp1 = st1.nextToken(); // System.out.println("temp replace all is // "+temp1.replaceAll("'s","").replaceAll("[^A-Za-z]","")); map.put(n, temp1.replaceAll("'s", "").replaceAll("[^A-Za-z]", "")); n++; } // for(int s=0;s<n;s++) // System.out.println(map.get(s)); List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize(); lp.parse(tokens); // parse the tokens Tree t = lp.getBestParse(); // get the best parse tree\ tp.printTree(t); System.out.println("\nPROCESSED:\n\n"); // tp.printTree(t); // print tree // dependencies only print TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(t); // dependencies // Tree b = t.firstChild(); // System.out.println("\nFirst child of the tree is :\n\n"); tp.printTree(b); String dependency = gs.typedDependenciesCollapsed().toString(); System.out.println("Dependencies :" + dependency); // BufferedReader reader = new BufferedReader( new InputStreamReader(System.in) ); // String wordForm = reader.readLine(); String wordForm = "yes"; int i = -1; String s[][] = new String[20][3]; if (wordForm.equals("yes")) { StringTokenizer st = new StringTokenizer(dependency, " ([)],"); while (st.hasMoreTokens()) { String as = st.nextToken(); System.out.println(as); if (!as.contains("-")) { i++; s[i][0] = as; } else { s[i][1] = as; s[i][2] = st.nextToken(); } } } length = i + 1; interchange1(s); System.out.println("The sorted version is "); // System.out.println("\n\n***********Li8 from here on***********"); for (i = 0; i < length; i++) { for (int j = 0; j < 3; j++) { System.out.print(s[i][j] + " "); } System.out.println(); } // int adjmatrix[][] = new int[length][length]; System.out.println("What answer type is required: "); BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); String answtype = reader.readLine(); String[] temp; temp = sentence.split(" ", 2); int g = 0; int h = 0; String secque = null; // dijikstra implementation int adjmatrix[][] = new int[length][length]; int j = 0; for (i = 0; i < length; i++) for (j = 0; j < length; j++) adjmatrix[i][j] = 100; formadj(adjmatrix, s); print(adjmatrix); // Dijikstraalgo.dijikstra(adjmatrix,length-2); // Dijikstraalgo.dijikstra(adjmatrix,length-1); if (Dijikstraalgo.dijikstra(adjmatrix, length - 1) - Dijikstraalgo.dijikstra(adjmatrix, length - 2) >= 0) { System.out.println("Type 1"); if (makesentence(s, length - 1) == null) { secque = s[length - 1][2] + " " + s[length - 1][1]; System.out.println(answtype + " is " + s[length - 1][2] + " " + s[length - 1][1] + " ?"); } else { secque = makesentence(s, length - 1); System.out.println(answtype + " is " + secque + " ?"); } } else { System.out.println("Type 2"); System.out.println( "Before entering the makesentence function(the cause of the null pointer exception) " + s[length - 2][0] + " " + s[length - 2][1]); if (makesentence(s, length - 2) == null) { secque = s[length - 2][2] + " " + s[length - 2][1]; System.out.println(answtype + " is " + s[length - 2][2] + " " + s[length - 2][1] + " ?"); } else { // System.out.println("null"); secque = makesentence(s, length - 2); System.out.println(answtype + " is " + secque + " ?"); } } // System.out.println("Secque is "+secque.replaceAll("[^A-Za-z ]","")); System.out.println(sentence.replace(secque.replaceAll("[^A-Za-z ]", ""), "")); long endTime = System.currentTimeMillis(); System.out.println("The time elapsed is : " + (int) (endTime - startTime) / 1000); System.out.println("The end"); }