public static ArrayList<TaggedWord> StanfordParse(String sentence, LexicalizedParser lp) { TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> rawWords2 = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); Tree parse = lp.apply(rawWords2); ArrayList<TaggedWord> taggedWords = parse.taggedYield(); return taggedWords; }
// todo: give options for document splitting. A line or the whole file or // sentence splitting as now public Iterator<List<IN>> getIterator(Reader r) { Tokenizer<IN> tokenizer = tokenizerFactory.getTokenizer(r); // PTBTokenizer.newPTBTokenizer(r, false, true); List<IN> words = new ArrayList<IN>(); IN previous = tokenFactory.makeToken(); StringBuilder prepend = new StringBuilder(); /* * This changes SGML tags into whitespace -- it should maybe be moved * elsewhere */ while (tokenizer.hasNext()) { IN w = tokenizer.next(); String word = w.get(CoreAnnotations.TextAnnotation.class); Matcher m = sgml.matcher(word); if (m.matches()) { String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class)); String after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class)); prepend.append(before).append(word); String previousTokenAfter = StringUtils.getNotNullString(previous.get(CoreAnnotations.AfterAnnotation.class)); previous.set(AfterAnnotation.class, previousTokenAfter + word + after); // previous.appendAfter(w.word() + w.after()); } else { String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class)); if (prepend.length() > 0) { w.set(BeforeAnnotation.class, prepend.toString() + before); // w.prependBefore(prepend.toString()); prepend = new StringBuilder(); } words.add(w); previous = w; } } List<List<IN>> sentences = wts.process(words); String after = ""; IN last = null; for (List<IN> sentence : sentences) { int pos = 0; for (IN w : sentence) { w.set(PositionAnnotation.class, Integer.toString(pos)); after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class)); w.remove(AfterAnnotation.class); last = w; } } if (last != null) { last.set(AfterAnnotation.class, after); } return sentences.iterator(); }
public static void main(String[] args) // start of the main method { System.out.println("\n\n\nSTART\n\n\n"); // print START try // device to handle potential errors { // open file whose path is passed // as the first argument of the main method: FileInputStream fis = new FileInputStream(args[0]); DataInputStream dis = new DataInputStream(fis); BufferedReader br = new BufferedReader(new InputStreamReader(dis)); // prepare Parser, Tokenizer and Tree printer: LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory()); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); String sentence; // initialization // for each line of the file // retrieve it as a string called 'sentence': while ((sentence = br.readLine()) != null) { // print sentence: System.out.println("\n\n\n\nORIGINAL:\n\n" + sentence); // put tokens in a list: List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize(); lp.parse(tokens); // parse the tokens Tree t = lp.getBestParse(); // get the best parse tree System.out.println("\nPROCESSED:\n\n"); tp.printTree(t); // print tree } dis.close(); // close input file } catch (Exception e) // catch error if any { System.err.println("ERROR: " + e.getMessage()); // print error message } System.out.println("\n\n\nTHE END\n\n\n"); // print THE END } // end of the main method
public static void main(String args[]) throws IOException { long startTime = System.currentTimeMillis(); LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory()); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); String sentence = "Where did the first President die ?"; System.out.println("Enter the question or press enter for default : "); String tempInput; BufferedReader b1 = new BufferedReader(new InputStreamReader(System.in)); tempInput = b1.readLine(); if (tempInput.length() == 0) System.out.println("The question is the default one : " + sentence); else { sentence = tempInput; System.out.println("The question entered is : " + sentence); } String sentence1 = PreProcess.removeStopWords1(sentence); System.out.println(sentence1); StringTokenizer st1 = new StringTokenizer(sentence1, " "); int n = 0; while (st1.hasMoreTokens()) { String temp1 = st1.nextToken(); // System.out.println("temp replace all is // "+temp1.replaceAll("'s","").replaceAll("[^A-Za-z]","")); map.put(n, temp1.replaceAll("'s", "").replaceAll("[^A-Za-z]", "")); n++; } // for(int s=0;s<n;s++) // System.out.println(map.get(s)); List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize(); lp.parse(tokens); // parse the tokens Tree t = lp.getBestParse(); // get the best parse tree\ tp.printTree(t); System.out.println("\nPROCESSED:\n\n"); // tp.printTree(t); // print tree // dependencies only print TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(t); // dependencies // Tree b = t.firstChild(); // System.out.println("\nFirst child of the tree is :\n\n"); tp.printTree(b); String dependency = gs.typedDependenciesCollapsed().toString(); System.out.println("Dependencies :" + dependency); // BufferedReader reader = new BufferedReader( new InputStreamReader(System.in) ); // String wordForm = reader.readLine(); String wordForm = "yes"; int i = -1; String s[][] = new String[20][3]; if (wordForm.equals("yes")) { StringTokenizer st = new StringTokenizer(dependency, " ([)],"); while (st.hasMoreTokens()) { String as = st.nextToken(); System.out.println(as); if (!as.contains("-")) { i++; s[i][0] = as; } else { s[i][1] = as; s[i][2] = st.nextToken(); } } } length = i + 1; interchange1(s); System.out.println("The sorted version is "); // System.out.println("\n\n***********Li8 from here on***********"); for (i = 0; i < length; i++) { for (int j = 0; j < 3; j++) { System.out.print(s[i][j] + " "); } System.out.println(); } // int adjmatrix[][] = new int[length][length]; System.out.println("What answer type is required: "); BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); String answtype = reader.readLine(); String[] temp; temp = sentence.split(" ", 2); int g = 0; int h = 0; String secque = null; // dijikstra implementation int adjmatrix[][] = new int[length][length]; int j = 0; for (i = 0; i < length; i++) for (j = 0; j < length; j++) adjmatrix[i][j] = 100; formadj(adjmatrix, s); print(adjmatrix); // Dijikstraalgo.dijikstra(adjmatrix,length-2); // Dijikstraalgo.dijikstra(adjmatrix,length-1); if (Dijikstraalgo.dijikstra(adjmatrix, length - 1) - Dijikstraalgo.dijikstra(adjmatrix, length - 2) >= 0) { System.out.println("Type 1"); if (makesentence(s, length - 1) == null) { secque = s[length - 1][2] + " " + s[length - 1][1]; System.out.println(answtype + " is " + s[length - 1][2] + " " + s[length - 1][1] + " ?"); } else { secque = makesentence(s, length - 1); System.out.println(answtype + " is " + secque + " ?"); } } else { System.out.println("Type 2"); System.out.println( "Before entering the makesentence function(the cause of the null pointer exception) " + s[length - 2][0] + " " + s[length - 2][1]); if (makesentence(s, length - 2) == null) { secque = s[length - 2][2] + " " + s[length - 2][1]; System.out.println(answtype + " is " + s[length - 2][2] + " " + s[length - 2][1] + " ?"); } else { // System.out.println("null"); secque = makesentence(s, length - 2); System.out.println(answtype + " is " + secque + " ?"); } } // System.out.println("Secque is "+secque.replaceAll("[^A-Za-z ]","")); System.out.println(sentence.replace(secque.replaceAll("[^A-Za-z ]", ""), "")); long endTime = System.currentTimeMillis(); System.out.println("The time elapsed is : " + (int) (endTime - startTime) / 1000); System.out.println("The end"); }
public Document nextDoc() { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); List<List<Mention>> allPredictedMentions; List<CoreMap> allSentences = new ArrayList<CoreMap>(); Annotation docAnno = new Annotation(""); Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Pattern sentencePattern = Pattern.compile( "(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docMatcher = docPattern.matcher(fileContents); if (!docMatcher.find(currentOffset)) return null; currentOffset = docMatcher.end(); String doc = docMatcher.group(1); Matcher sentenceMatcher = sentencePattern.matcher(doc); String ner = null; // Maintain current document ID. Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docIDMatcher = docIDPattern.matcher(doc); if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1); else currentDocumentID = "documentAfter " + currentDocumentID; while (sentenceMatcher.find()) { String sentenceString = sentenceMatcher.group(2); List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.size(); i++) { CoreLabel w = words.get(i); if (i > 0 && w.word().equals("$")) { if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP")) continue; words.get(i - 1).set(TextAnnotation.class, words.get(i - 1).word() + "$"); words.remove(i); i--; } else if (w.word().equals("\\/")) { if (words.get(i - 1).word().equals("</COREF>")) continue; w.set(TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word()); words.remove(i + 1); words.remove(i - 1); } } // END FIXING TOKENIZATION PROBLEMS List<CoreLabel> sentence = new ArrayList<CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently // open Stack<Mention> stack = new Stack<Mention>(); List<Mention> mentions = new ArrayList<Mention>(); allWords.add(sentence); allGoldMentions.add(mentions); for (CoreLabel word : words) { String w = word.get(TextAnnotation.class); // found regular token: WORD/POS if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) { int i = w.lastIndexOf("\\/"); String w1 = w.substring(0, i); // we do NOT set POS info here. We take the POS tags from the parser! word.set(TextAnnotation.class, w1); word.remove(CurrentAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(NamedEntityTagAnnotation.class, ner); } else { word.set(NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) { Pattern nerPattern = Pattern.compile("<(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); ner = m.group(1); } // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" else if (w.startsWith("</") && !w.startsWith("</COREF")) { Pattern nerPattern = Pattern.compile("</(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); String ner1 = m.group(1); if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); ner = null; } // found the start SGML tag for a coref mention else if (w.startsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.size(); // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.compile("ID=\\\"(.*?)\\\""); Pattern refPattern = Pattern.compile("REF=\\\"(.*?)\\\""); Matcher m = idPattern.matcher(w); m.find(); mention.mentionID = Integer.valueOf(m.group(1)); m = refPattern.matcher(w); if (m.find()) { mention.originalRef = Integer.valueOf(m.group(1)); } // open mention. keep track of all open mentions using the stack stack.push(mention); } // found the end SGML tag for a coref mention else if (w.equals("</COREF>")) { Mention mention = stack.pop(); mention.endIndex = sentence.size(); // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, // mention.originalRef); mentions.add(mention); } else { word.remove(CurrentAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(NamedEntityTagAnnotation.class, ner); } else { word.set(NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } } StringBuilder textContent = new StringBuilder(); for (int i = 0; i < sentence.size(); i++) { CoreLabel w = sentence.get(i); w.set(IndexAnnotation.class, i + 1); w.set(UtteranceAnnotation.class, 0); if (i > 0) textContent.append(" "); textContent.append(w.getString(TextAnnotation.class)); } CoreMap sentCoreMap = new Annotation(textContent.toString()); allSentences.add(sentCoreMap); sentCoreMap.set(TokensAnnotation.class, sentence); } // assign goldCorefClusterID HashMap<Integer, Mention> idMention = new HashMap<Integer, Mention>(); // temporary use for (int i = 0; i < allGoldMentions.size(); i++) { for (int j = 0; j < allGoldMentions.get(i).size(); j++) { Mention m = allGoldMentions.get(i).get(j); idMention.put(m.mentionID, m); } } for (int i = 0; i < allGoldMentions.size(); i++) { for (int j = 0; j < allGoldMentions.get(i).size(); j++) { Mention m = allGoldMentions.get(i).get(j); if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID; else { Mention m2; int ref = m.originalRef; while (true) { m2 = idMention.get(ref); if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { ref = m2.originalRef; } } } } } } docAnno.set(SentencesAnnotation.class, allSentences); stanfordProcessor.annotate(docAnno); if (allSentences.size() != allWords.size()) throw new RuntimeException(); for (int i = 0; i < allSentences.size(); i++) { List<CoreLabel> annotatedSent = allSentences.get(i).get(TokensAnnotation.class); List<CoreLabel> unannotatedSent = allWords.get(i); List<Mention> mentionInSent = allGoldMentions.get(i); for (Mention m : mentionInSent) { m.dependency = allSentences.get(i).get(CollapsedDependenciesAnnotation.class); } if (annotatedSent.size() != unannotatedSent.size()) { throw new RuntimeException(); } int k = 0; for (int j = 0; j < annotatedSent.size(); j++, k++) { CoreLabel annotatedWord = annotatedSent.get(j); CoreLabel unannotatedWord = unannotatedSent.get(k); if (!annotatedWord .get(TextAnnotation.class) .equals(unannotatedWord.get(TextAnnotation.class))) { throw new RuntimeException(); } } allWords.set(i, annotatedSent); allTrees.add(allSentences.get(i).get(TreeAnnotation.class)); } // extract predicted mentions if (Constants.USE_GOLD_MENTIONS) allPredictedMentions = allGoldMentions; else allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); // add the relevant fields to mentions and order them for coref return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); }
/** * Tokenize a sentence in the argument, and print out the tokens to the console. * * @param args Set the first argument as the sentence to * <p>be tokenized. */ public static void main(String[] args) { TokenizerFactory<Word> factory = PTBTokenizerFactory.newTokenizerFactory(); Tokenizer<Word> tokenizer = factory.getTokenizer(new StringReader(args[0])); System.out.println(tokenizer.tokenize()); }