private Collection<TypedDependency> parseSentenceTDL(String text) { System.out.println("Parsing sentence..."); Collection<TypedDependency> tdl = null; TreebankLanguagePack tlp = lp.treebankLanguagePack(); GrammaticalStructureFactory gsf = null; if (tlp.supportsGrammaticalStructures()) { gsf = tlp.grammaticalStructureFactory(); } Reader reader = new StringReader(text); for (List<HasWord> sentence : new DocumentPreprocessor(reader)) { Tree parse = lp.apply(sentence); if (gsf != null) { GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); tdl = gs.allTypedDependencies(); } } return tdl; }
public HashMap<String,ArrayList<TreeData>> parseAllDocs() throws IOException{ String grammar = "./jsan_resources/englishPCFG.ser.gz"; String[] options = { "-maxLength", "120", "-retainTmpSubcategories" }; // LexicalizedParser lp = new LexicalizedParser(grammar, options); LexicalizedParser lp = new LexicalizedParser() TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); Iterable<List<? extends HasWord>> sentences; ArrayList<HashMap<String,ArrayList<String>>> everything = new ArrayList<HashMap<String,ArrayList<String>>>(3); everything.add(0,otherSampleStrings); everything.add(1,authorSampleStrings); everything.add(2,toModifyStrings); Iterator<HashMap<String,ArrayList<String>>> everythingIter = everything.iterator(); int docTypeNumber = -1; // 0 for otherSampleStrings, 1 for authorSampleStrings, 2 for toModifyStrings int numLoaded = 0; while(everythingIter.hasNext()){ docTypeNumber++; HashMap<String,ArrayList<String>> currentSampleStrings = docPathFinder(); Set<String> currentDocStrings = currentSampleStrings.keySet(); Iterator<String> docStrIter = currentDocStrings.iterator(); String docID; ArrayList<String> sentenceTokens; allTreeProcessors[docTypeNumber] = new TreeProcessor(); allTreeProcessors[docTypeNumber].clearLoadedTreeDataMaps(); numLoaded=0; while(docStrIter.hasNext()){ docID = docStrIter.next(); sentenceTokens = currentSampleStrings.get(docID); if(sentenceTokens == null){ allTreeProcessors[docTypeNumber].loadTreeDataMap(docID, GRAMMAR_DIR, false); numLoaded++; continue; } //System.out.println(sentenceTokens.size()+", strIter.hasNext? -> "+strIter.hasNext()); numSentences = sentenceTokens.size(); //initialize(numSentences); Iterator<String> sentIter = sentenceTokens.iterator(); List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>(); String tempSent; while(sentIter.hasNext()){ tempSent = sentIter.next(); Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(tempSent)); List<? extends HasWord> sentenceTokenized = toke.tokenize(); tmp.add(sentenceTokenized); } sentences = tmp; //int numDone = 0; TreeProcessor.singleDocMap.clear(); boolean willSaveResults = true; for (List<? extends HasWord> sentence : sentences) { Tree parse = lp.apply(sentence); //parse.pennPrint(); //System.out.println(parse.treeSkeletonCopy().toString()); //System.out.println(parse.taggedYield()); //System.out.println(); //printSubTrees(parse); //TreeContainer.recurseTree(parse,"breadth"); allTreeProcessors[docTypeNumber].processTree(parse, 0, willSaveResults); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //numDone++; //System.out.println("sent "+numDone+" of "+numSentences+" done "); //System.out.println(tc.processedTrees.toString()); //in.nextLine(); //TreeContainer.recurseTree(parse, "depth"); //in.nextLine(); //addTree(parse); //GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);//TODO: LOOK AT THIS //Collection tdl = gs.typedDependenciesCCprocessed(true); //System.out.println(tdl); //System.out.println(); } if(willSaveResults == true) ObjectIO.writeObject(TreeProcessor.singleDocMap,docID, GRAMMAR_DIR); //System.out.println("After all sents: "); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //String sent3 = "This is one last test!"; //Tree parse3 = lp.apply(sent3); //parse3.pennPrint(); //System.out.println("After sorting and writing:"); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //Scanner in = new Scanner(System.in); //System.out.println("First one done."); //in.nextLine(); //viewTrees(); } //TreeProcessor.writeTreeDataToCSV(sortedTD,docID); allTreeProcessors[docTypeNumber].unmergedMaps = new ArrayList<HashMap<String,TreeData>>(numLoaded+1); } int i= 0; allParsedAndOrdered.clear(); String[] docTypes = new String[]{"otherSample","authorSample","toModify"}; for(i=0; i < 3; i++){ allTreeProcessors[i].unmergedMaps.add(allTreeProcessors[i].processedTrees); allTreeProcessors[i].unmergedMaps.addAll(allTreeProcessors[i].loadedTreeDataMaps); allTreeProcessors[i].mergeTreeDataLists(allTreeProcessors[i].unmergedMaps); allParsedAndOrdered.put(docTypes[i],allTreeProcessors[i].sortTreeData(allTreeProcessors[i].mergedMap)); } //ArrayList<TreeData> sortedTD = TreeContainer.sortTreeData(TreeContainer.allProcessedTrees); //TreeContainer.writeTreeDataToCSV(sortedTD,"ALL_AUTHORS"); return allParsedAndOrdered; }