private static void getSynaxTree() { syntaxTree = null; try { syntaxTree = treeReader.next(); } catch (RuntimeException e) { System.out.println("exception" + e + " ::tree " + syntaxTree); } }
private static void combineAllFile() { System.out.println("Combining all FILES"); if (corpusFiles != null) { sortByNumber(corpusFiles); for (File file : corpusFiles) { builder = new StringBuilder(); System.out.println("File Name: " + file.getName()); getTreeReader(file); int count = 0; inner: while (treeReader.hasNext()) { count++; formStringBuilder(treeReader.next()); if (count == 300) { count = 0; break inner; } } } } else { builder = new StringBuilder(); System.out.println("File Name: " + sickFile.getName()); getTreeReader(sickFile); while (treeReader.hasNext()) { formStringBuilder(treeReader.next()); } } // builder.append("\n"); // builder.append("DOCSTART-X-0"); // builder.append("\n"); // builder.append("\n"); formLSADocument(); System.out.println("DONE"); }
public static void main(String... args) throws Exception { /* * The inside and outisde projection matrices I suppose */ Object[] matrices = new Object[2]; /* * Data structure to hold all the sparse vectors */ // ArrayList<SparseVector> phiList = new ArrayList<SparseVector>(); // ArrayList<SparseVector> psiList = new ArrayList<SparseVector>(); /* * Used to normalize the trees */ PTBTreeNormaliser treeNormalizer = new PTBTreeNormaliser(true); /* * Getting the feature dictionary path, i.e. the serialized file path. * This dictionary will be used to form the feature vectors. */ String featureDictionary = null; /* * This variable tells the code about the directory path where parse * trees are stored from which feature vectors need to be extracted * corresponding to all the nodes */ String parsedTreeCorpus = null; /* * The feature dictionary that needs to be used while extracting * features */ featureDictionary = "/disk/scratch/s1444025/worddictionary/worddictionary.ser"; /* * The directory that holds the parse trees that are iterated over to * extract the feature vector corresponding to the nodes */ parsedTreeCorpus = "/afs/inf.ed.ac.uk/group/project/vsm.restored/trees"; /* * Necessary to get the appropriate directory structure */ // countMapLoc = // "/afs/inf.ed.ac.uk/group/project/vsm/countmapnodesamples/countMap.ser"; /* * Getting the serialised dictionary bean object that contains the * inside and outside feature dictionaries which are used to form the * feature vectors */ VSMWordDictionaryBean dictionaryBean = VSMReadSerialWordDict.readSerializedDictionary(featureDictionary); /* * Getting the inside and outside feature dictionaries, that are used * for forming the feature vectors */ System.out.println("***Getting word dictionary*****"); Alphabet wordDictionary = dictionaryBean.getWordDictionary(); System.out.println(wordDictionary); // System.out.println(wordDictionary.size()); dprime = wordDictionary.size(); d = wordDictionary.size(); SparseMatrixLil PsiTPsi = new SparseMatrixLil(dprime, dprime); SparseMatrixLil PsiTPhi = new SparseMatrixLil(dprime, d); SparseMatrixLil PhiTPhi = new SparseMatrixLil(d, d); SparseMatrixLil PhiTPsi = new SparseMatrixLil(d, dprime); /* * The parsed tree corpus from where the feature vectors need to be * extracted corresponding to all the nodes */ File[] files = new File(parsedTreeCorpus) .listFiles( new FileFilter() { @Override public boolean accept(File file) { return !file.isHidden(); } }); ArrayList<String> filePaths = VSMUtil.getFilePaths(files); /* * The obect that is used to serialize the feature vector bean. The * feature vector bean storing the inside and outside feature vectors * corresponding to a particular node in a tree. Each feature vector * bean holds the feature vectors for one particular node */ VSMSerializeFeatureVectorBeanWord serializeBean = null; /* * If we already have a serialized count map object then we would want * to start from where we left */ // File fileCountMap = new File(countMapLoc); serializeBean = new VSMSerializeFeatureVectorBeanWord(); // } else { // VSMCountMap countMapObj = VSMReadSerialCountMap // .readCountMapObj(countMapLoc); // System.out.println("inside the count map***"); // serializeBean = new VSMSerializeFeatureVectorBeanWord( // countMapObj.getCountMap()); // } /* * Getting the data structure to store all the feature vectors in it, We * are taking 200000 samples for a particular non-terminal */ SparseMatrixLil Phi = new SparseMatrixLil(300000, d); SparseMatrixLil Psi = new SparseMatrixLil(300000, dprime); int count = 0; mainloop: for (String filePath : filePaths) { /* * Getting an iterator over the trees in the file */ PennTreeReader treeReader = VSMUtil.getTreeReader(filePath); /* * Iterating over all the trees */ while (treeReader.hasNext()) { /* * The syntax tree */ Tree<String> syntaxTree = null; /* * Unmatched parentheses exception. Does this mean that the * BLLIP corpus sometimes does not have correct parse trees? * Strange */ try { syntaxTree = treeReader.next(); } catch (RuntimeException e) { System.out.println("exception" + e + " ::tree " + syntaxTree); } /* * Do stuff only if the syntax tree is a valid one */ if (syntaxTree != null) { /* * Process the syntax tree to remove the top bracket */ syntaxTree = treeNormalizer.process(syntaxTree); /* * Iterator over the nodes of the tree */ Iterator<Tree<String>> nodeTrees = syntaxTree.iterator(); /* * Sparse Inside and outside feature vectors declared */ no.uib.cipr.matrix.sparse.SparseVector psi = null; no.uib.cipr.matrix.sparse.SparseVector phi = null; Tree<String> insideTree = null; /* * Iterating over all the nodes in a particular syntax tree */ while (nodeTrees.hasNext()) { /* * This is the inside tree for which we want to form a * feature vector and store it in the map */ insideTree = nodeTrees.next(); /* * Only do stuff if inside tree is not a leaf */ if (!insideTree.isLeaf() && insideTree.getLabel().equalsIgnoreCase("NNS")) { /* * Setting the object's properties that are stored * in the .ser file */ VSMWordFeatureVectorBean vectorBean = new VSMWordFeatureVectorBean(); System.out.println( "****Extracting inside and outside feature vectors for node**** " + insideTree.getLabel()); /* * Getting the inside and outside feature vectors * corresponding to the partcular node */ psi = new VSMOutsideFeatureVectorWords() .getOutsideFeatureVectorPsi( syntaxTree, insideTree, wordDictionary, vectorBean); // psiList.add(psi); phi = new VSMInsideFeatureVectorWords() .getInsideFeatureVectorPhi(insideTree, wordDictionary, vectorBean); // phiList.add(phi); System.out.println("got the sparse vectors*** "); /* * Inside sparse matrix formation for the particular * node. */ /* * Do the below operation only if both psi and phi * are not null for the given node sample and also * if either psi pr phi are different than before * for this spample, if both are same then no need * to unecessarily fill up Psi and Phi */ if (phi != null && psi != null) { System.out.println(count); System.out.println("****Filling in the matrices***"); int[] indicesPhi = phi.getIndex(); double[] valuesPhi = phi.getData(); /* * Don't need the phi anymore in this iteration */ phi = null; /* * Putting the inside feature vector into the * inside feature matrix */ for (int i = 0; i < indicesPhi.length; i++) { Phi.append(count, indicesPhi[i], valuesPhi[i]); } indicesPhi = null; valuesPhi = null; /* * Outside sparse matrix formation for the * particular node */ int[] indicesPsi = psi.getIndex(); double[] valuesPsi = psi.getData(); psi = null; /* * Putting the outside feature vector into the * outside feature matrix */ for (int j = 0; j < indicesPsi.length; j++) { Psi.append(count, indicesPsi[j], valuesPsi[j]); } indicesPsi = null; valuesPsi = null; System.gc(); /* * Storing the feature vectors in a bean which * will be serialized for future use */ vectorBean.setPhi(phi); vectorBean.setPsi(psi); vectorBean.setInsideTree(insideTree); vectorBean.setLabel(insideTree.getLabel()); vectorBean.setSyntaxTree(syntaxTree); /* * Serialize the feature vector bean * corresponding to the particular node. The * feature vector bean contains the sparse * inside and outside feature vectors */ serializeBean.serializeWordVectorBean(vectorBean); System.out.println("***Serialized the feature vector***"); count++; /* * Break when we have 200000 samples */ if (count == (Psi.rows - 1)) { break mainloop; } } } } } } } /* * Call the CCA function here */ System.out.println("*****Done with matrices formation****"); /* * Just calculating the co-vavriance, assuming that the data is centered * and normalized */ System.out.println("***Calculating Covariances****"); PsiTPsi = Psi.t().mmul(Psi); // d' \times d' PsiTPhi = Psi.t().mmul(Phi); // d' \times d PhiTPhi = Phi.t().mmul(Phi); // d \times d PhiTPsi = Phi.t().mmul(Psi); // d \times d' System.out.println("****Done with it***"); /* * Log and square root transform */ PsiTPsi = VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PsiTPsi))); PsiTPhi = VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PsiTPhi))); PhiTPhi = VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PhiTPhi))); PhiTPsi = VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PhiTPsi))); /* * Writing the co-variance matrices in a text file to see what's going * on */ System.out.println("****Writing the Covarinace Matrices to the file***"); VSMUtil.writeCovarMatrixSem(PsiTPsi, "NNS"); VSMUtil.writeCovarMatrixSem(PsiTPhi, "NNS"); VSMUtil.writeCovarMatrixSem(PhiTPhi, "NNS"); VSMUtil.writeCovarMatrixSem(PhiTPsi, "NNS"); System.out.println("***Done***"); /* * Done with the Psi and Phi and freeing up some space */ Psi = null; Phi = null; System.gc(); /* * Getting the the similarity scoressvd template object that has utility * methods to do preprocessing before performing CCA */ SVDTemplates1 svdTC = new SVDTemplates1(null); /* * Function to compute the CCA, passing the covariance matrices to the * function */ computeCCA2( MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PsiTPhi), MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PhiTPsi), MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PhiTPhi), MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PsiTPsi), svdTC, null, 0, 50, "NNS"); /* * Writing the projection matrices out in a file to see what is in there */ matrices = VSMUtil.deserializeCCAVariantsRunSem("NNS"); VSMUtil.writeEigenDictInsideSemantic(matrices, "NNS", d); VSMUtil.writeEigenDictOutsideSem(matrices, "NNS", dprime); matrices = null; PsiTPhi = null; PhiTPhi = null; PsiTPsi = null; PhiTPsi = null; System.gc(); /* * We would also like to serialize the count map. The count map is the * data structure that helps us store the .ser files in proper * directories with proper names. So, if in future we want to extract * feature vectors corresponding to more parse trees, we will start from * where we left in the directory structure and file name */ /* * Getting the updated count map */ // countMap = VSMSerializeFeatureVectorBean.getCountMap(); // /* // * The object that will be serialized // */ // VSMCountMap countMapObject = new VSMCountMap(); // countMapObject.setCountMap(countMap); // // /* // * Serialize count map // */ // VSMSerializeCountMap.serializeCountMap(countMapObject); // System.out.println("*****count map serialized****"); }
public static void main(String... args) throws Exception { System.out.println("+++Compiled New++++"); nonTerminal = VSMUtil.getNonTerminal(args); LOGGER = VSMLogger.setup(FeatureVectors.class.getName() + "." + nonTerminal); featureDictionary = VSMContant.FEATURE_DICTIONARY + nonTerminal.toLowerCase() + "/dictionary.ser"; wordDictionaryPath = VSMContant.WORD_DICT; LOGGER.info("Reading the Feature Dictionary Object"); dictionaryBean = ReadSerializedDictionary.readSerializedDictionary(featureDictionary, LOGGER); LOGGER.info("Reading the word dictionary object"); wordDictBean = VSMReadSerialWordDict.readSerializedDictionary(wordDictionaryPath); outsideFeatureDictionary = dictionaryBean.getOutsideFeatureDictionary(); insideFeatureDictionary = dictionaryBean.getInsideFeatureDictionary(); wordDictionary = wordDictBean.getWordDictionary(); LOGGER.info( "Got the syntactic and semantic feature dictionaries, with word dictionary dimensions: " + wordDictionary.size()); treeReader = VSMUtil.getTreeReader(VSMContant.SICK_TRIAL_TREES); LOGGER.info("GOT Training Trees File Iterator: " + treeReader); int treeCount = 0; while (treeReader.hasNext()) { getSynaxTree(); if (syntaxTree != null) { treeCount += 1; syntaxTree = treeNormalizer.process(syntaxTree); constituentsMap = syntaxTree.getConstituents(); Iterator<Tree<String>> nodeTrees = syntaxTree.iterator(); while (nodeTrees.hasNext()) { insideTree = nodeTrees.next(); if (!insideTree.isLeaf() && insideTree.getLabel().equalsIgnoreCase(nonTerminal)) { createSparseVectors(); serializeVectorBean(treeCount); System.out.println("Serialized the feature vector***"); } } } } LOGGER.info("Done Creating the Sparse Vectors For the Non Terminal: " + nonTerminal); }
@Test public void testFeatureVectorSerialization() throws Exception { // VSMFeatureMatrixBean matrixBean = VSMReadSerialMatrix // .readFeatureMatrix("/Users/sameerkhurana10/Documents/featurematrix/dictionary.ser"); VSMDictionaryBean matrixBean = VSMReadSerialMatrix.readSerializedDictionary( "/Users/sameerkhurana10/Documents/featurematrixtest/dictionary.ser"); ArrayList<Alphabet> updateFilteredDcitionaryOutside = matrixBean.getOutsideFeatureDictionary(); ArrayList<Alphabet> updatedFilteredDictionaryInside = matrixBean.getInsideFeatureDictionary(); /* * Getting all the tree files */ // File[] files = new File("/Users/sameerkhurana10/blipp_corpus/trees") // .listFiles(); // File[] files = new File("/Users/sameerkhurana10/blipp_corpus/trees") // .listFiles(); File[] files = new File("/Users/sameerkhurana10/blipp_corpus/testtrees").listFiles(); /* * Getting the iterator over all the trees in the file specified by the * URI */ VSMSerializeFeatureVectorBean serializeBean = new VSMSerializeFeatureVectorBean(); Trees.StandardTreeNormalizer obj = new Trees.StandardTreeNormalizer(); PTBTreeNormaliser treeNormalizer = new PTBTreeNormaliser(true); for (File file : files) { PennTreeReader treeReader = VSMUtil.getTreeReader(file.getAbsolutePath()); /* * Iterating over all the trees */ while (treeReader.hasNext()) { /* * Get the tree */ Tree<String> syntaxTree = null; /* * Unmatched parentheses exception */ try { syntaxTree = treeReader.next(); } catch (RuntimeException e) { System.out.println("exception" + e + " ::tree " + syntaxTree); } /* * Do stuff only if the syntax tree is a valid one */ if (syntaxTree != null) { /* * Processed syntax tree */ syntaxTree = treeNormalizer.process(syntaxTree); Map<Tree<String>, Constituent<String>> constituentsMap = syntaxTree.getConstituents(); /* * Iterator over the nodes of the tree */ Iterator<Tree<String>> nodeTrees = syntaxTree.iterator(); /* * Iterating over all the nodes */ // double[] psi = null; // double[] phi = null; no.uib.cipr.matrix.sparse.SparseVector psi = null; no.uib.cipr.matrix.sparse.SparseVector phi = null; Tree<String> insideTree = null; while (nodeTrees.hasNext()) { /* * This is the inside tree for which we want to form a * feature vector and store it in the map */ insideTree = nodeTrees.next(); System.out.println("****Serializing for node " + insideTree.getLabel()); /* * Setting some static variables for the particular node * feature */ VSMUtil.setConstituentLength(constituentsMap.get(insideTree)); VSMUtil.getNumberOfOutsideWordsLeft(insideTree, constituentsMap, syntaxTree); VSMUtil.getNumberOfOutsideWordsRight(insideTree, constituentsMap, syntaxTree); /* * Creating the footoroot path for outside feature * extraction */ Stack<Tree<String>> foottoroot = new Stack<Tree<String>>(); foottoroot = VSMUtil.updateFoottorootPath(foottoroot, syntaxTree, insideTree, constituentsMap); /* * Only do stuff if inside tree is not a leaf */ if (!insideTree.isLeaf()) { /* * Setting the object's properties that are stored * in the .ser file */ VSMFeatureVectorBean vectorBean = new VSMFeatureVectorBean(); // System.out.println(":::::::" + insideTree); /* * Getting the inside feature vector phi */ psi = new VSMOutsideFeatureVector() .getOutsideFeatureVectorPsi( foottoroot, updateFilteredDcitionaryOutside, vectorBean); System.out.println("got the outside feature vector** " + psi); phi = new VSMInsideFeatureVector() .getInsideFeatureVectorPhi( insideTree, updatedFilteredDictionaryInside, vectorBean); System.out.println("got the outside feature vector*** " + phi); /* * THe inside feature vector // */ vectorBean.setPhi(phi); /* * // * The outside feature vector // */ vectorBean.setPsi(psi); // /* // * The inside tree from which the inside feature // vector // * is extracted // */ vectorBean.setInsideTree(insideTree); // /* // * The label of the node for which the inside and // * outside feature vectors are extracted // */ vectorBean.setLabel(insideTree.getLabel()); // /* // * The tree from which the inside and outside // feature // * vectors are extracted // */ vectorBean.setSyntaxTree(syntaxTree); // /* // * Setting the outside constituent trees from // which // the // * outside feature vector is extracted // */ vectorBean.setFootToRoot(foottoroot); // /* // * Read the count map from the file, if it not // null // then // * call the other constructor, otherwise call the // empty // * constructor // */ // // String fileURI = // // // "/Users/sameerkhurana10/Documents/serialization/countMap.ser"; // // File file = new File(fileURI); // // if (!file.exists()) { // // VSMSerializeFeatureVectorBean serializeBean = // new // // VSMSerializeFeatureVectorBean(); // // serializeBean.serializePhiBean(vectorBean); // // System.out.println("****does not exist***"); // // } else { // // LinkedHashMap<String, Integer> countMap = // // VSMReadSerialCountMap // // .readCountMapObj(fileURI).getCountMap(); // // VSMSerializeFeatureVectorBean serializeBean = // new // // VSMSerializeFeatureVectorBean( // // countMap); System.out.println("****heer here****" + vectorBean.getInsideFeatureVectorDim()); System.out.println(vectorBean.getInsideTreeFeatureList()); serializeBean.serializeVectorBean(vectorBean); System.out.println("****heer here****" + vectorBean.getInsideFeatureVectorDim()); System.out.println(vectorBean.getInsideTreeFeatureList()); System.out.println("Serialized the feature vector***"); // // } // // // // outsideFeatureMatrix.put(insideTree.getLabel(), // psi); // } // } // } // } // } } } } } // /* // * Serialize the count map // */ LinkedHashMap<String, Integer> countMap = VSMSerializeFeatureVectorBean.getCountMap(); // /* // * The object that will be serialized // */ VSMCountMap countMapObject = new VSMCountMap(); countMapObject.setCountMap(countMap); // /* // * Serialize count Map // */ VSMSerializeCountMap.serializeCountMap(countMapObject); System.out.println("count map serialized"); // /* // * Test the serialized object, read the object // */ LinkedHashMap<String, Integer> countMapRetireved = VSMReadSerialCountMap.readCountMapObj( "/Users/sameerkhurana10/Documents/serialization/countMap.ser") .getCountMap(); System.out.println(countMapRetireved); }