/** * @param row * @return - array of counts of one-bits for each cell in the row. */ public int[] getRowBitCount(int row) { int[] bits = new int[columns()]; for (FlexCompRowMatrix cell : this.matrix) { SparseVector vector = cell.getRow(row); /* * Sparse vector: has indices (>0 except for first position) saying where values are; data are the values. */ double[] data = vector.getData(); int[] indices = vector.getIndex(); for (int j = 0; j < data.length; j++) { if (indices[j] == 0 && j > 0) break; if (data[j] != 0.0) bits[indices[j]] += countBits(data[j]); } } return bits; }
public static void main(String... args) throws Exception { /* * The inside and outisde projection matrices I suppose */ Object[] matrices = new Object[2]; /* * Data structure to hold all the sparse vectors */ // ArrayList<SparseVector> phiList = new ArrayList<SparseVector>(); // ArrayList<SparseVector> psiList = new ArrayList<SparseVector>(); /* * Used to normalize the trees */ PTBTreeNormaliser treeNormalizer = new PTBTreeNormaliser(true); /* * Getting the feature dictionary path, i.e. the serialized file path. * This dictionary will be used to form the feature vectors. */ String featureDictionary = null; /* * This variable tells the code about the directory path where parse * trees are stored from which feature vectors need to be extracted * corresponding to all the nodes */ String parsedTreeCorpus = null; /* * The feature dictionary that needs to be used while extracting * features */ featureDictionary = "/disk/scratch/s1444025/worddictionary/worddictionary.ser"; /* * The directory that holds the parse trees that are iterated over to * extract the feature vector corresponding to the nodes */ parsedTreeCorpus = "/afs/inf.ed.ac.uk/group/project/vsm.restored/trees"; /* * Necessary to get the appropriate directory structure */ // countMapLoc = // "/afs/inf.ed.ac.uk/group/project/vsm/countmapnodesamples/countMap.ser"; /* * Getting the serialised dictionary bean object that contains the * inside and outside feature dictionaries which are used to form the * feature vectors */ VSMWordDictionaryBean dictionaryBean = VSMReadSerialWordDict.readSerializedDictionary(featureDictionary); /* * Getting the inside and outside feature dictionaries, that are used * for forming the feature vectors */ System.out.println("***Getting word dictionary*****"); Alphabet wordDictionary = dictionaryBean.getWordDictionary(); System.out.println(wordDictionary); // System.out.println(wordDictionary.size()); dprime = wordDictionary.size(); d = wordDictionary.size(); SparseMatrixLil PsiTPsi = new SparseMatrixLil(dprime, dprime); SparseMatrixLil PsiTPhi = new SparseMatrixLil(dprime, d); SparseMatrixLil PhiTPhi = new SparseMatrixLil(d, d); SparseMatrixLil PhiTPsi = new SparseMatrixLil(d, dprime); /* * The parsed tree corpus from where the feature vectors need to be * extracted corresponding to all the nodes */ File[] files = new File(parsedTreeCorpus) .listFiles( new FileFilter() { @Override public boolean accept(File file) { return !file.isHidden(); } }); ArrayList<String> filePaths = VSMUtil.getFilePaths(files); /* * The obect that is used to serialize the feature vector bean. The * feature vector bean storing the inside and outside feature vectors * corresponding to a particular node in a tree. Each feature vector * bean holds the feature vectors for one particular node */ VSMSerializeFeatureVectorBeanWord serializeBean = null; /* * If we already have a serialized count map object then we would want * to start from where we left */ // File fileCountMap = new File(countMapLoc); serializeBean = new VSMSerializeFeatureVectorBeanWord(); // } else { // VSMCountMap countMapObj = VSMReadSerialCountMap // .readCountMapObj(countMapLoc); // System.out.println("inside the count map***"); // serializeBean = new VSMSerializeFeatureVectorBeanWord( // countMapObj.getCountMap()); // } /* * Getting the data structure to store all the feature vectors in it, We * are taking 200000 samples for a particular non-terminal */ SparseMatrixLil Phi = new SparseMatrixLil(300000, d); SparseMatrixLil Psi = new SparseMatrixLil(300000, dprime); int count = 0; mainloop: for (String filePath : filePaths) { /* * Getting an iterator over the trees in the file */ PennTreeReader treeReader = VSMUtil.getTreeReader(filePath); /* * Iterating over all the trees */ while (treeReader.hasNext()) { /* * The syntax tree */ Tree<String> syntaxTree = null; /* * Unmatched parentheses exception. Does this mean that the * BLLIP corpus sometimes does not have correct parse trees? * Strange */ try { syntaxTree = treeReader.next(); } catch (RuntimeException e) { System.out.println("exception" + e + " ::tree " + syntaxTree); } /* * Do stuff only if the syntax tree is a valid one */ if (syntaxTree != null) { /* * Process the syntax tree to remove the top bracket */ syntaxTree = treeNormalizer.process(syntaxTree); /* * Iterator over the nodes of the tree */ Iterator<Tree<String>> nodeTrees = syntaxTree.iterator(); /* * Sparse Inside and outside feature vectors declared */ no.uib.cipr.matrix.sparse.SparseVector psi = null; no.uib.cipr.matrix.sparse.SparseVector phi = null; Tree<String> insideTree = null; /* * Iterating over all the nodes in a particular syntax tree */ while (nodeTrees.hasNext()) { /* * This is the inside tree for which we want to form a * feature vector and store it in the map */ insideTree = nodeTrees.next(); /* * Only do stuff if inside tree is not a leaf */ if (!insideTree.isLeaf() && insideTree.getLabel().equalsIgnoreCase("NNS")) { /* * Setting the object's properties that are stored * in the .ser file */ VSMWordFeatureVectorBean vectorBean = new VSMWordFeatureVectorBean(); System.out.println( "****Extracting inside and outside feature vectors for node**** " + insideTree.getLabel()); /* * Getting the inside and outside feature vectors * corresponding to the partcular node */ psi = new VSMOutsideFeatureVectorWords() .getOutsideFeatureVectorPsi( syntaxTree, insideTree, wordDictionary, vectorBean); // psiList.add(psi); phi = new VSMInsideFeatureVectorWords() .getInsideFeatureVectorPhi(insideTree, wordDictionary, vectorBean); // phiList.add(phi); System.out.println("got the sparse vectors*** "); /* * Inside sparse matrix formation for the particular * node. */ /* * Do the below operation only if both psi and phi * are not null for the given node sample and also * if either psi pr phi are different than before * for this spample, if both are same then no need * to unecessarily fill up Psi and Phi */ if (phi != null && psi != null) { System.out.println(count); System.out.println("****Filling in the matrices***"); int[] indicesPhi = phi.getIndex(); double[] valuesPhi = phi.getData(); /* * Don't need the phi anymore in this iteration */ phi = null; /* * Putting the inside feature vector into the * inside feature matrix */ for (int i = 0; i < indicesPhi.length; i++) { Phi.append(count, indicesPhi[i], valuesPhi[i]); } indicesPhi = null; valuesPhi = null; /* * Outside sparse matrix formation for the * particular node */ int[] indicesPsi = psi.getIndex(); double[] valuesPsi = psi.getData(); psi = null; /* * Putting the outside feature vector into the * outside feature matrix */ for (int j = 0; j < indicesPsi.length; j++) { Psi.append(count, indicesPsi[j], valuesPsi[j]); } indicesPsi = null; valuesPsi = null; System.gc(); /* * Storing the feature vectors in a bean which * will be serialized for future use */ vectorBean.setPhi(phi); vectorBean.setPsi(psi); vectorBean.setInsideTree(insideTree); vectorBean.setLabel(insideTree.getLabel()); vectorBean.setSyntaxTree(syntaxTree); /* * Serialize the feature vector bean * corresponding to the particular node. The * feature vector bean contains the sparse * inside and outside feature vectors */ serializeBean.serializeWordVectorBean(vectorBean); System.out.println("***Serialized the feature vector***"); count++; /* * Break when we have 200000 samples */ if (count == (Psi.rows - 1)) { break mainloop; } } } } } } } /* * Call the CCA function here */ System.out.println("*****Done with matrices formation****"); /* * Just calculating the co-vavriance, assuming that the data is centered * and normalized */ System.out.println("***Calculating Covariances****"); PsiTPsi = Psi.t().mmul(Psi); // d' \times d' PsiTPhi = Psi.t().mmul(Phi); // d' \times d PhiTPhi = Phi.t().mmul(Phi); // d \times d PhiTPsi = Phi.t().mmul(Psi); // d \times d' System.out.println("****Done with it***"); /* * Log and square root transform */ PsiTPsi = VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PsiTPsi))); PsiTPhi = VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PsiTPhi))); PhiTPhi = VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PhiTPhi))); PhiTPsi = VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PhiTPsi))); /* * Writing the co-variance matrices in a text file to see what's going * on */ System.out.println("****Writing the Covarinace Matrices to the file***"); VSMUtil.writeCovarMatrixSem(PsiTPsi, "NNS"); VSMUtil.writeCovarMatrixSem(PsiTPhi, "NNS"); VSMUtil.writeCovarMatrixSem(PhiTPhi, "NNS"); VSMUtil.writeCovarMatrixSem(PhiTPsi, "NNS"); System.out.println("***Done***"); /* * Done with the Psi and Phi and freeing up some space */ Psi = null; Phi = null; System.gc(); /* * Getting the the similarity scoressvd template object that has utility * methods to do preprocessing before performing CCA */ SVDTemplates1 svdTC = new SVDTemplates1(null); /* * Function to compute the CCA, passing the covariance matrices to the * function */ computeCCA2( MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PsiTPhi), MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PhiTPsi), MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PhiTPhi), MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PsiTPsi), svdTC, null, 0, 50, "NNS"); /* * Writing the projection matrices out in a file to see what is in there */ matrices = VSMUtil.deserializeCCAVariantsRunSem("NNS"); VSMUtil.writeEigenDictInsideSemantic(matrices, "NNS", d); VSMUtil.writeEigenDictOutsideSem(matrices, "NNS", dprime); matrices = null; PsiTPhi = null; PhiTPhi = null; PsiTPsi = null; PhiTPsi = null; System.gc(); /* * We would also like to serialize the count map. The count map is the * data structure that helps us store the .ser files in proper * directories with proper names. So, if in future we want to extract * feature vectors corresponding to more parse trees, we will start from * where we left in the directory structure and file name */ /* * Getting the updated count map */ // countMap = VSMSerializeFeatureVectorBean.getCountMap(); // /* // * The object that will be serialized // */ // VSMCountMap countMapObject = new VSMCountMap(); // countMapObject.setCountMap(countMap); // // /* // * Serialize count map // */ // VSMSerializeCountMap.serializeCountMap(countMapObject); // System.out.println("*****count map serialized****"); }