예제 #1
0
  /**
   * @param row
   * @return - array of counts of one-bits for each cell in the row.
   */
  public int[] getRowBitCount(int row) {
    int[] bits = new int[columns()];
    for (FlexCompRowMatrix cell : this.matrix) {
      SparseVector vector = cell.getRow(row);

      /*
       * Sparse vector: has indices (>0 except for first position) saying where values are; data are the values.
       */
      double[] data = vector.getData();
      int[] indices = vector.getIndex();
      for (int j = 0; j < data.length; j++) {
        if (indices[j] == 0 && j > 0) break;
        if (data[j] != 0.0) bits[indices[j]] += countBits(data[j]);
      }
    }
    return bits;
  }
  public static void main(String... args) throws Exception {

    /*
     * The inside and outisde projection matrices I suppose
     */
    Object[] matrices = new Object[2];

    /*
     * Data structure to hold all the sparse vectors
     */
    // ArrayList<SparseVector> phiList = new ArrayList<SparseVector>();
    // ArrayList<SparseVector> psiList = new ArrayList<SparseVector>();

    /*
     * Used to normalize the trees
     */

    PTBTreeNormaliser treeNormalizer = new PTBTreeNormaliser(true);

    /*
     * Getting the feature dictionary path, i.e. the serialized file path.
     * This dictionary will be used to form the feature vectors.
     */
    String featureDictionary = null;

    /*
     * This variable tells the code about the directory path where parse
     * trees are stored from which feature vectors need to be extracted
     * corresponding to all the nodes
     */
    String parsedTreeCorpus = null;

    /*
     * The feature dictionary that needs to be used while extracting
     * features
     */
    featureDictionary = "/disk/scratch/s1444025/worddictionary/worddictionary.ser";
    /*
     * The directory that holds the parse trees that are iterated over to
     * extract the feature vector corresponding to the nodes
     */
    parsedTreeCorpus = "/afs/inf.ed.ac.uk/group/project/vsm.restored/trees";
    /*
     * Necessary to get the appropriate directory structure
     */
    // countMapLoc =
    // "/afs/inf.ed.ac.uk/group/project/vsm/countmapnodesamples/countMap.ser";

    /*
     * Getting the serialised dictionary bean object that contains the
     * inside and outside feature dictionaries which are used to form the
     * feature vectors
     */
    VSMWordDictionaryBean dictionaryBean =
        VSMReadSerialWordDict.readSerializedDictionary(featureDictionary);

    /*
     * Getting the inside and outside feature dictionaries, that are used
     * for forming the feature vectors
     */
    System.out.println("***Getting word dictionary*****");
    Alphabet wordDictionary = dictionaryBean.getWordDictionary();
    System.out.println(wordDictionary);
    // System.out.println(wordDictionary.size());

    dprime = wordDictionary.size();
    d = wordDictionary.size();

    SparseMatrixLil PsiTPsi = new SparseMatrixLil(dprime, dprime);
    SparseMatrixLil PsiTPhi = new SparseMatrixLil(dprime, d);
    SparseMatrixLil PhiTPhi = new SparseMatrixLil(d, d);
    SparseMatrixLil PhiTPsi = new SparseMatrixLil(d, dprime);

    /*
     * The parsed tree corpus from where the feature vectors need to be
     * extracted corresponding to all the nodes
     */
    File[] files =
        new File(parsedTreeCorpus)
            .listFiles(
                new FileFilter() {

                  @Override
                  public boolean accept(File file) {
                    return !file.isHidden();
                  }
                });

    ArrayList<String> filePaths = VSMUtil.getFilePaths(files);

    /*
     * The obect that is used to serialize the feature vector bean. The
     * feature vector bean storing the inside and outside feature vectors
     * corresponding to a particular node in a tree. Each feature vector
     * bean holds the feature vectors for one particular node
     */
    VSMSerializeFeatureVectorBeanWord serializeBean = null;

    /*
     * If we already have a serialized count map object then we would want
     * to start from where we left
     */
    // File fileCountMap = new File(countMapLoc);

    serializeBean = new VSMSerializeFeatureVectorBeanWord();
    // } else {
    // VSMCountMap countMapObj = VSMReadSerialCountMap
    // .readCountMapObj(countMapLoc);
    // System.out.println("inside the count map***");
    // serializeBean = new VSMSerializeFeatureVectorBeanWord(
    // countMapObj.getCountMap());
    // }

    /*
     * Getting the data structure to store all the feature vectors in it, We
     * are taking 200000 samples for a particular non-terminal
     */
    SparseMatrixLil Phi = new SparseMatrixLil(300000, d);
    SparseMatrixLil Psi = new SparseMatrixLil(300000, dprime);

    int count = 0;

    mainloop:
    for (String filePath : filePaths) {

      /*
       * Getting an iterator over the trees in the file
       */
      PennTreeReader treeReader = VSMUtil.getTreeReader(filePath);

      /*
       * Iterating over all the trees
       */
      while (treeReader.hasNext()) {

        /*
         * The syntax tree
         */
        Tree<String> syntaxTree = null;

        /*
         * Unmatched parentheses exception. Does this mean that the
         * BLLIP corpus sometimes does not have correct parse trees?
         * Strange
         */
        try {
          syntaxTree = treeReader.next();
        } catch (RuntimeException e) {
          System.out.println("exception" + e + " ::tree  " + syntaxTree);
        }

        /*
         * Do stuff only if the syntax tree is a valid one
         */
        if (syntaxTree != null) {

          /*
           * Process the syntax tree to remove the top bracket
           */
          syntaxTree = treeNormalizer.process(syntaxTree);

          /*
           * Iterator over the nodes of the tree
           */
          Iterator<Tree<String>> nodeTrees = syntaxTree.iterator();

          /*
           * Sparse Inside and outside feature vectors declared
           */
          no.uib.cipr.matrix.sparse.SparseVector psi = null;
          no.uib.cipr.matrix.sparse.SparseVector phi = null;
          Tree<String> insideTree = null;

          /*
           * Iterating over all the nodes in a particular syntax tree
           */
          while (nodeTrees.hasNext()) {

            /*
             * This is the inside tree for which we want to form a
             * feature vector and store it in the map
             */
            insideTree = nodeTrees.next();

            /*
             * Only do stuff if inside tree is not a leaf
             */
            if (!insideTree.isLeaf() && insideTree.getLabel().equalsIgnoreCase("NNS")) {

              /*
               * Setting the object's properties that are stored
               * in the .ser file
               */
              VSMWordFeatureVectorBean vectorBean = new VSMWordFeatureVectorBean();

              System.out.println(
                  "****Extracting inside and outside feature vectors for node****  "
                      + insideTree.getLabel());

              /*
               * Getting the inside and outside feature vectors
               * corresponding to the partcular node
               */

              psi =
                  new VSMOutsideFeatureVectorWords()
                      .getOutsideFeatureVectorPsi(
                          syntaxTree, insideTree, wordDictionary, vectorBean);
              // psiList.add(psi);

              phi =
                  new VSMInsideFeatureVectorWords()
                      .getInsideFeatureVectorPhi(insideTree, wordDictionary, vectorBean);
              // phiList.add(phi);

              System.out.println("got the sparse vectors*** ");

              /*
               * Inside sparse matrix formation for the particular
               * node.
               */

              /*
               * Do the below operation only if both psi and phi
               * are not null for the given node sample and also
               * if either psi pr phi are different than before
               * for this spample, if both are same then no need
               * to unecessarily fill up Psi and Phi
               */

              if (phi != null && psi != null) {

                System.out.println(count);

                System.out.println("****Filling in the matrices***");

                int[] indicesPhi = phi.getIndex();
                double[] valuesPhi = phi.getData();
                /*
                 * Don't need the phi anymore in this iteration
                 */
                phi = null;
                /*
                 * Putting the inside feature vector into the
                 * inside feature matrix
                 */
                for (int i = 0; i < indicesPhi.length; i++) {
                  Phi.append(count, indicesPhi[i], valuesPhi[i]);
                }
                indicesPhi = null;
                valuesPhi = null;

                /*
                 * Outside sparse matrix formation for the
                 * particular node
                 */
                int[] indicesPsi = psi.getIndex();
                double[] valuesPsi = psi.getData();
                psi = null;

                /*
                 * Putting the outside feature vector into the
                 * outside feature matrix
                 */
                for (int j = 0; j < indicesPsi.length; j++) {
                  Psi.append(count, indicesPsi[j], valuesPsi[j]);
                }
                indicesPsi = null;
                valuesPsi = null;
                System.gc();

                /*
                 * Storing the feature vectors in a bean which
                 * will be serialized for future use
                 */
                vectorBean.setPhi(phi);
                vectorBean.setPsi(psi);
                vectorBean.setInsideTree(insideTree);
                vectorBean.setLabel(insideTree.getLabel());
                vectorBean.setSyntaxTree(syntaxTree);

                /*
                 * Serialize the feature vector bean
                 * corresponding to the particular node. The
                 * feature vector bean contains the sparse
                 * inside and outside feature vectors
                 */
                serializeBean.serializeWordVectorBean(vectorBean);
                System.out.println("***Serialized the feature vector***");

                count++;

                /*
                 * Break when we have 200000 samples
                 */
                if (count == (Psi.rows - 1)) {
                  break mainloop;
                }
              }
            }
          }
        }
      }
    }

    /*
     * Call the CCA function here
     */

    System.out.println("*****Done with matrices formation****");

    /*
     * Just calculating the co-vavriance, assuming that the data is centered
     * and normalized
     */

    System.out.println("***Calculating Covariances****");
    PsiTPsi = Psi.t().mmul(Psi); // d' \times d'
    PsiTPhi = Psi.t().mmul(Phi); // d' \times d
    PhiTPhi = Phi.t().mmul(Phi); // d \times d
    PhiTPsi = Phi.t().mmul(Psi); // d \times d'
    System.out.println("****Done with it***");

    /*
     * Log and square root transform
     */
    PsiTPsi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PsiTPsi)));
    PsiTPhi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PsiTPhi)));
    PhiTPhi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PhiTPhi)));
    PhiTPsi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PhiTPsi)));

    /*
     * Writing the co-variance matrices in a text file to see what's going
     * on
     */
    System.out.println("****Writing the Covarinace Matrices to the file***");
    VSMUtil.writeCovarMatrixSem(PsiTPsi, "NNS");
    VSMUtil.writeCovarMatrixSem(PsiTPhi, "NNS");
    VSMUtil.writeCovarMatrixSem(PhiTPhi, "NNS");
    VSMUtil.writeCovarMatrixSem(PhiTPsi, "NNS");
    System.out.println("***Done***");

    /*
     * Done with the Psi and Phi and freeing up some space
     */
    Psi = null;
    Phi = null;
    System.gc();
    /*
     * Getting the the similarity scoressvd template object that has utility
     * methods to do preprocessing before performing CCA
     */
    SVDTemplates1 svdTC = new SVDTemplates1(null);

    /*
     * Function to compute the CCA, passing the covariance matrices to the
     * function
     */
    computeCCA2(
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PsiTPhi),
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PhiTPsi),
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PhiTPhi),
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PsiTPsi),
        svdTC,
        null,
        0,
        50,
        "NNS");

    /*
     * Writing the projection matrices out in a file to see what is in there
     */
    matrices = VSMUtil.deserializeCCAVariantsRunSem("NNS");
    VSMUtil.writeEigenDictInsideSemantic(matrices, "NNS", d);
    VSMUtil.writeEigenDictOutsideSem(matrices, "NNS", dprime);
    matrices = null;
    PsiTPhi = null;
    PhiTPhi = null;
    PsiTPsi = null;
    PhiTPsi = null;
    System.gc();

    /*
     * We would also like to serialize the count map. The count map is the
     * data structure that helps us store the .ser files in proper
     * directories with proper names. So, if in future we want to extract
     * feature vectors corresponding to more parse trees, we will start from
     * where we left in the directory structure and file name
     */

    /*
     * Getting the updated count map
     */
    // countMap = VSMSerializeFeatureVectorBean.getCountMap();
    // /*
    // * The object that will be serialized
    // */
    // VSMCountMap countMapObject = new VSMCountMap();
    // countMapObject.setCountMap(countMap);
    //
    // /*
    // * Serialize count map
    // */
    // VSMSerializeCountMap.serializeCountMap(countMapObject);
    // System.out.println("*****count map serialized****");
  }