private static void getSynaxTree() {
   syntaxTree = null;
   try {
     syntaxTree = treeReader.next();
   } catch (RuntimeException e) {
     System.out.println("exception" + e + " ::tree  " + syntaxTree);
   }
 }
Ejemplo n.º 2
0
  private static void combineAllFile() {

    System.out.println("Combining all FILES");

    if (corpusFiles != null) {
      sortByNumber(corpusFiles);
      for (File file : corpusFiles) {
        builder = new StringBuilder();
        System.out.println("File Name: " + file.getName());
        getTreeReader(file);
        int count = 0;
        inner:
        while (treeReader.hasNext()) {
          count++;
          formStringBuilder(treeReader.next());
          if (count == 300) {
            count = 0;
            break inner;
          }
        }
      }
    } else {
      builder = new StringBuilder();
      System.out.println("File Name: " + sickFile.getName());
      getTreeReader(sickFile);
      while (treeReader.hasNext()) {
        formStringBuilder(treeReader.next());
      }
    }

    // builder.append("\n");
    // builder.append("DOCSTART-X-0");
    // builder.append("\n");
    // builder.append("\n");
    formLSADocument();

    System.out.println("DONE");
  }
  public static void main(String... args) throws Exception {

    /*
     * The inside and outisde projection matrices I suppose
     */
    Object[] matrices = new Object[2];

    /*
     * Data structure to hold all the sparse vectors
     */
    // ArrayList<SparseVector> phiList = new ArrayList<SparseVector>();
    // ArrayList<SparseVector> psiList = new ArrayList<SparseVector>();

    /*
     * Used to normalize the trees
     */

    PTBTreeNormaliser treeNormalizer = new PTBTreeNormaliser(true);

    /*
     * Getting the feature dictionary path, i.e. the serialized file path.
     * This dictionary will be used to form the feature vectors.
     */
    String featureDictionary = null;

    /*
     * This variable tells the code about the directory path where parse
     * trees are stored from which feature vectors need to be extracted
     * corresponding to all the nodes
     */
    String parsedTreeCorpus = null;

    /*
     * The feature dictionary that needs to be used while extracting
     * features
     */
    featureDictionary = "/disk/scratch/s1444025/worddictionary/worddictionary.ser";
    /*
     * The directory that holds the parse trees that are iterated over to
     * extract the feature vector corresponding to the nodes
     */
    parsedTreeCorpus = "/afs/inf.ed.ac.uk/group/project/vsm.restored/trees";
    /*
     * Necessary to get the appropriate directory structure
     */
    // countMapLoc =
    // "/afs/inf.ed.ac.uk/group/project/vsm/countmapnodesamples/countMap.ser";

    /*
     * Getting the serialised dictionary bean object that contains the
     * inside and outside feature dictionaries which are used to form the
     * feature vectors
     */
    VSMWordDictionaryBean dictionaryBean =
        VSMReadSerialWordDict.readSerializedDictionary(featureDictionary);

    /*
     * Getting the inside and outside feature dictionaries, that are used
     * for forming the feature vectors
     */
    System.out.println("***Getting word dictionary*****");
    Alphabet wordDictionary = dictionaryBean.getWordDictionary();
    System.out.println(wordDictionary);
    // System.out.println(wordDictionary.size());

    dprime = wordDictionary.size();
    d = wordDictionary.size();

    SparseMatrixLil PsiTPsi = new SparseMatrixLil(dprime, dprime);
    SparseMatrixLil PsiTPhi = new SparseMatrixLil(dprime, d);
    SparseMatrixLil PhiTPhi = new SparseMatrixLil(d, d);
    SparseMatrixLil PhiTPsi = new SparseMatrixLil(d, dprime);

    /*
     * The parsed tree corpus from where the feature vectors need to be
     * extracted corresponding to all the nodes
     */
    File[] files =
        new File(parsedTreeCorpus)
            .listFiles(
                new FileFilter() {

                  @Override
                  public boolean accept(File file) {
                    return !file.isHidden();
                  }
                });

    ArrayList<String> filePaths = VSMUtil.getFilePaths(files);

    /*
     * The obect that is used to serialize the feature vector bean. The
     * feature vector bean storing the inside and outside feature vectors
     * corresponding to a particular node in a tree. Each feature vector
     * bean holds the feature vectors for one particular node
     */
    VSMSerializeFeatureVectorBeanWord serializeBean = null;

    /*
     * If we already have a serialized count map object then we would want
     * to start from where we left
     */
    // File fileCountMap = new File(countMapLoc);

    serializeBean = new VSMSerializeFeatureVectorBeanWord();
    // } else {
    // VSMCountMap countMapObj = VSMReadSerialCountMap
    // .readCountMapObj(countMapLoc);
    // System.out.println("inside the count map***");
    // serializeBean = new VSMSerializeFeatureVectorBeanWord(
    // countMapObj.getCountMap());
    // }

    /*
     * Getting the data structure to store all the feature vectors in it, We
     * are taking 200000 samples for a particular non-terminal
     */
    SparseMatrixLil Phi = new SparseMatrixLil(300000, d);
    SparseMatrixLil Psi = new SparseMatrixLil(300000, dprime);

    int count = 0;

    mainloop:
    for (String filePath : filePaths) {

      /*
       * Getting an iterator over the trees in the file
       */
      PennTreeReader treeReader = VSMUtil.getTreeReader(filePath);

      /*
       * Iterating over all the trees
       */
      while (treeReader.hasNext()) {

        /*
         * The syntax tree
         */
        Tree<String> syntaxTree = null;

        /*
         * Unmatched parentheses exception. Does this mean that the
         * BLLIP corpus sometimes does not have correct parse trees?
         * Strange
         */
        try {
          syntaxTree = treeReader.next();
        } catch (RuntimeException e) {
          System.out.println("exception" + e + " ::tree  " + syntaxTree);
        }

        /*
         * Do stuff only if the syntax tree is a valid one
         */
        if (syntaxTree != null) {

          /*
           * Process the syntax tree to remove the top bracket
           */
          syntaxTree = treeNormalizer.process(syntaxTree);

          /*
           * Iterator over the nodes of the tree
           */
          Iterator<Tree<String>> nodeTrees = syntaxTree.iterator();

          /*
           * Sparse Inside and outside feature vectors declared
           */
          no.uib.cipr.matrix.sparse.SparseVector psi = null;
          no.uib.cipr.matrix.sparse.SparseVector phi = null;
          Tree<String> insideTree = null;

          /*
           * Iterating over all the nodes in a particular syntax tree
           */
          while (nodeTrees.hasNext()) {

            /*
             * This is the inside tree for which we want to form a
             * feature vector and store it in the map
             */
            insideTree = nodeTrees.next();

            /*
             * Only do stuff if inside tree is not a leaf
             */
            if (!insideTree.isLeaf() && insideTree.getLabel().equalsIgnoreCase("NNS")) {

              /*
               * Setting the object's properties that are stored
               * in the .ser file
               */
              VSMWordFeatureVectorBean vectorBean = new VSMWordFeatureVectorBean();

              System.out.println(
                  "****Extracting inside and outside feature vectors for node****  "
                      + insideTree.getLabel());

              /*
               * Getting the inside and outside feature vectors
               * corresponding to the partcular node
               */

              psi =
                  new VSMOutsideFeatureVectorWords()
                      .getOutsideFeatureVectorPsi(
                          syntaxTree, insideTree, wordDictionary, vectorBean);
              // psiList.add(psi);

              phi =
                  new VSMInsideFeatureVectorWords()
                      .getInsideFeatureVectorPhi(insideTree, wordDictionary, vectorBean);
              // phiList.add(phi);

              System.out.println("got the sparse vectors*** ");

              /*
               * Inside sparse matrix formation for the particular
               * node.
               */

              /*
               * Do the below operation only if both psi and phi
               * are not null for the given node sample and also
               * if either psi pr phi are different than before
               * for this spample, if both are same then no need
               * to unecessarily fill up Psi and Phi
               */

              if (phi != null && psi != null) {

                System.out.println(count);

                System.out.println("****Filling in the matrices***");

                int[] indicesPhi = phi.getIndex();
                double[] valuesPhi = phi.getData();
                /*
                 * Don't need the phi anymore in this iteration
                 */
                phi = null;
                /*
                 * Putting the inside feature vector into the
                 * inside feature matrix
                 */
                for (int i = 0; i < indicesPhi.length; i++) {
                  Phi.append(count, indicesPhi[i], valuesPhi[i]);
                }
                indicesPhi = null;
                valuesPhi = null;

                /*
                 * Outside sparse matrix formation for the
                 * particular node
                 */
                int[] indicesPsi = psi.getIndex();
                double[] valuesPsi = psi.getData();
                psi = null;

                /*
                 * Putting the outside feature vector into the
                 * outside feature matrix
                 */
                for (int j = 0; j < indicesPsi.length; j++) {
                  Psi.append(count, indicesPsi[j], valuesPsi[j]);
                }
                indicesPsi = null;
                valuesPsi = null;
                System.gc();

                /*
                 * Storing the feature vectors in a bean which
                 * will be serialized for future use
                 */
                vectorBean.setPhi(phi);
                vectorBean.setPsi(psi);
                vectorBean.setInsideTree(insideTree);
                vectorBean.setLabel(insideTree.getLabel());
                vectorBean.setSyntaxTree(syntaxTree);

                /*
                 * Serialize the feature vector bean
                 * corresponding to the particular node. The
                 * feature vector bean contains the sparse
                 * inside and outside feature vectors
                 */
                serializeBean.serializeWordVectorBean(vectorBean);
                System.out.println("***Serialized the feature vector***");

                count++;

                /*
                 * Break when we have 200000 samples
                 */
                if (count == (Psi.rows - 1)) {
                  break mainloop;
                }
              }
            }
          }
        }
      }
    }

    /*
     * Call the CCA function here
     */

    System.out.println("*****Done with matrices formation****");

    /*
     * Just calculating the co-vavriance, assuming that the data is centered
     * and normalized
     */

    System.out.println("***Calculating Covariances****");
    PsiTPsi = Psi.t().mmul(Psi); // d' \times d'
    PsiTPhi = Psi.t().mmul(Phi); // d' \times d
    PhiTPhi = Phi.t().mmul(Phi); // d \times d
    PhiTPsi = Phi.t().mmul(Psi); // d \times d'
    System.out.println("****Done with it***");

    /*
     * Log and square root transform
     */
    PsiTPsi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PsiTPsi)));
    PsiTPhi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PsiTPhi)));
    PhiTPhi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PhiTPhi)));
    PhiTPsi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PhiTPsi)));

    /*
     * Writing the co-variance matrices in a text file to see what's going
     * on
     */
    System.out.println("****Writing the Covarinace Matrices to the file***");
    VSMUtil.writeCovarMatrixSem(PsiTPsi, "NNS");
    VSMUtil.writeCovarMatrixSem(PsiTPhi, "NNS");
    VSMUtil.writeCovarMatrixSem(PhiTPhi, "NNS");
    VSMUtil.writeCovarMatrixSem(PhiTPsi, "NNS");
    System.out.println("***Done***");

    /*
     * Done with the Psi and Phi and freeing up some space
     */
    Psi = null;
    Phi = null;
    System.gc();
    /*
     * Getting the the similarity scoressvd template object that has utility
     * methods to do preprocessing before performing CCA
     */
    SVDTemplates1 svdTC = new SVDTemplates1(null);

    /*
     * Function to compute the CCA, passing the covariance matrices to the
     * function
     */
    computeCCA2(
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PsiTPhi),
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PhiTPsi),
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PhiTPhi),
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PsiTPsi),
        svdTC,
        null,
        0,
        50,
        "NNS");

    /*
     * Writing the projection matrices out in a file to see what is in there
     */
    matrices = VSMUtil.deserializeCCAVariantsRunSem("NNS");
    VSMUtil.writeEigenDictInsideSemantic(matrices, "NNS", d);
    VSMUtil.writeEigenDictOutsideSem(matrices, "NNS", dprime);
    matrices = null;
    PsiTPhi = null;
    PhiTPhi = null;
    PsiTPsi = null;
    PhiTPsi = null;
    System.gc();

    /*
     * We would also like to serialize the count map. The count map is the
     * data structure that helps us store the .ser files in proper
     * directories with proper names. So, if in future we want to extract
     * feature vectors corresponding to more parse trees, we will start from
     * where we left in the directory structure and file name
     */

    /*
     * Getting the updated count map
     */
    // countMap = VSMSerializeFeatureVectorBean.getCountMap();
    // /*
    // * The object that will be serialized
    // */
    // VSMCountMap countMapObject = new VSMCountMap();
    // countMapObject.setCountMap(countMap);
    //
    // /*
    // * Serialize count map
    // */
    // VSMSerializeCountMap.serializeCountMap(countMapObject);
    // System.out.println("*****count map serialized****");
  }
  public static void main(String... args) throws Exception {

    System.out.println("+++Compiled New++++");

    nonTerminal = VSMUtil.getNonTerminal(args);

    LOGGER = VSMLogger.setup(FeatureVectors.class.getName() + "." + nonTerminal);

    featureDictionary =
        VSMContant.FEATURE_DICTIONARY + nonTerminal.toLowerCase() + "/dictionary.ser";

    wordDictionaryPath = VSMContant.WORD_DICT;

    LOGGER.info("Reading the Feature Dictionary Object");
    dictionaryBean = ReadSerializedDictionary.readSerializedDictionary(featureDictionary, LOGGER);

    LOGGER.info("Reading the word dictionary object");
    wordDictBean = VSMReadSerialWordDict.readSerializedDictionary(wordDictionaryPath);

    outsideFeatureDictionary = dictionaryBean.getOutsideFeatureDictionary();

    insideFeatureDictionary = dictionaryBean.getInsideFeatureDictionary();

    wordDictionary = wordDictBean.getWordDictionary();

    LOGGER.info(
        "Got the syntactic and semantic feature dictionaries, with word dictionary dimensions: "
            + wordDictionary.size());

    treeReader = VSMUtil.getTreeReader(VSMContant.SICK_TRIAL_TREES);

    LOGGER.info("GOT Training Trees File Iterator: " + treeReader);

    int treeCount = 0;

    while (treeReader.hasNext()) {

      getSynaxTree();

      if (syntaxTree != null) {

        treeCount += 1;

        syntaxTree = treeNormalizer.process(syntaxTree);

        constituentsMap = syntaxTree.getConstituents();

        Iterator<Tree<String>> nodeTrees = syntaxTree.iterator();

        while (nodeTrees.hasNext()) {

          insideTree = nodeTrees.next();

          if (!insideTree.isLeaf() && insideTree.getLabel().equalsIgnoreCase(nonTerminal)) {

            createSparseVectors();

            serializeVectorBean(treeCount);

            System.out.println("Serialized the feature vector***");
          }
        }
      }
    }

    LOGGER.info("Done Creating the Sparse Vectors For the Non Terminal: " + nonTerminal);
  }
  @Test
  public void testFeatureVectorSerialization() throws Exception {

    // VSMFeatureMatrixBean matrixBean = VSMReadSerialMatrix
    // .readFeatureMatrix("/Users/sameerkhurana10/Documents/featurematrix/dictionary.ser");
    VSMDictionaryBean matrixBean =
        VSMReadSerialMatrix.readSerializedDictionary(
            "/Users/sameerkhurana10/Documents/featurematrixtest/dictionary.ser");

    ArrayList<Alphabet> updateFilteredDcitionaryOutside = matrixBean.getOutsideFeatureDictionary();
    ArrayList<Alphabet> updatedFilteredDictionaryInside = matrixBean.getInsideFeatureDictionary();

    /*
     * Getting all the tree files
     */
    // File[] files = new File("/Users/sameerkhurana10/blipp_corpus/trees")
    // .listFiles();
    // File[] files = new File("/Users/sameerkhurana10/blipp_corpus/trees")
    // .listFiles();
    File[] files = new File("/Users/sameerkhurana10/blipp_corpus/testtrees").listFiles();

    /*
     * Getting the iterator over all the trees in the file specified by the
     * URI
     */

    VSMSerializeFeatureVectorBean serializeBean = new VSMSerializeFeatureVectorBean();

    Trees.StandardTreeNormalizer obj = new Trees.StandardTreeNormalizer();
    PTBTreeNormaliser treeNormalizer = new PTBTreeNormaliser(true);
    for (File file : files) {
      PennTreeReader treeReader = VSMUtil.getTreeReader(file.getAbsolutePath());
      /*
       * Iterating over all the trees
       */
      while (treeReader.hasNext()) {
        /*
         * Get the tree
         */
        Tree<String> syntaxTree = null;
        /*
         * Unmatched parentheses exception
         */
        try {
          syntaxTree = treeReader.next();
        } catch (RuntimeException e) {
          System.out.println("exception" + e + " ::tree  " + syntaxTree);
        }
        /*
         * Do stuff only if the syntax tree is a valid one
         */
        if (syntaxTree != null) {

          /*
           * Processed syntax tree
           */
          syntaxTree = treeNormalizer.process(syntaxTree);

          Map<Tree<String>, Constituent<String>> constituentsMap = syntaxTree.getConstituents();
          /*
           * Iterator over the nodes of the tree
           */
          Iterator<Tree<String>> nodeTrees = syntaxTree.iterator();
          /*
           * Iterating over all the nodes
           */
          // double[] psi = null;
          // double[] phi = null;
          no.uib.cipr.matrix.sparse.SparseVector psi = null;
          no.uib.cipr.matrix.sparse.SparseVector phi = null;
          Tree<String> insideTree = null;
          while (nodeTrees.hasNext()) {
            /*
             * This is the inside tree for which we want to form a
             * feature vector and store it in the map
             */
            insideTree = nodeTrees.next();
            System.out.println("****Serializing for node  " + insideTree.getLabel());
            /*
             * Setting some static variables for the particular node
             * feature
             */
            VSMUtil.setConstituentLength(constituentsMap.get(insideTree));
            VSMUtil.getNumberOfOutsideWordsLeft(insideTree, constituentsMap, syntaxTree);
            VSMUtil.getNumberOfOutsideWordsRight(insideTree, constituentsMap, syntaxTree);

            /*
             * Creating the footoroot path for outside feature
             * extraction
             */
            Stack<Tree<String>> foottoroot = new Stack<Tree<String>>();
            foottoroot =
                VSMUtil.updateFoottorootPath(foottoroot, syntaxTree, insideTree, constituentsMap);

            /*
             * Only do stuff if inside tree is not a leaf
             */
            if (!insideTree.isLeaf()) {

              /*
               * Setting the object's properties that are stored
               * in the .ser file
               */
              VSMFeatureVectorBean vectorBean = new VSMFeatureVectorBean();

              // System.out.println(":::::::" + insideTree);
              /*
               * Getting the inside feature vector phi
               */
              psi =
                  new VSMOutsideFeatureVector()
                      .getOutsideFeatureVectorPsi(
                          foottoroot, updateFilteredDcitionaryOutside, vectorBean);
              System.out.println("got the outside feature vector** " + psi);
              phi =
                  new VSMInsideFeatureVector()
                      .getInsideFeatureVectorPhi(
                          insideTree, updatedFilteredDictionaryInside, vectorBean);
              System.out.println("got the outside feature vector*** " + phi);

              /*
               * THe inside feature vector //
               */
              vectorBean.setPhi(phi);
              /*
               * // * The outside feature vector //
               */
              vectorBean.setPsi(psi);
              // /*
              // * The inside tree from which the inside feature
              // vector
              // * is extracted
              // */
              vectorBean.setInsideTree(insideTree);
              // /*
              // * The label of the node for which the inside and
              // * outside feature vectors are extracted
              // */
              vectorBean.setLabel(insideTree.getLabel());
              // /*
              // * The tree from which the inside and outside
              // feature
              // * vectors are extracted
              // */
              vectorBean.setSyntaxTree(syntaxTree);
              // /*
              // * Setting the outside constituent trees from
              // which
              // the
              // * outside feature vector is extracted
              // */
              vectorBean.setFootToRoot(foottoroot);
              // /*
              // * Read the count map from the file, if it not
              // null
              // then
              // * call the other constructor, otherwise call the
              // empty
              // * constructor
              // */
              // // String fileURI =
              // //
              // "/Users/sameerkhurana10/Documents/serialization/countMap.ser";
              // // File file = new File(fileURI);
              // // if (!file.exists()) {
              // // VSMSerializeFeatureVectorBean serializeBean =
              // new
              // // VSMSerializeFeatureVectorBean();
              // // serializeBean.serializePhiBean(vectorBean);
              // // System.out.println("****does not exist***");
              // // } else {
              // // LinkedHashMap<String, Integer> countMap =
              // // VSMReadSerialCountMap
              // // .readCountMapObj(fileURI).getCountMap();
              // // VSMSerializeFeatureVectorBean serializeBean =
              // new
              // // VSMSerializeFeatureVectorBean(
              // // countMap);
              System.out.println("****heer here****" + vectorBean.getInsideFeatureVectorDim());
              System.out.println(vectorBean.getInsideTreeFeatureList());
              serializeBean.serializeVectorBean(vectorBean);
              System.out.println("****heer here****" + vectorBean.getInsideFeatureVectorDim());
              System.out.println(vectorBean.getInsideTreeFeatureList());
              System.out.println("Serialized the feature vector***");
              // // }
              //
              // //
              // outsideFeatureMatrix.put(insideTree.getLabel(),
              // psi);
              // }
              // }
              // }
              // }
              //

            }
          }
        }
      }
    }

    // /*
    // * Serialize the count map
    // */
    LinkedHashMap<String, Integer> countMap = VSMSerializeFeatureVectorBean.getCountMap();
    // /*
    // * The object that will be serialized
    // */
    VSMCountMap countMapObject = new VSMCountMap();
    countMapObject.setCountMap(countMap);
    // /*
    // * Serialize count Map
    // */
    VSMSerializeCountMap.serializeCountMap(countMapObject);
    System.out.println("count map serialized");
    // /*
    // * Test the serialized object, read the object
    // */
    LinkedHashMap<String, Integer> countMapRetireved =
        VSMReadSerialCountMap.readCountMapObj(
                "/Users/sameerkhurana10/Documents/serialization/countMap.ser")
            .getCountMap();
    System.out.println(countMapRetireved);
  }