Java VSMWordDictionaryBean Examples

Programming Language: Java

Namespace/Package Name: VSMSerialization

Examples at hotexamples.com: 2

Java VSMWordDictionaryBean - 2 examples found. These are the top rated real world Java examples of VSMSerialization.VSMWordDictionaryBean extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getWordDictionary(2)

Example #1

Show file

File: VSMFeatureVectorsWordNNS.java Project: sameerkhurana10/vsm

  public static void main(String... args) throws Exception {

    /*
     * The inside and outisde projection matrices I suppose
     */
    Object[] matrices = new Object[2];

    /*
     * Data structure to hold all the sparse vectors
     */
    // ArrayList<SparseVector> phiList = new ArrayList<SparseVector>();
    // ArrayList<SparseVector> psiList = new ArrayList<SparseVector>();

    /*
     * Used to normalize the trees
     */

    PTBTreeNormaliser treeNormalizer = new PTBTreeNormaliser(true);

    /*
     * Getting the feature dictionary path, i.e. the serialized file path.
     * This dictionary will be used to form the feature vectors.
     */
    String featureDictionary = null;

    /*
     * This variable tells the code about the directory path where parse
     * trees are stored from which feature vectors need to be extracted
     * corresponding to all the nodes
     */
    String parsedTreeCorpus = null;

    /*
     * The feature dictionary that needs to be used while extracting
     * features
     */
    featureDictionary = "/disk/scratch/s1444025/worddictionary/worddictionary.ser";
    /*
     * The directory that holds the parse trees that are iterated over to
     * extract the feature vector corresponding to the nodes
     */
    parsedTreeCorpus = "/afs/inf.ed.ac.uk/group/project/vsm.restored/trees";
    /*
     * Necessary to get the appropriate directory structure
     */
    // countMapLoc =
    // "/afs/inf.ed.ac.uk/group/project/vsm/countmapnodesamples/countMap.ser";

    /*
     * Getting the serialised dictionary bean object that contains the
     * inside and outside feature dictionaries which are used to form the
     * feature vectors
     */
    VSMWordDictionaryBean dictionaryBean =
        VSMReadSerialWordDict.readSerializedDictionary(featureDictionary);

    /*
     * Getting the inside and outside feature dictionaries, that are used
     * for forming the feature vectors
     */
    System.out.println("***Getting word dictionary*****");
    Alphabet wordDictionary = dictionaryBean.getWordDictionary();
    System.out.println(wordDictionary);
    // System.out.println(wordDictionary.size());

    dprime = wordDictionary.size();
    d = wordDictionary.size();

    SparseMatrixLil PsiTPsi = new SparseMatrixLil(dprime, dprime);
    SparseMatrixLil PsiTPhi = new SparseMatrixLil(dprime, d);
    SparseMatrixLil PhiTPhi = new SparseMatrixLil(d, d);
    SparseMatrixLil PhiTPsi = new SparseMatrixLil(d, dprime);

    /*
     * The parsed tree corpus from where the feature vectors need to be
     * extracted corresponding to all the nodes
     */
    File[] files =
        new File(parsedTreeCorpus)
            .listFiles(
                new FileFilter() {

                  @Override
                  public boolean accept(File file) {
                    return !file.isHidden();
                  }
                });

    ArrayList<String> filePaths = VSMUtil.getFilePaths(files);

    /*
     * The obect that is used to serialize the feature vector bean. The
     * feature vector bean storing the inside and outside feature vectors
     * corresponding to a particular node in a tree. Each feature vector
     * bean holds the feature vectors for one particular node
     */
    VSMSerializeFeatureVectorBeanWord serializeBean = null;

    /*
     * If we already have a serialized count map object then we would want
     * to start from where we left
     */
    // File fileCountMap = new File(countMapLoc);

    serializeBean = new VSMSerializeFeatureVectorBeanWord();
    // } else {
    // VSMCountMap countMapObj = VSMReadSerialCountMap
    // .readCountMapObj(countMapLoc);
    // System.out.println("inside the count map***");
    // serializeBean = new VSMSerializeFeatureVectorBeanWord(
    // countMapObj.getCountMap());
    // }

    /*
     * Getting the data structure to store all the feature vectors in it, We
     * are taking 200000 samples for a particular non-terminal
     */
    SparseMatrixLil Phi = new SparseMatrixLil(300000, d);
    SparseMatrixLil Psi = new SparseMatrixLil(300000, dprime);

    int count = 0;

    mainloop:
    for (String filePath : filePaths) {

      /*
       * Getting an iterator over the trees in the file
       */
      PennTreeReader treeReader = VSMUtil.getTreeReader(filePath);

      /*
       * Iterating over all the trees
       */
      while (treeReader.hasNext()) {

        /*
         * The syntax tree
         */
        Tree<String> syntaxTree = null;

        /*
         * Unmatched parentheses exception. Does this mean that the
         * BLLIP corpus sometimes does not have correct parse trees?
         * Strange
         */
        try {
          syntaxTree = treeReader.next();
        } catch (RuntimeException e) {
          System.out.println("exception" + e + " ::tree  " + syntaxTree);
        }

        /*
         * Do stuff only if the syntax tree is a valid one
         */
        if (syntaxTree != null) {

          /*
           * Process the syntax tree to remove the top bracket
           */
          syntaxTree = treeNormalizer.process(syntaxTree);

          /*
           * Iterator over the nodes of the tree
           */
          Iterator<Tree<String>> nodeTrees = syntaxTree.iterator();

          /*
           * Sparse Inside and outside feature vectors declared
           */
          no.uib.cipr.matrix.sparse.SparseVector psi = null;
          no.uib.cipr.matrix.sparse.SparseVector phi = null;
          Tree<String> insideTree = null;

          /*
           * Iterating over all the nodes in a particular syntax tree
           */
          while (nodeTrees.hasNext()) {

            /*
             * This is the inside tree for which we want to form a
             * feature vector and store it in the map
             */
            insideTree = nodeTrees.next();

            /*
             * Only do stuff if inside tree is not a leaf
             */
            if (!insideTree.isLeaf() && insideTree.getLabel().equalsIgnoreCase("NNS")) {

              /*
               * Setting the object's properties that are stored
               * in the .ser file
               */
              VSMWordFeatureVectorBean vectorBean = new VSMWordFeatureVectorBean();

              System.out.println(
                  "****Extracting inside and outside feature vectors for node****  "
                      + insideTree.getLabel());

              /*
               * Getting the inside and outside feature vectors
               * corresponding to the partcular node
               */

              psi =
                  new VSMOutsideFeatureVectorWords()
                      .getOutsideFeatureVectorPsi(
                          syntaxTree, insideTree, wordDictionary, vectorBean);
              // psiList.add(psi);

              phi =
                  new VSMInsideFeatureVectorWords()
                      .getInsideFeatureVectorPhi(insideTree, wordDictionary, vectorBean);
              // phiList.add(phi);

              System.out.println("got the sparse vectors*** ");

              /*
               * Inside sparse matrix formation for the particular
               * node.
               */

              /*
               * Do the below operation only if both psi and phi
               * are not null for the given node sample and also
               * if either psi pr phi are different than before
               * for this spample, if both are same then no need
               * to unecessarily fill up Psi and Phi
               */

              if (phi != null && psi != null) {

                System.out.println(count);

                System.out.println("****Filling in the matrices***");

                int[] indicesPhi = phi.getIndex();
                double[] valuesPhi = phi.getData();
                /*
                 * Don't need the phi anymore in this iteration
                 */
                phi = null;
                /*
                 * Putting the inside feature vector into the
                 * inside feature matrix
                 */
                for (int i = 0; i < indicesPhi.length; i++) {
                  Phi.append(count, indicesPhi[i], valuesPhi[i]);
                }
                indicesPhi = null;
                valuesPhi = null;

                /*
                 * Outside sparse matrix formation for the
                 * particular node
                 */
                int[] indicesPsi = psi.getIndex();
                double[] valuesPsi = psi.getData();
                psi = null;

                /*
                 * Putting the outside feature vector into the
                 * outside feature matrix
                 */
                for (int j = 0; j < indicesPsi.length; j++) {
                  Psi.append(count, indicesPsi[j], valuesPsi[j]);
                }
                indicesPsi = null;
                valuesPsi = null;
                System.gc();

                /*
                 * Storing the feature vectors in a bean which
                 * will be serialized for future use
                 */
                vectorBean.setPhi(phi);
                vectorBean.setPsi(psi);
                vectorBean.setInsideTree(insideTree);
                vectorBean.setLabel(insideTree.getLabel());
                vectorBean.setSyntaxTree(syntaxTree);

                /*
                 * Serialize the feature vector bean
                 * corresponding to the particular node. The
                 * feature vector bean contains the sparse
                 * inside and outside feature vectors
                 */
                serializeBean.serializeWordVectorBean(vectorBean);
                System.out.println("***Serialized the feature vector***");

                count++;

                /*
                 * Break when we have 200000 samples
                 */
                if (count == (Psi.rows - 1)) {
                  break mainloop;
                }
              }
            }
          }
        }
      }
    }

    /*
     * Call the CCA function here
     */

    System.out.println("*****Done with matrices formation****");

    /*
     * Just calculating the co-vavriance, assuming that the data is centered
     * and normalized
     */

    System.out.println("***Calculating Covariances****");
    PsiTPsi = Psi.t().mmul(Psi); // d' \times d'
    PsiTPhi = Psi.t().mmul(Phi); // d' \times d
    PhiTPhi = Phi.t().mmul(Phi); // d \times d
    PhiTPsi = Phi.t().mmul(Psi); // d \times d'
    System.out.println("****Done with it***");

    /*
     * Log and square root transform
     */
    PsiTPsi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PsiTPsi)));
    PsiTPhi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PsiTPhi)));
    PhiTPhi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PhiTPhi)));
    PhiTPsi =
        VSMUtil.createJeigenMatrix(transform(VSMUtil.createSparseMatrixMTJFromJeigen(PhiTPsi)));

    /*
     * Writing the co-variance matrices in a text file to see what's going
     * on
     */
    System.out.println("****Writing the Covarinace Matrices to the file***");
    VSMUtil.writeCovarMatrixSem(PsiTPsi, "NNS");
    VSMUtil.writeCovarMatrixSem(PsiTPhi, "NNS");
    VSMUtil.writeCovarMatrixSem(PhiTPhi, "NNS");
    VSMUtil.writeCovarMatrixSem(PhiTPsi, "NNS");
    System.out.println("***Done***");

    /*
     * Done with the Psi and Phi and freeing up some space
     */
    Psi = null;
    Phi = null;
    System.gc();
    /*
     * Getting the the similarity scoressvd template object that has utility
     * methods to do preprocessing before performing CCA
     */
    SVDTemplates1 svdTC = new SVDTemplates1(null);

    /*
     * Function to compute the CCA, passing the covariance matrices to the
     * function
     */
    computeCCA2(
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PsiTPhi),
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PhiTPsi),
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PhiTPhi),
        MatrixFormatConversion.createSparseMatrixMTJFromJeigen(PsiTPsi),
        svdTC,
        null,
        0,
        50,
        "NNS");

    /*
     * Writing the projection matrices out in a file to see what is in there
     */
    matrices = VSMUtil.deserializeCCAVariantsRunSem("NNS");
    VSMUtil.writeEigenDictInsideSemantic(matrices, "NNS", d);
    VSMUtil.writeEigenDictOutsideSem(matrices, "NNS", dprime);
    matrices = null;
    PsiTPhi = null;
    PhiTPhi = null;
    PsiTPsi = null;
    PhiTPsi = null;
    System.gc();

    /*
     * We would also like to serialize the count map. The count map is the
     * data structure that helps us store the .ser files in proper
     * directories with proper names. So, if in future we want to extract
     * feature vectors corresponding to more parse trees, we will start from
     * where we left in the directory structure and file name
     */

    /*
     * Getting the updated count map
     */
    // countMap = VSMSerializeFeatureVectorBean.getCountMap();
    // /*
    // * The object that will be serialized
    // */
    // VSMCountMap countMapObject = new VSMCountMap();
    // countMapObject.setCountMap(countMap);
    //
    // /*
    // * Serialize count map
    // */
    // VSMSerializeCountMap.serializeCountMap(countMapObject);
    // System.out.println("*****count map serialized****");
  }

Example #2

Show file

File: SentenceFeatureVectorsTrial.java Project: sameerkhurana10/vsm

  public static void main(String... args) throws Exception {

    System.out.println("+++Compiled New++++");

    nonTerminal = VSMUtil.getNonTerminal(args);

    LOGGER = VSMLogger.setup(FeatureVectors.class.getName() + "." + nonTerminal);

    featureDictionary =
        VSMContant.FEATURE_DICTIONARY + nonTerminal.toLowerCase() + "/dictionary.ser";

    wordDictionaryPath = VSMContant.WORD_DICT;

    LOGGER.info("Reading the Feature Dictionary Object");
    dictionaryBean = ReadSerializedDictionary.readSerializedDictionary(featureDictionary, LOGGER);

    LOGGER.info("Reading the word dictionary object");
    wordDictBean = VSMReadSerialWordDict.readSerializedDictionary(wordDictionaryPath);

    outsideFeatureDictionary = dictionaryBean.getOutsideFeatureDictionary();

    insideFeatureDictionary = dictionaryBean.getInsideFeatureDictionary();

    wordDictionary = wordDictBean.getWordDictionary();

    LOGGER.info(
        "Got the syntactic and semantic feature dictionaries, with word dictionary dimensions: "
            + wordDictionary.size());

    treeReader = VSMUtil.getTreeReader(VSMContant.SICK_TRIAL_TREES);

    LOGGER.info("GOT Training Trees File Iterator: " + treeReader);

    int treeCount = 0;

    while (treeReader.hasNext()) {

      getSynaxTree();

      if (syntaxTree != null) {

        treeCount += 1;

        syntaxTree = treeNormalizer.process(syntaxTree);

        constituentsMap = syntaxTree.getConstituents();

        Iterator<Tree<String>> nodeTrees = syntaxTree.iterator();

        while (nodeTrees.hasNext()) {

          insideTree = nodeTrees.next();

          if (!insideTree.isLeaf() && insideTree.getLabel().equalsIgnoreCase(nonTerminal)) {

            createSparseVectors();

            serializeVectorBean(treeCount);

            System.out.println("Serialized the feature vector***");
          }
        }
      }
    }

    LOGGER.info("Done Creating the Sparse Vectors For the Non Terminal: " + nonTerminal);
  }