Пример #1
0
 private double getMaxOntology() {
   double max = Double.NEGATIVE_INFINITY;
   for (GOTerm term : this.subGoTerms) {
     max = Math.max(max, this.annotations.countNumberOfGenesForGOTerm(term.getGOid()));
   }
   return max;
 }
Пример #2
0
 // **************************************************************************
 // **************************************************************************
 // UTILITY FUNCTIONS
 // **************************************************************************
 // **************************************************************************
 private int getNumberOfAnnotations(GOTerm currentGoTerm) {
   // look for the annotations in the cache, if we find it, return it.
   int id = currentGoTerm.getNumericId();
   if (this.numAnnotations.containsKey(id)) {
     return this.numAnnotations.get(id);
   } else { // this number is not cached yet.
     int num = this.annotations.countNumberOfGenesForGOTerm(currentGoTerm.getGOid());
     this.numAnnotations.put(id, num);
     return num;
   }
 }
Пример #3
0
  private void setAllLeafs() {

    for (GOTerm term : this.subGoTerms) {
      boolean isLeaf = true;
      for (String relation : this.relations) {
        // is it a leaf for ALL the ontologies? otherwise,
        // it is not really a leaf..
        if (!this.isLeafISM(term, relation)) {
          isLeaf = false;
          break;
        }
      }
      if (isLeaf) {
        this.leafs.add(term.getNumericId());
      }
    }
  }
Пример #4
0
  private Matrix initialiseTransitionProbabilities() {
    // 1. initialise transitionprobabilities
    // we use a sparse matrix, so we don't need to put zeroes anywhere.
    // Matrix P = new Matrix(this.getNumGoTerms(), this.getNumGoTerms());
    Matrix P = new SparseMatrix(this.getNumGoTerms(), this.getNumGoTerms());

    // 0.1 check if the node is a leaf, if it is, put a 1 into it.
    for (GOTerm currentGoTerm : this.subGoTerms) {
      if (this.leafs.contains(currentGoTerm.getNumericId())) {
        int leafIndex = this.goTermIndex.get(currentGoTerm.getNumericId());
        P.set(leafIndex, leafIndex, 1.0f);
        continue;
      }
    }
    for (GOTerm currentGoTerm : this.subGoTerms) {
      int N_v = this.getNumberOfAnnotations(currentGoTerm);
      int N_vStar = this.getNumberOfAnnotationsStar(currentGoTerm);

      this.setTransitionProbabilitiesNonLeaf(currentGoTerm, N_v, N_vStar, P);
    }
    return P;
  }
Пример #5
0
  private int getNumberOfAnnotationsStar(GOTerm currentGoTerm) throws IllegalArgumentException {

    // look for the annotations in the cache, if we find it, return it.
    int id = currentGoTerm.getNumericId();
    if (this.numAnnotationsStar.containsKey(id)) {
      return this.numAnnotationsStar.get(id);
    } else { // this number is not cached yet, so we need to compute it.

      // 0. get number of annotations for this node.
      int currentGoTermAnnotationCount = this.getNumberOfAnnotations(currentGoTerm);
      // 1. get all annotations for the children.

      Set<GOTerm> children = new HashSet<GOTerm>();
      for (String currentRelation : this.relations) { // again, we consider all relations at once.
        children.addAll(currentGoTerm.getChildrenForRelation(currentRelation));
      }

      Set<String> proteinsAnnotatedToChildren = new HashSet<String>();
      for (GOTerm child : children) {
        proteinsAnnotatedToChildren.addAll(this.annotations.getProteinsForGOTerm(child.getGOid()));
      }

      // there is no need to actually make the set difference. Once we
      // get the number of annotations in the parent, we should just
      // substract to that number the number of unique annotations in the
      // children. This would be the number of annotations in the parent
      // that belong to none of the children.

      int childrenAnnotationCount = proteinsAnnotatedToChildren.size();

      int retVal = currentGoTermAnnotationCount - childrenAnnotationCount;

      this.numAnnotationsStar.put(id, retVal);

      return retVal;
    }
  }
Пример #6
0
  /*we write this to conform to the defintion of leaf in ISM..
   * Basically, we need to find if the children of the current node belongs
   * to the "ontology". By "ontology" we mean the subset of nodes with annotations
   * The easies way of doing this is: fetch the children, iterate through them
   * and find the number of annotations. If it all of the children do not have annotations
   * then it is a leaf, oftherwise, it is not.
   */
  private boolean isLeafISM(GOTerm currentTerm, String relation) {

    List<GOTerm> children = currentTerm.getChildrenForRelation(relation);

    // if it is emtpy, then the thing is a leaf, for sure
    // keep in mind that we only check children for nodes with annotations
    if (children.isEmpty()) {
      return true;
    }
    // check the annotation count for each of the children, in case the thing
    // has children

    for (GOTerm currentChild : children) {
      // the easy way: is the intersection between children and this.goTerms
      // emtpy?
      if (this.getNumberOfAnnotations(currentChild) > 0) {
        return false;
      }
    }
    return true;
  }
Пример #7
0
  private Matrix getMatrixA() throws IOException {

    // Matrix A = new Matrix(this.getNumGoTerms(), this.annotations.sizeGenes());
    Matrix A = new Matrix(this.getNumGoTerms(), this.RWC.getColumnDimension());
    for (GOTerm currentGoTerm : this.subGoTerms) {
      // 0. check for NStar value > 0, since this indicates there
      // is an annotation
      if (this.getNumberOfAnnotationsStar(currentGoTerm) > 0) {
        // 0. Get all the genes annotating the current node
        // this will hodl the difference of the parent set with the child set
        Set<String> uniqueAnnotations =
            new HashSet<String>(this.annotations.getProteinsForGOTerm(currentGoTerm.getGOid()));
        // 1. Get all the genes annotating the children of the current node
        // 1.0 get all the children
        Set<GOTerm> children = new HashSet<GOTerm>();
        for (String currentRelation : this.relations) {
          children.addAll(currentGoTerm.getChildrenForRelation(currentRelation));
        }
        // 1.1 get all the genes for the children.
        Set<String> childrenAnnotations = new HashSet<String>();
        for (GOTerm currentChild : children) {
          childrenAnnotations.addAll(this.annotations.getProteinsForGOTerm(currentChild.getGOid()));
        }
        // 2. Traverse the difference and count the number of terms annotating this et of genes.
        // 2.0 obtain the difference between the two nodes. that is, the annotations
        // that are unique to the current node..
        uniqueAnnotations.removeAll(childrenAnnotations);
        for (String uniqueAnnotation : uniqueAnnotations) {
          // this is a tricky one. We are not sure what "directly" means in the paper.
          // but it should not be a very complicated problem to solve.
          int count =
              this.annotations.getGOTermScoresForProteinId(uniqueAnnotation).keySet().size();
          // 0. get the protein id.
          A.set(
              this.goTermIndex.get(currentGoTerm.getNumericId()),
              this.proteinIndices.get(uniqueAnnotation),
              1.0f / count);
        }
      }
    }
    this.logger.showMessage("Matrix A computed. % of sparseness = " + A.getSparsenessPercentage());
    return A;
  }
Пример #8
0
  public ISM_validImplementation(
      GOTerm[] ISM_currentGoTerms,
      Matrix HSM,
      String[] ISM_currentRelations,
      Assignment ISM_Annotations,
      boolean termwise,
      boolean wJaccard,
      TinyLogger logger) {
    // 0. Utils

    // 0.1 various caches for speedup
    this.annotations = ISM_Annotations; // copy the annotations
    this.numAnnotations = new HashMap<Integer, Integer>(); // fire up the cache
    this.numAnnotationsStar = new HashMap<Integer, Integer>(); // fire up the cache

    // 0.2 load the affected goterms
    this.subGoTerms = ISM_currentGoTerms;

    // 0.3 just some variables to keep useful data.
    this.relations = ISM_currentRelations;
    this.maxNumberOfAnnotations = this.getMaxOntology();

    // 0.4 HSM
    this.HSM = HSM;

    // 0.5.0 fire up the cache for the indices and load it up
    this.goTermIndex = new HashMap<Integer, Integer>();
    this.gotermIdByIndex = new HashMap<Integer, Integer>();
    // now we load all the indices for the goterms.
    for (int i = 0; i < this.getNumGoTerms(); i++) {
      this.goTermIndex.put(this.subGoTerms[i].getNumericId(), i);
      this.gotermIdByIndex.put(i, this.subGoTerms[i].getNumericId());
    }
    // 0.5.1 now that the goterms are set, we can load the indices.
    this.allIndices = this.getAllIndices();

    // 0.6 finding the proper leafs.
    this.leafs = new HashSet<Integer>(); // all leafs are here
    this.setAllLeafs();
    this.leafIndices = this.getLeafIndices();

    // 0.7 ISM
    // to differentiate between genewise and termwise.
    this.termwise = termwise;
    // to differentiate between weighted and unweighted jaccard
    this.weightedJaccard = wJaccard;

    // we load the indices for the annotations only in the case
    // were we compute the genewise similarity
    if (!this.termwise) {
      // this is just to index all the annotations.
      this.proteinIndices = new HashMap<String, Integer>();
      for (GOTerm term : this.subGoTerms) {
        Set<String> proteinsInGoTerm = this.annotations.getProteinsForGOTerm(term.getGOid());
        int proteinIndex;
        // the protein already exists in the list.
        for (String protein : proteinsInGoTerm) {
          if (!proteinIndices.containsKey(protein)) {
            proteinIndex = this.proteinIndices.size();
            this.proteinIndices.put(protein, proteinIndex);
          }
        }
      }
    }

    // 0.8 RWC
    // Different sets have to be traversed
    this.RWC = new Matrix(HSM.getRowDimension(), HSM.getColumnDimension());
    /*
    if (this.termwise) {
    this.RWC = new Matrix(this.getNumGoTerms(), this.getNumGoTerms());
    } else {
    this.RWC = new Matrix(this.proteinIndices.size(), this.proteinIndices.size());
    }*/

    // 3.
    // just set the convergence limit.
    this.epsilon = 0.001;

    this.logger = logger;
  }
Пример #9
0
  private void setTransitionProbabilitiesNonLeaf(
      GOTerm currentGoTerm, int N_v, int N_vStar, Matrix P) {
    // P(v,c) = (1 - N_v* / N_v) N_c/(Sum{u: v->u} N_u_)
    // P(v,c) = A * N_C/B

    if (N_v == N_vStar) {
      // The fact that N_v is equal to N_vStar
      // gives a 0.0 in the matrix entries, because of the leading factor
      // 1-N_v/N_vStar, which does not to be specified due to the sparse
      // matrix used...
      return;
    }

    float A = (1.0f - ((float) N_vStar / (float) N_v));

    // we get all the children
    // now, this is a tricky one.
    // the way I understand it so far is that all relations are the same, that is,
    // the random walker will not differentiate between a part_of and a is_a relation
    // so, we get ALL the children, adding them all up in a list, disregarding the relations
    // completely. This is viable, since the random walker would be able to reach that
    // node following either one of the relations.
    // children will the contain ALL the children, following every possible relation

    // the number of children per node is not very large, ArrayList is O(1) insertion, and O(n)
    // so it's ok.
    Set<GOTerm> children = new HashSet<GOTerm>();
    for (String currentRelation : this.relations) {
      children.addAll(currentGoTerm.getChildrenForRelation(currentRelation));
    }

    // we count all the annotations in the children
    // B keesp the sum of N_u for every child.
    // this number is the same given eery child of v.
    int N_u = 0;
    for (GOTerm currentChild : children) {
      N_u += this.getNumberOfAnnotations(currentChild);
    }
    // we need to use two loops. First we compute the total sum of
    // annotations in the children, and the we modify the matrix P.
    // keep in mind that these nodes are allways non leaf nodes, therefore, B
    // should not remain zero unless the children are actually not annotating
    // any genes.
    // this is why we specify an initial value for B. This will put
    // a very low transition probability to that node.

    // P(v,c) = A * N_C/B

    final float inv_N_u = 1.0f / N_u;

    Map<Integer, GOTerm> sortedChildren = new TreeMap<Integer, GOTerm>();

    for (GOTerm currentChild : children) {
      sortedChildren.put(currentChild.getNumericId(), currentChild);
    }

    for (int currentChildId : sortedChildren.keySet()) {
      GOTerm currentChild = sortedChildren.get(currentChildId);

      int N_c = this.getNumberOfAnnotations(currentChild);

      // we check whether the annotations are in excess of zero.
      // if the node does not have any annotations, then there is no need
      // in putting anything in it, since the structure is a sparse matrix.

      if (N_c > 0) {
        float newEntry = A * (N_c * inv_N_u);
        int v = this.goTermIndex.get(currentGoTerm.getNumericId());
        int c = this.goTermIndex.get(currentChild.getNumericId());
        P.set(c, v, newEntry);
      }
    }
  }