/**
   * Calculates the centroid pivot of a node based on the list of points that it contains (tbe two
   * lists of its children are provided).
   *
   * @param list1 The point index list of first child.
   * @param list2 The point index list of second child.
   * @param insts The insts object on which the tree is being built (for header information).
   * @return The centroid pivot of the node.
   */
  public Instance calcPivot(MyIdxList list1, MyIdxList list2, Instances insts) {
    int classIdx = m_Instances.classIndex();
    double[] attrVals = new double[insts.numAttributes()];

    Instance temp;
    for (int i = 0; i < list1.length(); i++) {
      temp = insts.instance(((ListNode) list1.get(i)).idx);
      for (int k = 0; k < temp.numValues(); k++) {
        if (temp.index(k) == classIdx) continue;
        attrVals[k] += temp.valueSparse(k);
      }
    }
    for (int j = 0; j < list2.length(); j++) {
      temp = insts.instance(((ListNode) list2.get(j)).idx);
      for (int k = 0; k < temp.numValues(); k++) {
        if (temp.index(k) == classIdx) continue;
        attrVals[k] += temp.valueSparse(k);
      }
    }
    for (int j = 0, numInsts = list1.length() + list2.length(); j < attrVals.length; j++) {
      attrVals[j] /= numInsts;
    }
    temp = new DenseInstance(1.0, attrVals);
    return temp;
  }
예제 #2
0
  /**
   * Compute the MI between instances and attributes
   *
   * @param m the term-document matrix
   * @param input object that describes the statistics about the training data
   */
  private void MI(Matrix m, Input input) {
    int minDimSize =
        m.getColumnDimension() < m.getRowDimension() ? m.getColumnDimension() : m.getRowDimension();
    if (minDimSize < 2) {
      System.err.println("Warning : This is not a JOINT distribution");
      input.Hx = Entropy(m);
      input.Hy = 0;
      input.Ixy = 0;
      return;
    }

    input.Hx = Entropy(input.Px);
    input.Hy = Entropy(input.Py);

    double entropy = input.Hx + input.Hy;
    for (int i = 0; i < m_numInstances; i++) {
      Instance inst = m_data.instance(i);
      for (int v = 0; v < inst.numValues(); v++) {
        double tmp = m.get(inst.index(v), i);
        if (tmp <= 0) {
          continue;
        }
        entropy += tmp * Math.log(tmp);
      }
    }
    input.Ixy = entropy;
    if (m_verbose) {
      System.out.println("Ixy = " + input.Ixy);
    }
  }
예제 #3
0
  /**
   * @param inst
   * @return
   * @throws Exception
   */
  public double SVMOutput(Instance inst) throws Exception {

    double result = -m_b;
    // Is the machine linear?
    if (m_weights != null) {
      // Is weight vector stored in sparse format?
      for (int i = 0; i < inst.numValues(); i++) {
        if (inst.index(i) != m_classIndex) {
          result += m_weights[inst.index(i)] * inst.valueSparse(i);
        }
      }
    } else {
      for (int i = m_supportVectors.getNext(-1); i != -1; i = m_supportVectors.getNext(i)) {
        result += (m_alpha[i] - m_alphaStar[i]) * m_kernel.eval(-1, i, inst);
      }
    }
    return result;
  }
예제 #4
0
 /**
  * Transpose the document-term matrix to term-document matrix
  *
  * @param data instances with document-term info
  * @return a term-document matrix transposed from the input dataset
  */
 private Matrix getTransposedMatrix(Instances data) {
   double[][] temp = new double[data.numAttributes()][data.numInstances()];
   for (int i = 0; i < data.numInstances(); i++) {
     Instance inst = data.instance(i);
     for (int v = 0; v < inst.numValues(); v++) {
       temp[inst.index(v)][i] = inst.valueSparse(v);
     }
   }
   Matrix My_x = new Matrix(temp);
   return My_x;
 }
예제 #5
0
  /**
   * Calculates the distance between two instances
   *
   * @param test the first instance
   * @param train the second instance
   * @return the distance between the two given instances, between 0 and 1
   */
  protected double distance(Instance first, Instance second) {

    double distance = 0;
    int firstI, secondI;

    for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues(); ) {
      if (p1 >= first.numValues()) {
        firstI = m_instances.numAttributes();
      } else {
        firstI = first.index(p1);
      }
      if (p2 >= second.numValues()) {
        secondI = m_instances.numAttributes();
      } else {
        secondI = second.index(p2);
      }
      if (firstI == m_instances.classIndex()) {
        p1++;
        continue;
      }
      if (secondI == m_instances.classIndex()) {
        p2++;
        continue;
      }
      double diff;
      if (firstI == secondI) {
        diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2));
        p1++;
        p2++;
      } else if (firstI > secondI) {
        diff = difference(secondI, 0, second.valueSparse(p2));
        p2++;
      } else {
        diff = difference(firstI, first.valueSparse(p1), 0);
        p1++;
      }
      distance += diff * diff;
    }

    return Math.sqrt(distance / m_instances.numAttributes());
  }
예제 #6
0
  /**
   * log(N!) + (for all the words)(log(Pi^ni) - log(ni!))
   *
   * <p>where N is the total number of words Pi is the probability of obtaining word i ni is the
   * number of times the word at index i occurs in the document
   *
   * @param inst The instance to be classified
   * @param classIndex The index of the class we are calculating the probability with respect to
   * @return The log of the probability of the document occuring given the class
   */
  private double probOfDocGivenClass(Instance inst, int classIndex) {
    double answer = 0;
    // double totalWords = 0; //no need as we are not calculating the factorial at all.

    double freqOfWordInDoc; // should be double
    for (int i = 0; i < inst.numValues(); i++)
      if (inst.index(i) != inst.classIndex()) {
        freqOfWordInDoc = inst.valueSparse(i);
        // totalWords += freqOfWordInDoc;
        answer +=
            (freqOfWordInDoc
                * m_probOfWordGivenClass[classIndex][
                    inst.index(i)]); // - lnFactorial(freqOfWordInDoc));
      }

    // answer += lnFactorial(totalWords);//The factorial terms don't make
    // any difference to the classifier's
    // accuracy, so not needed.

    return answer;
  }
예제 #7
0
 /**
  * Compute the JS divergence between an instance and a cluster, used for training data
  *
  * @param instIdx index of the instance
  * @param input statistics of the input data
  * @param T the whole partition
  * @param t index of the cluster
  * @param pi1
  * @param pi2
  * @return the JS divergence
  */
 private double JS(int instIdx, Input input, Partition T, int t, double pi1, double pi2) {
   if (Math.min(pi1, pi2) <= 0) {
     System.out.format(
         "Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2);
     return 0;
   }
   Instance inst = m_data.instance(instIdx);
   double kl1 = 0.0, kl2 = 0.0, tmp = 0.0;
   for (int i = 0; i < inst.numValues(); i++) {
     tmp = input.Py_x.get(inst.index(i), instIdx);
     if (tmp != 0) {
       kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * T.Py_t.get(inst.index(i), t)));
     }
   }
   for (int i = 0; i < m_numAttributes; i++) {
     if ((tmp = T.Py_t.get(i, t)) != 0) {
       kl2 += tmp * Math.log(tmp / (input.Py_x.get(i, instIdx) * pi1 + pi2 * tmp));
     }
   }
   return pi1 * kl1 + pi2 * kl2;
 }
예제 #8
0
파일: ItemSet.java 프로젝트: ch93/myWEKA
  /**
   * Checks if an instance contains an item set.
   *
   * @param instance the instance to be tested
   * @return true if the given instance contains this item set
   */
  public boolean containedByTreatZeroAsMissing(Instance instance) {

    if (instance instanceof weka.core.SparseInstance) {
      int numInstVals = instance.numValues();
      int numItemSetVals = m_items.length;

      for (int p1 = 0, p2 = 0; p1 < numInstVals || p2 < numItemSetVals; ) {
        int instIndex = Integer.MAX_VALUE;
        if (p1 < numInstVals) {
          instIndex = instance.index(p1);
        }
        int itemIndex = p2;

        if (m_items[itemIndex] > -1) {
          if (itemIndex != instIndex) {
            return false;
          } else {
            if (instance.isMissingSparse(p1)) {
              return false;
            }
            if (m_items[itemIndex] != (int) instance.valueSparse(p1)) {
              return false;
            }
          }

          p1++;
          p2++;
        } else {
          if (itemIndex < instIndex) {
            p2++;
          } else if (itemIndex == instIndex) {
            p2++;
            p1++;
          }
        }
      }
    } else {
      for (int i = 0; i < instance.numAttributes(); i++) {
        if (m_items[i] > -1) {
          if (instance.isMissing(i) || (int) instance.value(i) == 0) {
            return false;
          }
          if (m_items[i] != (int) instance.value(i)) {
            return false;
          }
        }
      }
    }

    return true;
  }
예제 #9
0
 /**
  * Compute the JS divergence between an instance and a cluster, used for test data
  *
  * @param inst instance to be clustered
  * @param t index of the cluster
  * @param pi1
  * @param pi2
  * @return the JS divergence
  */
 private double JS(Instance inst, int t, double pi1, double pi2) {
   if (Math.min(pi1, pi2) <= 0) {
     System.out.format(
         "Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2);
     return 0;
   }
   double sum = Utils.sum(inst.toDoubleArray());
   double kl1 = 0.0, kl2 = 0.0, tmp = 0.0;
   for (int i = 0; i < inst.numValues(); i++) {
     tmp = inst.valueSparse(i) / sum;
     if (tmp != 0) {
       kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * bestT.Py_t.get(inst.index(i), t)));
     }
   }
   for (int i = 0; i < m_numAttributes; i++) {
     if ((tmp = bestT.Py_t.get(i, t)) != 0) {
       kl2 += tmp * Math.log(tmp / (inst.value(i) * pi1 / sum + pi2 * tmp));
     }
   }
   return pi1 * kl1 + pi2 * kl2;
 }
예제 #10
0
  /**
   * Generates the classifier.
   *
   * @param instances set of instances serving as training data
   * @throws Exception if the classifier has not been generated successfully
   */
  public void buildClassifier(Instances instances) throws Exception {
    // can classifier handle the data?
    getCapabilities().testWithFail(instances);

    // remove instances with missing class
    instances = new Instances(instances);
    instances.deleteWithMissingClass();

    m_headerInfo = new Instances(instances, 0);
    m_numClasses = instances.numClasses();
    m_numAttributes = instances.numAttributes();
    m_probOfWordGivenClass = new double[m_numClasses][];

    /*
      initialising the matrix of word counts
      NOTE: Laplace estimator introduced in case a word that does not appear for a class in the
      training set does so for the test set
    */
    for (int c = 0; c < m_numClasses; c++) {
      m_probOfWordGivenClass[c] = new double[m_numAttributes];
      for (int att = 0; att < m_numAttributes; att++) {
        m_probOfWordGivenClass[c][att] = 1;
      }
    }

    // enumerate through the instances
    Instance instance;
    int classIndex;
    double numOccurences;
    double[] docsPerClass = new double[m_numClasses];
    double[] wordsPerClass = new double[m_numClasses];

    java.util.Enumeration enumInsts = instances.enumerateInstances();
    while (enumInsts.hasMoreElements()) {
      instance = (Instance) enumInsts.nextElement();
      classIndex = (int) instance.value(instance.classIndex());
      docsPerClass[classIndex] += instance.weight();

      for (int a = 0; a < instance.numValues(); a++)
        if (instance.index(a) != instance.classIndex()) {
          if (!instance.isMissing(a)) {
            numOccurences = instance.valueSparse(a) * instance.weight();
            if (numOccurences < 0)
              throw new Exception("Numeric attribute values must all be greater or equal to zero.");
            wordsPerClass[classIndex] += numOccurences;
            m_probOfWordGivenClass[classIndex][instance.index(a)] += numOccurences;
          }
        }
    }

    /*
      normalising probOfWordGivenClass values
      and saving each value as the log of each value
    */
    for (int c = 0; c < m_numClasses; c++)
      for (int v = 0; v < m_numAttributes; v++)
        m_probOfWordGivenClass[c][v] =
            Math.log(m_probOfWordGivenClass[c][v] / (wordsPerClass[c] + m_numAttributes - 1));

    /*
      calculating Pr(H)
      NOTE: Laplace estimator introduced in case a class does not get mentioned in the set of
      training instances
    */
    final double numDocs = instances.sumOfWeights() + m_numClasses;
    m_probOfClass = new double[m_numClasses];
    for (int h = 0; h < m_numClasses; h++)
      m_probOfClass[h] = (double) (docsPerClass[h] + 1) / numDocs;
  }