예제 #1
0
파일: C45.java 프로젝트: alei76/tctm
 /**
  * Calculates the minimum description length of this node, i.e., the length of the binary
  * encoding that describes the feature and the split value used at this node
  */
 public double getMDL() {
   int numClasses = m_ilist.getTargetAlphabet().size();
   double mdl = getSize() * getGainRatio().getBaseEntropy();
   mdl += ((numClasses - 1) * Math.log(getSize() / 2.0)) / (2 * GainRatio.log2);
   double piPow = Math.pow(Math.PI, numClasses / 2.0);
   double gammaVal = Maths.gamma(numClasses / 2.0);
   mdl += Math.log(piPow / gammaVal) / GainRatio.log2;
   return mdl;
 }
  /**
   * Initialize this separate model using a complete list.
   *
   * @param documents
   * @param testStartIndex
   */
  public void divideDocuments(InstanceList documents, int testStartIndex) {
    Alphabet dataAlpha = documents.getDataAlphabet();
    Alphabet targetAlpha = documents.getTargetAlphabet();

    this.training = new InstanceList(dataAlpha, targetAlpha);
    this.test = new InstanceList(dataAlpha, targetAlpha);
    int di = 0;
    for (di = 0; di < testStartIndex; di++) {
      training.add(documents.get(di));
    }
    for (di = testStartIndex; di < documents.size(); di++) {
      test.add(documents.get(di));
    }
  }