コード例 #1
0
ファイル: Discretizer.java プロジェクト: Jungwon/mltk
 /**
  * Discretizes an attribute using bins.
  *
  * @param instances the dataset to discretize.
  * @param attIndex the attribute index.
  * @param bins the bins.
  */
 public static void discretize(Instances instances, int attIndex, Bins bins) {
   Attribute attribute = instances.getAttributes().get(attIndex);
   BinnedAttribute binnedAttribute = new BinnedAttribute(attribute.getName(), bins);
   binnedAttribute.setIndex(attribute.getIndex());
   instances.getAttributes().set(attIndex, binnedAttribute);
   for (Instance instance : instances) {
     int v = bins.getIndex(instance.getValue(attribute.getIndex()));
     instance.setValue(attribute.getIndex(), v);
   }
 }
コード例 #2
0
ファイル: Discretizer.java プロジェクト: Jungwon/mltk
  /**
   * Compute bins for a specified attribute.
   *
   * @param instances the dataset to discretize.
   * @param attIndex the attribute index.
   * @param maxNumBins the number of bins.
   */
  public static Bins computeBins(Instances instances, int attIndex, int maxNumBins) {
    Attribute attribute = instances.getAttributes().get(attIndex);
    List<Element<Double>> list = new ArrayList<>();
    for (Instance instance : instances) {
      list.add(new Element<Double>(instance.getWeight(), instance.getValue(attribute)));
    }
    Collections.sort(list);
    List<DoublePair> stats = new ArrayList<>();
    getStats(list, stats);
    Bins bins = new Bins();
    if (stats.size() <= maxNumBins) {
      double[] a = new double[stats.size()];
      for (int i = 0; i < a.length; i++) {
        a[i] = stats.get(i).v1;
      }
      bins.boundaries = bins.medians = a;
    } else {
      double totalWeight = 0;
      for (DoublePair stat : stats) {
        totalWeight += stat.v2;
      }
      double binSize = totalWeight / maxNumBins;
      List<Double> boundaryList = new ArrayList<>();
      List<Double> medianList = new ArrayList<>();
      int start = 0;
      double weight = 0;
      for (int i = 0; i < stats.size(); i++) {
        weight += stats.get(i).v2;
        totalWeight -= stats.get(i).v2;
        if (weight >= binSize) {
          if (i == start) {
            boundaryList.add(stats.get(start).v1);
            medianList.add(stats.get(start).v1);
            weight = 0;
            start = i + 1;
          } else {
            double d1 = weight - binSize;
            double d2 = stats.get(i).v2 - d1;
            if (d1 < d2) {
              boundaryList.add(stats.get(i).v1);
              medianList.add(getMedian(stats, start, weight / 2));
              start = i + 1;
              weight = 0;
            } else {
              weight -= stats.get(i).v2;
              boundaryList.add(stats.get(i - 1).v1);
              medianList.add(getMedian(stats, start, weight / 2));
              start = i;
              weight = stats.get(i).v2;
            }
          }
          binSize = (totalWeight + weight) / (maxNumBins - boundaryList.size());
        } else if (i == stats.size() - 1) {
          boundaryList.add(stats.get(i).v1);
          medianList.add(getMedian(stats, start, weight / 2));
        }
      }
      bins.boundaries = new double[boundaryList.size()];
      bins.medians = new double[medianList.size()];
      for (int i = 0; i < bins.boundaries.length; i++) {
        bins.boundaries[i] = boundaryList.get(i);
        bins.medians[i] = medianList.get(i);
      }
    }

    return bins;
  }