Exemple #1
0
 @Override
 public void map(Key key) {
   _rows = new long[_clusters.length];
   _dist = new double[_clusters.length];
   assert key.home();
   ValueArray va = DKV.get(_arykey).get();
   AutoBuffer bits = va.getChunk(key);
   int rows = va.rpc(ValueArray.getChunkIndex(key));
   double[] values = new double[_cols.length - 1];
   ClusterDist cd = new ClusterDist();
   for (int row = 0; row < rows; row++) {
     KMeans.datad(va, bits, row, _cols, _normalized, values);
     KMeans.closest(_clusters, values, cd);
     _rows[cd._cluster]++;
     _dist[cd._cluster] += cd._dist;
   }
   _arykey = null;
   _cols = null;
   _clusters = null;
 }
Exemple #2
0
 @Override
 protected double cluster(
     DataSet dataSet,
     List<Double> accelCache,
     int k,
     List<Vec> means,
     int[] assignment,
     boolean exactTotal,
     ExecutorService threadpool,
     boolean returnError) {
   return kmeans.cluster(
       dataSet, accelCache, k, means, assignment, exactTotal, threadpool, returnError);
 }
  static ArrayList run(int para) {
    modshogun.init_shogun_with_defaults();
    int k = para;
    init_random(17);

    DoubleMatrix fm_train = Load.load_numbers("../data/fm_train_real.dat");

    RealFeatures feats_train = new RealFeatures(fm_train);
    EuclidianDistance distance = new EuclidianDistance(feats_train, feats_train);

    KMeans kmeans = new KMeans(k, distance);
    kmeans.train();

    DoubleMatrix out_centers = kmeans.get_cluster_centers();
    kmeans.get_radiuses();

    ArrayList result = new ArrayList();
    result.add(kmeans);
    result.add(out_centers);

    modshogun.exit_shogun();
    return result;
  }
Exemple #4
0
 /**
  * Single row scoring, on properly ordered data. Will return NaN if any data element contains a
  * NaN. Returns the cluster-number, which is mostly an internal value. Last data element refers to
  * the response variable, which is not used for k-means.
  */
 @Override
 protected double score0(double[] data) {
   for (int i = 0; i < data.length - 1; i++) { // Normalize the data before scoring
     ValueArray.Column C = _va._cols[i];
     double d = data[i];
     if (_normalized) {
       d -= C._mean;
       if (C._sigma != 0.0 && !Double.isNaN(C._sigma)) d /= C._sigma;
     }
     data[i] = d;
   }
   data[data.length - 1] = Double.NaN; // Response variable column not used
   return KMeans.closest(_clusters, data, new ClusterDist())._cluster;
 }
Exemple #5
0
 /**
  * Creates a new ValueArray with classes. New ValueArray is not aligned with source one
  * unfortunately so have to send results to each chunk owner using Atomic.
  */
 @Override
 public void map(Key key) {
   assert key.home();
   if (Job.isRunning(_job.self())) {
     ValueArray va = DKV.get(_arykey).get();
     AutoBuffer bits = va.getChunk(key);
     long startRow = va.startRow(ValueArray.getChunkIndex(key));
     int rows = va.rpc(ValueArray.getChunkIndex(key));
     int rpc = (int) (ValueArray.CHUNK_SZ / ROW_SIZE);
     long chunk = ValueArray.chknum(startRow, va.numRows(), ROW_SIZE);
     long updatedChk = chunk;
     long updatedRow = startRow;
     double[] values = new double[_cols.length - 1];
     ClusterDist cd = new ClusterDist();
     int[] clusters = new int[rows];
     int count = 0;
     for (int row = 0; row < rows; row++) {
       KMeans.datad(va, bits, row, _cols, _normalized, values);
       KMeans.closest(_clusters, values, cd);
       chunk = ValueArray.chknum(startRow + row, va.numRows(), ROW_SIZE);
       if (chunk != updatedChk) {
         updateClusters(clusters, count, updatedChk, va.numRows(), rpc, updatedRow);
         updatedChk = chunk;
         updatedRow = startRow + row;
         count = 0;
       }
       clusters[count++] = cd._cluster;
     }
     if (count > 0) updateClusters(clusters, count, chunk, va.numRows(), rpc, updatedRow);
     _job.updateProgress(1);
   }
   _job = null;
   _arykey = null;
   _cols = null;
   _clusters = null;
 }
  /**
   * This method implements the clustering-based misclassified sample exploitation technique. In
   * this method we sample around each of the clusters by creating queries such as the following:
   * SELECT rowc, colc FROM testing WHERE (rowc >= 662.5 AND rowc <= 702.5 AND colc >= 992 AND colc
   * <= 1053 )
   *
   * @param numberOfClusters the number of clusters to be explored.
   * @return an arrayList containing the samples that we have selected from the misclassified
   *     sampling areas
   * @throws Exception
   */
  @Override
  public ArrayList<Tuple> getNearestMissclassified(
      int numberOfClusters, ArrayList<Tuple> misclassified) throws Exception {
    Statement statement;
    ResultSet rs;
    ArrayList<Tuple> rand = new ArrayList<Tuple>();

    double percenAroundMiscl = Global.PERCENT_AROUND_MISCLASSIFIED;

    KMeans k = new KMeans();
    ArrayList<Tuple> centroids = k.getCentroids(numberOfClusters);
    int[] assignments = null;
    if (k.centroidsExist == true) {
      assignments = k.assignments;
    }
    System.out.println(
        "Exploring the clusters of misclassified. Number of clusters: " + numberOfClusters);
    // total number of tuples we should spend on the misclassified phase.
    for (int im = 0; im < centroids.size(); im++) {
      // System.out.println("This is the centroid: "+centroids.get(0));
      ArrayList<Tuple> h = new ArrayList<Tuple>();
      Tuple t = centroids.get(im);
      String query = "SELECT " + Global.OBJECT_KEY + " , ";
      for (int i = 0; i < Global.attributes.size(); i++) {
        if (i != Global.attributes.size() - 1) query += Global.attributes.get(i).getName() + " , ";
        else query += Global.attributes.get(i).getName();
      }
      query += " FROM " + Global.TABLE_NAME + " WHERE ";
      for (int i = 0; i < Global.attributes.size(); i++) {
        ArrayList<String> valuesForQuery =
            getClosest(
                Global.attributes.get(i).getDomain(), percenAroundMiscl, (double) t.valueAt(i));
        if (k.centroidsExist == true) {
          while (checkOK(misclassified, valuesForQuery, assignments, im, i) == false) {
            percenAroundMiscl++;
            valuesForQuery =
                getClosest(
                    Global.attributes.get(i).getDomain(), percenAroundMiscl, (double) t.valueAt(i));
          }
        }
        query +=
            " ("
                + Global.attributes.get(i).getName()
                + " >= "
                + valuesForQuery.get(0)
                + " AND "
                + Global.attributes.get(i).getName()
                + " <= "
                + valuesForQuery.get(1)
                + ") ";
        if (i < Global.attributes.size() - 1) {
          query += " AND ";
        }
      }
      query += " ORDER BY RANDOM() LIMIT " + Global.RANDOM_AROUND_MISCLASSIFIED;
      Connection connection = DBConnection.getConnection();
      statement =
          connection.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_READ_ONLY);
      rs = statement.executeQuery(query);
      while (rs.next()) {
        Object key = rs.getString(1);
        Object[] attrValues = new Object[Global.attributes.size()];
        for (int m = 1; m <= Global.attributes.size(); m++) {
          attrValues[m - 1] = rs.getString(m + 1);
        }
        Tuple tuple = new Tuple(key, attrValues);
        h.add(tuple);
      }
      rand.addAll(h);
    }
    return rand; // returns an ArrayList with the k nearest neighbors
  }