@Override public void map(Key key) { _rows = new long[_clusters.length]; _dist = new double[_clusters.length]; assert key.home(); ValueArray va = DKV.get(_arykey).get(); AutoBuffer bits = va.getChunk(key); int rows = va.rpc(ValueArray.getChunkIndex(key)); double[] values = new double[_cols.length - 1]; ClusterDist cd = new ClusterDist(); for (int row = 0; row < rows; row++) { KMeans.datad(va, bits, row, _cols, _normalized, values); KMeans.closest(_clusters, values, cd); _rows[cd._cluster]++; _dist[cd._cluster] += cd._dist; } _arykey = null; _cols = null; _clusters = null; }
@Override protected double cluster( DataSet dataSet, List<Double> accelCache, int k, List<Vec> means, int[] assignment, boolean exactTotal, ExecutorService threadpool, boolean returnError) { return kmeans.cluster( dataSet, accelCache, k, means, assignment, exactTotal, threadpool, returnError); }
static ArrayList run(int para) { modshogun.init_shogun_with_defaults(); int k = para; init_random(17); DoubleMatrix fm_train = Load.load_numbers("../data/fm_train_real.dat"); RealFeatures feats_train = new RealFeatures(fm_train); EuclidianDistance distance = new EuclidianDistance(feats_train, feats_train); KMeans kmeans = new KMeans(k, distance); kmeans.train(); DoubleMatrix out_centers = kmeans.get_cluster_centers(); kmeans.get_radiuses(); ArrayList result = new ArrayList(); result.add(kmeans); result.add(out_centers); modshogun.exit_shogun(); return result; }
/** * Single row scoring, on properly ordered data. Will return NaN if any data element contains a * NaN. Returns the cluster-number, which is mostly an internal value. Last data element refers to * the response variable, which is not used for k-means. */ @Override protected double score0(double[] data) { for (int i = 0; i < data.length - 1; i++) { // Normalize the data before scoring ValueArray.Column C = _va._cols[i]; double d = data[i]; if (_normalized) { d -= C._mean; if (C._sigma != 0.0 && !Double.isNaN(C._sigma)) d /= C._sigma; } data[i] = d; } data[data.length - 1] = Double.NaN; // Response variable column not used return KMeans.closest(_clusters, data, new ClusterDist())._cluster; }
/** * Creates a new ValueArray with classes. New ValueArray is not aligned with source one * unfortunately so have to send results to each chunk owner using Atomic. */ @Override public void map(Key key) { assert key.home(); if (Job.isRunning(_job.self())) { ValueArray va = DKV.get(_arykey).get(); AutoBuffer bits = va.getChunk(key); long startRow = va.startRow(ValueArray.getChunkIndex(key)); int rows = va.rpc(ValueArray.getChunkIndex(key)); int rpc = (int) (ValueArray.CHUNK_SZ / ROW_SIZE); long chunk = ValueArray.chknum(startRow, va.numRows(), ROW_SIZE); long updatedChk = chunk; long updatedRow = startRow; double[] values = new double[_cols.length - 1]; ClusterDist cd = new ClusterDist(); int[] clusters = new int[rows]; int count = 0; for (int row = 0; row < rows; row++) { KMeans.datad(va, bits, row, _cols, _normalized, values); KMeans.closest(_clusters, values, cd); chunk = ValueArray.chknum(startRow + row, va.numRows(), ROW_SIZE); if (chunk != updatedChk) { updateClusters(clusters, count, updatedChk, va.numRows(), rpc, updatedRow); updatedChk = chunk; updatedRow = startRow + row; count = 0; } clusters[count++] = cd._cluster; } if (count > 0) updateClusters(clusters, count, chunk, va.numRows(), rpc, updatedRow); _job.updateProgress(1); } _job = null; _arykey = null; _cols = null; _clusters = null; }
/** * This method implements the clustering-based misclassified sample exploitation technique. In * this method we sample around each of the clusters by creating queries such as the following: * SELECT rowc, colc FROM testing WHERE (rowc >= 662.5 AND rowc <= 702.5 AND colc >= 992 AND colc * <= 1053 ) * * @param numberOfClusters the number of clusters to be explored. * @return an arrayList containing the samples that we have selected from the misclassified * sampling areas * @throws Exception */ @Override public ArrayList<Tuple> getNearestMissclassified( int numberOfClusters, ArrayList<Tuple> misclassified) throws Exception { Statement statement; ResultSet rs; ArrayList<Tuple> rand = new ArrayList<Tuple>(); double percenAroundMiscl = Global.PERCENT_AROUND_MISCLASSIFIED; KMeans k = new KMeans(); ArrayList<Tuple> centroids = k.getCentroids(numberOfClusters); int[] assignments = null; if (k.centroidsExist == true) { assignments = k.assignments; } System.out.println( "Exploring the clusters of misclassified. Number of clusters: " + numberOfClusters); // total number of tuples we should spend on the misclassified phase. for (int im = 0; im < centroids.size(); im++) { // System.out.println("This is the centroid: "+centroids.get(0)); ArrayList<Tuple> h = new ArrayList<Tuple>(); Tuple t = centroids.get(im); String query = "SELECT " + Global.OBJECT_KEY + " , "; for (int i = 0; i < Global.attributes.size(); i++) { if (i != Global.attributes.size() - 1) query += Global.attributes.get(i).getName() + " , "; else query += Global.attributes.get(i).getName(); } query += " FROM " + Global.TABLE_NAME + " WHERE "; for (int i = 0; i < Global.attributes.size(); i++) { ArrayList<String> valuesForQuery = getClosest( Global.attributes.get(i).getDomain(), percenAroundMiscl, (double) t.valueAt(i)); if (k.centroidsExist == true) { while (checkOK(misclassified, valuesForQuery, assignments, im, i) == false) { percenAroundMiscl++; valuesForQuery = getClosest( Global.attributes.get(i).getDomain(), percenAroundMiscl, (double) t.valueAt(i)); } } query += " (" + Global.attributes.get(i).getName() + " >= " + valuesForQuery.get(0) + " AND " + Global.attributes.get(i).getName() + " <= " + valuesForQuery.get(1) + ") "; if (i < Global.attributes.size() - 1) { query += " AND "; } } query += " ORDER BY RANDOM() LIMIT " + Global.RANDOM_AROUND_MISCLASSIFIED; Connection connection = DBConnection.getConnection(); statement = connection.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_READ_ONLY); rs = statement.executeQuery(query); while (rs.next()) { Object key = rs.getString(1); Object[] attrValues = new Object[Global.attributes.size()]; for (int m = 1; m <= Global.attributes.size(); m++) { attrValues[m - 1] = rs.getString(m + 1); } Tuple tuple = new Tuple(key, attrValues); h.add(tuple); } rand.addAll(h); } return rand; // returns an ArrayList with the k nearest neighbors }