/** * This method implements the clustering-based misclassified sample exploitation technique. In * this method we sample around each of the clusters by creating queries such as the following: * SELECT rowc, colc FROM testing WHERE (rowc >= 662.5 AND rowc <= 702.5 AND colc >= 992 AND colc * <= 1053 ) * * @param numberOfClusters the number of clusters to be explored. * @return an arrayList containing the samples that we have selected from the misclassified * sampling areas * @throws Exception */ @Override public ArrayList<Tuple> getNearestMissclassified( int numberOfClusters, ArrayList<Tuple> misclassified) throws Exception { Statement statement; ResultSet rs; ArrayList<Tuple> rand = new ArrayList<Tuple>(); double percenAroundMiscl = Global.PERCENT_AROUND_MISCLASSIFIED; KMeans k = new KMeans(); ArrayList<Tuple> centroids = k.getCentroids(numberOfClusters); int[] assignments = null; if (k.centroidsExist == true) { assignments = k.assignments; } System.out.println( "Exploring the clusters of misclassified. Number of clusters: " + numberOfClusters); // total number of tuples we should spend on the misclassified phase. for (int im = 0; im < centroids.size(); im++) { // System.out.println("This is the centroid: "+centroids.get(0)); ArrayList<Tuple> h = new ArrayList<Tuple>(); Tuple t = centroids.get(im); String query = "SELECT " + Global.OBJECT_KEY + " , "; for (int i = 0; i < Global.attributes.size(); i++) { if (i != Global.attributes.size() - 1) query += Global.attributes.get(i).getName() + " , "; else query += Global.attributes.get(i).getName(); } query += " FROM " + Global.TABLE_NAME + " WHERE "; for (int i = 0; i < Global.attributes.size(); i++) { ArrayList<String> valuesForQuery = getClosest( Global.attributes.get(i).getDomain(), percenAroundMiscl, (double) t.valueAt(i)); if (k.centroidsExist == true) { while (checkOK(misclassified, valuesForQuery, assignments, im, i) == false) { percenAroundMiscl++; valuesForQuery = getClosest( Global.attributes.get(i).getDomain(), percenAroundMiscl, (double) t.valueAt(i)); } } query += " (" + Global.attributes.get(i).getName() + " >= " + valuesForQuery.get(0) + " AND " + Global.attributes.get(i).getName() + " <= " + valuesForQuery.get(1) + ") "; if (i < Global.attributes.size() - 1) { query += " AND "; } } query += " ORDER BY RANDOM() LIMIT " + Global.RANDOM_AROUND_MISCLASSIFIED; Connection connection = DBConnection.getConnection(); statement = connection.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_READ_ONLY); rs = statement.executeQuery(query); while (rs.next()) { Object key = rs.getString(1); Object[] attrValues = new Object[Global.attributes.size()]; for (int m = 1; m <= Global.attributes.size(); m++) { attrValues[m - 1] = rs.getString(m + 1); } Tuple tuple = new Tuple(key, attrValues); h.add(tuple); } rand.addAll(h); } return rand; // returns an ArrayList with the k nearest neighbors }
/** * This method calculates the density of a grid cell according to this type: * numberOfUniqueExistingTuples/numberOfUniqueCombinations (in the cell) * * @param theBoundary the boundaries of the specific cell we are calculating the density for * @return the density of the cell. * @throws SQLException * @throws IOException * @throws NumberFormatException */ private double findDensity(Tuple center) throws SQLException, NumberFormatException, IOException { double density = 0.0; // calculates the size of each grid double gridLength = 100 / (double) grids.getGridNumber(); String query = "SELECT count(*) AS count_1 FROM ( SELECT "; for (int i = 0; i < Global.attributes.size(); i++) { if (i != Global.attributes.size() - 1) query += Global.attributes.get(i).getName() + " , "; else query += Global.attributes.get(i).getName(); } query += " FROM " + Global.TABLE_NAME + " WHERE "; for (int i = 0; i < Global.attributes.size(); i++) { query += Global.attributes.get(i).getName() + " >= " + getClosest( Global.attributes.get(i).getDomain(), gridLength / 2, Double.parseDouble("" + center.valueAt(i)), false) + " AND " + Global.attributes.get(i).getName() + " <= " + getClosest( Global.attributes.get(i).getDomain(), gridLength / 2, Double.parseDouble("" + center.valueAt(i)), true); if (i < Global.attributes.size() - 1) { query += " AND "; } } query += " GROUP BY "; for (int i = 0; i < Global.attributes.size(); i++) { if (i != Global.attributes.size() - 1) query += Global.attributes.get(i).getName() + " , "; else query += Global.attributes.get(i).getName(); } query += ") AS anon_1"; // System.out.println("Query: "+query); Connection connection = DBConnection.getConnection(); java.sql.Statement statement = connection.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_READ_ONLY); java.sql.ResultSet rs = statement.executeQuery(query); String result = ""; while (rs.next()) { result = rs.getString(1); } int existing = Integer.parseInt(result); rs.close(); statement.close(); ArrayList<Integer> uniques = new ArrayList<Integer>(); for (int i = 0; i < Global.attributes.size(); i++) { String query1 = "SELECT count(distinct " + Global.attributes.get(i).getName(); query1 += ") FROM " + Global.TABLE_NAME + " WHERE "; query1 += Global.attributes.get(i).getName() + " >= " + getClosest( Global.attributes.get(i).getDomain(), gridLength / 2, Double.parseDouble("" + center.valueAt(i)), false) + " AND " + Global.attributes.get(i).getName() + " <= " + getClosest( Global.attributes.get(i).getDomain(), gridLength / 2, Double.parseDouble("" + center.valueAt(i)), true); java.sql.Statement statement1 = connection.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_READ_ONLY); java.sql.ResultSet rs1 = statement1.executeQuery(query1); String result2 = ""; while (rs1.next()) { result2 = rs1.getString(1); } uniques.add(Integer.parseInt(result2)); rs1.close(); statement1.close(); } int possibleCombinations = uniques.get(0); // System.out.println("First attr unique values in the cell: "+uniques.get(0)); // System.out.println("Second attr unique values in the cell: "+uniques.get(1)); // System.out.println("Here should be 2: "+uniques.size()); for (int i = 1; i < uniques.size(); i++) { possibleCombinations *= uniques.get(i); } // System.out.println("Possible combinations: "+ possibleCombinations); density = (double) existing / (double) possibleCombinations; return density; }