/** * Remove FV based on its threshold, if it is over <b> threshold </b> then it would be removed. * Let the FVs that has entropy below threshold remains. * * @param fv_list * @param threshold * @return Remaining FVs */ public Map<FV, Collection<FV>> applyThresholdRemoval( Map<FV, Collection<FV>> fv_list, double threshold) { Map<FV, Collection<FV>> result = new HashMap<FV, Collection<FV>>(); List<Double> entropies = generateEntropy(fv_list); result.putAll(fv_list); // See mean, Q1~Q3 values for entropy threshold Double[] temp = new Double[entropies.size()]; temp = entropies.toArray(temp); Double mean = MathHelper.getInstance().calculateAverage(temp); Double stdev = MathHelper.getInstance().calculateStdev(mean, temp); Double[] q = MathHelper.getInstance().calculateQuartile(temp); // System.out.println("Mean: " + mean); // System.out.println("Stdev: " + stdev); // System.out.println("Mean + Stdev: " + (mean + stdev)); // System.out.println("Mean - Stdev: " + (mean - stdev)); // System.out.println("Q1: " + q[0]); // System.out.println("Q2: " + q[1]); // System.out.println("Q3: " + q[2]); // threshold = mean + stdev; // Force using specified threshold switch (thr_alg) { case Mean: threshold = mean; break; case MeanMin: threshold = mean - stdev; break; case MeanPlus: threshold = mean + stdev; break; case Q1: threshold = q[0]; break; case Q2: threshold = q[1]; break; case Q3: threshold = q[2]; break; default: break; } // Apply removal for (FV k : fv_list.keySet()) { if (k.getEntropy() > threshold) result.remove(k); } return result; }
// Instead of top-k, we select top-k percents data public Map<FV, Collection<FV>> applyCorrelationRemoval( Map<FV, Collection<FV>> fv_list, double topk, Double[][] CM, List<Double> corrValues) { Map<FV, Collection<FV>> result = new HashMap<FV, Collection<FV>>(); generateEntropy(fv_list); Double mean; Double stdev; Double[] q = new Double[3]; // Q1, Q2, Q3 Double[] temp = new Double[corrValues.size()]; temp = corrValues.toArray(temp); // Calculate mean, Q1~Q3 mean = MathHelper.getInstance().calculateAverage(temp); stdev = MathHelper.getInstance().calculateStdev(mean, temp); q = MathHelper.getInstance().calculateQuartile(temp); // System.out.println("Mean: " + mean); // System.out.println("Stdev: " + stdev); // System.out.println("Mean + Stdev: " + (mean + stdev)); // System.out.println("Mean - Stdev: " + (mean - stdev)); // System.out.println("Q1: " + q[0]); // System.out.println("Q2: " + q[1]); // System.out.println("Q3: " + q[2]); // Selecting correlation threshold double corrThreshold = q[1]; // Use Q3 / Q2 / Mean+Stdev as corr // threshold switch (thr_alg) { case Mean: corrThreshold = mean; break; case MeanMin: corrThreshold = mean - stdev; break; case MeanPlus: corrThreshold = mean + stdev; break; case Q1: corrThreshold = q[0]; break; case Q2: corrThreshold = q[1]; break; case Q3: corrThreshold = q[2]; break; default: break; } /** Create several list for each correlated features */ List<List<FV>> correlatedFV = new ArrayList<>(); Set<FV> fvs = fv_list.keySet(); int[] selectedColumns = new int[CM.length]; for (int i = 0; i < CM.length; i++) { for (int j = i + 1; j < CM.length; j++) { /** * For each correlated feature, put all FV into list and do selection based on top-k percent */ if (CM[i][j] >= corrThreshold) { // Mark selectedColumns selectedColumns[i] = 1; selectedColumns[j] = 0; // Add into list List<FV> list = new ArrayList<>(); for (FV fv : fvs) { if (fv.getFeature() == i || fv.getFeature() == j) { list.add(fv); } } selectSubsetTopKPercent(topk, correlatedFV, list); } } } /** Add remaining columns which are not selected (not correlated) */ for (int i = 0; i < selectedColumns.length; i++) { if (selectedColumns[i] == 0) { List<FV> list = new ArrayList<>(); for (FV fv : fvs) { if (fv.getFeature() == i) { list.add(fv); } } selectSubsetTopKPercent(topk, correlatedFV, list); } } /** Merge all correlated FV set and non-correlated FV set */ for (List<FV> list : correlatedFV) { for (FV fv : list) { result.put(fv, null); } } /* * TODO Correlation FVS 1. Create several list for each correlated * features 2. For each correlated feature, put all FV into list and do * selection based on top-k percent 3. When doing selection, check on * every list (because every correlated feature would have different * list and non-correlated feature, would have a separate list). For * example: there is 5 features A, B, C, D, and E. Then, correlated * pairs are: A with B, A with C, A with D, B with D, and C with D. Then * we would have several list for them: AB list, AC list, AD list, BD * list, CD list, and E list. (E is a non-correlated feature, thus have * a separate list by itself). 4. Evaluate the parameters: top-k * percents and threshold selection (for correlation). */ return result; }