Exemplo n.º 1
0
 /**
  * Remove FV based on its threshold, if it is over <b> threshold </b> then it would be removed.
  * Let the FVs that has entropy below threshold remains.
  *
  * @param fv_list
  * @param threshold
  * @return Remaining FVs
  */
 public Map<FV, Collection<FV>> applyThresholdRemoval(
     Map<FV, Collection<FV>> fv_list, double threshold) {
   Map<FV, Collection<FV>> result = new HashMap<FV, Collection<FV>>();
   List<Double> entropies = generateEntropy(fv_list);
   result.putAll(fv_list);
   // See mean, Q1~Q3 values for entropy threshold
   Double[] temp = new Double[entropies.size()];
   temp = entropies.toArray(temp);
   Double mean = MathHelper.getInstance().calculateAverage(temp);
   Double stdev = MathHelper.getInstance().calculateStdev(mean, temp);
   Double[] q = MathHelper.getInstance().calculateQuartile(temp);
   // System.out.println("Mean: " + mean);
   // System.out.println("Stdev: " + stdev);
   // System.out.println("Mean + Stdev: " + (mean + stdev));
   // System.out.println("Mean - Stdev: " + (mean - stdev));
   // System.out.println("Q1: " + q[0]);
   // System.out.println("Q2: " + q[1]);
   // System.out.println("Q3: " + q[2]);
   // threshold = mean + stdev; // Force using specified threshold
   switch (thr_alg) {
     case Mean:
       threshold = mean;
       break;
     case MeanMin:
       threshold = mean - stdev;
       break;
     case MeanPlus:
       threshold = mean + stdev;
       break;
     case Q1:
       threshold = q[0];
       break;
     case Q2:
       threshold = q[1];
       break;
     case Q3:
       threshold = q[2];
       break;
     default:
       break;
   }
   // Apply removal
   for (FV k : fv_list.keySet()) {
     if (k.getEntropy() > threshold) result.remove(k);
   }
   return result;
 }
Exemplo n.º 2
0
 // Instead of top-k, we select top-k percents data
 public Map<FV, Collection<FV>> applyCorrelationRemoval(
     Map<FV, Collection<FV>> fv_list, double topk, Double[][] CM, List<Double> corrValues) {
   Map<FV, Collection<FV>> result = new HashMap<FV, Collection<FV>>();
   generateEntropy(fv_list);
   Double mean;
   Double stdev;
   Double[] q = new Double[3]; // Q1, Q2, Q3
   Double[] temp = new Double[corrValues.size()];
   temp = corrValues.toArray(temp);
   // Calculate mean, Q1~Q3
   mean = MathHelper.getInstance().calculateAverage(temp);
   stdev = MathHelper.getInstance().calculateStdev(mean, temp);
   q = MathHelper.getInstance().calculateQuartile(temp);
   // System.out.println("Mean: " + mean);
   // System.out.println("Stdev: " + stdev);
   // System.out.println("Mean + Stdev: " + (mean + stdev));
   // System.out.println("Mean - Stdev: " + (mean - stdev));
   // System.out.println("Q1: " + q[0]);
   // System.out.println("Q2: " + q[1]);
   // System.out.println("Q3: " + q[2]);
   // Selecting correlation threshold
   double corrThreshold = q[1]; // Use Q3 / Q2 / Mean+Stdev as corr
   // threshold
   switch (thr_alg) {
     case Mean:
       corrThreshold = mean;
       break;
     case MeanMin:
       corrThreshold = mean - stdev;
       break;
     case MeanPlus:
       corrThreshold = mean + stdev;
       break;
     case Q1:
       corrThreshold = q[0];
       break;
     case Q2:
       corrThreshold = q[1];
       break;
     case Q3:
       corrThreshold = q[2];
       break;
     default:
       break;
   }
   /** Create several list for each correlated features */
   List<List<FV>> correlatedFV = new ArrayList<>();
   Set<FV> fvs = fv_list.keySet();
   int[] selectedColumns = new int[CM.length];
   for (int i = 0; i < CM.length; i++) {
     for (int j = i + 1; j < CM.length; j++) {
       /**
        * For each correlated feature, put all FV into list and do selection based on top-k percent
        */
       if (CM[i][j] >= corrThreshold) {
         // Mark selectedColumns
         selectedColumns[i] = 1;
         selectedColumns[j] = 0;
         // Add into list
         List<FV> list = new ArrayList<>();
         for (FV fv : fvs) {
           if (fv.getFeature() == i || fv.getFeature() == j) {
             list.add(fv);
           }
         }
         selectSubsetTopKPercent(topk, correlatedFV, list);
       }
     }
   }
   /** Add remaining columns which are not selected (not correlated) */
   for (int i = 0; i < selectedColumns.length; i++) {
     if (selectedColumns[i] == 0) {
       List<FV> list = new ArrayList<>();
       for (FV fv : fvs) {
         if (fv.getFeature() == i) {
           list.add(fv);
         }
       }
       selectSubsetTopKPercent(topk, correlatedFV, list);
     }
   }
   /** Merge all correlated FV set and non-correlated FV set */
   for (List<FV> list : correlatedFV) {
     for (FV fv : list) {
       result.put(fv, null);
     }
   }
   /*
    * TODO Correlation FVS 1. Create several list for each correlated
    * features 2. For each correlated feature, put all FV into list and do
    * selection based on top-k percent 3. When doing selection, check on
    * every list (because every correlated feature would have different
    * list and non-correlated feature, would have a separate list). For
    * example: there is 5 features A, B, C, D, and E. Then, correlated
    * pairs are: A with B, A with C, A with D, B with D, and C with D. Then
    * we would have several list for them: AB list, AC list, AD list, BD
    * list, CD list, and E list. (E is a non-correlated feature, thus have
    * a separate list by itself). 4. Evaluate the parameters: top-k
    * percents and threshold selection (for correlation).
    */
   return result;
 }