/*
     Each of the operations has as a key the new field to be created
     The operation is one of the following
         FIELD_NAME              -- a field to be used as a dimension (a factor or group)
         FIELD_NAME : base       -- a field to be used as a dimension AND a base for percentages
         FIELD_NAME : transform  -- a measure to use to transform the field (e.g. 'mean', 'count', ...)

         Note that an empty field is legal for the count transform
  */
  public static Dataset transform(Dataset base, String command) {
    if (base.rowCount() == 0) return base;
    List<String[]> operations = map(command);
    if (operations.isEmpty()) return base;

    // Decode the operations into these collections
    List<MeasureField> measures = new ArrayList<>();
    List<DimensionField> dimensions = new ArrayList<>();
    List<Field> percentBase = new ArrayList<>();

    boolean containsCount = false;
    boolean containsRow = false;
    boolean containsSelection = false;

    for (String[] op : operations) {
      if (op[0].equals("#count")) containsCount = true;
      if (op[0].equals("#row")) containsRow = true;
      if (op[0].equals("#selection")) containsSelection = true;
      String[] values = op[1].split(":");
      Field baseField = base.field(values[0].trim());
      if (values.length == 1) {
        dimensions.add(new DimensionField(baseField, op[0]));
      } else if (values[1].trim().equals("base")) {
        dimensions.add(new DimensionField(baseField, op[0]));
        percentBase.add(baseField);
      } else {
        MeasureField measureField = new MeasureField(baseField, op[0], values[1].trim());
        if (values.length > 2) {
          // Add the option info in
          measureField.option = values[2].trim();
        }
        measures.add(measureField);
      }
    }

    Collections.sort(measures);
    Collections.sort(dimensions);

    // ensure #count and #row are included
    if (!containsCount) measures.add(new MeasureField(base.field("#count"), "#count", "sum"));
    if (!containsRow) measures.add(new MeasureField(base.field("#row"), "#row", "list"));
    if (!containsSelection)
      measures.add(new MeasureField(base.field("#selection"), "#selection", "mode"));

    Summarize s = new Summarize(measures, dimensions, percentBase, base.rowCount());
    Field[] fields = s.make();

    return base.replaceFields(fields);
  }
Exemple #2
0
  @Test
  public void testSimpleCount() {
    Dataset a = Summarize.transform(data, "count = : count");
    assertEquals(
        "count|#count|#row -- 25|25|1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, \u2026", CannedData.dump(a));
    assertEquals(true, a.fields[0].isNumeric());

    a = Summarize.transform(data, "COUNT = : count");
    assertEquals(
        "COUNT|#count|#row -- 25|25|1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, \u2026", CannedData.dump(a));

    a = Summarize.transform(data, "g=gender:count");
    assertEquals(
        "g|#count|#row -- 25|25|1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, \u2026", CannedData.dump(a));
  }
Exemple #3
0
  @Test
  public void testGroups() {
    Dataset a = Summarize.transform(data, "gender=gender; count=:count");
    assertEquals(
        "gender|count|#count|#row -- "
            + "Female|12|12|3, 4, 8, 9, 10, 11, 14, 20, 21, 23, 24, 25 -- "
            + "Male|13|13|1, 2, 5, 6, 7, 12, 13, 15, 16, 17, 18, \u2026",
        CannedData.dump(a));

    a = Summarize.transform(data, "gender=gender; educ=educ; count=:count");

    assertEquals(
        "educ|gender|count|#count|#row -- "
            + "8|Female|1|1|4 -- 8|Male|1|1|12 -- 12|Female|5|5|3, 8, 10, 20, 24 -- 12|Male|4|4|15, 16, 19, 22 -- "
            + "15|Female|4|4|9, 14, 23, 25 -- 15|Male|6|6|1, 5, 6, 7, 13, 17 -- 16|Female|2|2|11, 21 -- 16|Male|2|2|2, 18",
        CannedData.dump(a));
  }
Exemple #4
0
 @Test
 public void testPercentOverall() {
   Dataset a = Summarize.transform(data, "gender=gender:base; #percent=#count:percent:overall");
   assertEquals(
       "gender|#percent|#count|#row -- "
           + "Female|48%|12|3, 4, 8, 9, 10, 11, 14, 20, 21, 23, 24, 25 -- "
           + "Male|52%|13|1, 2, 5, 6, 7, 12, 13, 15, 16, 17, 18, \u2026",
       CannedData.dump(a));
 }
Exemple #5
0
 /*
  * /*
  * Possible summaries are:
  * [numeric] mean, min, max, range, iqr, median, stddev
  * [any] count, valid, mode, unique
  */
 @Test
 public void testRangeStats() {
   Dataset a = Summarize.transform(data, "gender=gender; a=salary:range; b=salary:iqr");
   assertEquals(
       "gender|a|b|#count|#row -- "
           + "Female|16,950\u202638,850|21,675\u202629,100|12|3, 4, 8, 9, 10, 11, 14, 20, 21, 23, 24, 25 -- "
           + "Male|21,750\u2026103,750|28,350\u202645,000|13|1, 2, 5, 6, 7, 12, 13, 15, 16, 17, 18, \u2026",
       CannedData.dump(a));
 }
Exemple #6
0
 @Test
 public void testEmptyData() {
   Field x = Fields.makeColumnField("x", null, new Object[] {1, 2});
   Field y = Fields.makeConstantField("y", null, null, 2);
   Dataset a = Dataset.make(new Field[] {x, y});
   a = Summarize.transform(a, "x=x; y1=y:min; y2=y:sum; y3=y:iqr; y4=y:valid; y5=y:median");
   assertEquals(
       "x|y1|y2|y3|y4|y5|#count|#row -- " + "1|?|?|?|0|?|1|1 -- 2|?|?|?|0|?|1|2",
       CannedData.dump(a));
 }
Exemple #7
0
 @Test
 public void testPercentDifferentBases() {
   Dataset a =
       Summarize.transform(data, "gender=gender; jobcat=jobcat:base; #percent=#count:percent");
   assertEquals(
       "gender|jobcat|#percent|#count|#row -- "
           + "Female|Clerical|52.2%|12|3, 4, 8, 9, 10, 11, 14, 20, 21, 23, 24, 25 -- "
           + "Male|Clerical|47.8%|11|2, 5, 6, 7, 12, 13, 15, 16, 17, 19, 22 -- "
           + "Male|Manager|100%|2|1, 18",
       CannedData.dump(a));
 }
Exemple #8
0
 @Test
 public void testSimpleStatsNonNumeric() {
   String spec =
       "gender = gender; a = jobcat: mean; b = jobcat:min;  d = jobcat: valid; e = jobcat:median; f = jobcat:stddev; g = jobcat:unique ; h = jobcat:mode";
   Dataset a = Summarize.transform(data, spec);
   assertEquals(
       "gender|a|b|d|e|f|g|h|#count|#row -- "
           + "Female|Clerical|?|12|?|?|1|Clerical|12|3, 4, 8, 9, 10, 11, 14, 20, 21, 23, 24, 25 -- "
           + "Male|Clerical|?|13|?|?|2|Clerical|13|1, 2, 5, 6, 7, 12, 13, 15, 16, 17, 18, \u2026",
       CannedData.dump(a));
 }
Exemple #9
0
 /*
  * Possible summaries are:
  * [numeric] mean, min, max, range, iqr, median, stddev
  * [any] count, valid, mode, unique
  */
 @Test
 public void testSimpleStats() {
   String spec =
       "gender = gender; a = educ: mean; b = educ:min; c = educ:max; d = educ: valid; e = educ:median; f = educ:stddev; g = educ:unique ; h = educ:mode";
   Dataset a = Summarize.transform(data, spec);
   assertEquals(
       "gender|a|b|c|d|e|f|g|h|#count|#row -- "
           + "Female|13.333333|8|16|12|13.5|2.3868326|4|12|12|3, 4, 8, 9, 10, 11, 14, 20, 21, 23, 24, 25 -- "
           + "Male|13.692308|8|16|13|15|2.3232382|4|15|13|1, 2, 5, 6, 7, 12, 13, 15, 16, 17, 18, \u2026",
       CannedData.dump(a));
 }
Exemple #10
0
  @Test
  public void testListedDatesPreserveFormat() {
    Field f1 =
        Fields.makeColumnField("a", "A", new Object[] {"1932-1-1", "2033-2-2"}); // Years format
    Field f2 =
        Fields.makeColumnField("b", "B", new Object[] {"1932-1-1", "1932-2-2"}); // Days format
    Dataset a = Dataset.make(new Field[] {f1, f2});
    assertEquals("1932", a.fields[0].valueFormatted(0));
    assertEquals("Jan 1, 1932", a.fields[1].valueFormatted(0));

    String spec = "a = a:list; b = b:list";
    Dataset b = Summarize.transform(a, spec);
    assertEquals("1932, 2033", b.fields[0].valueFormatted(0));
    assertEquals("Jan 1 1932, Feb 2 1932", b.fields[1].valueFormatted(0));
  }