Exemplo n.º 1
0
 @Test
 public void testSampleBy() {
   DataFrame df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
   DataFrame sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
   Row[] actual = sampled.groupBy("key").count().orderBy("key").collect();
   Row[] expected = {RowFactory.create(0, 5), RowFactory.create(1, 8)};
   Assert.assertArrayEquals(expected, actual);
 }
Exemplo n.º 2
0
 @Test
 public void testSampleBy() {
   DataFrame df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
   DataFrame sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
   Row[] actual = sampled.groupBy("key").count().orderBy("key").collect();
   Assert.assertEquals(0, actual[0].getLong(0));
   Assert.assertTrue(0 <= actual[0].getLong(1) && actual[0].getLong(1) <= 8);
   Assert.assertEquals(1, actual[1].getLong(0));
   Assert.assertTrue(2 <= actual[1].getLong(1) && actual[1].getLong(1) <= 13);
 }
Exemplo n.º 3
0
  @Test
  public void pivot() {
    DataFrame df = context.table("courseSales");
    Row[] actual =
        df.groupBy("year")
            .pivot("course", Arrays.<Object>asList("dotNET", "Java"))
            .agg(sum("earnings"))
            .orderBy("year")
            .collect();

    Assert.assertEquals(2012, actual[0].getInt(0));
    Assert.assertEquals(15000.0, actual[0].getDouble(1), 0.01);
    Assert.assertEquals(20000.0, actual[0].getDouble(2), 0.01);

    Assert.assertEquals(2013, actual[1].getInt(0));
    Assert.assertEquals(48000.0, actual[1].getDouble(1), 0.01);
    Assert.assertEquals(30000.0, actual[1].getDouble(2), 0.01);
  }
Exemplo n.º 4
0
  /** See SPARK-5904. Abstract vararg methods defined in Scala do not work in Java. */
  @Test
  public void testVarargMethods() {
    DataFrame df = context.table("testData");

    df.toDF("key1", "value1");

    df.select("key", "value");
    df.select(col("key"), col("value"));
    df.selectExpr("key", "value + 1");

    df.sort("key", "value");
    df.sort(col("key"), col("value"));
    df.orderBy("key", "value");
    df.orderBy(col("key"), col("value"));

    df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
    df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
    df.agg(first("key"), sum("value"));

    df.groupBy().avg("key");
    df.groupBy().mean("key");
    df.groupBy().max("key");
    df.groupBy().min("key");
    df.groupBy().stddev("key");
    df.groupBy().sum("key");

    // Varargs in column expressions
    df.groupBy().agg(countDistinct("key", "value"));
    df.groupBy().agg(countDistinct(col("key"), col("value")));
    df.select(coalesce(col("key")));

    // Varargs with mathfunctions
    DataFrame df2 = context.table("testData2");
    df2.select(exp("a"), exp("b"));
    df2.select(exp(log("a")));
    df2.select(pow("a", "a"), pow("b", 2.0));
    df2.select(pow(col("a"), col("b")), exp("b"));
    df2.select(sin("a"), acos("b"));

    df2.select(rand(), acos("b"));
    df2.select(col("*"), randn(5L));
  }