@Test public void testSampleBy() { DataFrame df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); DataFrame sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); Row[] actual = sampled.groupBy("key").count().orderBy("key").collect(); Row[] expected = {RowFactory.create(0, 5), RowFactory.create(1, 8)}; Assert.assertArrayEquals(expected, actual); }
@Test public void testSampleBy() { DataFrame df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); DataFrame sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); Row[] actual = sampled.groupBy("key").count().orderBy("key").collect(); Assert.assertEquals(0, actual[0].getLong(0)); Assert.assertTrue(0 <= actual[0].getLong(1) && actual[0].getLong(1) <= 8); Assert.assertEquals(1, actual[1].getLong(0)); Assert.assertTrue(2 <= actual[1].getLong(1) && actual[1].getLong(1) <= 13); }
@Test public void pivot() { DataFrame df = context.table("courseSales"); Row[] actual = df.groupBy("year") .pivot("course", Arrays.<Object>asList("dotNET", "Java")) .agg(sum("earnings")) .orderBy("year") .collect(); Assert.assertEquals(2012, actual[0].getInt(0)); Assert.assertEquals(15000.0, actual[0].getDouble(1), 0.01); Assert.assertEquals(20000.0, actual[0].getDouble(2), 0.01); Assert.assertEquals(2013, actual[1].getInt(0)); Assert.assertEquals(48000.0, actual[1].getDouble(1), 0.01); Assert.assertEquals(30000.0, actual[1].getDouble(2), 0.01); }
/** See SPARK-5904. Abstract vararg methods defined in Scala do not work in Java. */ @Test public void testVarargMethods() { DataFrame df = context.table("testData"); df.toDF("key1", "value1"); df.select("key", "value"); df.select(col("key"), col("value")); df.selectExpr("key", "value + 1"); df.sort("key", "value"); df.sort(col("key"), col("value")); df.orderBy("key", "value"); df.orderBy(col("key"), col("value")); df.groupBy("key", "value").agg(col("key"), col("value"), sum("value")); df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value")); df.agg(first("key"), sum("value")); df.groupBy().avg("key"); df.groupBy().mean("key"); df.groupBy().max("key"); df.groupBy().min("key"); df.groupBy().stddev("key"); df.groupBy().sum("key"); // Varargs in column expressions df.groupBy().agg(countDistinct("key", "value")); df.groupBy().agg(countDistinct(col("key"), col("value"))); df.select(coalesce(col("key"))); // Varargs with mathfunctions DataFrame df2 = context.table("testData2"); df2.select(exp("a"), exp("b")); df2.select(exp(log("a"))); df2.select(pow("a", "a"), pow("b", 2.0)); df2.select(pow(col("a"), col("b")), exp("b")); df2.select(sin("a"), acos("b")); df2.select(rand(), acos("b")); df2.select(col("*"), randn(5L)); }