private static void runBasicDataSourceExample(SparkSession spark) { // $example on:generic_load_save_functions$ Dataset<Row> usersDF = spark.read().load("examples/src/main/resources/users.parquet"); usersDF.select("name", "favorite_color").write().save("namesAndFavColors.parquet"); // $example off:generic_load_save_functions$ // $example on:manual_load_options$ Dataset<Row> peopleDF = spark.read().format("json").load("examples/src/main/resources/people.json"); peopleDF.select("name", "age").write().format("parquet").save("namesAndAges.parquet"); // $example off:manual_load_options$ // $example on:direct_sql$ Dataset<Row> sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"); // $example off:direct_sql$ }
@Test public void testVectorAssembler() { StructType schema = createStructType( new StructField[] { createStructField("id", IntegerType, false), createStructField("x", DoubleType, false), createStructField("y", new VectorUDT(), false), createStructField("name", StringType, false), createStructField("z", new VectorUDT(), false), createStructField("n", LongType, false) }); Row row = RowFactory.create( 0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, new int[] {1}, new double[] {3.0}), 10L); Dataset<Row> dataset = sqlContext.createDataFrame(Arrays.asList(row), schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[] {"x", "y", "z", "n"}) .setOutputCol("features"); Dataset<Row> output = assembler.transform(dataset); Assert.assertEquals( Vectors.sparse(6, new int[] {1, 2, 4, 5}, new double[] {1.0, 2.0, 3.0, 10.0}), output.select("features").first().<Vector>getAs(0)); }
@Test public void testSelect() { List<Integer> data = Arrays.asList(2, 6); Dataset<Integer> ds = context.createDataset(data, Encoders.INT()); Dataset<Tuple2<Integer, String>> selected = ds.select(expr("value + 1"), col("value").cast("string")) .as(Encoders.tuple(Encoders.INT(), Encoders.STRING())); Assert.assertEquals(Arrays.asList(tuple2(3, "2"), tuple2(7, "6")), selected.collectAsList()); }