private static void runBasicDataSourceExample(SparkSession spark) {
   // $example on:generic_load_save_functions$
   Dataset<Row> usersDF = spark.read().load("examples/src/main/resources/users.parquet");
   usersDF.select("name", "favorite_color").write().save("namesAndFavColors.parquet");
   // $example off:generic_load_save_functions$
   // $example on:manual_load_options$
   Dataset<Row> peopleDF =
       spark.read().format("json").load("examples/src/main/resources/people.json");
   peopleDF.select("name", "age").write().format("parquet").save("namesAndAges.parquet");
   // $example off:manual_load_options$
   // $example on:direct_sql$
   Dataset<Row> sqlDF =
       spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`");
   // $example off:direct_sql$
 }
 @Test
 public void testVectorAssembler() {
   StructType schema =
       createStructType(
           new StructField[] {
             createStructField("id", IntegerType, false),
             createStructField("x", DoubleType, false),
             createStructField("y", new VectorUDT(), false),
             createStructField("name", StringType, false),
             createStructField("z", new VectorUDT(), false),
             createStructField("n", LongType, false)
           });
   Row row =
       RowFactory.create(
           0,
           0.0,
           Vectors.dense(1.0, 2.0),
           "a",
           Vectors.sparse(2, new int[] {1}, new double[] {3.0}),
           10L);
   Dataset<Row> dataset = sqlContext.createDataFrame(Arrays.asList(row), schema);
   VectorAssembler assembler =
       new VectorAssembler()
           .setInputCols(new String[] {"x", "y", "z", "n"})
           .setOutputCol("features");
   Dataset<Row> output = assembler.transform(dataset);
   Assert.assertEquals(
       Vectors.sparse(6, new int[] {1, 2, 4, 5}, new double[] {1.0, 2.0, 3.0, 10.0}),
       output.select("features").first().<Vector>getAs(0));
 }
Esempio n. 3
0
  @Test
  public void testSelect() {
    List<Integer> data = Arrays.asList(2, 6);
    Dataset<Integer> ds = context.createDataset(data, Encoders.INT());

    Dataset<Tuple2<Integer, String>> selected =
        ds.select(expr("value + 1"), col("value").cast("string"))
            .as(Encoders.tuple(Encoders.INT(), Encoders.STRING()));

    Assert.assertEquals(Arrays.asList(tuple2(3, "2"), tuple2(7, "6")), selected.collectAsList());
  }