private static void runBasicParquetExample(SparkSession spark) { // $example on:basic_parquet_example$ Dataset<Row> peopleDF = spark.read().json("examples/src/main/resources/people.json"); // DataFrames can be saved as Parquet files, maintaining the schema information peopleDF.write().parquet("people.parquet"); // Read in the Parquet file created above. // Parquet files are self-describing so the schema is preserved // The result of loading a parquet file is also a DataFrame Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet"); // Parquet files can also be used to create a temporary view and then used in SQL statements parquetFileDF.createOrReplaceTempView("parquetFile"); Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19"); Dataset<String> namesDS = namesDF.map( new MapFunction<Row, String>() { public String call(Row row) { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // $example off:basic_parquet_example$ }
@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = context.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter( new FilterFunction<String>() { @Override public boolean call(String v) throws Exception { return v.startsWith("h"); } }); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map( new MapFunction<String, Integer>() { @Override public Integer call(String v) throws Exception { return v.length(); } }, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions( new MapPartitionsFunction<String, String>() { @Override public Iterable<String> call(Iterator<String> it) throws Exception { List<String> ls = new LinkedList<String>(); while (it.hasNext()) { ls.add(it.next().toUpperCase()); } return ls; } }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) throws Exception { List<String> ls = new LinkedList<String>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls; } }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }