private static void runParquetSchemaMergingExample(SparkSession spark) { // $example on:schema_merging$ List<Square> squares = new ArrayList<>(); for (int value = 1; value <= 5; value++) { Square square = new Square(); square.setValue(value); square.setSquare(value * value); squares.add(square); } // Create a simple DataFrame, store into a partition directory Dataset<Row> squaresDF = spark.createDataFrame(squares, Square.class); squaresDF.write().parquet("data/test_table/key=1"); List<Cube> cubes = new ArrayList<>(); for (int value = 6; value <= 10; value++) { Cube cube = new Cube(); cube.setValue(value); cube.setCube(value * value * value); cubes.add(cube); } // Create another DataFrame in a new partition directory, // adding a new column and dropping an existing column Dataset<Row> cubesDF = spark.createDataFrame(cubes, Cube.class); cubesDF.write().parquet("data/test_table/key=2"); // Read the partitioned table Dataset<Row> mergedDF = spark.read().option("mergeSchema", true).parquet("data/test_table"); mergedDF.printSchema(); // The final schema consists of all 3 columns in the Parquet files together // with the partitioning column appeared in the partition directory paths // root // |-- value: int (nullable = true) // |-- square: int (nullable = true) // |-- cube: int (nullable = true) // |-- key: int (nullable = true) // $example off:schema_merging$ }
private static void runJsonDatasetExample(SparkSession spark) { // $example on:json_dataset$ // A JSON dataset is pointed to by path. // The path can be either a single text file or a directory storing text files Dataset<Row> people = spark.read().json("examples/src/main/resources/people.json"); // The inferred schema can be visualized using the printSchema() method people.printSchema(); // root // |-- age: long (nullable = true) // |-- name: string (nullable = true) // Creates a temporary view using the DataFrame people.createOrReplaceTempView("people"); // SQL statements can be run by using the sql methods provided by spark Dataset<Row> namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); namesDF.show(); // +------+ // | name| // +------+ // |Justin| // +------+ // Alternatively, a DataFrame can be created for a JSON dataset represented by // an RDD[String] storing one JSON object per string. List<String> jsonData = Arrays.asList("{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); JavaRDD<String> anotherPeopleRDD = new JavaSparkContext(spark.sparkContext()).parallelize(jsonData); Dataset anotherPeople = spark.read().json(anotherPeopleRDD); anotherPeople.show(); // +---------------+----+ // | address|name| // +---------------+----+ // |[Columbus,Ohio]| Yin| // +---------------+----+ // $example off:json_dataset$ }