/** * @param sqlctx * @param mb * @param schema * @return * @throws DMLRuntimeException */ @SuppressWarnings("resource") private DataFrame createDataFrame( SQLContext sqlctx, MatrixBlock mb, boolean containsID, ValueType[] schema) throws DMLRuntimeException { // create in-memory list of rows List<Row> list = new ArrayList<Row>(); int off = (containsID ? 1 : 0); int clen = mb.getNumColumns() + off - colsVector + 1; for (int i = 0; i < mb.getNumRows(); i++) { Object[] row = new Object[clen]; if (containsID) row[0] = i + 1; for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) { if (schema[j2] != ValueType.OBJECT) { row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j)); } else { double[] tmp = DataConverter.convertToDoubleVector( mb.sliceOperations(i, i, j, j + colsVector - 1, new MatrixBlock())); row[j2 + off] = new DenseVector(tmp); j += colsVector - 1; } } list.add(RowFactory.create(row)); } // create data frame schema List<StructField> fields = new ArrayList<StructField>(); if (containsID) fields.add( DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); for (int j = 0; j < schema.length; j++) { DataType dt = null; switch (schema[j]) { case STRING: dt = DataTypes.StringType; break; case DOUBLE: dt = DataTypes.DoubleType; break; case INT: dt = DataTypes.LongType; break; case OBJECT: dt = new VectorUDT(); break; default: throw new RuntimeException("Unsupported value type."); } fields.add(DataTypes.createStructField("C" + (j + 1), dt, true)); } StructType dfSchema = DataTypes.createStructType(fields); // create rdd and data frame JavaSparkContext sc = new JavaSparkContext(sqlctx.sparkContext()); JavaRDD<Row> rowRDD = sc.parallelize(list); return sqlctx.createDataFrame(rowRDD, dfSchema); }
/** * @param args Path of the hadoop sequence file * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException { String path = args[0]; JavaSparkContext sc = getSparkContext(); // sc is an existing JavaSparkContext. SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc); sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy"); sqlContext.setConf("spark.sql.parquet.filterPushdown", "true"); long start = System.nanoTime(); // read sequence file and map JavaRDD<Row> rowRDD = sc.sequenceFile(path, Text.class, Text.class) // .sample(false, 0.01, 123) .mapToPair(t -> new Tuple2<String, String>(t._1.toString(), t._2.toString())) .groupByKey() .map(new HadoopToParqRow()) .cache(); List<StructField> fields = new ArrayList<StructField>(); // create data fields of features for the DataFrame fields.add(DataTypes.createStructField("index", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("chainId1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("chainId2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Rnum1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Rnum2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Ins1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Ins2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("res1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("res2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("atom1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("atom2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("element1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("element2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("distance", DataTypes.IntegerType, false)); fields.add( DataTypes.createStructField( "pdbId", DataTypes.createArrayType(DataTypes.StringType), false)); StructType schema = DataTypes.createStructType(fields); // Apply the schema to the RDD. DataFrame dataFrame = sqlContext.createDataFrame(rowRDD, schema); dataFrame .coalesce(1) .write() .mode(SaveMode.Overwrite) .partitionBy("index") .parquet("/Users/hina/Data/ExampleFiles/seq.parquet"); sc.close(); System.out.println("Time: " + (System.nanoTime() - start) / 1E9 + " sec."); }
public StructType getQuerySchema(SolrQuery query) throws Exception { CloudSolrClient solrServer = getSolrClient(zkHost); // Build up a schema based on the fields requested String fieldList = query.getFields(); String[] fields = null; if (fieldList != null) { fields = query.getFields().split(","); } else { // just go out to Solr and get 10 docs and extract a field list from that SolrQuery probeForFieldsQuery = query.getCopy(); probeForFieldsQuery.remove("distrib"); probeForFieldsQuery.set("collection", collection); probeForFieldsQuery.set("fl", "*"); probeForFieldsQuery.setStart(0); probeForFieldsQuery.setRows(10); QueryResponse probeForFieldsResp = solrServer.query(probeForFieldsQuery); SolrDocumentList hits = probeForFieldsResp.getResults(); Set<String> fieldSet = new TreeSet<String>(); for (SolrDocument hit : hits) fieldSet.addAll(hit.getFieldNames()); fields = fieldSet.toArray(new String[0]); } if (fields == null || fields.length == 0) throw new IllegalArgumentException( "Query (" + query + ") does not specify any fields needed to build a schema!"); Set<String> liveNodes = solrServer.getZkStateReader().getClusterState().getLiveNodes(); if (liveNodes.isEmpty()) throw new RuntimeException("No live nodes found for cluster: " + zkHost); String solrBaseUrl = solrServer.getZkStateReader().getBaseUrlForNodeName(liveNodes.iterator().next()); if (!solrBaseUrl.endsWith("?")) solrBaseUrl += "/"; Map<String, String> fieldTypeMap = getFieldTypes(fields, solrBaseUrl, collection); List<StructField> listOfFields = new ArrayList<StructField>(); for (String field : fields) { String fieldType = fieldTypeMap.get(field); DataType dataType = (fieldType != null) ? solrDataTypes.get(fieldType) : null; if (dataType == null) dataType = DataTypes.StringType; listOfFields.add(DataTypes.createStructField(field, dataType, true)); } return DataTypes.createStructType(listOfFields); }
private DataFrame artistsAsDataFrame() { String input = TestUtils.sampleArtistsDat(); JavaRDD<String> data = sc.textFile(input); StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("id", DataTypes.IntegerType, false), DataTypes.createStructField("name", DataTypes.StringType, false), DataTypes.createStructField("url", DataTypes.StringType, true), DataTypes.createStructField("pictures", DataTypes.StringType, true), DataTypes.createStructField("time", DataTypes.TimestampType, true) }); JavaRDD<Row> rowData = data.map( new Function<String, String[]>() { @Override public String[] call(String line) throws Exception { return line.split("\t"); } }) .map( new Function<String[], Row>() { @Override public Row call(String[] r) throws Exception { return RowFactory.create( Integer.parseInt(r[0]), r[1], r[2], r[3], new Timestamp(DatatypeConverter.parseDateTime(r[4]).getTimeInMillis())); } }); return sqc.createDataFrame(rowData, schema); }
public static void main(String[] args) { SparkConf conf = SparkConfBuilder.buildLocal("movie-lense-df"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); // user id | item id | rating | timestam JavaRDD<Row> dataRdd = sc.textFile("data/ml-100k/u.data") .map( line -> { String[] col = line.split("\t"); return RowFactory.create( Integer.valueOf(col[0]), Integer.valueOf(col[1]), Integer.valueOf(col[2]), Long.valueOf(col[3])); }); List<StructField> dataFields = new ArrayList<>(); dataFields.add(DataTypes.createStructField("userId", DataTypes.IntegerType, false)); dataFields.add(DataTypes.createStructField("itemId", DataTypes.IntegerType, true)); dataFields.add(DataTypes.createStructField("rating", DataTypes.IntegerType, true)); dataFields.add(DataTypes.createStructField("timestamp", DataTypes.LongType, true)); StructType dataSchema = DataTypes.createStructType(dataFields); DataFrame data = sqlContext.createDataFrame(dataRdd, dataSchema); data.show(); // movie id | movie title | release date | video release date | // IMDb URL | unknown | Action | Adventure | Animation | // Children's | Comedy | Crime | Documentary | Drama | Fantasy | // Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | // Thriller | War | Western | JavaRDD<Row> itemRdd = sc.textFile("data/ml-100k/u.item") .map( line -> { String[] col = line.split("\\|"); return RowFactory.create( Integer.valueOf(col[0]), col[1], col[2], col[3], col[4], toBool(col[5]), toBool(col[6]), toBool(col[7]), toBool(col[8]), toBool(col[9]), toBool(col[10]), toBool(col[11]), toBool(col[12]), toBool(col[13]), toBool(col[14]), toBool(col[15]), toBool(col[16]), toBool(col[17]), toBool(col[18]), toBool(col[19]), toBool(col[20]), toBool(col[21]), toBool(col[22]), toBool(col[23])); }); List<StructField> itemFields = new ArrayList<>(); itemFields.add(DataTypes.createStructField("movieId", DataTypes.IntegerType, false)); itemFields.add(DataTypes.createStructField("movieTitle", DataTypes.StringType, true)); itemFields.add(DataTypes.createStructField("releaseDate", DataTypes.StringType, true)); itemFields.add(DataTypes.createStructField("videoReleaseDate", DataTypes.StringType, true)); itemFields.add(DataTypes.createStructField("imdbUrl", DataTypes.StringType, true)); itemFields.add(DataTypes.createStructField("unknown", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("action", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("adventure", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("animation", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("childrens", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("comedy", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("crime", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("documentary", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("drama", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("fantasy", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("filmNoir", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("horror", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("musical", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("mystery", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("romance", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("sciFi", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("thriller", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("war", DataTypes.BooleanType, true)); itemFields.add(DataTypes.createStructField("western", DataTypes.BooleanType, true)); StructType itemSchema = DataTypes.createStructType(itemFields); DataFrame item = sqlContext.createDataFrame(itemRdd, itemSchema); item.show(); // user id | age | gender | occupation | zip code JavaRDD<Row> userRdd = sc.textFile("data/ml-100k/u.user") .map( line -> { String[] col = line.split("\\|"); return RowFactory.create( Integer.valueOf(col[0]), Integer.valueOf(col[1]), col[2], col[3], col[4]); }); List<StructField> userFields = new ArrayList<>(); userFields.add(DataTypes.createStructField("userId", DataTypes.IntegerType, false)); userFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true)); userFields.add(DataTypes.createStructField("gender", DataTypes.StringType, true)); userFields.add(DataTypes.createStructField("occupation", DataTypes.StringType, true)); userFields.add(DataTypes.createStructField("zipCode", DataTypes.StringType, true)); StructType userSchema = DataTypes.createStructType(userFields); DataFrame user = sqlContext.createDataFrame(userRdd, userSchema); user.show(); }