/**
   * @param sqlctx
   * @param mb
   * @param schema
   * @return
   * @throws DMLRuntimeException
   */
  @SuppressWarnings("resource")
  private DataFrame createDataFrame(
      SQLContext sqlctx, MatrixBlock mb, boolean containsID, ValueType[] schema)
      throws DMLRuntimeException {
    // create in-memory list of rows
    List<Row> list = new ArrayList<Row>();
    int off = (containsID ? 1 : 0);
    int clen = mb.getNumColumns() + off - colsVector + 1;

    for (int i = 0; i < mb.getNumRows(); i++) {
      Object[] row = new Object[clen];
      if (containsID) row[0] = i + 1;
      for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
        if (schema[j2] != ValueType.OBJECT) {
          row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
        } else {
          double[] tmp =
              DataConverter.convertToDoubleVector(
                  mb.sliceOperations(i, i, j, j + colsVector - 1, new MatrixBlock()));
          row[j2 + off] = new DenseVector(tmp);
          j += colsVector - 1;
        }
      }
      list.add(RowFactory.create(row));
    }

    // create data frame schema
    List<StructField> fields = new ArrayList<StructField>();
    if (containsID)
      fields.add(
          DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    for (int j = 0; j < schema.length; j++) {
      DataType dt = null;
      switch (schema[j]) {
        case STRING:
          dt = DataTypes.StringType;
          break;
        case DOUBLE:
          dt = DataTypes.DoubleType;
          break;
        case INT:
          dt = DataTypes.LongType;
          break;
        case OBJECT:
          dt = new VectorUDT();
          break;
        default:
          throw new RuntimeException("Unsupported value type.");
      }
      fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
    }
    StructType dfSchema = DataTypes.createStructType(fields);

    // create rdd and data frame
    JavaSparkContext sc = new JavaSparkContext(sqlctx.sparkContext());
    JavaRDD<Row> rowRDD = sc.parallelize(list);
    return sqlctx.createDataFrame(rowRDD, dfSchema);
  }
Пример #2
0
  public StructType getQuerySchema(SolrQuery query) throws Exception {
    CloudSolrClient solrServer = getSolrClient(zkHost);
    // Build up a schema based on the fields requested
    String fieldList = query.getFields();
    String[] fields = null;
    if (fieldList != null) {
      fields = query.getFields().split(",");
    } else {
      // just go out to Solr and get 10 docs and extract a field list from that
      SolrQuery probeForFieldsQuery = query.getCopy();
      probeForFieldsQuery.remove("distrib");
      probeForFieldsQuery.set("collection", collection);
      probeForFieldsQuery.set("fl", "*");
      probeForFieldsQuery.setStart(0);
      probeForFieldsQuery.setRows(10);
      QueryResponse probeForFieldsResp = solrServer.query(probeForFieldsQuery);
      SolrDocumentList hits = probeForFieldsResp.getResults();
      Set<String> fieldSet = new TreeSet<String>();
      for (SolrDocument hit : hits) fieldSet.addAll(hit.getFieldNames());
      fields = fieldSet.toArray(new String[0]);
    }

    if (fields == null || fields.length == 0)
      throw new IllegalArgumentException(
          "Query (" + query + ") does not specify any fields needed to build a schema!");

    Set<String> liveNodes = solrServer.getZkStateReader().getClusterState().getLiveNodes();
    if (liveNodes.isEmpty())
      throw new RuntimeException("No live nodes found for cluster: " + zkHost);
    String solrBaseUrl =
        solrServer.getZkStateReader().getBaseUrlForNodeName(liveNodes.iterator().next());
    if (!solrBaseUrl.endsWith("?")) solrBaseUrl += "/";

    Map<String, String> fieldTypeMap = getFieldTypes(fields, solrBaseUrl, collection);
    List<StructField> listOfFields = new ArrayList<StructField>();
    for (String field : fields) {
      String fieldType = fieldTypeMap.get(field);
      DataType dataType = (fieldType != null) ? solrDataTypes.get(fieldType) : null;
      if (dataType == null) dataType = DataTypes.StringType;
      listOfFields.add(DataTypes.createStructField(field, dataType, true));
    }

    return DataTypes.createStructType(listOfFields);
  }
  private DataFrame artistsAsDataFrame() {
    String input = TestUtils.sampleArtistsDat();
    JavaRDD<String> data = sc.textFile(input);

    StructType schema =
        DataTypes.createStructType(
            new StructField[] {
              DataTypes.createStructField("id", DataTypes.IntegerType, false),
              DataTypes.createStructField("name", DataTypes.StringType, false),
              DataTypes.createStructField("url", DataTypes.StringType, true),
              DataTypes.createStructField("pictures", DataTypes.StringType, true),
              DataTypes.createStructField("time", DataTypes.TimestampType, true)
            });

    JavaRDD<Row> rowData =
        data.map(
                new Function<String, String[]>() {
                  @Override
                  public String[] call(String line) throws Exception {
                    return line.split("\t");
                  }
                })
            .map(
                new Function<String[], Row>() {
                  @Override
                  public Row call(String[] r) throws Exception {
                    return RowFactory.create(
                        Integer.parseInt(r[0]),
                        r[1],
                        r[2],
                        r[3],
                        new Timestamp(DatatypeConverter.parseDateTime(r[4]).getTimeInMillis()));
                  }
                });

    return sqc.createDataFrame(rowData, schema);
  }
  /**
   * @param args Path of the hadoop sequence file
   * @throws FileNotFoundException
   */
  public static void main(String[] args) throws FileNotFoundException {
    String path = args[0];
    JavaSparkContext sc = getSparkContext();
    // sc is an existing JavaSparkContext.
    SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
    sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy");
    sqlContext.setConf("spark.sql.parquet.filterPushdown", "true");
    long start = System.nanoTime();
    // read sequence file and map
    JavaRDD<Row> rowRDD =
        sc.sequenceFile(path, Text.class, Text.class)
            // .sample(false, 0.01, 123)
            .mapToPair(t -> new Tuple2<String, String>(t._1.toString(), t._2.toString()))
            .groupByKey()
            .map(new HadoopToParqRow())
            .cache();

    List<StructField> fields =
        new ArrayList<StructField>(); // create data fields of features for the DataFrame
    fields.add(DataTypes.createStructField("index", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("chainId1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("chainId2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Rnum1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Rnum2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Ins1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Ins2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("res1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("res2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("atom1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("atom2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("element1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("element2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("distance", DataTypes.IntegerType, false));
    fields.add(
        DataTypes.createStructField(
            "pdbId", DataTypes.createArrayType(DataTypes.StringType), false));
    StructType schema = DataTypes.createStructType(fields);

    // Apply the schema to the RDD.
    DataFrame dataFrame = sqlContext.createDataFrame(rowRDD, schema);
    dataFrame
        .coalesce(1)
        .write()
        .mode(SaveMode.Overwrite)
        .partitionBy("index")
        .parquet("/Users/hina/Data/ExampleFiles/seq.parquet");
    sc.close();
    System.out.println("Time: " + (System.nanoTime() - start) / 1E9 + " sec.");
  }
Пример #5
0
  public static void main(String[] args) {
    SparkConf conf = SparkConfBuilder.buildLocal("movie-lense-df");
    JavaSparkContext sc = new JavaSparkContext(conf);
    SQLContext sqlContext = new SQLContext(sc);

    // user id | item id | rating | timestam
    JavaRDD<Row> dataRdd =
        sc.textFile("data/ml-100k/u.data")
            .map(
                line -> {
                  String[] col = line.split("\t");
                  return RowFactory.create(
                      Integer.valueOf(col[0]),
                      Integer.valueOf(col[1]),
                      Integer.valueOf(col[2]),
                      Long.valueOf(col[3]));
                });

    List<StructField> dataFields = new ArrayList<>();
    dataFields.add(DataTypes.createStructField("userId", DataTypes.IntegerType, false));
    dataFields.add(DataTypes.createStructField("itemId", DataTypes.IntegerType, true));
    dataFields.add(DataTypes.createStructField("rating", DataTypes.IntegerType, true));
    dataFields.add(DataTypes.createStructField("timestamp", DataTypes.LongType, true));
    StructType dataSchema = DataTypes.createStructType(dataFields);

    DataFrame data = sqlContext.createDataFrame(dataRdd, dataSchema);

    data.show();

    // movie id | movie title | release date | video release date |
    //        IMDb URL | unknown | Action | Adventure | Animation |
    //        Children's | Comedy | Crime | Documentary | Drama | Fantasy |
    // Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
    //        Thriller | War | Western |
    JavaRDD<Row> itemRdd =
        sc.textFile("data/ml-100k/u.item")
            .map(
                line -> {
                  String[] col = line.split("\\|");
                  return RowFactory.create(
                      Integer.valueOf(col[0]),
                      col[1],
                      col[2],
                      col[3],
                      col[4],
                      toBool(col[5]),
                      toBool(col[6]),
                      toBool(col[7]),
                      toBool(col[8]),
                      toBool(col[9]),
                      toBool(col[10]),
                      toBool(col[11]),
                      toBool(col[12]),
                      toBool(col[13]),
                      toBool(col[14]),
                      toBool(col[15]),
                      toBool(col[16]),
                      toBool(col[17]),
                      toBool(col[18]),
                      toBool(col[19]),
                      toBool(col[20]),
                      toBool(col[21]),
                      toBool(col[22]),
                      toBool(col[23]));
                });

    List<StructField> itemFields = new ArrayList<>();
    itemFields.add(DataTypes.createStructField("movieId", DataTypes.IntegerType, false));
    itemFields.add(DataTypes.createStructField("movieTitle", DataTypes.StringType, true));
    itemFields.add(DataTypes.createStructField("releaseDate", DataTypes.StringType, true));
    itemFields.add(DataTypes.createStructField("videoReleaseDate", DataTypes.StringType, true));
    itemFields.add(DataTypes.createStructField("imdbUrl", DataTypes.StringType, true));
    itemFields.add(DataTypes.createStructField("unknown", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("action", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("adventure", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("animation", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("childrens", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("comedy", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("crime", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("documentary", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("drama", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("fantasy", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("filmNoir", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("horror", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("musical", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("mystery", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("romance", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("sciFi", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("thriller", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("war", DataTypes.BooleanType, true));
    itemFields.add(DataTypes.createStructField("western", DataTypes.BooleanType, true));
    StructType itemSchema = DataTypes.createStructType(itemFields);

    DataFrame item = sqlContext.createDataFrame(itemRdd, itemSchema);

    item.show();
    // user id | age | gender | occupation | zip code
    JavaRDD<Row> userRdd =
        sc.textFile("data/ml-100k/u.user")
            .map(
                line -> {
                  String[] col = line.split("\\|");
                  return RowFactory.create(
                      Integer.valueOf(col[0]), Integer.valueOf(col[1]), col[2], col[3], col[4]);
                });

    List<StructField> userFields = new ArrayList<>();
    userFields.add(DataTypes.createStructField("userId", DataTypes.IntegerType, false));
    userFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));
    userFields.add(DataTypes.createStructField("gender", DataTypes.StringType, true));
    userFields.add(DataTypes.createStructField("occupation", DataTypes.StringType, true));
    userFields.add(DataTypes.createStructField("zipCode", DataTypes.StringType, true));
    StructType userSchema = DataTypes.createStructType(userFields);

    DataFrame user = sqlContext.createDataFrame(userRdd, userSchema);

    user.show();
  }