Example #1
0
  public static void main(String[] args) throws FileNotFoundException {
    String path = args[0];

    // This is the default 2 line structure for spark programs in java
    // The spark.executor.memory can only take the maximum java heapspace set by -Xmx
    SparkConf conf =
        new SparkConf()
            .setMaster("local[" + NUM_THREADS + "]")
            .setAppName(Demo.class.getSimpleName())
            .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");

    JavaSparkContext sc = new JavaSparkContext(conf);

    long start = System.nanoTime();

    // if you need both the coordinates and the sequences, use this section of code
    // read sequence file and map to PdbId.chainId, SimplePolymerChain pairs
    List<Tuple2<String, SimplePolymerChain>> chains =
        sc.sequenceFile(path, Text.class, ArrayWritable.class, NUM_THREADS * NUM_TASKS_PER_THREAD)
            .sample(false, 0.01, 123)
            .mapToPair(
                new HadoopToSimpleChainMapper()) // convert input to <pdbId.chainId,
                                                 // SimplePolymerChain> pairs
            .filter(t -> t._2.isProtein())
            .collect();

    for (Tuple2<String, SimplePolymerChain> t : chains) {
      System.out.println(t._1 + ": " + t._2);
    }

    // if you need just the coordinates, use this section of code
    // read sequence file and map to PdbId.chainId, C-alpha coordinate pairs
    List<Tuple2<String, Point3d[]>> coordinates =
        sc.sequenceFile(path, Text.class, ArrayWritable.class, NUM_THREADS * NUM_TASKS_PER_THREAD)
            .sample(false, 0.01, 123)
            .mapToPair(
                new HadoopToSimpleChainMapper()) // convert input to <pdbId.chainId, protein
                                                 // sequence> pairs
            .filter(t -> t._2.isProtein())
            .mapToPair(t -> new Tuple2<String, Point3d[]>(t._1, t._2.getCoordinates()))
            .collect();

    for (Tuple2<String, Point3d[]> t : coordinates) {
      System.out.println(t._1 + ": " + Arrays.toString(t._2));
    }

    sc.close();

    System.out.println("Time: " + (System.nanoTime() - start) / 1E9 + " sec.");
  }
  /**
   * @param args Path of the hadoop sequence file
   * @throws FileNotFoundException
   */
  public static void main(String[] args) throws FileNotFoundException {
    String path = args[0];
    JavaSparkContext sc = getSparkContext();
    // sc is an existing JavaSparkContext.
    SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
    sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy");
    sqlContext.setConf("spark.sql.parquet.filterPushdown", "true");
    long start = System.nanoTime();
    // read sequence file and map
    JavaRDD<Row> rowRDD =
        sc.sequenceFile(path, Text.class, Text.class)
            // .sample(false, 0.01, 123)
            .mapToPair(t -> new Tuple2<String, String>(t._1.toString(), t._2.toString()))
            .groupByKey()
            .map(new HadoopToParqRow())
            .cache();

    List<StructField> fields =
        new ArrayList<StructField>(); // create data fields of features for the DataFrame
    fields.add(DataTypes.createStructField("index", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("chainId1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("chainId2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Rnum1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Rnum2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Ins1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Ins2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("res1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("res2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("atom1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("atom2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("element1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("element2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("distance", DataTypes.IntegerType, false));
    fields.add(
        DataTypes.createStructField(
            "pdbId", DataTypes.createArrayType(DataTypes.StringType), false));
    StructType schema = DataTypes.createStructType(fields);

    // Apply the schema to the RDD.
    DataFrame dataFrame = sqlContext.createDataFrame(rowRDD, schema);
    dataFrame
        .coalesce(1)
        .write()
        .mode(SaveMode.Overwrite)
        .partitionBy("index")
        .parquet("/Users/hina/Data/ExampleFiles/seq.parquet");
    sc.close();
    System.out.println("Time: " + (System.nanoTime() - start) / 1E9 + " sec.");
  }