public static void main(String[] args) throws FileNotFoundException { String path = args[0]; // This is the default 2 line structure for spark programs in java // The spark.executor.memory can only take the maximum java heapspace set by -Xmx SparkConf conf = new SparkConf() .setMaster("local[" + NUM_THREADS + "]") .setAppName(Demo.class.getSimpleName()) .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); JavaSparkContext sc = new JavaSparkContext(conf); long start = System.nanoTime(); // if you need both the coordinates and the sequences, use this section of code // read sequence file and map to PdbId.chainId, SimplePolymerChain pairs List<Tuple2<String, SimplePolymerChain>> chains = sc.sequenceFile(path, Text.class, ArrayWritable.class, NUM_THREADS * NUM_TASKS_PER_THREAD) .sample(false, 0.01, 123) .mapToPair( new HadoopToSimpleChainMapper()) // convert input to <pdbId.chainId, // SimplePolymerChain> pairs .filter(t -> t._2.isProtein()) .collect(); for (Tuple2<String, SimplePolymerChain> t : chains) { System.out.println(t._1 + ": " + t._2); } // if you need just the coordinates, use this section of code // read sequence file and map to PdbId.chainId, C-alpha coordinate pairs List<Tuple2<String, Point3d[]>> coordinates = sc.sequenceFile(path, Text.class, ArrayWritable.class, NUM_THREADS * NUM_TASKS_PER_THREAD) .sample(false, 0.01, 123) .mapToPair( new HadoopToSimpleChainMapper()) // convert input to <pdbId.chainId, protein // sequence> pairs .filter(t -> t._2.isProtein()) .mapToPair(t -> new Tuple2<String, Point3d[]>(t._1, t._2.getCoordinates())) .collect(); for (Tuple2<String, Point3d[]> t : coordinates) { System.out.println(t._1 + ": " + Arrays.toString(t._2)); } sc.close(); System.out.println("Time: " + (System.nanoTime() - start) / 1E9 + " sec."); }
/** * @param args Path of the hadoop sequence file * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException { String path = args[0]; JavaSparkContext sc = getSparkContext(); // sc is an existing JavaSparkContext. SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc); sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy"); sqlContext.setConf("spark.sql.parquet.filterPushdown", "true"); long start = System.nanoTime(); // read sequence file and map JavaRDD<Row> rowRDD = sc.sequenceFile(path, Text.class, Text.class) // .sample(false, 0.01, 123) .mapToPair(t -> new Tuple2<String, String>(t._1.toString(), t._2.toString())) .groupByKey() .map(new HadoopToParqRow()) .cache(); List<StructField> fields = new ArrayList<StructField>(); // create data fields of features for the DataFrame fields.add(DataTypes.createStructField("index", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("chainId1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("chainId2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Rnum1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Rnum2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Ins1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Ins2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("res1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("res2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("atom1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("atom2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("element1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("element2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("distance", DataTypes.IntegerType, false)); fields.add( DataTypes.createStructField( "pdbId", DataTypes.createArrayType(DataTypes.StringType), false)); StructType schema = DataTypes.createStructType(fields); // Apply the schema to the RDD. DataFrame dataFrame = sqlContext.createDataFrame(rowRDD, schema); dataFrame .coalesce(1) .write() .mode(SaveMode.Overwrite) .partitionBy("index") .parquet("/Users/hina/Data/ExampleFiles/seq.parquet"); sc.close(); System.out.println("Time: " + (System.nanoTime() - start) / 1E9 + " sec."); }