// @Override public void performSourceMapReduce(JavaRDD<KeyValueObject<KEYIN, VALUEIN>> pInputs) { // if not commented out this line forces mappedKeys to be realized // pInputs = SparkUtilities.realizeAndReturn(pInputs,getCtx()); JavaSparkContext ctx2 = SparkUtilities.getCurrentContext(); System.err.println("Starting Score Mapping"); JavaPairRDD<K, Tuple2<K, V>> kkv = performMappingPart(pInputs); // kkv = SparkUtilities.realizeAndReturn(kkv, ctx2); // mappedKeys = mappedKeys.persist(StorageLevel.MEMORY_AND_DISK_2()); // // if not commented out this line forces mappedKeys to be realized // mappedKeys = SparkUtilities.realizeAndReturn(mappedKeys, ctx2); // // // convert to tuples // // JavaPairRDD<K, Tuple2<K, V>> kkv = mappedKeys.mapToPair(new KeyValuePairFunction<K, // V>()); // // kkv = kkv.persist(StorageLevel.MEMORY_AND_DISK_2()); // // if not commented out this line forces mappedKeys to be realized // kkv = SparkUtilities.realizeAndReturn(kkv, ctx2); // if not commented out this line forces kvJavaPairRDD to be realized // kkv = SparkUtilities.realizeAndReturn(kkv ); System.err.println("Starting Score Reduce"); IReducerFunction reduce = getReduce(); // for some reason the compiler thnks K or V is not Serializable JavaPairRDD<K, Tuple2<K, V>> kkv1 = kkv; // JavaPairRDD<? extends Serializable, Tuple2<? extends Serializable, ? extends Serializable>> // kkv1 = (JavaPairRDD<? extends Serializable, Tuple2<? extends Serializable, ? extends // Serializable>>)kkv; //noinspection unchecked JavaPairRDD<K, KeyAndValues<K, V>> reducedSets = (JavaPairRDD<K, KeyAndValues<K, V>>) KeyAndValues.combineByKey(kkv1); // if not commented out this line forces kvJavaPairRDD to be realized reducedSets = SparkUtilities.realizeAndReturn(reducedSets); PartitionAdaptor<K> prt = new PartitionAdaptor<K>(getPartitioner()); reducedSets = reducedSets.partitionBy(prt); reducedSets = reducedSets.sortByKey(); // if not commented out this line forces kvJavaPairRDD to be realized reducedSets = SparkUtilities.realizeAndReturn(reducedSets); ReduceFunctionAdaptor f = new ReduceFunctionAdaptor(ctx2, reduce); JavaRDD<KeyValueObject<KOUT, VOUT>> reducedOutput = reducedSets.flatMap(f); // JavaPairRDD<K, V> kvJavaPairRDD = asTuples.partitionBy(sparkPartitioner); // if not commented out this line forces kvJavaPairRDD to be realized // kvJavaPairRDD = SparkUtilities.realizeAndReturn(kvJavaPairRDD,getCtx()); // if not commented out this line forces kvJavaPairRDD to be realized // reducedOutput = SparkUtilities.realizeAndReturn(reducedOutput, ctx2); output = reducedOutput; }
// @Override public void performSingleReturnMapReduce(JavaRDD<KeyValueObject<KEYIN, VALUEIN>> pInputs) { // if not commented out this line forces mappedKeys to be realized // pInputs = SparkUtilities.realizeAndReturn(pInputs,getCtx()); JavaPairRDD<K, Tuple2<K, V>> kkv = performMappingPart(pInputs); // if not commented out this line forces kvJavaPairRDD to be realized kkv = SparkUtilities.realizeAndReturn(kkv); PartitionAdaptor<K> prt = new PartitionAdaptor<K>(getPartitioner()); kkv = kkv.partitionBy(prt); IReducerFunction reduce = getReduce(); /** we can guarantee one output per input */ SingleOutputReduceFunctionAdaptor<K, V, KOUT, VOUT> f = new SingleOutputReduceFunctionAdaptor((ISingleOutputReducerFunction) reduce); JavaRDD<KeyValueObject<KOUT, VOUT>> reduced = kkv.map(f); // if not commented out this line forces kvJavaPairRDD to be realized reduced = SparkUtilities.realizeAndReturn(reduced); output = reduced; }
/** * sources may be very implementation specific * * @param source some source of data - might be a hadoop directory or a Spark RDD - this will be * cast internally * @param otherData */ @Override public void mapReduceSource(@Nonnull final Object source, final Object... otherData) { if (source instanceof JavaRDD) { performSourceMapReduce((JavaRDD) source); return; } if (source instanceof Path) { performMapReduce((Path) source); return; } if (source instanceof java.lang.Iterable) { performSourceMapReduce(SparkUtilities.fromIterable((Iterable) source)); return; } throw new IllegalArgumentException("cannot handle source of class " + source.getClass()); }