예제 #1
0
  // @Override
  public void performSourceMapReduce(JavaRDD<KeyValueObject<KEYIN, VALUEIN>> pInputs) {
    // if not commented out this line forces mappedKeys to be realized
    //    pInputs = SparkUtilities.realizeAndReturn(pInputs,getCtx());
    JavaSparkContext ctx2 = SparkUtilities.getCurrentContext();
    System.err.println("Starting Score Mapping");
    JavaPairRDD<K, Tuple2<K, V>> kkv = performMappingPart(pInputs);
    //      kkv = SparkUtilities.realizeAndReturn(kkv, ctx2);

    //        mappedKeys = mappedKeys.persist(StorageLevel.MEMORY_AND_DISK_2());
    //        // if not commented out this line forces mappedKeys to be realized
    //        mappedKeys = SparkUtilities.realizeAndReturn(mappedKeys, ctx2);
    //
    //        // convert to tuples
    //     //   JavaPairRDD<K, Tuple2<K, V>> kkv = mappedKeys.mapToPair(new KeyValuePairFunction<K,
    // V>());
    //
    //        kkv = kkv.persist(StorageLevel.MEMORY_AND_DISK_2());
    //        // if not commented out this line forces mappedKeys to be realized
    //       kkv = SparkUtilities.realizeAndReturn(kkv, ctx2);

    // if not commented out this line forces kvJavaPairRDD to be realized
    // kkv = SparkUtilities.realizeAndReturn(kkv );

    System.err.println("Starting Score Reduce");
    IReducerFunction reduce = getReduce();
    // for some reason the compiler thnks K or V is not Serializable
    JavaPairRDD<K, Tuple2<K, V>> kkv1 = kkv;

    // JavaPairRDD<? extends Serializable, Tuple2<? extends Serializable, ? extends Serializable>>
    // kkv1 = (JavaPairRDD<? extends Serializable, Tuple2<? extends Serializable, ? extends
    // Serializable>>)kkv;
    //noinspection unchecked
    JavaPairRDD<K, KeyAndValues<K, V>> reducedSets =
        (JavaPairRDD<K, KeyAndValues<K, V>>) KeyAndValues.combineByKey(kkv1);

    // if not commented out this line forces kvJavaPairRDD to be realized
    reducedSets = SparkUtilities.realizeAndReturn(reducedSets);

    PartitionAdaptor<K> prt = new PartitionAdaptor<K>(getPartitioner());
    reducedSets = reducedSets.partitionBy(prt);
    reducedSets = reducedSets.sortByKey();

    // if not commented out this line forces kvJavaPairRDD to be realized
    reducedSets = SparkUtilities.realizeAndReturn(reducedSets);

    ReduceFunctionAdaptor f = new ReduceFunctionAdaptor(ctx2, reduce);

    JavaRDD<KeyValueObject<KOUT, VOUT>> reducedOutput = reducedSets.flatMap(f);

    //  JavaPairRDD<K, V> kvJavaPairRDD = asTuples.partitionBy(sparkPartitioner);

    // if not commented out this line forces kvJavaPairRDD to be realized
    // kvJavaPairRDD = SparkUtilities.realizeAndReturn(kvJavaPairRDD,getCtx());

    // if not commented out this line forces kvJavaPairRDD to be realized
    //  reducedOutput = SparkUtilities.realizeAndReturn(reducedOutput, ctx2);

    output = reducedOutput;
  }
예제 #2
0
  // @Override
  public void performSingleReturnMapReduce(JavaRDD<KeyValueObject<KEYIN, VALUEIN>> pInputs) {
    // if not commented out this line forces mappedKeys to be realized
    //    pInputs = SparkUtilities.realizeAndReturn(pInputs,getCtx());
    JavaPairRDD<K, Tuple2<K, V>> kkv = performMappingPart(pInputs);

    // if not commented out this line forces kvJavaPairRDD to be realized
    kkv = SparkUtilities.realizeAndReturn(kkv);

    PartitionAdaptor<K> prt = new PartitionAdaptor<K>(getPartitioner());
    kkv = kkv.partitionBy(prt);

    IReducerFunction reduce = getReduce();
    /** we can guarantee one output per input */
    SingleOutputReduceFunctionAdaptor<K, V, KOUT, VOUT> f =
        new SingleOutputReduceFunctionAdaptor((ISingleOutputReducerFunction) reduce);
    JavaRDD<KeyValueObject<KOUT, VOUT>> reduced = kkv.map(f);

    // if not commented out this line forces kvJavaPairRDD to be realized
    reduced = SparkUtilities.realizeAndReturn(reduced);

    output = reduced;
  }
예제 #3
0
 /**
  * sources may be very implementation specific
  *
  * @param source some source of data - might be a hadoop directory or a Spark RDD - this will be
  *     cast internally
  * @param otherData
  */
 @Override
 public void mapReduceSource(@Nonnull final Object source, final Object... otherData) {
   if (source instanceof JavaRDD) {
     performSourceMapReduce((JavaRDD) source);
     return;
   }
   if (source instanceof Path) {
     performMapReduce((Path) source);
     return;
   }
   if (source instanceof java.lang.Iterable) {
     performSourceMapReduce(SparkUtilities.fromIterable((Iterable) source));
     return;
   }
   throw new IllegalArgumentException("cannot handle source of class " + source.getClass());
 }