Esempio n. 1
0
  @Test
  public void testPGroupedTableToMultipleOutputs() throws IOException {
    Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
    PGroupedTable<String, String> groupedLineTable =
        pipeline
            .readTextFile(tmpDir.copyResourceFileName("set1.txt"))
            .by(IdentityFn.<String>getInstance(), Writables.strings())
            .groupByKey();

    PTable<String, String> ungroupedTableA = groupedLineTable.ungroup();
    PTable<String, String> ungroupedTableB = groupedLineTable.ungroup();

    File outputDirA = tmpDir.getFile("output_a");
    File outputDirB = tmpDir.getFile("output_b");

    pipeline.writeTextFile(ungroupedTableA, outputDirA.getAbsolutePath());
    pipeline.writeTextFile(ungroupedTableB, outputDirB.getAbsolutePath());
    PipelineResult result = pipeline.done();
    for (StageResult stageResult : result.getStageResults()) {
      assertTrue(stageResult.getStageName().length() > 1);
      assertTrue(stageResult.getStageId().length() > 1);
    }

    // Verify that output from a single PGroupedTable can be sent to multiple collections
    assertTrue(new File(outputDirA, "part-r-00000").exists());
    assertTrue(new File(outputDirB, "part-r-00000").exists());
  }
Esempio n. 2
0
 /**
  * Convert the given {@code PCollection<Pair<K, V>>} to a {@code PTable<K, V>}.
  *
  * @param pcollect The {@code PCollection} to convert
  * @return A {@code PTable} that contains the same data as the input {@code PCollection}
  */
 public static <K, V> PTable<K, V> asPTable(PCollection<Pair<K, V>> pcollect) {
   PType<Pair<K, V>> pt = pcollect.getPType();
   PTypeFamily ptf = pt.getFamily();
   PTableType<K, V> ptt = ptf.tableOf(pt.getSubTypes().get(0), pt.getSubTypes().get(1));
   DoFn<Pair<K, V>, Pair<K, V>> id = IdentityFn.getInstance();
   return pcollect.parallelDo("asPTable", id, ptt);
 }
Esempio n. 3
0
 public AvroType(Class<T> typeClass, Schema schema, DeepCopier<T> deepCopier, PType... ptypes) {
   this(typeClass, schema, IdentityFn.getInstance(), IdentityFn.getInstance(), deepCopier, ptypes);
 }
Esempio n. 4
0
 private void monitorLoop() {
   status.set(Status.RUNNING);
   long start = System.currentTimeMillis();
   Map<PCollectionImpl<?>, Set<Target>> targetDeps = Maps.newTreeMap(DEPTH_COMPARATOR);
   Set<Target> unfinished = Sets.newHashSet();
   for (PCollectionImpl<?> pcollect : outputTargets.keySet()) {
     targetDeps.put(pcollect, pcollect.getTargetDependencies());
     unfinished.addAll(outputTargets.get(pcollect));
   }
   runCallables(unfinished);
   while (!targetDeps.isEmpty() && doneSignal.getCount() > 0) {
     Set<Target> allTargets = Sets.newHashSet();
     for (PCollectionImpl<?> pcollect : targetDeps.keySet()) {
       allTargets.addAll(outputTargets.get(pcollect));
     }
     Map<PCollectionImpl<?>, JavaRDDLike<?, ?>> pcolToRdd = Maps.newTreeMap(DEPTH_COMPARATOR);
     for (PCollectionImpl<?> pcollect : targetDeps.keySet()) {
       if (Sets.intersection(allTargets, targetDeps.get(pcollect)).isEmpty()) {
         JavaRDDLike<?, ?> rdd = ((SparkCollection) pcollect).getJavaRDDLike(this);
         pcolToRdd.put(pcollect, rdd);
       }
     }
     distributeFiles();
     for (Map.Entry<PCollectionImpl<?>, JavaRDDLike<?, ?>> e : pcolToRdd.entrySet()) {
       JavaRDDLike<?, ?> rdd = e.getValue();
       PType<?> ptype = e.getKey().getPType();
       Set<Target> targets = outputTargets.get(e.getKey());
       if (targets.size() > 1) {
         rdd.rdd().cache();
       }
       for (Target t : targets) {
         Configuration conf = new Configuration(getConfiguration());
         getRuntimeContext().setConf(sparkContext.broadcast(WritableUtils.toByteArray(conf)));
         if (t instanceof MapReduceTarget) { // TODO: check this earlier
           Converter c = t.getConverter(ptype);
           IdentityFn ident = IdentityFn.getInstance();
           JavaPairRDD<?, ?> outRDD;
           if (rdd instanceof JavaRDD) {
             outRDD =
                 ((JavaRDD) rdd)
                     .map(
                         new MapFunction(
                             c.applyPTypeTransforms() ? ptype.getOutputMapFn() : ident, ctxt))
                     .mapToPair(new OutputConverterFunction(c));
           } else {
             outRDD =
                 ((JavaPairRDD) rdd)
                     .map(
                         new PairMapFunction(
                             c.applyPTypeTransforms() ? ptype.getOutputMapFn() : ident, ctxt))
                     .mapToPair(new OutputConverterFunction(c));
           }
           try {
             Job job = new Job(conf);
             if (t instanceof PathTarget) {
               PathTarget pt = (PathTarget) t;
               pt.configureForMapReduce(job, ptype, pt.getPath(), "out0");
               CrunchOutputs.OutputConfig outConfig =
                   CrunchOutputs.getNamedOutputs(job.getConfiguration()).get("out0");
               job.setOutputFormatClass(outConfig.bundle.getFormatClass());
               job.setOutputKeyClass(outConfig.keyClass);
               job.setOutputValueClass(outConfig.valueClass);
               outConfig.bundle.configure(job.getConfiguration());
               Path tmpPath = pipeline.createTempPath();
               outRDD.saveAsNewAPIHadoopFile(
                   tmpPath.toString(),
                   c.getKeyClass(),
                   c.getValueClass(),
                   job.getOutputFormatClass(),
                   job.getConfiguration());
               pt.handleOutputs(job.getConfiguration(), tmpPath, -1);
             } else { // if (t instanceof MapReduceTarget) {
               MapReduceTarget mrt = (MapReduceTarget) t;
               mrt.configureForMapReduce(job, ptype, new Path("/tmp"), "out0");
               CrunchOutputs.OutputConfig outConfig =
                   CrunchOutputs.getNamedOutputs(job.getConfiguration()).get("out0");
               job.setOutputFormatClass(outConfig.bundle.getFormatClass());
               job.setOutputKeyClass(outConfig.keyClass);
               job.setOutputValueClass(outConfig.valueClass);
               outRDD.saveAsHadoopDataset(new JobConf(job.getConfiguration()));
             }
           } catch (Exception et) {
             LOG.error("Spark Exception", et);
             status.set(Status.FAILED);
             set(PipelineResult.EMPTY);
             doneSignal.countDown();
           }
         }
       }
       unfinished.removeAll(targets);
     }
     if (status.get() == Status.RUNNING) {
       for (PCollectionImpl<?> output : pcolToRdd.keySet()) {
         if (toMaterialize.containsKey(output)) {
           MaterializableIterable mi = toMaterialize.get(output);
           if (mi.isSourceTarget()) {
             output.materializeAt((SourceTarget) mi.getSource());
           }
         }
         targetDeps.remove(output);
       }
     }
     runCallables(unfinished);
   }
   if (status.get() != Status.FAILED || status.get() != Status.KILLED) {
     status.set(Status.SUCCEEDED);
     set(
         new PipelineResult(
             ImmutableList.of(
                 new PipelineResult.StageResult(
                     "Spark", getCounters(), start, System.currentTimeMillis())),
             Status.SUCCEEDED));
   } else {
     set(PipelineResult.EMPTY);
   }
   doneSignal.countDown();
 }