@Test public void testPGroupedTableToMultipleOutputs() throws IOException { Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration()); PGroupedTable<String, String> groupedLineTable = pipeline .readTextFile(tmpDir.copyResourceFileName("set1.txt")) .by(IdentityFn.<String>getInstance(), Writables.strings()) .groupByKey(); PTable<String, String> ungroupedTableA = groupedLineTable.ungroup(); PTable<String, String> ungroupedTableB = groupedLineTable.ungroup(); File outputDirA = tmpDir.getFile("output_a"); File outputDirB = tmpDir.getFile("output_b"); pipeline.writeTextFile(ungroupedTableA, outputDirA.getAbsolutePath()); pipeline.writeTextFile(ungroupedTableB, outputDirB.getAbsolutePath()); PipelineResult result = pipeline.done(); for (StageResult stageResult : result.getStageResults()) { assertTrue(stageResult.getStageName().length() > 1); assertTrue(stageResult.getStageId().length() > 1); } // Verify that output from a single PGroupedTable can be sent to multiple collections assertTrue(new File(outputDirA, "part-r-00000").exists()); assertTrue(new File(outputDirB, "part-r-00000").exists()); }
/** * Convert the given {@code PCollection<Pair<K, V>>} to a {@code PTable<K, V>}. * * @param pcollect The {@code PCollection} to convert * @return A {@code PTable} that contains the same data as the input {@code PCollection} */ public static <K, V> PTable<K, V> asPTable(PCollection<Pair<K, V>> pcollect) { PType<Pair<K, V>> pt = pcollect.getPType(); PTypeFamily ptf = pt.getFamily(); PTableType<K, V> ptt = ptf.tableOf(pt.getSubTypes().get(0), pt.getSubTypes().get(1)); DoFn<Pair<K, V>, Pair<K, V>> id = IdentityFn.getInstance(); return pcollect.parallelDo("asPTable", id, ptt); }
public AvroType(Class<T> typeClass, Schema schema, DeepCopier<T> deepCopier, PType... ptypes) { this(typeClass, schema, IdentityFn.getInstance(), IdentityFn.getInstance(), deepCopier, ptypes); }
private void monitorLoop() { status.set(Status.RUNNING); long start = System.currentTimeMillis(); Map<PCollectionImpl<?>, Set<Target>> targetDeps = Maps.newTreeMap(DEPTH_COMPARATOR); Set<Target> unfinished = Sets.newHashSet(); for (PCollectionImpl<?> pcollect : outputTargets.keySet()) { targetDeps.put(pcollect, pcollect.getTargetDependencies()); unfinished.addAll(outputTargets.get(pcollect)); } runCallables(unfinished); while (!targetDeps.isEmpty() && doneSignal.getCount() > 0) { Set<Target> allTargets = Sets.newHashSet(); for (PCollectionImpl<?> pcollect : targetDeps.keySet()) { allTargets.addAll(outputTargets.get(pcollect)); } Map<PCollectionImpl<?>, JavaRDDLike<?, ?>> pcolToRdd = Maps.newTreeMap(DEPTH_COMPARATOR); for (PCollectionImpl<?> pcollect : targetDeps.keySet()) { if (Sets.intersection(allTargets, targetDeps.get(pcollect)).isEmpty()) { JavaRDDLike<?, ?> rdd = ((SparkCollection) pcollect).getJavaRDDLike(this); pcolToRdd.put(pcollect, rdd); } } distributeFiles(); for (Map.Entry<PCollectionImpl<?>, JavaRDDLike<?, ?>> e : pcolToRdd.entrySet()) { JavaRDDLike<?, ?> rdd = e.getValue(); PType<?> ptype = e.getKey().getPType(); Set<Target> targets = outputTargets.get(e.getKey()); if (targets.size() > 1) { rdd.rdd().cache(); } for (Target t : targets) { Configuration conf = new Configuration(getConfiguration()); getRuntimeContext().setConf(sparkContext.broadcast(WritableUtils.toByteArray(conf))); if (t instanceof MapReduceTarget) { // TODO: check this earlier Converter c = t.getConverter(ptype); IdentityFn ident = IdentityFn.getInstance(); JavaPairRDD<?, ?> outRDD; if (rdd instanceof JavaRDD) { outRDD = ((JavaRDD) rdd) .map( new MapFunction( c.applyPTypeTransforms() ? ptype.getOutputMapFn() : ident, ctxt)) .mapToPair(new OutputConverterFunction(c)); } else { outRDD = ((JavaPairRDD) rdd) .map( new PairMapFunction( c.applyPTypeTransforms() ? ptype.getOutputMapFn() : ident, ctxt)) .mapToPair(new OutputConverterFunction(c)); } try { Job job = new Job(conf); if (t instanceof PathTarget) { PathTarget pt = (PathTarget) t; pt.configureForMapReduce(job, ptype, pt.getPath(), "out0"); CrunchOutputs.OutputConfig outConfig = CrunchOutputs.getNamedOutputs(job.getConfiguration()).get("out0"); job.setOutputFormatClass(outConfig.bundle.getFormatClass()); job.setOutputKeyClass(outConfig.keyClass); job.setOutputValueClass(outConfig.valueClass); outConfig.bundle.configure(job.getConfiguration()); Path tmpPath = pipeline.createTempPath(); outRDD.saveAsNewAPIHadoopFile( tmpPath.toString(), c.getKeyClass(), c.getValueClass(), job.getOutputFormatClass(), job.getConfiguration()); pt.handleOutputs(job.getConfiguration(), tmpPath, -1); } else { // if (t instanceof MapReduceTarget) { MapReduceTarget mrt = (MapReduceTarget) t; mrt.configureForMapReduce(job, ptype, new Path("/tmp"), "out0"); CrunchOutputs.OutputConfig outConfig = CrunchOutputs.getNamedOutputs(job.getConfiguration()).get("out0"); job.setOutputFormatClass(outConfig.bundle.getFormatClass()); job.setOutputKeyClass(outConfig.keyClass); job.setOutputValueClass(outConfig.valueClass); outRDD.saveAsHadoopDataset(new JobConf(job.getConfiguration())); } } catch (Exception et) { LOG.error("Spark Exception", et); status.set(Status.FAILED); set(PipelineResult.EMPTY); doneSignal.countDown(); } } } unfinished.removeAll(targets); } if (status.get() == Status.RUNNING) { for (PCollectionImpl<?> output : pcolToRdd.keySet()) { if (toMaterialize.containsKey(output)) { MaterializableIterable mi = toMaterialize.get(output); if (mi.isSourceTarget()) { output.materializeAt((SourceTarget) mi.getSource()); } } targetDeps.remove(output); } } runCallables(unfinished); } if (status.get() != Status.FAILED || status.get() != Status.KILLED) { status.set(Status.SUCCEEDED); set( new PipelineResult( ImmutableList.of( new PipelineResult.StageResult( "Spark", getCounters(), start, System.currentTimeMillis())), Status.SUCCEEDED)); } else { set(PipelineResult.EMPTY); } doneSignal.countDown(); }