public Map<String, Pipe> getTailsByName() { Map<String, Pipe> tails = Maps.newHashMap(); for (Pipe pipe : getTails()) { tails.put(pipe.getName(), pipe); } return tails; }
private static void collectNames(Pipe[] pipes, Set<String> names) { for (Pipe pipe : pipes) { if (pipe instanceof SubAssembly) names.addAll(Arrays.asList(((SubAssembly) pipe).getTailNames())); else names.add(pipe.getName()); collectNames(SubAssembly.unwind(pipe.getPrevious()), names); } }
/** * Method getHeads returns the first Pipe instances in this pipe assembly. * * @return the first (type Pipe[]) of this Pipe object. */ public Pipe[] getHeads() { Pipe[] pipes = getPrevious(); if (pipes.length == 0) return new Pipe[] {this}; if (pipes.length == 1) return pipes[0].getHeads(); Set<Pipe> heads = new HashSet<Pipe>(); for (Pipe pipe : pipes) Collections.addAll(heads, pipe.getHeads()); return heads.toArray(new Pipe[heads.size()]); }
/** * Get the name of this pipe. Guaranteed non-null. * * @return String the name of this pipe */ public String getName() { if (name != null) return name; if (previous != null) return previous.getName(); return "ANONYMOUS"; }
/** * Opens the sink the flow. Only valid after evaluation was completed * * @return a tuple entry iterator * @throws IOException */ public TupleEntryIterator openSink() throws IOException { if (flow == null) { return null; } return flow.openSink(pipe.getName()); // Hfs hfs = new Hfs(Fields.ALL, path); // return hfs.openForRead(mConfiguration.jobConf); }
static Pipe resolvePrevious(Pipe pipe) { if (pipe instanceof Group || pipe instanceof Operator) return pipe; Pipe[] pipes = pipe.getPrevious(); if (pipes.length > 1) throw new IllegalStateException( "cannot resolve SubAssemblies with multiple tails at this time"); for (Pipe previous : pipes) { if (previous instanceof Group || previous instanceof Operator) return previous; return resolvePrevious(previous); } return pipe; }
/** * Constructor Unique creates a new Unique instance. * * @param name of type String * @param pipe of type Pipe * @param uniqueFields of type Fields * @param include of type Include * @param threshold of type int */ @ConstructorProperties({"name", "pipe", "uniqueFields", "include", "threshold"}) public Unique(String name, Pipe pipe, Fields uniqueFields, Include include, int threshold) { this(name, Pipe.pipes(pipe), uniqueFields, include, threshold); }
/* * creates and processes a flow identified by {@code flowIdentificator} * results are stored at {@code output} under the result named {@code resultName} */ private boolean processFlow(String resultName, String flowIdentificator, String output) throws IOException { boolean hasNewInferences = false; String flowName = resultName + flowIdentificator; Map<String, Tap> sources = prepareSourceTaps(); SequenceFile sinkScheme = new SequenceFile(fields); // sinkScheme.setNumSinkParts(1); //FIXME Tap headSink = new Hfs(sinkScheme, output, true); Map<String, Tap> sinks = new HashMap<String, Tap>(); List<Pipe> pipes = new ArrayList<Pipe>(); sinks.put(pipe.getName(), headSink); pipes.add(pipe); if (mConfiguration.doPredicateIndexing) { // calculate the count of the result and write it in the configuration // if the predicate is a variable then we have to split also the result and put it in the // right location setupPredicateCounts(pipe, sinks, pipes); } flow = new FlowConnector(mConfiguration.flowProperties) .connect(flowName, sources, sinks, pipes.toArray(new Pipe[0])); if (flow != null) { // flow.writeDOT("flow.dot"); } flow.complete(); try { TupleEntryIterator iterator = flow.openSink(pipe.getName()); if (iterator.hasNext()) { hasNewInferences = true; } iterator.close(); } catch (IOException e) { logger.error("io exception", e); throw new RuntimeException("io exception", e); } if (!hasNewInferences) { deleteResults(new Path(path)); } else { // merge part files FIXME FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); // delete empty results (could be from reducers running on no data) int index = 0; while (true) { String value = String.valueOf(index); String file = path + "/" + "part-" + "00000".substring(0, 5 - value.length()) + value; Path filePath = new Path(file); if (fs.exists(filePath)) { Tap source = new Hfs(new Fields(0, 1, 2), file); TupleEntryIterator tei = source.openForRead(mConfiguration.jobConf); boolean noData = !tei.hasNext(); tei.close(); if (noData) { logger.info("delete empty result : " + file); fs.delete(filePath, false); } } else { break; } index++; } } if (hasNewInferences && mConfiguration.doPredicateIndexing) { FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); // update counts in configuration List<PredicateCount> predicateCounts = Utils.readPredicateCounts(flow, "predicatesPipe"); distributedFileSystemManager.addPredicates(predicateCounts); if (ruleStreams.getHeadStream().getPredicate() == null) { // split result to the right locations (for variable predicate) Tap source = new Hfs(sinkScheme, output, true); Utils.splitStreamPerPredicates( mConfiguration, distributedFileSystemManager, source, predicateCounts, resultName, flowIdentificator); fs.delete(new Path(output), true); } distributedFileSystemManager.savePredicateConfig(); String predicateGroupsTempPath = distributedFileSystemManager.getPredicateGroupsTempPath(mConfiguration.resultsName); fs.delete(new Path(predicateGroupsTempPath), true); } return hasNewInferences; }
/** * Creates a new GroupBy instance that will first merge the given pipes, then group on the given * groupFields field names. * * @param groupName of type String * @param lhsPipe of type Pipe * @param rhsPipe of type Pipe * @param groupFields of type Fields */ public GroupBy(String groupName, Pipe lhsPipe, Pipe rhsPipe, Fields groupFields) { super(groupName, Pipe.pipes(lhsPipe, rhsPipe), groupFields); }
/** * Creates a new GroupBy instance that will first merge the given pipes, then group on the given * groupFields field names. * * @param lhsPipe of type Pipe * @param rhsPipe of type Pipe * @param groupFields of type Fields */ public GroupBy(Pipe lhsPipe, Pipe rhsPipe, Fields groupFields) { super(Pipe.pipes(lhsPipe, rhsPipe), groupFields); }
@Override HadoopFlow createFlow() throws IOException { // copy flowDef FlowDef def = FlowDef.flowDef(); if (flowDef != null) { def.addSinks(flowDef.getSinksCopy()) .addSources(flowDef.getSourcesCopy()) .addTraps(flowDef.getTrapsCopy()) .addTails(flowDef.getTailsArray()) .setAssertionLevel(flowDef.getAssertionLevel()) .setDebugLevel(flowDef.getDebugLevel()) .addCheckpoints(flowDef.getCheckpointsCopy()) .addTags(flowDef.getTags()) .setName(flowDef.getName()); } Set<Pipe> heads = new LinkedHashSet<Pipe>(); if (tails != null) { for (Pipe pipe : tails) { Collections.addAll(heads, pipe.getHeads()); } } Pipe pipe = null; if (heads.size() == 1) { pipe = heads.iterator().next(); } if (sources != null && sources.size() == 1) { Tap tap = sources.remove(MARKER); if (tap != null) { sources.put(pipe.getName(), tap); } } if (sinks != null && sinks.size() == 1) { Tap tap = sinks.remove(MARKER); if (tap != null) { sinks.put(pipe.getName(), tap); } } def.addSources(sources).addSinks(sinks).addTraps(traps); if (tails != null) { def.addTails(tails); } if (StringUtils.hasText(beanName)) { def.addTag(beanName); if (!StringUtils.hasText(def.getName())) { def.setName(beanName); } } Configuration cfg = ConfigurationUtils.createFrom(configuration, properties); Properties props = ConfigurationUtils.asProperties(cfg); if (jarSetup) { if (jar != null) { AppProps.setApplicationJarPath(props, ResourceUtils.decode(jar.getURI().toString())); } else if (jarClass != null) { AppProps.setApplicationJarClass(props, jarClass); } else { // auto-detection based on the classpath ClassLoader cascadingCL = Cascade.class.getClassLoader(); Resource cascadingCore = ResourceUtils.findContainingJar(Cascade.class); Resource cascadingHadoop = ResourceUtils.findContainingJar(cascadingCL, "cascading/flow/hadoop/HadoopFlow.class"); // find jgrapht Resource jgrapht = ResourceUtils.findContainingJar(cascadingCL, "org/jgrapht/Graph.class"); Assert.notNull(cascadingCore, "Cannot find cascading-core.jar"); Assert.notNull(cascadingHadoop, "Cannot find cascading-hadoop.jar"); Assert.notNull(jgrapht, "Cannot find jgraphts-jdk.jar"); if (log.isDebugEnabled()) { log.debug( "Auto-detecting Cascading Libs [" + Arrays.toString(new Resource[] {cascadingCore, cascadingHadoop, jgrapht}) + "]"); } ConfigurationUtils.addLibs(cfg, cascadingCore, cascadingHadoop, jgrapht); // config changed, reinit properties props = ConfigurationUtils.asProperties(cfg); } } if (jobPoolingInterval != null) { FlowProps.setJobPollingInterval(props, jobPoolingInterval); } if (maxConcurrentSteps != null) { FlowProps.setMaxConcurrentSteps(props, maxConcurrentSteps); } HadoopFlow flow = (HadoopFlow) new HadoopFlowConnector(props).connect(def); return flow; }