/* * creates a pipe for predicate counts */ private void setupPredicateCounts(Pipe pipe, Map<String, Tap> sinks, List<Pipe> pipes) throws IOException { String predicateGroupsTempPath = distributedFileSystemManager.getPredicateGroupsTempPath(mConfiguration.resultsName); FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); if (fs.exists(new Path(predicateGroupsTempPath))) { fs.delete(new Path(predicateGroupsTempPath), true); } Tap predicatesSink = new Hfs(new Fields(0, 1), predicateGroupsTempPath); Pipe predicatesPipe = Utils.buildPredicateCountPipe(pipe); sinks.put("predicatesPipe", predicatesSink); pipes.add(predicatesPipe); }
/* * creates and processes a flow identified by {@code flowIdentificator} * results are stored at {@code output} under the result named {@code resultName} */ private boolean processFlow(String resultName, String flowIdentificator, String output) throws IOException { boolean hasNewInferences = false; String flowName = resultName + flowIdentificator; Map<String, Tap> sources = prepareSourceTaps(); SequenceFile sinkScheme = new SequenceFile(fields); // sinkScheme.setNumSinkParts(1); //FIXME Tap headSink = new Hfs(sinkScheme, output, true); Map<String, Tap> sinks = new HashMap<String, Tap>(); List<Pipe> pipes = new ArrayList<Pipe>(); sinks.put(pipe.getName(), headSink); pipes.add(pipe); if (mConfiguration.doPredicateIndexing) { // calculate the count of the result and write it in the configuration // if the predicate is a variable then we have to split also the result and put it in the // right location setupPredicateCounts(pipe, sinks, pipes); } flow = new FlowConnector(mConfiguration.flowProperties) .connect(flowName, sources, sinks, pipes.toArray(new Pipe[0])); if (flow != null) { // flow.writeDOT("flow.dot"); } flow.complete(); try { TupleEntryIterator iterator = flow.openSink(pipe.getName()); if (iterator.hasNext()) { hasNewInferences = true; } iterator.close(); } catch (IOException e) { logger.error("io exception", e); throw new RuntimeException("io exception", e); } if (!hasNewInferences) { deleteResults(new Path(path)); } else { // merge part files FIXME FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); // delete empty results (could be from reducers running on no data) int index = 0; while (true) { String value = String.valueOf(index); String file = path + "/" + "part-" + "00000".substring(0, 5 - value.length()) + value; Path filePath = new Path(file); if (fs.exists(filePath)) { Tap source = new Hfs(new Fields(0, 1, 2), file); TupleEntryIterator tei = source.openForRead(mConfiguration.jobConf); boolean noData = !tei.hasNext(); tei.close(); if (noData) { logger.info("delete empty result : " + file); fs.delete(filePath, false); } } else { break; } index++; } } if (hasNewInferences && mConfiguration.doPredicateIndexing) { FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); // update counts in configuration List<PredicateCount> predicateCounts = Utils.readPredicateCounts(flow, "predicatesPipe"); distributedFileSystemManager.addPredicates(predicateCounts); if (ruleStreams.getHeadStream().getPredicate() == null) { // split result to the right locations (for variable predicate) Tap source = new Hfs(sinkScheme, output, true); Utils.splitStreamPerPredicates( mConfiguration, distributedFileSystemManager, source, predicateCounts, resultName, flowIdentificator); fs.delete(new Path(output), true); } distributedFileSystemManager.savePredicateConfig(); String predicateGroupsTempPath = distributedFileSystemManager.getPredicateGroupsTempPath(mConfiguration.resultsName); fs.delete(new Path(predicateGroupsTempPath), true); } return hasNewInferences; }