/* * Returns the tap for inferences */ private Map<String, Tap> getInferencesTap(Scheme scheme) { Map<String, Tap> inferencesTap = new HashMap<String, Tap>(); try { String path = null; FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); if (mConfiguration.doPredicateIndexing) { LiteralFields headStream = ruleStreams.getHeadStream(); path = distributedFileSystemManager.getInferencesPath(headStream); if (fs.exists(new Path(path))) { inferencesTap.put(headStream.getId().toString(), new Hfs(scheme, path)); } for (LiteralFields fields : ruleStreams.getBodyStreams()) { path = distributedFileSystemManager.getInferencesPath(fields); if (fs.exists(new Path(path))) { inferencesTap.put(fields.getId().toString(), new Hfs(scheme, path)); } } } else { path = distributedFileSystemManager.getInferencesPath(); if (fs.exists(new Path(path))) { inferencesTap.put("main", new Hfs(scheme, path)); } } } catch (IOException e) { logger.error("io exception", e); throw new RuntimeException("io exception", e); } return inferencesTap; }
/** * Evaluates this flow assembly * * @param evaluationContext the evaluation context, stratum, iteration, rule number * @return true if new inferences were stored, false otherwise */ public boolean evaluate(EvaluationContext evaluationContext) { this.distributedFileSystemManager = new DistributedFileSystemManager(mConfiguration); String flowIdentificator = "_" + evaluationContext.getStratumNumber() + "_" + evaluationContext.getIterationNumber() + "_" + evaluationContext.getRuleNumber(); String resultName = mConfiguration.resultsName != null ? mConfiguration.resultsName : "inference"; if (ruleStreams.getHeadStream().getPredicate() != null || !mConfiguration.doPredicateIndexing) { path = distributedFileSystemManager.getInferencesPath( ruleStreams.getHeadStream(), resultName, flowIdentificator); } else { path = distributedFileSystemManager.getTempInferencesPath(resultName, flowIdentificator); } try { return processFlow(resultName, flowIdentificator, path); } catch (IOException e) { logger.error("io exception creating flow", e); throw new RuntimeException("io exception creating flow", e); } }
/* * Prepares the source taps */ private Map<String, Tap> prepareSourceTaps() { SequenceFile sourceScheme = new SequenceFile(fields); Map<String, List<Tap>> sources = new HashMap<String, List<Tap>>(); if (mConfiguration.doPredicateIndexing) { LiteralFields headStream = ruleStreams.getHeadStream(); prepareIndexedSource(sourceScheme, sources, headStream); for (LiteralFields fields : ruleStreams.getBodyStreams()) { prepareIndexedSource(sourceScheme, sources, fields); } } else { Tap factsTap = new Hfs(sourceScheme, distributedFileSystemManager.getFactsPath()); sources.put("main", new ArrayList<Tap>()); sources.get("main").add(factsTap); Map<String, Tap> inferencesTaps = getInferencesTap(sourceScheme); if (inferencesTaps.containsKey("main")) { sources.get("main").add(inferencesTaps.get("main")); } } Map<String, Tap> sourceTaps = new HashMap<String, Tap>(); for (String name : sources.keySet()) { sourceTaps.put( name, new MultiSourceTap( sources .get(name) .toArray( new Tap [0]))); // we can assume that the number of fields are the same as the // head;s tuple size + 1 (the predicate) } return sourceTaps; }
/* * Put into the sources the taps for the predicate indexed storage */ private void prepareIndexedSource( SequenceFile sourceScheme, Map<String, List<Tap>> sources, LiteralFields fields) { IPredicate predicate = fields.getPredicate(); String literalId = fields.getId().toString(); sources.put(literalId, new ArrayList<Tap>()); if (predicate == null) { sources .get(literalId) .add(new Hfs(sourceScheme, distributedFileSystemManager.getFactsPath())); } else { sources .get(literalId) .add(new Hfs(sourceScheme, distributedFileSystemManager.getFactsPath(fields))); } Map<String, Tap> inferencesTaps = getInferencesTap(sourceScheme); if (inferencesTaps.containsKey(literalId)) { sources.get(literalId).add(inferencesTaps.get(literalId)); } }
/* * creates a pipe for predicate counts */ private void setupPredicateCounts(Pipe pipe, Map<String, Tap> sinks, List<Pipe> pipes) throws IOException { String predicateGroupsTempPath = distributedFileSystemManager.getPredicateGroupsTempPath(mConfiguration.resultsName); FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); if (fs.exists(new Path(predicateGroupsTempPath))) { fs.delete(new Path(predicateGroupsTempPath), true); } Tap predicatesSink = new Hfs(new Fields(0, 1), predicateGroupsTempPath); Pipe predicatesPipe = Utils.buildPredicateCountPipe(pipe); sinks.put("predicatesPipe", predicatesSink); pipes.add(predicatesPipe); }
/* * creates and processes a flow identified by {@code flowIdentificator} * results are stored at {@code output} under the result named {@code resultName} */ private boolean processFlow(String resultName, String flowIdentificator, String output) throws IOException { boolean hasNewInferences = false; String flowName = resultName + flowIdentificator; Map<String, Tap> sources = prepareSourceTaps(); SequenceFile sinkScheme = new SequenceFile(fields); // sinkScheme.setNumSinkParts(1); //FIXME Tap headSink = new Hfs(sinkScheme, output, true); Map<String, Tap> sinks = new HashMap<String, Tap>(); List<Pipe> pipes = new ArrayList<Pipe>(); sinks.put(pipe.getName(), headSink); pipes.add(pipe); if (mConfiguration.doPredicateIndexing) { // calculate the count of the result and write it in the configuration // if the predicate is a variable then we have to split also the result and put it in the // right location setupPredicateCounts(pipe, sinks, pipes); } flow = new FlowConnector(mConfiguration.flowProperties) .connect(flowName, sources, sinks, pipes.toArray(new Pipe[0])); if (flow != null) { // flow.writeDOT("flow.dot"); } flow.complete(); try { TupleEntryIterator iterator = flow.openSink(pipe.getName()); if (iterator.hasNext()) { hasNewInferences = true; } iterator.close(); } catch (IOException e) { logger.error("io exception", e); throw new RuntimeException("io exception", e); } if (!hasNewInferences) { deleteResults(new Path(path)); } else { // merge part files FIXME FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); // delete empty results (could be from reducers running on no data) int index = 0; while (true) { String value = String.valueOf(index); String file = path + "/" + "part-" + "00000".substring(0, 5 - value.length()) + value; Path filePath = new Path(file); if (fs.exists(filePath)) { Tap source = new Hfs(new Fields(0, 1, 2), file); TupleEntryIterator tei = source.openForRead(mConfiguration.jobConf); boolean noData = !tei.hasNext(); tei.close(); if (noData) { logger.info("delete empty result : " + file); fs.delete(filePath, false); } } else { break; } index++; } } if (hasNewInferences && mConfiguration.doPredicateIndexing) { FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); // update counts in configuration List<PredicateCount> predicateCounts = Utils.readPredicateCounts(flow, "predicatesPipe"); distributedFileSystemManager.addPredicates(predicateCounts); if (ruleStreams.getHeadStream().getPredicate() == null) { // split result to the right locations (for variable predicate) Tap source = new Hfs(sinkScheme, output, true); Utils.splitStreamPerPredicates( mConfiguration, distributedFileSystemManager, source, predicateCounts, resultName, flowIdentificator); fs.delete(new Path(output), true); } distributedFileSystemManager.savePredicateConfig(); String predicateGroupsTempPath = distributedFileSystemManager.getPredicateGroupsTempPath(mConfiguration.resultsName); fs.delete(new Path(predicateGroupsTempPath), true); } return hasNewInferences; }