예제 #1
0
 public Map<String, Pipe> getTailsByName() {
   Map<String, Pipe> tails = Maps.newHashMap();
   for (Pipe pipe : getTails()) {
     tails.put(pipe.getName(), pipe);
   }
   return tails;
 }
예제 #2
0
  private static void collectNames(Pipe[] pipes, Set<String> names) {
    for (Pipe pipe : pipes) {
      if (pipe instanceof SubAssembly)
        names.addAll(Arrays.asList(((SubAssembly) pipe).getTailNames()));
      else names.add(pipe.getName());

      collectNames(SubAssembly.unwind(pipe.getPrevious()), names);
    }
  }
예제 #3
0
  /**
   * Method getHeads returns the first Pipe instances in this pipe assembly.
   *
   * @return the first (type Pipe[]) of this Pipe object.
   */
  public Pipe[] getHeads() {
    Pipe[] pipes = getPrevious();

    if (pipes.length == 0) return new Pipe[] {this};

    if (pipes.length == 1) return pipes[0].getHeads();

    Set<Pipe> heads = new HashSet<Pipe>();

    for (Pipe pipe : pipes) Collections.addAll(heads, pipe.getHeads());

    return heads.toArray(new Pipe[heads.size()]);
  }
예제 #4
0
  /**
   * Get the name of this pipe. Guaranteed non-null.
   *
   * @return String the name of this pipe
   */
  public String getName() {
    if (name != null) return name;

    if (previous != null) return previous.getName();

    return "ANONYMOUS";
  }
 /**
  * Opens the sink the flow. Only valid after evaluation was completed
  *
  * @return a tuple entry iterator
  * @throws IOException
  */
 public TupleEntryIterator openSink() throws IOException {
   if (flow == null) {
     return null;
   }
   return flow.openSink(pipe.getName());
   // Hfs hfs = new Hfs(Fields.ALL, path);
   // return hfs.openForRead(mConfiguration.jobConf);
 }
예제 #6
0
  static Pipe resolvePrevious(Pipe pipe) {
    if (pipe instanceof Group || pipe instanceof Operator) return pipe;

    Pipe[] pipes = pipe.getPrevious();

    if (pipes.length > 1)
      throw new IllegalStateException(
          "cannot resolve SubAssemblies with multiple tails at this time");

    for (Pipe previous : pipes) {
      if (previous instanceof Group || previous instanceof Operator) return previous;

      return resolvePrevious(previous);
    }

    return pipe;
  }
예제 #7
0
파일: Unique.java 프로젝트: bs1/cascading
 /**
  * Constructor Unique creates a new Unique instance.
  *
  * @param name of type String
  * @param pipe of type Pipe
  * @param uniqueFields of type Fields
  * @param include of type Include
  * @param threshold of type int
  */
 @ConstructorProperties({"name", "pipe", "uniqueFields", "include", "threshold"})
 public Unique(String name, Pipe pipe, Fields uniqueFields, Include include, int threshold) {
   this(name, Pipe.pipes(pipe), uniqueFields, include, threshold);
 }
  /*
   * creates and processes a flow identified by {@code flowIdentificator}
   * results are stored at {@code output} under the result named {@code resultName}
   */
  private boolean processFlow(String resultName, String flowIdentificator, String output)
      throws IOException {
    boolean hasNewInferences = false;
    String flowName = resultName + flowIdentificator;
    Map<String, Tap> sources = prepareSourceTaps();

    SequenceFile sinkScheme = new SequenceFile(fields);
    // sinkScheme.setNumSinkParts(1); //FIXME
    Tap headSink = new Hfs(sinkScheme, output, true);

    Map<String, Tap> sinks = new HashMap<String, Tap>();
    List<Pipe> pipes = new ArrayList<Pipe>();
    sinks.put(pipe.getName(), headSink);
    pipes.add(pipe);
    if (mConfiguration.doPredicateIndexing) {
      // calculate the count of the result and write it in the configuration
      // if the predicate is a variable then we have to split also the result and put it in the
      // right location
      setupPredicateCounts(pipe, sinks, pipes);
    }

    flow =
        new FlowConnector(mConfiguration.flowProperties)
            .connect(flowName, sources, sinks, pipes.toArray(new Pipe[0]));
    if (flow != null) {
      // flow.writeDOT("flow.dot");
    }
    flow.complete();

    try {
      TupleEntryIterator iterator = flow.openSink(pipe.getName());
      if (iterator.hasNext()) {
        hasNewInferences = true;
      }
      iterator.close();
    } catch (IOException e) {
      logger.error("io exception", e);
      throw new RuntimeException("io exception", e);
    }
    if (!hasNewInferences) {
      deleteResults(new Path(path));
    } else {
      // merge part files FIXME
      FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration);

      // delete empty results (could be from reducers running on no data)
      int index = 0;
      while (true) {
        String value = String.valueOf(index);
        String file = path + "/" + "part-" + "00000".substring(0, 5 - value.length()) + value;
        Path filePath = new Path(file);
        if (fs.exists(filePath)) {
          Tap source = new Hfs(new Fields(0, 1, 2), file);
          TupleEntryIterator tei = source.openForRead(mConfiguration.jobConf);
          boolean noData = !tei.hasNext();
          tei.close();
          if (noData) {
            logger.info("delete empty result : " + file);
            fs.delete(filePath, false);
          }
        } else {
          break;
        }
        index++;
      }
    }

    if (hasNewInferences && mConfiguration.doPredicateIndexing) {
      FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration);

      // update counts in configuration
      List<PredicateCount> predicateCounts = Utils.readPredicateCounts(flow, "predicatesPipe");

      distributedFileSystemManager.addPredicates(predicateCounts);

      if (ruleStreams.getHeadStream().getPredicate() == null) {
        // split result to the right locations (for variable predicate)
        Tap source = new Hfs(sinkScheme, output, true);
        Utils.splitStreamPerPredicates(
            mConfiguration,
            distributedFileSystemManager,
            source,
            predicateCounts,
            resultName,
            flowIdentificator);

        fs.delete(new Path(output), true);
      }

      distributedFileSystemManager.savePredicateConfig();
      String predicateGroupsTempPath =
          distributedFileSystemManager.getPredicateGroupsTempPath(mConfiguration.resultsName);
      fs.delete(new Path(predicateGroupsTempPath), true);
    }

    return hasNewInferences;
  }
예제 #9
0
 /**
  * Creates a new GroupBy instance that will first merge the given pipes, then group on the given
  * groupFields field names.
  *
  * @param groupName of type String
  * @param lhsPipe of type Pipe
  * @param rhsPipe of type Pipe
  * @param groupFields of type Fields
  */
 public GroupBy(String groupName, Pipe lhsPipe, Pipe rhsPipe, Fields groupFields) {
   super(groupName, Pipe.pipes(lhsPipe, rhsPipe), groupFields);
 }
예제 #10
0
 /**
  * Creates a new GroupBy instance that will first merge the given pipes, then group on the given
  * groupFields field names.
  *
  * @param lhsPipe of type Pipe
  * @param rhsPipe of type Pipe
  * @param groupFields of type Fields
  */
 public GroupBy(Pipe lhsPipe, Pipe rhsPipe, Fields groupFields) {
   super(Pipe.pipes(lhsPipe, rhsPipe), groupFields);
 }
  @Override
  HadoopFlow createFlow() throws IOException {
    // copy flowDef
    FlowDef def = FlowDef.flowDef();

    if (flowDef != null) {
      def.addSinks(flowDef.getSinksCopy())
          .addSources(flowDef.getSourcesCopy())
          .addTraps(flowDef.getTrapsCopy())
          .addTails(flowDef.getTailsArray())
          .setAssertionLevel(flowDef.getAssertionLevel())
          .setDebugLevel(flowDef.getDebugLevel())
          .addCheckpoints(flowDef.getCheckpointsCopy())
          .addTags(flowDef.getTags())
          .setName(flowDef.getName());
    }

    Set<Pipe> heads = new LinkedHashSet<Pipe>();

    if (tails != null) {
      for (Pipe pipe : tails) {
        Collections.addAll(heads, pipe.getHeads());
      }
    }

    Pipe pipe = null;

    if (heads.size() == 1) {
      pipe = heads.iterator().next();
    }

    if (sources != null && sources.size() == 1) {
      Tap tap = sources.remove(MARKER);
      if (tap != null) {
        sources.put(pipe.getName(), tap);
      }
    }

    if (sinks != null && sinks.size() == 1) {
      Tap tap = sinks.remove(MARKER);
      if (tap != null) {
        sinks.put(pipe.getName(), tap);
      }
    }

    def.addSources(sources).addSinks(sinks).addTraps(traps);

    if (tails != null) {
      def.addTails(tails);
    }

    if (StringUtils.hasText(beanName)) {
      def.addTag(beanName);

      if (!StringUtils.hasText(def.getName())) {
        def.setName(beanName);
      }
    }

    Configuration cfg = ConfigurationUtils.createFrom(configuration, properties);
    Properties props = ConfigurationUtils.asProperties(cfg);

    if (jarSetup) {
      if (jar != null) {
        AppProps.setApplicationJarPath(props, ResourceUtils.decode(jar.getURI().toString()));
      } else if (jarClass != null) {
        AppProps.setApplicationJarClass(props, jarClass);
      } else {
        // auto-detection based on the classpath
        ClassLoader cascadingCL = Cascade.class.getClassLoader();
        Resource cascadingCore = ResourceUtils.findContainingJar(Cascade.class);
        Resource cascadingHadoop =
            ResourceUtils.findContainingJar(cascadingCL, "cascading/flow/hadoop/HadoopFlow.class");
        // find jgrapht
        Resource jgrapht = ResourceUtils.findContainingJar(cascadingCL, "org/jgrapht/Graph.class");

        Assert.notNull(cascadingCore, "Cannot find cascading-core.jar");
        Assert.notNull(cascadingHadoop, "Cannot find cascading-hadoop.jar");
        Assert.notNull(jgrapht, "Cannot find jgraphts-jdk.jar");

        if (log.isDebugEnabled()) {
          log.debug(
              "Auto-detecting Cascading Libs ["
                  + Arrays.toString(new Resource[] {cascadingCore, cascadingHadoop, jgrapht})
                  + "]");
        }

        ConfigurationUtils.addLibs(cfg, cascadingCore, cascadingHadoop, jgrapht);

        // config changed, reinit properties
        props = ConfigurationUtils.asProperties(cfg);
      }
    }

    if (jobPoolingInterval != null) {
      FlowProps.setJobPollingInterval(props, jobPoolingInterval);
    }

    if (maxConcurrentSteps != null) {
      FlowProps.setMaxConcurrentSteps(props, maxConcurrentSteps);
    }

    HadoopFlow flow = (HadoopFlow) new HadoopFlowConnector(props).connect(def);

    return flow;
  }