@Test
  public void testInFlow() throws Exception {
    FileSystem fs = FileSystem.get(new Configuration());
    fs.delete(new Path(TMP_DIR), true);

    Hfs input =
        new Hfs(new SequenceFile(new Fields("constant", "first", "second")), TMP_DIR + "/inputs");
    TupleEntryCollector collector = input.openForWrite(new HadoopFlowProcess());
    collector.add(new Tuple("constant 1", "a", "b"));
    collector.add(new Tuple("constant 2", "c", "d"));
    collector.close();

    Hfs output = new Hfs(new SequenceFile(new Fields("constant", "output")), TMP_DIR + "/outputs");

    Pipe pipe =
        Pump.prime()
            .each(new Extrude("output"), "first", "second")
            .retain("constant", "output")
            .toPipe();
    FlowDef flow = new FlowDef().addSource("input", input).addTailSink(pipe, output);
    CascadingHelper.setTestMode();
    CascadingHelper.get().getFlowConnector().connect(flow).complete();

    List<String> results = new ArrayList<String>();
    TupleEntryIterator iterator = output.openForRead(new HadoopFlowProcess());
    while (iterator.hasNext()) {
      TupleEntry tupleEntry = iterator.next();
      results.add(tupleEntry.getString(0) + "\t" + tupleEntry.getString(1));
    }
    assertEquals(
        Arrays.asList("constant 1\ta", "constant 1\tb", "constant 2\tc", "constant 2\td"), results);
  }
 static void setObject(TupleEntry entry, Comparable<?> field, Object object) {
   if (object != null && entry.getFields().getType(field) instanceof CoercibleType) {
     entry.setObject(field, object.toString());
   } else {
     entry.setObject(field, object);
   }
 }
Example #3
0
  @Override
  public void receive(Duct previous, final Grouping<TupleEntry, TupleEntryIterator> grouping) {
    try {
      // we want to null out any 'values' before and after the iterator begins/ends
      // this allows buffers to emit tuples before next() and when hasNext() return false;
      final TupleEntry tupleEntry = grouping.joinIterator.getTupleEntry();
      incomingEntry = tupleEntry;

      // if Fields.NONE are declared on the CoGroup, we don't provide arguments, only the
      // joinerClosure
      if (!tupleEntry.getFields().isNone()) {
        final Tuple valueNulledTuple = Tuples.setOnEmpty(tupleEntry, grouping.key);
        tupleEntry.setTuple(valueNulledTuple);

        operationCall.setArgumentsIterator(
            createArgumentsIterator(grouping, tupleEntry, valueNulledTuple));
      }

      operationCall.setOutputCollector(outputCollector);
      operationCall.setJoinerClosure(grouping.joinerClosure);
      operationCall.setGroup(grouping.key);

      buffer.operate(flowProcess, operationCall);
    } catch (CascadingException exception) {
      handleException(exception, argumentsEntry);
    } catch (Throwable throwable) {
      handleException(
          new OperatorException(
              every,
              "operator Every failed executing operation: " + every.getOperation(),
              throwable),
          argumentsEntry);
    }
  }
Example #4
0
  private void writeObject(ObjectOutputStream s) throws IOException {
    // Make sure anything in memory has been flushed to _tupleEntry
    commit();

    TupleEntry te = getTupleEntry();
    s.writeObject(te.getFields());
    s.writeObject(te.getTuple());
  }
 @Override
 public void aggregate(FlowProcess flowProcess, AggregatorCall aggregatorCall) {
   TupleEntry entry = aggregatorCall.getArguments();
   if (entry.getInteger(0) < min) {
     min = entry.getInteger(0);
   }
   if (entry.getInteger(1) > max) {
     max = entry.getInteger(1);
   }
 }
Example #6
0
  /**
   * Set the data container to be <tupleEntry>
   *
   * @param tupleEntry Data for the datum.
   */
  protected void setTupleEntry(TupleEntry tupleEntry, boolean checkFields) {
    if (checkFields && !tupleEntry.getFields().equals(getFields())) {
      throw new IllegalArgumentException(
          "Fields must be the same as for current value: "
              + tupleEntry.getFields()
              + "/"
              + _tupleEntry.getFields());
    }

    _tupleEntry = tupleEntry;
    reset();
  }
Example #7
0
  @Override
  public void sink(
      FlowProcess<? extends Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall)
      throws IOException {
    TupleEntry tupleEntry = sinkCall.getOutgoingEntry();

    Iterable<String> strings = tupleEntry.asIterableOf(String.class);

    delimitedParser.joinLine(strings, sinkCall.getContext());

    sinkCall.getContext().println();
  }
  public void testEvaluation() throws Exception {
    CascadingRuleCompiler crc = new CascadingRuleCompiler(defaultConfiguration);
    IDistributedCompiledRule dcr = crc.compile(rules.get(0));
    dcr.evaluate(new EvaluationContext(1, 1, 1));
    FlowAssembly fa = dcr.getFlowAssembly();

    TupleEntryIterator tei = fa.openSink();
    int size = 0;
    while (tei.hasNext()) {
      TupleEntry te = tei.next();
      logger.info(te.getTuple().toString());
      size++;
    }
    assertEquals(1, size);
  }
Example #9
0
 @Override
 public int hashCode() {
   final int prime = 31;
   int result = 1;
   result = prime * result + ((_tupleEntry == null) ? 0 : _tupleEntry.hashCode());
   return result;
 }
Example #10
0
  public void setTuple(Tuple tuple) {
    if (getFields().size() != tuple.size()) {
      throw new IllegalArgumentException("Size of tuple doesn't match current fields");
    }

    _tupleEntry.setTuple(tuple);
    reset();
  }
Example #11
0
  @Override
  public boolean equals(Object obj) {
    if (this == obj) return true;
    if (obj == null) return false;
    if (getClass() != obj.getClass()) return false;

    BaseDatum other = (BaseDatum) obj;
    if (_tupleEntry == null) {
      return other._tupleEntry == null;
    } else if (!_tupleEntry.getFields().equals(other._tupleEntry.getFields())) {
      return false;
    } else if (!_tupleEntry.getTuple().equals(other._tupleEntry.getTuple())) {
      return false;
    }

    return true;
  }
Example #12
0
  /**
   * @param flowProcess
   * @param sinkCall
   * @throws IOException
   */
  @Override
  public void sink(
      FlowProcess<JobConf> flowProcess, SinkCall<BSONWritable[], OutputCollector> sinkCall)
      throws IOException {
    TupleEntry tupleEntry = sinkCall.getOutgoingEntry();
    OutputCollector outputCollector = sinkCall.getOutput();

    String keyFieldName = this.fieldMappings.get(this.keyColumnName);
    Object key;

    // if fieldMappings doesn't have keyColumnName ("_id") field, then use new ObjectId() as key
    if (keyFieldName == null) {
      key = new ObjectId();
    } else {
      key = tupleEntry.selectTuple(new Fields(keyFieldName)).get(0);
    }
    // Object key = tupleEntry.selectTuple(new
    // Fields(this.fieldMappings.get(this.keyColumnName))).get(0);

    BasicDBObject dbObject = new BasicDBObject();

    for (String columnFieldName : columnFieldNames) {
      String columnFieldMapping = fieldMappings.get(columnFieldName);
      Object tupleEntryValue = null;

      try {
        if (columnFieldMapping != null) {
          // columnFieldMapping is null if no corresponding field name defined in Mappings.
          // only write the field value back to mongo if the field also defined in Mappings (ie. not
          // null)
          tupleEntryValue = tupleEntry.get(columnFieldMapping);
        }
      } catch (FieldsResolverException e) {
        logger.error("Couldn't resolve field: {}", columnFieldName);
      }

      if (tupleEntryValue != null && columnFieldName != keyColumnName) {
        // logger.info("Putting for output: {} {}", columnFieldName, tupleEntryValue);
        dbObject.put(columnFieldName, tupleEntryValue);
      }
    }
    logger.info("Putting key for output: {} {}", key, dbObject);
    // outputCollector.collect(new ObjectId(), dbObject);
    outputCollector.collect(key, dbObject);
  }
Example #13
0
  public static <T> void populateOutputTupleEntry(
      CombinerDefinition<T> definition, TupleEntry output, Tuple resultTuple) {
    // set the ID so we can differentiate later
    output.setRaw(MultiCombiner.ID_FIELD, definition.getId());

    // our tuples are of the form groupFields+outputFields, set the TupleEntry fields appropriately
    Fields groupFields = definition.getGroupFields();
    int index = 0;
    for (int i = 0; i < groupFields.size(); i++) {
      output.setRaw(groupFields.get(i), resultTuple.getObject(index));
      index++;
    }
    Fields outputFields = definition.getOutputFields();
    for (int i = 0; i < outputFields.size(); i++) {
      output.setRaw(outputFields.get(i), resultTuple.getObject(index));
      index++;
    }
  }
    static Tuple coerceToString(SinkCall<?, ?> sinkCall) {
      TupleEntry entry = sinkCall.getOutgoingEntry();
      Fields fields = entry.getFields();
      Tuple tuple = entry.getTuple();

      if (fields.hasTypes()) {
        Type types[] = new Type[fields.size()];
        for (int index = 0; index < fields.size(); index++) {
          Type type = fields.getType(index);
          if (type instanceof CoercibleType<?>) {
            types[index] = String.class;
          } else {
            types[index] = type;
          }
        }

        tuple = entry.getCoercedTuple(types);
      }
      return tuple;
    }
  @Override
  public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) {
    TupleEntry group = bufferCall.getGroup();
    String protocolAndDomain = group.getString(0);
    LOGGER.info("Processing tuple group: " + group);

    DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY);
    Iterator<TupleEntry> values = bufferCall.getArgumentsIterator();
    while (values.hasNext()) {
      urls.add(new GroupedUrlDatum(new TupleEntry(values.next())));
    }

    try {
      Runnable doRobots =
          new ProcessRobotsTask(
              protocolAndDomain,
              _scorer,
              urls,
              _fetcher,
              _parser,
              bufferCall.getOutputCollector(),
              _flowProcess);
      _executor.execute(doRobots);
    } catch (RejectedExecutionException e) {
      // should never happen.
      LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    } catch (Throwable t) {
      LOGGER.error(
          "Caught an unexpected throwable - robots handling rejected our request for "
              + protocolAndDomain,
          t);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    }
  }
  @Override
  public void operate(FlowProcess flowProcess, BufferCall bufferCall) {
    Iterator<TupleEntry> it = bufferCall.getArgumentsIterator();
    HyperLogLog merged = null;

    try {
      while (it.hasNext()) {
        TupleEntry tupleEntry = it.next();
        byte[] serialized = (byte[]) tupleEntry.getObject(0);

        HyperLogLog hll = HyperLogLog.Builder.build(serialized);
        if (merged == null) {
          merged = hll;
        } else {
          merged = (HyperLogLog) merged.merge(hll);
        }
      }
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  @Test
  public void testTempPath() throws Exception {
    BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class);

    BasePath tempDir = platform.getTempDir();

    // Verify we can write and then read
    BasePath testDir = platform.makePath(tempDir, UUID.randomUUID().toString());

    Scheme scheme = platform.makeBinaryScheme(new Fields("name", "age"));
    Tap tap = platform.makeTap(scheme, testDir);
    TupleEntryCollector writer = tap.openForWrite(platform.makeFlowProcess());
    writer.add(new Tuple("ken", 37));
    writer.close();

    TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess());
    assertTrue(iter.hasNext());
    TupleEntry te = iter.next();
    assertEquals("ken", te.getString("name"));
    assertFalse(iter.hasNext());
    iter.close();
  }
Example #18
0
  public void operate(FlowProcess flowProcess, BufferCall<TupleEntryCollector> bufferCall) {
    if (bufferCall.getJoinerClosure() != null)
      throw new IllegalStateException("joiner closure should be null");

    if (insertHeader) bufferCall.getOutputCollector().add(new Tuple(value));

    Iterator<TupleEntry> iterator = bufferCall.getArgumentsIterator();

    while (iterator.hasNext()) {
      TupleEntry arguments = iterator.next(); // must be called

      if (expectedSize != -1 && arguments.size() != expectedSize)
        throw new RuntimeException("arguments wrong size");

      if (path != null) bufferCall.getContext().add(arguments);

      if (value != null) bufferCall.getOutputCollector().add(new Tuple(value));
      else bufferCall.getOutputCollector().add(arguments); // copy
    }

    if (insertFooter) bufferCall.getOutputCollector().add(new Tuple(value));

    iterator.hasNext(); // regression
  }
  /**
   * Throws a StackException to flag a hard failure
   *
   * @param tupleEntry
   */
  private void operateSink(TupleEntry tupleEntry) {
    try {
      if (outputCollector != null) {
        getFlowProcess().keepAlive();
        ((Tap) getFlowElement()).sink(tupleEntry, outputCollector);
      } else {
        ((Tap) getFlowElement()).sink(tupleEntry, lastOutput);
      }

      getFlowProcess().increment(StepCounters.Tuples_Written, 1);
    } catch (OutOfMemoryError error) {
      throw new StackException("out of memory, try increasing task memory allocation", error);
    } catch (IOException exception) {
      throw new StackException("io exception writing to tap: " + sink.toString(), exception);
    } catch (TapException exception) {
      throw new StackException("exception writing to tap: " + sink.toString(), exception);
    } catch (Throwable throwable) {
      if (throwable instanceof CascadingException) throw (CascadingException) throwable;

      throw new FlowException("internal error: " + tupleEntry.getTuple().print(), throwable);
    }
  }
Example #20
0
 protected void performOperation(Tuple[] context, TupleEntry entry) {
   if (context[0] == null) context[0] = new Tuple(entry.getTuple());
 }
Example #21
0
 public Fields getFields() {
   return _tupleEntry.getFields();
 }
 protected Tuple getValue(TupleEntry tupleEntry) {
   return tupleEntry.selectTuple(getValueFields());
 }
Example #23
0
 public UrlDatum(TupleEntry tupleEntry) {
   super(tupleEntry);
   validateFields(tupleEntry.getFields(), FIELDS);
 }
Example #24
0
 protected void validateFields(TupleEntry tupleEntry, Fields myFields) {
   if (!tupleEntry.getFields().contains(myFields)) {
     throw new IllegalArgumentException("Fields passed to constructor don't contain " + myFields);
   }
 }
 static void setObject(TupleEntry entry, Comparable<?> field, Object object) {
   entry.setObject(field, object);
 }