@Test public void testInFlow() throws Exception { FileSystem fs = FileSystem.get(new Configuration()); fs.delete(new Path(TMP_DIR), true); Hfs input = new Hfs(new SequenceFile(new Fields("constant", "first", "second")), TMP_DIR + "/inputs"); TupleEntryCollector collector = input.openForWrite(new HadoopFlowProcess()); collector.add(new Tuple("constant 1", "a", "b")); collector.add(new Tuple("constant 2", "c", "d")); collector.close(); Hfs output = new Hfs(new SequenceFile(new Fields("constant", "output")), TMP_DIR + "/outputs"); Pipe pipe = Pump.prime() .each(new Extrude("output"), "first", "second") .retain("constant", "output") .toPipe(); FlowDef flow = new FlowDef().addSource("input", input).addTailSink(pipe, output); CascadingHelper.setTestMode(); CascadingHelper.get().getFlowConnector().connect(flow).complete(); List<String> results = new ArrayList<String>(); TupleEntryIterator iterator = output.openForRead(new HadoopFlowProcess()); while (iterator.hasNext()) { TupleEntry tupleEntry = iterator.next(); results.add(tupleEntry.getString(0) + "\t" + tupleEntry.getString(1)); } assertEquals( Arrays.asList("constant 1\ta", "constant 1\tb", "constant 2\tc", "constant 2\td"), results); }
static void setObject(TupleEntry entry, Comparable<?> field, Object object) { if (object != null && entry.getFields().getType(field) instanceof CoercibleType) { entry.setObject(field, object.toString()); } else { entry.setObject(field, object); } }
@Override public void receive(Duct previous, final Grouping<TupleEntry, TupleEntryIterator> grouping) { try { // we want to null out any 'values' before and after the iterator begins/ends // this allows buffers to emit tuples before next() and when hasNext() return false; final TupleEntry tupleEntry = grouping.joinIterator.getTupleEntry(); incomingEntry = tupleEntry; // if Fields.NONE are declared on the CoGroup, we don't provide arguments, only the // joinerClosure if (!tupleEntry.getFields().isNone()) { final Tuple valueNulledTuple = Tuples.setOnEmpty(tupleEntry, grouping.key); tupleEntry.setTuple(valueNulledTuple); operationCall.setArgumentsIterator( createArgumentsIterator(grouping, tupleEntry, valueNulledTuple)); } operationCall.setOutputCollector(outputCollector); operationCall.setJoinerClosure(grouping.joinerClosure); operationCall.setGroup(grouping.key); buffer.operate(flowProcess, operationCall); } catch (CascadingException exception) { handleException(exception, argumentsEntry); } catch (Throwable throwable) { handleException( new OperatorException( every, "operator Every failed executing operation: " + every.getOperation(), throwable), argumentsEntry); } }
private void writeObject(ObjectOutputStream s) throws IOException { // Make sure anything in memory has been flushed to _tupleEntry commit(); TupleEntry te = getTupleEntry(); s.writeObject(te.getFields()); s.writeObject(te.getTuple()); }
@Override public void aggregate(FlowProcess flowProcess, AggregatorCall aggregatorCall) { TupleEntry entry = aggregatorCall.getArguments(); if (entry.getInteger(0) < min) { min = entry.getInteger(0); } if (entry.getInteger(1) > max) { max = entry.getInteger(1); } }
/** * Set the data container to be <tupleEntry> * * @param tupleEntry Data for the datum. */ protected void setTupleEntry(TupleEntry tupleEntry, boolean checkFields) { if (checkFields && !tupleEntry.getFields().equals(getFields())) { throw new IllegalArgumentException( "Fields must be the same as for current value: " + tupleEntry.getFields() + "/" + _tupleEntry.getFields()); } _tupleEntry = tupleEntry; reset(); }
@Override public void sink( FlowProcess<? extends Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall) throws IOException { TupleEntry tupleEntry = sinkCall.getOutgoingEntry(); Iterable<String> strings = tupleEntry.asIterableOf(String.class); delimitedParser.joinLine(strings, sinkCall.getContext()); sinkCall.getContext().println(); }
public void testEvaluation() throws Exception { CascadingRuleCompiler crc = new CascadingRuleCompiler(defaultConfiguration); IDistributedCompiledRule dcr = crc.compile(rules.get(0)); dcr.evaluate(new EvaluationContext(1, 1, 1)); FlowAssembly fa = dcr.getFlowAssembly(); TupleEntryIterator tei = fa.openSink(); int size = 0; while (tei.hasNext()) { TupleEntry te = tei.next(); logger.info(te.getTuple().toString()); size++; } assertEquals(1, size); }
@Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((_tupleEntry == null) ? 0 : _tupleEntry.hashCode()); return result; }
public void setTuple(Tuple tuple) { if (getFields().size() != tuple.size()) { throw new IllegalArgumentException("Size of tuple doesn't match current fields"); } _tupleEntry.setTuple(tuple); reset(); }
@Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; BaseDatum other = (BaseDatum) obj; if (_tupleEntry == null) { return other._tupleEntry == null; } else if (!_tupleEntry.getFields().equals(other._tupleEntry.getFields())) { return false; } else if (!_tupleEntry.getTuple().equals(other._tupleEntry.getTuple())) { return false; } return true; }
/** * @param flowProcess * @param sinkCall * @throws IOException */ @Override public void sink( FlowProcess<JobConf> flowProcess, SinkCall<BSONWritable[], OutputCollector> sinkCall) throws IOException { TupleEntry tupleEntry = sinkCall.getOutgoingEntry(); OutputCollector outputCollector = sinkCall.getOutput(); String keyFieldName = this.fieldMappings.get(this.keyColumnName); Object key; // if fieldMappings doesn't have keyColumnName ("_id") field, then use new ObjectId() as key if (keyFieldName == null) { key = new ObjectId(); } else { key = tupleEntry.selectTuple(new Fields(keyFieldName)).get(0); } // Object key = tupleEntry.selectTuple(new // Fields(this.fieldMappings.get(this.keyColumnName))).get(0); BasicDBObject dbObject = new BasicDBObject(); for (String columnFieldName : columnFieldNames) { String columnFieldMapping = fieldMappings.get(columnFieldName); Object tupleEntryValue = null; try { if (columnFieldMapping != null) { // columnFieldMapping is null if no corresponding field name defined in Mappings. // only write the field value back to mongo if the field also defined in Mappings (ie. not // null) tupleEntryValue = tupleEntry.get(columnFieldMapping); } } catch (FieldsResolverException e) { logger.error("Couldn't resolve field: {}", columnFieldName); } if (tupleEntryValue != null && columnFieldName != keyColumnName) { // logger.info("Putting for output: {} {}", columnFieldName, tupleEntryValue); dbObject.put(columnFieldName, tupleEntryValue); } } logger.info("Putting key for output: {} {}", key, dbObject); // outputCollector.collect(new ObjectId(), dbObject); outputCollector.collect(key, dbObject); }
public static <T> void populateOutputTupleEntry( CombinerDefinition<T> definition, TupleEntry output, Tuple resultTuple) { // set the ID so we can differentiate later output.setRaw(MultiCombiner.ID_FIELD, definition.getId()); // our tuples are of the form groupFields+outputFields, set the TupleEntry fields appropriately Fields groupFields = definition.getGroupFields(); int index = 0; for (int i = 0; i < groupFields.size(); i++) { output.setRaw(groupFields.get(i), resultTuple.getObject(index)); index++; } Fields outputFields = definition.getOutputFields(); for (int i = 0; i < outputFields.size(); i++) { output.setRaw(outputFields.get(i), resultTuple.getObject(index)); index++; } }
static Tuple coerceToString(SinkCall<?, ?> sinkCall) { TupleEntry entry = sinkCall.getOutgoingEntry(); Fields fields = entry.getFields(); Tuple tuple = entry.getTuple(); if (fields.hasTypes()) { Type types[] = new Type[fields.size()]; for (int index = 0; index < fields.size(); index++) { Type type = fields.getType(index); if (type instanceof CoercibleType<?>) { types[index] = String.class; } else { types[index] = type; } } tuple = entry.getCoercedTuple(types); } return tuple; }
@Override public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) { TupleEntry group = bufferCall.getGroup(); String protocolAndDomain = group.getString(0); LOGGER.info("Processing tuple group: " + group); DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY); Iterator<TupleEntry> values = bufferCall.getArgumentsIterator(); while (values.hasNext()) { urls.add(new GroupedUrlDatum(new TupleEntry(values.next()))); } try { Runnable doRobots = new ProcessRobotsTask( protocolAndDomain, _scorer, urls, _fetcher, _parser, bufferCall.getOutputCollector(), _flowProcess); _executor.execute(doRobots); } catch (RejectedExecutionException e) { // should never happen. LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain); _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1); _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size()); ProcessRobotsTask.emptyQueue( urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess); } catch (Throwable t) { LOGGER.error( "Caught an unexpected throwable - robots handling rejected our request for " + protocolAndDomain, t); _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1); _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size()); ProcessRobotsTask.emptyQueue( urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess); } }
@Override public void operate(FlowProcess flowProcess, BufferCall bufferCall) { Iterator<TupleEntry> it = bufferCall.getArgumentsIterator(); HyperLogLog merged = null; try { while (it.hasNext()) { TupleEntry tupleEntry = it.next(); byte[] serialized = (byte[]) tupleEntry.getObject(0); HyperLogLog hll = HyperLogLog.Builder.build(serialized); if (merged == null) { merged = hll; } else { merged = (HyperLogLog) merged.merge(hll); } } } catch (Exception e) { throw new RuntimeException(e); } }
@Test public void testTempPath() throws Exception { BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class); BasePath tempDir = platform.getTempDir(); // Verify we can write and then read BasePath testDir = platform.makePath(tempDir, UUID.randomUUID().toString()); Scheme scheme = platform.makeBinaryScheme(new Fields("name", "age")); Tap tap = platform.makeTap(scheme, testDir); TupleEntryCollector writer = tap.openForWrite(platform.makeFlowProcess()); writer.add(new Tuple("ken", 37)); writer.close(); TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess()); assertTrue(iter.hasNext()); TupleEntry te = iter.next(); assertEquals("ken", te.getString("name")); assertFalse(iter.hasNext()); iter.close(); }
public void operate(FlowProcess flowProcess, BufferCall<TupleEntryCollector> bufferCall) { if (bufferCall.getJoinerClosure() != null) throw new IllegalStateException("joiner closure should be null"); if (insertHeader) bufferCall.getOutputCollector().add(new Tuple(value)); Iterator<TupleEntry> iterator = bufferCall.getArgumentsIterator(); while (iterator.hasNext()) { TupleEntry arguments = iterator.next(); // must be called if (expectedSize != -1 && arguments.size() != expectedSize) throw new RuntimeException("arguments wrong size"); if (path != null) bufferCall.getContext().add(arguments); if (value != null) bufferCall.getOutputCollector().add(new Tuple(value)); else bufferCall.getOutputCollector().add(arguments); // copy } if (insertFooter) bufferCall.getOutputCollector().add(new Tuple(value)); iterator.hasNext(); // regression }
/** * Throws a StackException to flag a hard failure * * @param tupleEntry */ private void operateSink(TupleEntry tupleEntry) { try { if (outputCollector != null) { getFlowProcess().keepAlive(); ((Tap) getFlowElement()).sink(tupleEntry, outputCollector); } else { ((Tap) getFlowElement()).sink(tupleEntry, lastOutput); } getFlowProcess().increment(StepCounters.Tuples_Written, 1); } catch (OutOfMemoryError error) { throw new StackException("out of memory, try increasing task memory allocation", error); } catch (IOException exception) { throw new StackException("io exception writing to tap: " + sink.toString(), exception); } catch (TapException exception) { throw new StackException("exception writing to tap: " + sink.toString(), exception); } catch (Throwable throwable) { if (throwable instanceof CascadingException) throw (CascadingException) throwable; throw new FlowException("internal error: " + tupleEntry.getTuple().print(), throwable); } }
protected void performOperation(Tuple[] context, TupleEntry entry) { if (context[0] == null) context[0] = new Tuple(entry.getTuple()); }
public Fields getFields() { return _tupleEntry.getFields(); }
protected Tuple getValue(TupleEntry tupleEntry) { return tupleEntry.selectTuple(getValueFields()); }
public UrlDatum(TupleEntry tupleEntry) { super(tupleEntry); validateFields(tupleEntry.getFields(), FIELDS); }
protected void validateFields(TupleEntry tupleEntry, Fields myFields) { if (!tupleEntry.getFields().contains(myFields)) { throw new IllegalArgumentException("Fields passed to constructor don't contain " + myFields); } }
static void setObject(TupleEntry entry, Comparable<?> field, Object object) { entry.setObject(field, object); }