@Test public void testInFlow() throws Exception { FileSystem fs = FileSystem.get(new Configuration()); fs.delete(new Path(TMP_DIR), true); Hfs input = new Hfs(new SequenceFile(new Fields("constant", "first", "second")), TMP_DIR + "/inputs"); TupleEntryCollector collector = input.openForWrite(new HadoopFlowProcess()); collector.add(new Tuple("constant 1", "a", "b")); collector.add(new Tuple("constant 2", "c", "d")); collector.close(); Hfs output = new Hfs(new SequenceFile(new Fields("constant", "output")), TMP_DIR + "/outputs"); Pipe pipe = Pump.prime() .each(new Extrude("output"), "first", "second") .retain("constant", "output") .toPipe(); FlowDef flow = new FlowDef().addSource("input", input).addTailSink(pipe, output); CascadingHelper.setTestMode(); CascadingHelper.get().getFlowConnector().connect(flow).complete(); List<String> results = new ArrayList<String>(); TupleEntryIterator iterator = output.openForRead(new HadoopFlowProcess()); while (iterator.hasNext()) { TupleEntry tupleEntry = iterator.next(); results.add(tupleEntry.getString(0) + "\t" + tupleEntry.getString(1)); } assertEquals( Arrays.asList("constant 1\ta", "constant 1\tb", "constant 2\tc", "constant 2\td"), results); }
@Override public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) { TupleEntry group = bufferCall.getGroup(); String protocolAndDomain = group.getString(0); LOGGER.info("Processing tuple group: " + group); DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY); Iterator<TupleEntry> values = bufferCall.getArgumentsIterator(); while (values.hasNext()) { urls.add(new GroupedUrlDatum(new TupleEntry(values.next()))); } try { Runnable doRobots = new ProcessRobotsTask( protocolAndDomain, _scorer, urls, _fetcher, _parser, bufferCall.getOutputCollector(), _flowProcess); _executor.execute(doRobots); } catch (RejectedExecutionException e) { // should never happen. LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain); _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1); _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size()); ProcessRobotsTask.emptyQueue( urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess); } catch (Throwable t) { LOGGER.error( "Caught an unexpected throwable - robots handling rejected our request for " + protocolAndDomain, t); _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1); _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size()); ProcessRobotsTask.emptyQueue( urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess); } }
@Test public void testTempPath() throws Exception { BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class); BasePath tempDir = platform.getTempDir(); // Verify we can write and then read BasePath testDir = platform.makePath(tempDir, UUID.randomUUID().toString()); Scheme scheme = platform.makeBinaryScheme(new Fields("name", "age")); Tap tap = platform.makeTap(scheme, testDir); TupleEntryCollector writer = tap.openForWrite(platform.makeFlowProcess()); writer.add(new Tuple("ken", 37)); writer.close(); TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess()); assertTrue(iter.hasNext()); TupleEntry te = iter.next(); assertEquals("ken", te.getString("name")); assertFalse(iter.hasNext()); iter.close(); }