/** * Create a Cascading Flow that will parse a set of mbox files and emit a tab-separated text file * with fields for the msgId, author, email address, etc. * * <p>Note this Flow will only run locally, since we're using the cascading.utils LocalPlatform. * * @param options Settings for the flow * @return Flow suitable for execution * @throws Exception */ public static Flow createFlow(ParseEmailArchivesOptions options) throws Exception { BasePlatform platform = new LocalPlatform(ParseEmailArchivesWorkflow.class); // We'll read individual file paths from the input file. BasePath inputPath = platform.makePath(options.getFileList()); Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath); Pipe emailPipe = new Pipe("emails"); emailPipe = new Each(emailPipe, new Fields("line"), new MboxSplitterFunction()); emailPipe = new Each(emailPipe, new ParseEmail()); BasePath outputPath = platform.makePath(options.getOutputDir()); TextLineScheme scheme = new TextLineScheme(false); Tap sinkTap = platform.makeTap(scheme, outputPath, SinkMode.REPLACE); FlowConnector flowConnector = platform.makeFlowConnector(); Flow flow = flowConnector.connect(sourceTap, sinkTap, emailPipe); return flow; }
@Test public void testTempPath() throws Exception { BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class); BasePath tempDir = platform.getTempDir(); // Verify we can write and then read BasePath testDir = platform.makePath(tempDir, UUID.randomUUID().toString()); Scheme scheme = platform.makeBinaryScheme(new Fields("name", "age")); Tap tap = platform.makeTap(scheme, testDir); TupleEntryCollector writer = tap.openForWrite(platform.makeFlowProcess()); writer.add(new Tuple("ken", 37)); writer.close(); TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess()); assertTrue(iter.hasNext()); TupleEntry te = iter.next(); assertEquals("ken", te.getString("name")); assertFalse(iter.hasNext()); iter.close(); }
@Test public void test() throws Exception { GenerateTermsOptions options = generateTerms("build/test/GenerateTermsFlowTest/test"); // Verify that we get expected results in the output BasePlatform platform = options.getPlatform(GenerateTermsFlowTest.class); Tap tap = platform.makeTap( platform.makeBinaryScheme(WikiTermDatum.FIELDS), options.getWorkingSubdirPath(WorkingConfig.TERMS_SUBDIR_NAME)); TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess()); WikiTermDatum datum = new WikiTermDatum(); while (iter.hasNext()) { datum.setTupleEntry(iter.next()); // TODO verify that each field looks correct? // System.out.println(datum.getTuple()); } // Verify we got the expected number of results. Map<String, Long> counters = options.getCounters(GenerateTermsFlow.class); String counterName = WorkflowOptions.getFlowCounterName(WikiwordsCounters.ARTICLES); assertEquals(15, (long) counters.get(counterName)); }