public void testSinkDeclaredFields() throws IOException { if (!new File(inputFileCross).exists()) fail("data file not found"); copyFromLocal(inputFileCross); Tap source = new Hfs(new TextLine(new Fields("line")), inputFileCross); Pipe pipe = new Pipe("test"); pipe = new Each( pipe, new RegexSplitter(new Fields("first", "second", "third"), "\\s"), Fields.ALL); Tap sink = new Hfs( new TextLine(new Fields("line"), new Fields("second", "first", "third")), outputPath + "/declaredsinks", true); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); // flow.writeDOT( "declaredsinks.dot" ); flow.complete(); validateLength(flow, 37, null); TupleEntryIterator iterator = flow.openSink(); String line = iterator.next().getString(0); assertTrue("not equal: wrong values", line.matches("[a-z]\t[0-9]\t[A-Z]")); iterator.close(); }
public void testNullsFromScheme() throws IOException { if (!new File(inputFileComments).exists()) fail("data file not found"); copyFromLocal(inputFileComments); Tap source = new Hfs(new CommentScheme(new Fields("line")), inputFileComments); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new Identity()); Tap sink = new Hfs(new TextLine(1), outputPath + "/testnulls", true); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); flow.complete(); validateLength(flow, 5, null); TupleEntryIterator iterator = flow.openSink(); assertEquals("not equal: tuple.get(1)", "1 a", iterator.next().get(1)); iterator.close(); // confirm the tuple iterator can handle nulls from the source validateLength(flow.openSource(), 5); }
public void testTemplateTapView() throws IOException { if (!new File(inputFileJoined).exists()) fail("data file not found"); copyFromLocal(inputFileJoined); Tap source = new Hfs(new TextLine(new Fields("line")), inputFileJoined); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new RegexSplitter(new Fields("number", "lower", "upper"), "\t")); Tap sink = new Hfs(new SequenceFile(new Fields("upper")), outputPath + "/testtemplatesview", true); sink = new TemplateTap((Hfs) sink, "%s-%s", new Fields("number", "lower"), 1); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); flow.complete(); Tap test = new Hfs(new SequenceFile(new Fields("upper")), sink.getPath().toString() + "/1-a"); validateLength(flow.openTapForRead(test), 1, 1); test = new Hfs(new SequenceFile(new Fields("upper")), sink.getPath().toString() + "/2-b"); validateLength(flow.openTapForRead(test), 1, 1); TupleEntryIterator input = flow.openTapForRead(test); // open 2-b assertEquals("wrong value", "B", input.next().get(0)); input.close(); }
public void testInFlow() throws Exception { FileSystem.get(new Configuration()).delete(new Path("/tmp/input"), true); FileSystem.get(new Configuration()).delete(new Path("/tmp/output"), true); Hfs inTap = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input"); TupleEntryCollector collector = inTap.openForWrite(new HadoopFlowProcess()); collector.add(new TupleEntry(new Fields("value"), new Tuple(BRYAN.build()))); collector.add(new TupleEntry(new Fields("value"), new Tuple(LUCAS.build()))); collector.close(); Pipe inPipe = new Pipe("in"); Pipe p = new Each( inPipe, new Fields("value"), new ExpandProto(Example.Person.class), new Fields("id", "name", "email", "position")); Hfs sink = new Hfs(new TextLine(), "/tmp/output"); new HadoopFlowConnector().connect(inTap, sink, p).complete(); TupleEntryIterator iter = sink.openForRead(new HadoopFlowProcess()); List<Tuple> results = new ArrayList<Tuple>(); while (iter.hasNext()) { results.add(iter.next().getTupleCopy()); } assertEquals(2, results.size()); assertEquals( new Tuple(0, 1, "bryan", "*****@*****.**", Example.Person.Position.CEO.getNumber()) .toString(), results.get(0).toString()); assertEquals(new Tuple(25, 2, "lucas", null, null).toString(), results.get(1).toString()); }
@Test public void testInFlow() throws Exception { FileSystem fs = FileSystem.get(new Configuration()); fs.delete(new Path(TMP_DIR), true); Hfs input = new Hfs(new SequenceFile(new Fields("constant", "first", "second")), TMP_DIR + "/inputs"); TupleEntryCollector collector = input.openForWrite(new HadoopFlowProcess()); collector.add(new Tuple("constant 1", "a", "b")); collector.add(new Tuple("constant 2", "c", "d")); collector.close(); Hfs output = new Hfs(new SequenceFile(new Fields("constant", "output")), TMP_DIR + "/outputs"); Pipe pipe = Pump.prime() .each(new Extrude("output"), "first", "second") .retain("constant", "output") .toPipe(); FlowDef flow = new FlowDef().addSource("input", input).addTailSink(pipe, output); CascadingHelper.setTestMode(); CascadingHelper.get().getFlowConnector().connect(flow).complete(); List<String> results = new ArrayList<String>(); TupleEntryIterator iterator = output.openForRead(new HadoopFlowProcess()); while (iterator.hasNext()) { TupleEntry tupleEntry = iterator.next(); results.add(tupleEntry.getString(0) + "\t" + tupleEntry.getString(1)); } assertEquals( Arrays.asList("constant 1\ta", "constant 1\tb", "constant 2\tc", "constant 2\td"), results); }
private void assertHeaders(Tap output, Flow flow) throws IOException { TupleEntryIterator iterator = flow.openTapForRead(getPlatform().getTextFile(new Fields("line"), output.getIdentifier())); assertEquals(iterator.next().getObject(0), "first,second,third,fourth,fifth"); iterator.close(); }
protected List<Tuple> getAllTuples(Tap sink) throws IOException { List<Tuple> ret = Lists.newArrayList(); TupleEntryIterator tupleEntryIterator = sink.openForRead(CascadingUtil.get().getFlowProcess()); while (tupleEntryIterator.hasNext()) { ret.add(new Tuple(tupleEntryIterator.next().getTuple())); } return ret; }
public void testEvaluation() throws Exception { CascadingRuleCompiler crc = new CascadingRuleCompiler(defaultConfiguration); IDistributedCompiledRule dcr = crc.compile(rules.get(0)); dcr.evaluate(new EvaluationContext(1, 1, 1)); FlowAssembly fa = dcr.getFlowAssembly(); TupleEntryIterator tei = fa.openSink(); int size = 0; while (tei.hasNext()) { TupleEntry te = tei.next(); logger.info(te.getTuple().toString()); size++; } assertEquals(1, size); }
@Test public void testTempPath() throws Exception { BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class); BasePath tempDir = platform.getTempDir(); // Verify we can write and then read BasePath testDir = platform.makePath(tempDir, UUID.randomUUID().toString()); Scheme scheme = platform.makeBinaryScheme(new Fields("name", "age")); Tap tap = platform.makeTap(scheme, testDir); TupleEntryCollector writer = tap.openForWrite(platform.makeFlowProcess()); writer.add(new Tuple("ken", 37)); writer.close(); TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess()); assertTrue(iter.hasNext()); TupleEntry te = iter.next(); assertEquals("ken", te.getString("name")); assertFalse(iter.hasNext()); iter.close(); }
@Test public void test() throws Exception { GenerateTermsOptions options = generateTerms("build/test/GenerateTermsFlowTest/test"); // Verify that we get expected results in the output BasePlatform platform = options.getPlatform(GenerateTermsFlowTest.class); Tap tap = platform.makeTap( platform.makeBinaryScheme(WikiTermDatum.FIELDS), options.getWorkingSubdirPath(WorkingConfig.TERMS_SUBDIR_NAME)); TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess()); WikiTermDatum datum = new WikiTermDatum(); while (iter.hasNext()) { datum.setTupleEntry(iter.next()); // TODO verify that each field looks correct? // System.out.println(datum.getTuple()); } // Verify we got the expected number of results. Map<String, Long> counters = options.getCounters(GenerateTermsFlow.class); String counterName = WorkflowOptions.getFlowCounterName(WikiwordsCounters.ARTICLES); assertEquals(15, (long) counters.get(counterName)); }
/* * creates and processes a flow identified by {@code flowIdentificator} * results are stored at {@code output} under the result named {@code resultName} */ private boolean processFlow(String resultName, String flowIdentificator, String output) throws IOException { boolean hasNewInferences = false; String flowName = resultName + flowIdentificator; Map<String, Tap> sources = prepareSourceTaps(); SequenceFile sinkScheme = new SequenceFile(fields); // sinkScheme.setNumSinkParts(1); //FIXME Tap headSink = new Hfs(sinkScheme, output, true); Map<String, Tap> sinks = new HashMap<String, Tap>(); List<Pipe> pipes = new ArrayList<Pipe>(); sinks.put(pipe.getName(), headSink); pipes.add(pipe); if (mConfiguration.doPredicateIndexing) { // calculate the count of the result and write it in the configuration // if the predicate is a variable then we have to split also the result and put it in the // right location setupPredicateCounts(pipe, sinks, pipes); } flow = new FlowConnector(mConfiguration.flowProperties) .connect(flowName, sources, sinks, pipes.toArray(new Pipe[0])); if (flow != null) { // flow.writeDOT("flow.dot"); } flow.complete(); try { TupleEntryIterator iterator = flow.openSink(pipe.getName()); if (iterator.hasNext()) { hasNewInferences = true; } iterator.close(); } catch (IOException e) { logger.error("io exception", e); throw new RuntimeException("io exception", e); } if (!hasNewInferences) { deleteResults(new Path(path)); } else { // merge part files FIXME FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); // delete empty results (could be from reducers running on no data) int index = 0; while (true) { String value = String.valueOf(index); String file = path + "/" + "part-" + "00000".substring(0, 5 - value.length()) + value; Path filePath = new Path(file); if (fs.exists(filePath)) { Tap source = new Hfs(new Fields(0, 1, 2), file); TupleEntryIterator tei = source.openForRead(mConfiguration.jobConf); boolean noData = !tei.hasNext(); tei.close(); if (noData) { logger.info("delete empty result : " + file); fs.delete(filePath, false); } } else { break; } index++; } } if (hasNewInferences && mConfiguration.doPredicateIndexing) { FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); // update counts in configuration List<PredicateCount> predicateCounts = Utils.readPredicateCounts(flow, "predicatesPipe"); distributedFileSystemManager.addPredicates(predicateCounts); if (ruleStreams.getHeadStream().getPredicate() == null) { // split result to the right locations (for variable predicate) Tap source = new Hfs(sinkScheme, output, true); Utils.splitStreamPerPredicates( mConfiguration, distributedFileSystemManager, source, predicateCounts, resultName, flowIdentificator); fs.delete(new Path(output), true); } distributedFileSystemManager.savePredicateConfig(); String predicateGroupsTempPath = distributedFileSystemManager.getPredicateGroupsTempPath(mConfiguration.resultsName); fs.delete(new Path(predicateGroupsTempPath), true); } return hasNewInferences; }
private void runQuotedText(String path, String inputData, String delimiter, boolean useAll) throws IOException { Object[][] results = new Object[][] { {"foo", "bar", "baz", "bin", 1L}, {"foo", "bar", "baz", "bin", 2L}, {"foo", "bar" + delimiter + "bar", "baz", "bin", 3L}, {"foo", "bar\"" + delimiter + "bar", "baz", "bin", 4L}, {"foo", "bar\"\"" + delimiter + "bar", "baz", "bin", 5L}, {null, null, "baz", null, 6L}, {null, null, null, null, 7L}, {"foo", null, null, null, 8L}, {null, null, null, null, 9L}, {"f", null, null, null, 10L}, // this one is quoted, single char {"f", null, null, ",bin", 11L}, {"f", null, null, "bin,", 11L} }; if (useAll) { for (int i = 0; i < results.length; i++) { Object[] result = results[i]; for (int j = 0; j < result.length; j++) result[j] = result[j] != null ? result[j].toString() : null; } } Tuple[] tuples = new Tuple[results.length]; for (int i = 0; i < results.length; i++) tuples[i] = new Tuple(results[i]); Class[] types = new Class[] {String.class, String.class, String.class, String.class, long.class}; Fields fields = new Fields("first", "second", "third", "fourth", "fifth"); if (useAll) { types = null; fields = Fields.ALL; } Tap input = getPlatform() .getDelimitedFile(fields, false, delimiter, "\"", types, inputData, SinkMode.KEEP); Tap output = getPlatform() .getDelimitedFile( fields, false, delimiter, "\"", types, getOutputPath("quoted/" + path + "" + useAll), SinkMode.REPLACE); Pipe pipe = new Pipe("pipe"); Flow flow = getPlatform().getFlowConnector().connect(input, output, pipe); flow.complete(); validateLength(flow, results.length, 5); // validate input parsing compares to expected, and results compare to expected TupleEntryIterator iterator = flow.openSource(); int count = 0; while (iterator.hasNext()) { Tuple tuple = iterator.next().getTuple(); assertEquals(tuples[count++], tuple); } iterator = flow.openSink(); count = 0; while (iterator.hasNext()) { Tuple tuple = iterator.next().getTuple(); assertEquals(tuples[count++], tuple); } }