public void testSinkDeclaredFields() throws IOException { if (!new File(inputFileCross).exists()) fail("data file not found"); copyFromLocal(inputFileCross); Tap source = new Hfs(new TextLine(new Fields("line")), inputFileCross); Pipe pipe = new Pipe("test"); pipe = new Each( pipe, new RegexSplitter(new Fields("first", "second", "third"), "\\s"), Fields.ALL); Tap sink = new Hfs( new TextLine(new Fields("line"), new Fields("second", "first", "third")), outputPath + "/declaredsinks", true); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); // flow.writeDOT( "declaredsinks.dot" ); flow.complete(); validateLength(flow, 37, null); TupleEntryIterator iterator = flow.openSink(); String line = iterator.next().getString(0); assertTrue("not equal: wrong values", line.matches("[a-z]\t[0-9]\t[A-Z]")); iterator.close(); }
public void testTemplateTapView() throws IOException { if (!new File(inputFileJoined).exists()) fail("data file not found"); copyFromLocal(inputFileJoined); Tap source = new Hfs(new TextLine(new Fields("line")), inputFileJoined); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new RegexSplitter(new Fields("number", "lower", "upper"), "\t")); Tap sink = new Hfs(new SequenceFile(new Fields("upper")), outputPath + "/testtemplatesview", true); sink = new TemplateTap((Hfs) sink, "%s-%s", new Fields("number", "lower"), 1); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); flow.complete(); Tap test = new Hfs(new SequenceFile(new Fields("upper")), sink.getPath().toString() + "/1-a"); validateLength(flow.openTapForRead(test), 1, 1); test = new Hfs(new SequenceFile(new Fields("upper")), sink.getPath().toString() + "/2-b"); validateLength(flow.openTapForRead(test), 1, 1); TupleEntryIterator input = flow.openTapForRead(test); // open 2-b assertEquals("wrong value", "B", input.next().get(0)); input.close(); }
public void testNullsFromScheme() throws IOException { if (!new File(inputFileComments).exists()) fail("data file not found"); copyFromLocal(inputFileComments); Tap source = new Hfs(new CommentScheme(new Fields("line")), inputFileComments); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new Identity()); Tap sink = new Hfs(new TextLine(1), outputPath + "/testnulls", true); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); flow.complete(); validateLength(flow, 5, null); TupleEntryIterator iterator = flow.openSink(); assertEquals("not equal: tuple.get(1)", "1 a", iterator.next().get(1)); iterator.close(); // confirm the tuple iterator can handle nulls from the source validateLength(flow.openSource(), 5); }
private void assertHeaders(Tap output, Flow flow) throws IOException { TupleEntryIterator iterator = flow.openTapForRead(getPlatform().getTextFile(new Fields("line"), output.getIdentifier())); assertEquals(iterator.next().getObject(0), "first,second,third,fourth,fifth"); iterator.close(); }
@Test public void testTempPath() throws Exception { BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class); BasePath tempDir = platform.getTempDir(); // Verify we can write and then read BasePath testDir = platform.makePath(tempDir, UUID.randomUUID().toString()); Scheme scheme = platform.makeBinaryScheme(new Fields("name", "age")); Tap tap = platform.makeTap(scheme, testDir); TupleEntryCollector writer = tap.openForWrite(platform.makeFlowProcess()); writer.add(new Tuple("ken", 37)); writer.close(); TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess()); assertTrue(iter.hasNext()); TupleEntry te = iter.next(); assertEquals("ken", te.getString("name")); assertFalse(iter.hasNext()); iter.close(); }
/* * creates and processes a flow identified by {@code flowIdentificator} * results are stored at {@code output} under the result named {@code resultName} */ private boolean processFlow(String resultName, String flowIdentificator, String output) throws IOException { boolean hasNewInferences = false; String flowName = resultName + flowIdentificator; Map<String, Tap> sources = prepareSourceTaps(); SequenceFile sinkScheme = new SequenceFile(fields); // sinkScheme.setNumSinkParts(1); //FIXME Tap headSink = new Hfs(sinkScheme, output, true); Map<String, Tap> sinks = new HashMap<String, Tap>(); List<Pipe> pipes = new ArrayList<Pipe>(); sinks.put(pipe.getName(), headSink); pipes.add(pipe); if (mConfiguration.doPredicateIndexing) { // calculate the count of the result and write it in the configuration // if the predicate is a variable then we have to split also the result and put it in the // right location setupPredicateCounts(pipe, sinks, pipes); } flow = new FlowConnector(mConfiguration.flowProperties) .connect(flowName, sources, sinks, pipes.toArray(new Pipe[0])); if (flow != null) { // flow.writeDOT("flow.dot"); } flow.complete(); try { TupleEntryIterator iterator = flow.openSink(pipe.getName()); if (iterator.hasNext()) { hasNewInferences = true; } iterator.close(); } catch (IOException e) { logger.error("io exception", e); throw new RuntimeException("io exception", e); } if (!hasNewInferences) { deleteResults(new Path(path)); } else { // merge part files FIXME FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); // delete empty results (could be from reducers running on no data) int index = 0; while (true) { String value = String.valueOf(index); String file = path + "/" + "part-" + "00000".substring(0, 5 - value.length()) + value; Path filePath = new Path(file); if (fs.exists(filePath)) { Tap source = new Hfs(new Fields(0, 1, 2), file); TupleEntryIterator tei = source.openForRead(mConfiguration.jobConf); boolean noData = !tei.hasNext(); tei.close(); if (noData) { logger.info("delete empty result : " + file); fs.delete(filePath, false); } } else { break; } index++; } } if (hasNewInferences && mConfiguration.doPredicateIndexing) { FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration); // update counts in configuration List<PredicateCount> predicateCounts = Utils.readPredicateCounts(flow, "predicatesPipe"); distributedFileSystemManager.addPredicates(predicateCounts); if (ruleStreams.getHeadStream().getPredicate() == null) { // split result to the right locations (for variable predicate) Tap source = new Hfs(sinkScheme, output, true); Utils.splitStreamPerPredicates( mConfiguration, distributedFileSystemManager, source, predicateCounts, resultName, flowIdentificator); fs.delete(new Path(output), true); } distributedFileSystemManager.savePredicateConfig(); String predicateGroupsTempPath = distributedFileSystemManager.getPredicateGroupsTempPath(mConfiguration.resultsName); fs.delete(new Path(predicateGroupsTempPath), true); } return hasNewInferences; }