예제 #1
0
  public void testSinkDeclaredFields() throws IOException {
    if (!new File(inputFileCross).exists()) fail("data file not found");

    copyFromLocal(inputFileCross);

    Tap source = new Hfs(new TextLine(new Fields("line")), inputFileCross);

    Pipe pipe = new Pipe("test");

    pipe =
        new Each(
            pipe, new RegexSplitter(new Fields("first", "second", "third"), "\\s"), Fields.ALL);

    Tap sink =
        new Hfs(
            new TextLine(new Fields("line"), new Fields("second", "first", "third")),
            outputPath + "/declaredsinks",
            true);

    Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe);

    //    flow.writeDOT( "declaredsinks.dot" );

    flow.complete();

    validateLength(flow, 37, null);

    TupleEntryIterator iterator = flow.openSink();

    String line = iterator.next().getString(0);
    assertTrue("not equal: wrong values", line.matches("[a-z]\t[0-9]\t[A-Z]"));

    iterator.close();
  }
예제 #2
0
  public void testTemplateTapView() throws IOException {
    if (!new File(inputFileJoined).exists()) fail("data file not found");

    copyFromLocal(inputFileJoined);

    Tap source = new Hfs(new TextLine(new Fields("line")), inputFileJoined);

    Pipe pipe = new Pipe("test");

    pipe = new Each(pipe, new RegexSplitter(new Fields("number", "lower", "upper"), "\t"));

    Tap sink =
        new Hfs(new SequenceFile(new Fields("upper")), outputPath + "/testtemplatesview", true);

    sink = new TemplateTap((Hfs) sink, "%s-%s", new Fields("number", "lower"), 1);

    Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe);

    flow.complete();

    Tap test = new Hfs(new SequenceFile(new Fields("upper")), sink.getPath().toString() + "/1-a");
    validateLength(flow.openTapForRead(test), 1, 1);

    test = new Hfs(new SequenceFile(new Fields("upper")), sink.getPath().toString() + "/2-b");
    validateLength(flow.openTapForRead(test), 1, 1);

    TupleEntryIterator input = flow.openTapForRead(test); // open 2-b

    assertEquals("wrong value", "B", input.next().get(0));

    input.close();
  }
예제 #3
0
  public void testNullsFromScheme() throws IOException {
    if (!new File(inputFileComments).exists()) fail("data file not found");

    copyFromLocal(inputFileComments);

    Tap source = new Hfs(new CommentScheme(new Fields("line")), inputFileComments);

    Pipe pipe = new Pipe("test");

    pipe = new Each(pipe, new Identity());

    Tap sink = new Hfs(new TextLine(1), outputPath + "/testnulls", true);

    Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe);

    flow.complete();

    validateLength(flow, 5, null);

    TupleEntryIterator iterator = flow.openSink();

    assertEquals("not equal: tuple.get(1)", "1 a", iterator.next().get(1));

    iterator.close();

    // confirm the tuple iterator can handle nulls from the source
    validateLength(flow.openSource(), 5);
  }
예제 #4
0
  private void assertHeaders(Tap output, Flow flow) throws IOException {
    TupleEntryIterator iterator =
        flow.openTapForRead(getPlatform().getTextFile(new Fields("line"), output.getIdentifier()));

    assertEquals(iterator.next().getObject(0), "first,second,third,fourth,fifth");

    iterator.close();
  }
  @Test
  public void testTempPath() throws Exception {
    BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class);

    BasePath tempDir = platform.getTempDir();

    // Verify we can write and then read
    BasePath testDir = platform.makePath(tempDir, UUID.randomUUID().toString());

    Scheme scheme = platform.makeBinaryScheme(new Fields("name", "age"));
    Tap tap = platform.makeTap(scheme, testDir);
    TupleEntryCollector writer = tap.openForWrite(platform.makeFlowProcess());
    writer.add(new Tuple("ken", 37));
    writer.close();

    TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess());
    assertTrue(iter.hasNext());
    TupleEntry te = iter.next();
    assertEquals("ken", te.getString("name"));
    assertFalse(iter.hasNext());
    iter.close();
  }
  /*
   * creates and processes a flow identified by {@code flowIdentificator}
   * results are stored at {@code output} under the result named {@code resultName}
   */
  private boolean processFlow(String resultName, String flowIdentificator, String output)
      throws IOException {
    boolean hasNewInferences = false;
    String flowName = resultName + flowIdentificator;
    Map<String, Tap> sources = prepareSourceTaps();

    SequenceFile sinkScheme = new SequenceFile(fields);
    // sinkScheme.setNumSinkParts(1); //FIXME
    Tap headSink = new Hfs(sinkScheme, output, true);

    Map<String, Tap> sinks = new HashMap<String, Tap>();
    List<Pipe> pipes = new ArrayList<Pipe>();
    sinks.put(pipe.getName(), headSink);
    pipes.add(pipe);
    if (mConfiguration.doPredicateIndexing) {
      // calculate the count of the result and write it in the configuration
      // if the predicate is a variable then we have to split also the result and put it in the
      // right location
      setupPredicateCounts(pipe, sinks, pipes);
    }

    flow =
        new FlowConnector(mConfiguration.flowProperties)
            .connect(flowName, sources, sinks, pipes.toArray(new Pipe[0]));
    if (flow != null) {
      // flow.writeDOT("flow.dot");
    }
    flow.complete();

    try {
      TupleEntryIterator iterator = flow.openSink(pipe.getName());
      if (iterator.hasNext()) {
        hasNewInferences = true;
      }
      iterator.close();
    } catch (IOException e) {
      logger.error("io exception", e);
      throw new RuntimeException("io exception", e);
    }
    if (!hasNewInferences) {
      deleteResults(new Path(path));
    } else {
      // merge part files FIXME
      FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration);

      // delete empty results (could be from reducers running on no data)
      int index = 0;
      while (true) {
        String value = String.valueOf(index);
        String file = path + "/" + "part-" + "00000".substring(0, 5 - value.length()) + value;
        Path filePath = new Path(file);
        if (fs.exists(filePath)) {
          Tap source = new Hfs(new Fields(0, 1, 2), file);
          TupleEntryIterator tei = source.openForRead(mConfiguration.jobConf);
          boolean noData = !tei.hasNext();
          tei.close();
          if (noData) {
            logger.info("delete empty result : " + file);
            fs.delete(filePath, false);
          }
        } else {
          break;
        }
        index++;
      }
    }

    if (hasNewInferences && mConfiguration.doPredicateIndexing) {
      FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration);

      // update counts in configuration
      List<PredicateCount> predicateCounts = Utils.readPredicateCounts(flow, "predicatesPipe");

      distributedFileSystemManager.addPredicates(predicateCounts);

      if (ruleStreams.getHeadStream().getPredicate() == null) {
        // split result to the right locations (for variable predicate)
        Tap source = new Hfs(sinkScheme, output, true);
        Utils.splitStreamPerPredicates(
            mConfiguration,
            distributedFileSystemManager,
            source,
            predicateCounts,
            resultName,
            flowIdentificator);

        fs.delete(new Path(output), true);
      }

      distributedFileSystemManager.savePredicateConfig();
      String predicateGroupsTempPath =
          distributedFileSystemManager.getPredicateGroupsTempPath(mConfiguration.resultsName);
      fs.delete(new Path(predicateGroupsTempPath), true);
    }

    return hasNewInferences;
  }