Пример #1
0
  public void testSinkDeclaredFields() throws IOException {
    if (!new File(inputFileCross).exists()) fail("data file not found");

    copyFromLocal(inputFileCross);

    Tap source = new Hfs(new TextLine(new Fields("line")), inputFileCross);

    Pipe pipe = new Pipe("test");

    pipe =
        new Each(
            pipe, new RegexSplitter(new Fields("first", "second", "third"), "\\s"), Fields.ALL);

    Tap sink =
        new Hfs(
            new TextLine(new Fields("line"), new Fields("second", "first", "third")),
            outputPath + "/declaredsinks",
            true);

    Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe);

    //    flow.writeDOT( "declaredsinks.dot" );

    flow.complete();

    validateLength(flow, 37, null);

    TupleEntryIterator iterator = flow.openSink();

    String line = iterator.next().getString(0);
    assertTrue("not equal: wrong values", line.matches("[a-z]\t[0-9]\t[A-Z]"));

    iterator.close();
  }
Пример #2
0
  public void testNullsFromScheme() throws IOException {
    if (!new File(inputFileComments).exists()) fail("data file not found");

    copyFromLocal(inputFileComments);

    Tap source = new Hfs(new CommentScheme(new Fields("line")), inputFileComments);

    Pipe pipe = new Pipe("test");

    pipe = new Each(pipe, new Identity());

    Tap sink = new Hfs(new TextLine(1), outputPath + "/testnulls", true);

    Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe);

    flow.complete();

    validateLength(flow, 5, null);

    TupleEntryIterator iterator = flow.openSink();

    assertEquals("not equal: tuple.get(1)", "1 a", iterator.next().get(1));

    iterator.close();

    // confirm the tuple iterator can handle nulls from the source
    validateLength(flow.openSource(), 5);
  }
Пример #3
0
  public void testTemplateTapView() throws IOException {
    if (!new File(inputFileJoined).exists()) fail("data file not found");

    copyFromLocal(inputFileJoined);

    Tap source = new Hfs(new TextLine(new Fields("line")), inputFileJoined);

    Pipe pipe = new Pipe("test");

    pipe = new Each(pipe, new RegexSplitter(new Fields("number", "lower", "upper"), "\t"));

    Tap sink =
        new Hfs(new SequenceFile(new Fields("upper")), outputPath + "/testtemplatesview", true);

    sink = new TemplateTap((Hfs) sink, "%s-%s", new Fields("number", "lower"), 1);

    Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe);

    flow.complete();

    Tap test = new Hfs(new SequenceFile(new Fields("upper")), sink.getPath().toString() + "/1-a");
    validateLength(flow.openTapForRead(test), 1, 1);

    test = new Hfs(new SequenceFile(new Fields("upper")), sink.getPath().toString() + "/2-b");
    validateLength(flow.openTapForRead(test), 1, 1);

    TupleEntryIterator input = flow.openTapForRead(test); // open 2-b

    assertEquals("wrong value", "B", input.next().get(0));

    input.close();
  }
  public void testInFlow() throws Exception {
    FileSystem.get(new Configuration()).delete(new Path("/tmp/input"), true);
    FileSystem.get(new Configuration()).delete(new Path("/tmp/output"), true);

    Hfs inTap = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input");
    TupleEntryCollector collector = inTap.openForWrite(new HadoopFlowProcess());
    collector.add(new TupleEntry(new Fields("value"), new Tuple(BRYAN.build())));
    collector.add(new TupleEntry(new Fields("value"), new Tuple(LUCAS.build())));
    collector.close();

    Pipe inPipe = new Pipe("in");
    Pipe p =
        new Each(
            inPipe,
            new Fields("value"),
            new ExpandProto(Example.Person.class),
            new Fields("id", "name", "email", "position"));

    Hfs sink = new Hfs(new TextLine(), "/tmp/output");
    new HadoopFlowConnector().connect(inTap, sink, p).complete();

    TupleEntryIterator iter = sink.openForRead(new HadoopFlowProcess());
    List<Tuple> results = new ArrayList<Tuple>();
    while (iter.hasNext()) {
      results.add(iter.next().getTupleCopy());
    }
    assertEquals(2, results.size());

    assertEquals(
        new Tuple(0, 1, "bryan", "*****@*****.**", Example.Person.Position.CEO.getNumber())
            .toString(),
        results.get(0).toString());
    assertEquals(new Tuple(25, 2, "lucas", null, null).toString(), results.get(1).toString());
  }
  @Test
  public void testInFlow() throws Exception {
    FileSystem fs = FileSystem.get(new Configuration());
    fs.delete(new Path(TMP_DIR), true);

    Hfs input =
        new Hfs(new SequenceFile(new Fields("constant", "first", "second")), TMP_DIR + "/inputs");
    TupleEntryCollector collector = input.openForWrite(new HadoopFlowProcess());
    collector.add(new Tuple("constant 1", "a", "b"));
    collector.add(new Tuple("constant 2", "c", "d"));
    collector.close();

    Hfs output = new Hfs(new SequenceFile(new Fields("constant", "output")), TMP_DIR + "/outputs");

    Pipe pipe =
        Pump.prime()
            .each(new Extrude("output"), "first", "second")
            .retain("constant", "output")
            .toPipe();
    FlowDef flow = new FlowDef().addSource("input", input).addTailSink(pipe, output);
    CascadingHelper.setTestMode();
    CascadingHelper.get().getFlowConnector().connect(flow).complete();

    List<String> results = new ArrayList<String>();
    TupleEntryIterator iterator = output.openForRead(new HadoopFlowProcess());
    while (iterator.hasNext()) {
      TupleEntry tupleEntry = iterator.next();
      results.add(tupleEntry.getString(0) + "\t" + tupleEntry.getString(1));
    }
    assertEquals(
        Arrays.asList("constant 1\ta", "constant 1\tb", "constant 2\tc", "constant 2\td"), results);
  }
Пример #6
0
  private void assertHeaders(Tap output, Flow flow) throws IOException {
    TupleEntryIterator iterator =
        flow.openTapForRead(getPlatform().getTextFile(new Fields("line"), output.getIdentifier()));

    assertEquals(iterator.next().getObject(0), "first,second,third,fourth,fifth");

    iterator.close();
  }
Пример #7
0
 protected List<Tuple> getAllTuples(Tap sink) throws IOException {
   List<Tuple> ret = Lists.newArrayList();
   TupleEntryIterator tupleEntryIterator = sink.openForRead(CascadingUtil.get().getFlowProcess());
   while (tupleEntryIterator.hasNext()) {
     ret.add(new Tuple(tupleEntryIterator.next().getTuple()));
   }
   return ret;
 }
  public void testEvaluation() throws Exception {
    CascadingRuleCompiler crc = new CascadingRuleCompiler(defaultConfiguration);
    IDistributedCompiledRule dcr = crc.compile(rules.get(0));
    dcr.evaluate(new EvaluationContext(1, 1, 1));
    FlowAssembly fa = dcr.getFlowAssembly();

    TupleEntryIterator tei = fa.openSink();
    int size = 0;
    while (tei.hasNext()) {
      TupleEntry te = tei.next();
      logger.info(te.getTuple().toString());
      size++;
    }
    assertEquals(1, size);
  }
  @Test
  public void testTempPath() throws Exception {
    BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class);

    BasePath tempDir = platform.getTempDir();

    // Verify we can write and then read
    BasePath testDir = platform.makePath(tempDir, UUID.randomUUID().toString());

    Scheme scheme = platform.makeBinaryScheme(new Fields("name", "age"));
    Tap tap = platform.makeTap(scheme, testDir);
    TupleEntryCollector writer = tap.openForWrite(platform.makeFlowProcess());
    writer.add(new Tuple("ken", 37));
    writer.close();

    TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess());
    assertTrue(iter.hasNext());
    TupleEntry te = iter.next();
    assertEquals("ken", te.getString("name"));
    assertFalse(iter.hasNext());
    iter.close();
  }
  @Test
  public void test() throws Exception {
    GenerateTermsOptions options = generateTerms("build/test/GenerateTermsFlowTest/test");

    // Verify that we get expected results in the output
    BasePlatform platform = options.getPlatform(GenerateTermsFlowTest.class);
    Tap tap =
        platform.makeTap(
            platform.makeBinaryScheme(WikiTermDatum.FIELDS),
            options.getWorkingSubdirPath(WorkingConfig.TERMS_SUBDIR_NAME));
    TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess());
    WikiTermDatum datum = new WikiTermDatum();
    while (iter.hasNext()) {
      datum.setTupleEntry(iter.next());
      // TODO verify that each field looks correct?
      // System.out.println(datum.getTuple());
    }

    // Verify we got the expected number of results.
    Map<String, Long> counters = options.getCounters(GenerateTermsFlow.class);
    String counterName = WorkflowOptions.getFlowCounterName(WikiwordsCounters.ARTICLES);
    assertEquals(15, (long) counters.get(counterName));
  }
  /*
   * creates and processes a flow identified by {@code flowIdentificator}
   * results are stored at {@code output} under the result named {@code resultName}
   */
  private boolean processFlow(String resultName, String flowIdentificator, String output)
      throws IOException {
    boolean hasNewInferences = false;
    String flowName = resultName + flowIdentificator;
    Map<String, Tap> sources = prepareSourceTaps();

    SequenceFile sinkScheme = new SequenceFile(fields);
    // sinkScheme.setNumSinkParts(1); //FIXME
    Tap headSink = new Hfs(sinkScheme, output, true);

    Map<String, Tap> sinks = new HashMap<String, Tap>();
    List<Pipe> pipes = new ArrayList<Pipe>();
    sinks.put(pipe.getName(), headSink);
    pipes.add(pipe);
    if (mConfiguration.doPredicateIndexing) {
      // calculate the count of the result and write it in the configuration
      // if the predicate is a variable then we have to split also the result and put it in the
      // right location
      setupPredicateCounts(pipe, sinks, pipes);
    }

    flow =
        new FlowConnector(mConfiguration.flowProperties)
            .connect(flowName, sources, sinks, pipes.toArray(new Pipe[0]));
    if (flow != null) {
      // flow.writeDOT("flow.dot");
    }
    flow.complete();

    try {
      TupleEntryIterator iterator = flow.openSink(pipe.getName());
      if (iterator.hasNext()) {
        hasNewInferences = true;
      }
      iterator.close();
    } catch (IOException e) {
      logger.error("io exception", e);
      throw new RuntimeException("io exception", e);
    }
    if (!hasNewInferences) {
      deleteResults(new Path(path));
    } else {
      // merge part files FIXME
      FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration);

      // delete empty results (could be from reducers running on no data)
      int index = 0;
      while (true) {
        String value = String.valueOf(index);
        String file = path + "/" + "part-" + "00000".substring(0, 5 - value.length()) + value;
        Path filePath = new Path(file);
        if (fs.exists(filePath)) {
          Tap source = new Hfs(new Fields(0, 1, 2), file);
          TupleEntryIterator tei = source.openForRead(mConfiguration.jobConf);
          boolean noData = !tei.hasNext();
          tei.close();
          if (noData) {
            logger.info("delete empty result : " + file);
            fs.delete(filePath, false);
          }
        } else {
          break;
        }
        index++;
      }
    }

    if (hasNewInferences && mConfiguration.doPredicateIndexing) {
      FileSystem fs = FileSystem.get(mConfiguration.hadoopConfiguration);

      // update counts in configuration
      List<PredicateCount> predicateCounts = Utils.readPredicateCounts(flow, "predicatesPipe");

      distributedFileSystemManager.addPredicates(predicateCounts);

      if (ruleStreams.getHeadStream().getPredicate() == null) {
        // split result to the right locations (for variable predicate)
        Tap source = new Hfs(sinkScheme, output, true);
        Utils.splitStreamPerPredicates(
            mConfiguration,
            distributedFileSystemManager,
            source,
            predicateCounts,
            resultName,
            flowIdentificator);

        fs.delete(new Path(output), true);
      }

      distributedFileSystemManager.savePredicateConfig();
      String predicateGroupsTempPath =
          distributedFileSystemManager.getPredicateGroupsTempPath(mConfiguration.resultsName);
      fs.delete(new Path(predicateGroupsTempPath), true);
    }

    return hasNewInferences;
  }
Пример #12
0
  private void runQuotedText(String path, String inputData, String delimiter, boolean useAll)
      throws IOException {
    Object[][] results =
        new Object[][] {
          {"foo", "bar", "baz", "bin", 1L},
          {"foo", "bar", "baz", "bin", 2L},
          {"foo", "bar" + delimiter + "bar", "baz", "bin", 3L},
          {"foo", "bar\"" + delimiter + "bar", "baz", "bin", 4L},
          {"foo", "bar\"\"" + delimiter + "bar", "baz", "bin", 5L},
          {null, null, "baz", null, 6L},
          {null, null, null, null, 7L},
          {"foo", null, null, null, 8L},
          {null, null, null, null, 9L},
          {"f", null, null, null, 10L}, // this one is quoted, single char
          {"f", null, null, ",bin", 11L},
          {"f", null, null, "bin,", 11L}
        };

    if (useAll) {
      for (int i = 0; i < results.length; i++) {
        Object[] result = results[i];

        for (int j = 0; j < result.length; j++)
          result[j] = result[j] != null ? result[j].toString() : null;
      }
    }

    Tuple[] tuples = new Tuple[results.length];

    for (int i = 0; i < results.length; i++) tuples[i] = new Tuple(results[i]);

    Class[] types =
        new Class[] {String.class, String.class, String.class, String.class, long.class};
    Fields fields = new Fields("first", "second", "third", "fourth", "fifth");

    if (useAll) {
      types = null;
      fields = Fields.ALL;
    }

    Tap input =
        getPlatform()
            .getDelimitedFile(fields, false, delimiter, "\"", types, inputData, SinkMode.KEEP);
    Tap output =
        getPlatform()
            .getDelimitedFile(
                fields,
                false,
                delimiter,
                "\"",
                types,
                getOutputPath("quoted/" + path + "" + useAll),
                SinkMode.REPLACE);

    Pipe pipe = new Pipe("pipe");

    Flow flow = getPlatform().getFlowConnector().connect(input, output, pipe);

    flow.complete();

    validateLength(flow, results.length, 5);

    // validate input parsing compares to expected, and results compare to expected
    TupleEntryIterator iterator = flow.openSource();

    int count = 0;
    while (iterator.hasNext()) {
      Tuple tuple = iterator.next().getTuple();
      assertEquals(tuples[count++], tuple);
    }

    iterator = flow.openSink();

    count = 0;
    while (iterator.hasNext()) {
      Tuple tuple = iterator.next().getTuple();
      assertEquals(tuples[count++], tuple);
    }
  }