@Override
  public int run(final String[] args) throws Exception {
    createTable();
    final Configuration config = getConf();
    final Pipeline pipeline =
        new MRPipeline(CrunchStockDateInserter.class, "PipelineWithFilterFn", config);
    PCollection<String> lines = pipeline.readTextFile(Constants.HDFS_INPUT_PATH + "/2004_2014.csv");
    PCollection<Put> resultPut = CrunchUtils.returnDates(lines);
    System.out.println("********** size ************ : " + resultPut.getSize());

    pipeline.write(resultPut, new HBaseTarget(Constants.STOCK_DATES_TABLE));
    PipelineResult result = pipeline.done();
    return result.succeeded() ? 0 : 1;
  }
  public int run(String[] args) throws Exception {

    Pipeline pipeline = new MRPipeline(SecondarySortingExample.class);
    // Read input
    PCollection<String> lines = pipeline.readTextFile(args[0]);
    // Split each line and count them
    PTable<String, Long> wordcount = lines.parallelDo(new Tokenizer(), Writables.strings()).count();
    // Sort
    PCollection<Pair<String, Long>> sorted =
        Sort.sortPairs(wordcount, ColumnOrder.by(1, Sort.Order.DESCENDING));
    // Write the output
    sorted.write(To.textFile(args[0]));
    // Kick off execution
    PipelineResult result = pipeline.done();
    return result.succeeded() ? 0 : 1;
  }
Пример #3
0
  @Override
  public int execute(Configuration conf) throws IOException {
    Pipeline p = pipelineParams.create(KMeansSketchCommand.class, conf);
    List<Vector> initial = null;
    if (initVectorsPath != null) {
      initial = getInitialVectors(p);
    }

    PCollection<Vector> input = inputParams.getVectors(p);
    if (initial == null || initial.isEmpty()) {
      initial = Lists.newArrayList();
      initial.add(input.materialize().iterator().next());
    }
    KMeansParallel kmp = new KMeansParallel(randomParams.getRandom(), indexBits, indexSamples);
    Crossfold cf = new Crossfold(crossFolds);

    List<List<Weighted<Vector>>> wv =
        kmp.initialization(input, numIterations, samplesPerIteration, initial, cf);
    AvroIO.write(toWeightedCenters(wv), new File(outputFile));
    p.done();

    return 0;
  }
Пример #4
0
  @Override
  public int execute(Configuration conf) throws Exception {
    Pipeline p = pipelineParams.create(SampleCommand.class, conf);
    PCollection<Record> elements = inputParams.getRecords(p);

    if (sampleSize > 0 && samplingProbability > 0.0) {
      throw new IllegalArgumentException("--size and --prob are mutually exclusive options.");
    }
    PCollection<Record> sample;
    if (sampleSize > 0) {
      sample = ReservoirSampling.sample(elements, sampleSize);
    } else if (samplingProbability > 0.0 && samplingProbability < 1.0) {
      sample = Sample.sample(elements, samplingProbability);
    } else {
      throw new IllegalArgumentException(
          String.format(
              "Invalid input args: sample size = %d, sample prob = %.4f",
              sampleSize, samplingProbability));
    }
    outputParams.write(sample, sampleFile);

    PipelineResult pr = p.done();
    return pr.succeeded() ? 0 : 1;
  }
Пример #5
0
  @Override
  public int execute(Configuration conf) throws Exception {
    Pipeline p = pipelineParams.create(SummaryCommand.class, conf);
    PCollection<Record> records = inputParams.getRecords(p);

    Spec spec = null;
    List<Integer> symbolicColumns = Lists.newArrayList();
    List<Integer> ignoredColumns = Lists.newArrayList();
    if (headerFile != null) {
      spec = Specs.readFromHeaderFile(headerFile, ignoredColumns, symbolicColumns);
    }

    Summarizer summarizer =
        new Summarizer()
            .spec(spec)
            .defaultToSymbolic(false)
            .exceptionColumns(symbolicColumns)
            .ignoreColumns(ignoredColumns);
    Summary summary = summarizer.build(records).getValue();
    summaryParams.save(summary, summaryFile);

    p.done();
    return 0;
  }
Пример #6
0
 @After
 public void tearDown() {
   pipeline.done();
 }
Пример #7
0
 private void runCheckpointPipeline(View<Record> inputView, View<Record> outputView) {
   Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
   PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
   pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.CHECKPOINT);
   pipeline.done();
 }