Example #1
0
  @Override
  public int execute(Configuration conf) throws IOException {
    Pipeline p = pipelineParams.create(KMeansSketchCommand.class, conf);
    List<Vector> initial = null;
    if (initVectorsPath != null) {
      initial = getInitialVectors(p);
    }

    PCollection<Vector> input = inputParams.getVectors(p);
    if (initial == null || initial.isEmpty()) {
      initial = Lists.newArrayList();
      initial.add(input.materialize().iterator().next());
    }
    KMeansParallel kmp = new KMeansParallel(randomParams.getRandom(), indexBits, indexSamples);
    Crossfold cf = new Crossfold(crossFolds);

    List<List<Weighted<Vector>>> wv =
        kmp.initialization(input, numIterations, samplesPerIteration, initial, cf);
    AvroIO.write(toWeightedCenters(wv), new File(outputFile));
    p.done();

    return 0;
  }
Example #2
0
  @Override
  public int execute(Configuration conf) throws Exception {
    Pipeline p = pipelineParams.create(SampleCommand.class, conf);
    PCollection<Record> elements = inputParams.getRecords(p);

    if (sampleSize > 0 && samplingProbability > 0.0) {
      throw new IllegalArgumentException("--size and --prob are mutually exclusive options.");
    }
    PCollection<Record> sample;
    if (sampleSize > 0) {
      sample = ReservoirSampling.sample(elements, sampleSize);
    } else if (samplingProbability > 0.0 && samplingProbability < 1.0) {
      sample = Sample.sample(elements, samplingProbability);
    } else {
      throw new IllegalArgumentException(
          String.format(
              "Invalid input args: sample size = %d, sample prob = %.4f",
              sampleSize, samplingProbability));
    }
    outputParams.write(sample, sampleFile);

    PipelineResult pr = p.done();
    return pr.succeeded() ? 0 : 1;
  }
Example #3
0
  @Override
  public int execute(Configuration conf) throws Exception {
    Pipeline p = pipelineParams.create(SummaryCommand.class, conf);
    PCollection<Record> records = inputParams.getRecords(p);

    Spec spec = null;
    List<Integer> symbolicColumns = Lists.newArrayList();
    List<Integer> ignoredColumns = Lists.newArrayList();
    if (headerFile != null) {
      spec = Specs.readFromHeaderFile(headerFile, ignoredColumns, symbolicColumns);
    }

    Summarizer summarizer =
        new Summarizer()
            .spec(spec)
            .defaultToSymbolic(false)
            .exceptionColumns(symbolicColumns)
            .ignoreColumns(ignoredColumns);
    Summary summary = summarizer.build(records).getValue();
    summaryParams.save(summary, summaryFile);

    p.done();
    return 0;
  }