@Override public int execute(Configuration conf) throws IOException { Pipeline p = pipelineParams.create(KMeansSketchCommand.class, conf); List<Vector> initial = null; if (initVectorsPath != null) { initial = getInitialVectors(p); } PCollection<Vector> input = inputParams.getVectors(p); if (initial == null || initial.isEmpty()) { initial = Lists.newArrayList(); initial.add(input.materialize().iterator().next()); } KMeansParallel kmp = new KMeansParallel(randomParams.getRandom(), indexBits, indexSamples); Crossfold cf = new Crossfold(crossFolds); List<List<Weighted<Vector>>> wv = kmp.initialization(input, numIterations, samplesPerIteration, initial, cf); AvroIO.write(toWeightedCenters(wv), new File(outputFile)); p.done(); return 0; }
@Override public int execute(Configuration conf) throws Exception { Pipeline p = pipelineParams.create(SampleCommand.class, conf); PCollection<Record> elements = inputParams.getRecords(p); if (sampleSize > 0 && samplingProbability > 0.0) { throw new IllegalArgumentException("--size and --prob are mutually exclusive options."); } PCollection<Record> sample; if (sampleSize > 0) { sample = ReservoirSampling.sample(elements, sampleSize); } else if (samplingProbability > 0.0 && samplingProbability < 1.0) { sample = Sample.sample(elements, samplingProbability); } else { throw new IllegalArgumentException( String.format( "Invalid input args: sample size = %d, sample prob = %.4f", sampleSize, samplingProbability)); } outputParams.write(sample, sampleFile); PipelineResult pr = p.done(); return pr.succeeded() ? 0 : 1; }
@Override public int execute(Configuration conf) throws Exception { Pipeline p = pipelineParams.create(SummaryCommand.class, conf); PCollection<Record> records = inputParams.getRecords(p); Spec spec = null; List<Integer> symbolicColumns = Lists.newArrayList(); List<Integer> ignoredColumns = Lists.newArrayList(); if (headerFile != null) { spec = Specs.readFromHeaderFile(headerFile, ignoredColumns, symbolicColumns); } Summarizer summarizer = new Summarizer() .spec(spec) .defaultToSymbolic(false) .exceptionColumns(symbolicColumns) .ignoreColumns(ignoredColumns); Summary summary = summarizer.build(records).getValue(); summaryParams.save(summary, summaryFile); p.done(); return 0; }