public static void main(String[] args) { WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class); Pipeline p = Pipeline.create(options); // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the // static FormatAsTextFn() to the ParDo transform. p.apply("ReadLines", TextIO.Read.from(options.getInputFile())) .apply(new CountWords()) .apply(MapElements.via(new FormatAsTextFn())) .apply("WriteCounts", TextIO.Write.to(options.getOutput())); p.run().waitUntilFinish(); }
@Test public void testE2EBigtableWrite() throws Exception { final String tableName = bigtableOptions.getInstanceName().toTableNameStr(tableId); final String instanceName = bigtableOptions.getInstanceName().toString(); final int numRows = 1000; final List<KV<ByteString, ByteString>> testData = generateTableData(numRows); createEmptyTable(instanceName, tableId); Pipeline p = Pipeline.create(options); p.apply(CountingInput.upTo(numRows)) .apply( ParDo.of( new DoFn<Long, KV<ByteString, Iterable<Mutation>>>() { @ProcessElement public void processElement(ProcessContext c) { int index = c.element().intValue(); Iterable<Mutation> mutations = ImmutableList.of( Mutation.newBuilder() .setSetCell( Mutation.SetCell.newBuilder() .setValue(testData.get(index).getValue()) .setFamilyName(COLUMN_FAMILY_NAME)) .build()); c.output(KV.of(testData.get(index).getKey(), mutations)); } })) .apply(BigtableIO.write().withBigtableOptions(bigtableOptions).withTableId(tableId)); p.run(); // Test number of column families and column family name equality Table table = getTable(tableName); assertThat(table.getColumnFamilies().keySet(), Matchers.hasSize(1)); assertThat(table.getColumnFamilies(), Matchers.hasKey(COLUMN_FAMILY_NAME)); // Test table data equality List<KV<ByteString, ByteString>> tableData = getTableData(tableName); assertThat(tableData, Matchers.containsInAnyOrder(testData.toArray())); }
@Test public void testSequenceFile() throws Exception { populateFile(); Pipeline p = Pipeline.create(pipelineOptions.getOptions()); @SuppressWarnings("unchecked") Class<? extends FileInputFormat<IntWritable, Text>> inputFormatClass = (Class<? extends FileInputFormat<IntWritable, Text>>) (Class<?>) SequenceFileInputFormat.class; HadoopIO.Read.Bound<IntWritable, Text> read = HadoopIO.Read.from( inputFile.getAbsolutePath(), inputFormatClass, IntWritable.class, Text.class); PCollection<KV<IntWritable, Text>> input = p.apply(read) .setCoder( KvCoder.of(WritableCoder.of(IntWritable.class), WritableCoder.of(Text.class))); @SuppressWarnings("unchecked") Class<? extends FileOutputFormat<IntWritable, Text>> outputFormatClass = (Class<? extends FileOutputFormat<IntWritable, Text>>) (Class<?>) TemplatedSequenceFileOutputFormat.class; @SuppressWarnings("unchecked") HadoopIO.Write.Bound<IntWritable, Text> write = HadoopIO.Write.to( outputFile.getAbsolutePath(), outputFormatClass, IntWritable.class, Text.class); input.apply(write.withoutSharding()); p.run(); IntWritable key = new IntWritable(); Text value = new Text(); try (Reader reader = new Reader(new Configuration(), Reader.file(new Path(outputFile.toURI())))) { int i = 0; while (reader.next(key, value)) { assertEquals(i, key.get()); assertEquals("value-" + i, value.toString()); i++; } } }