public static void main(String[] args) {
    PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
    KafkaStreamingWordCountOptions options =
        PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
    options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
    options.setStreaming(true);
    options.setCheckpointingInterval(1000L);
    options.setNumberOfExecutionRetries(5);
    options.setExecutionRetryDelay(3000L);
    options.setRunner(FlinkPipelineRunner.class);

    System.out.println(
        options.getKafkaTopic()
            + " "
            + options.getZookeeper()
            + " "
            + options.getBroker()
            + " "
            + options.getGroup());
    Pipeline pipeline = Pipeline.create(options);

    Properties p = new Properties();
    p.setProperty("zookeeper.connect", options.getZookeeper());
    p.setProperty("bootstrap.servers", options.getBroker());
    p.setProperty("group.id", options.getGroup());

    // this is the Flink consumer that reads the input to
    // the program from a kafka topic.
    FlinkKafkaConsumer08<String> kafkaConsumer =
        new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p);

    PCollection<String> words =
        pipeline
            .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
            .apply(ParDo.of(new ExtractWordsFn()))
            .apply(
                Window.<String>into(
                        FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
                    .triggering(AfterWatermark.pastEndOfWindow())
                    .withAllowedLateness(Duration.ZERO)
                    .discardingFiredPanes());

    PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement());

    wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt"));

    pipeline.run();
  }
  @Test
  public void testReadEmptyCollectionSideInput() throws Exception {
    SideInputInfo sideInputInfo = createCollectionSideInputInfo(createSideInputSource());

    assertThatContains(
        SideInputUtils.readSideInput(
            PipelineOptionsFactory.create(), sideInputInfo, new BatchModeExecutionContext()));
  }
  @Test
  public void testReadSingletonSideInput() throws Exception {
    SideInputInfo sideInputInfo = createSingletonSideInputInfo(createSideInputSource(42));

    assertEquals(
        42,
        SideInputUtils.readSideInput(
            PipelineOptionsFactory.create(), sideInputInfo, new BatchModeExecutionContext()));
  }
  @Test
  public void testCreateNormalParDoFn() throws Exception {
    String stringState = "some state";
    long longState = 42L;

    TestDoFn fn = new TestDoFn(stringState, longState);

    String serializedFn =
        StringUtils.byteArrayToJsonString(
            SerializableUtils.serializeToByteArray(
                new DoFnInfo(fn, WindowingStrategy.globalDefault())));

    CloudObject cloudUserFn = CloudObject.forClassName("DoFn");
    addString(cloudUserFn, "serialized_fn", serializedFn);

    String tag = "output";
    MultiOutputInfo multiOutputInfo = new MultiOutputInfo();
    multiOutputInfo.setTag(tag);
    List<MultiOutputInfo> multiOutputInfos = Arrays.asList(multiOutputInfo);

    PipelineOptions options = PipelineOptionsFactory.create();
    DataflowExecutionContext context = BatchModeExecutionContext.fromOptions(options);
    CounterSet counters = new CounterSet();
    StateSampler stateSampler = new StateSampler("test", counters.getAddCounterMutator());
    ParDoFn parDoFn =
        factory.create(
            options,
            cloudUserFn,
            "name",
            "transformName",
            null,
            multiOutputInfos,
            1,
            context,
            counters.getAddCounterMutator(),
            stateSampler);

    // Test that the factory created the correct class
    assertThat(parDoFn, instanceOf(NormalParDoFn.class));

    // Test that the DoFnInfo reflects the one passed in
    NormalParDoFn normalParDoFn = (NormalParDoFn) parDoFn;
    DoFnInfo doFnInfo = normalParDoFn.getDoFnInfo();
    DoFn actualDoFn = doFnInfo.getDoFn();
    assertThat(actualDoFn, instanceOf(TestDoFn.class));
    assertThat(doFnInfo.getWindowingStrategy().getWindowFn(), instanceOf(GlobalWindows.class));
    assertThat(
        doFnInfo.getWindowingStrategy().getTrigger().getSpec(), instanceOf(DefaultTrigger.class));

    // Test that the deserialized user DoFn is as expected
    TestDoFn actualTestDoFn = (TestDoFn) actualDoFn;
    assertEquals(stringState, actualTestDoFn.stringState);
    assertEquals(longState, actualTestDoFn.longState);
    assertEquals(context, normalParDoFn.getExecutionContext());
  }
  @Test
  public void testSplitsWithSmallBlocks() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    // Test reading from an object file with many small random-sized blocks.
    List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
    String filename =
        generateTestFile(
            "tmp.avro",
            expected,
            SyncBehavior.SYNC_RANDOM,
            100 /* max records/block */,
            AvroCoder.of(Bird.class),
            DataFileConstants.NULL_CODEC);
    File file = new File(filename);

    // Small minimum bundle size
    AvroSource<Bird> source =
        AvroSource.from(filename).withSchema(Bird.class).withMinBundleSize(100L);

    // Assert that the source produces the expected records
    assertEquals(expected, SourceTestUtils.readFromSource(source, options));

    List<? extends BoundedSource<Bird>> splits;
    int nonEmptySplits;

    // Split with the minimum bundle size
    splits = source.splitIntoBundles(100L, options);
    assertTrue(splits.size() > 2);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    nonEmptySplits = 0;
    for (BoundedSource<Bird> subSource : splits) {
      if (SourceTestUtils.readFromSource(subSource, options).size() > 0) {
        nonEmptySplits += 1;
      }
    }
    assertTrue(nonEmptySplits > 2);

    // Split with larger bundle size
    splits = source.splitIntoBundles(file.length() / 4, options);
    assertTrue(splits.size() > 2);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    nonEmptySplits = 0;
    for (BoundedSource<Bird> subSource : splits) {
      if (SourceTestUtils.readFromSource(subSource, options).size() > 0) {
        nonEmptySplits += 1;
      }
    }
    assertTrue(nonEmptySplits > 2);

    // Split with the file length
    splits = source.splitIntoBundles(file.length(), options);
    assertTrue(splits.size() == 1);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
  }
Example #6
0
  public static void main(String[] args) throws Exception {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));

    pipeline
        .apply(new ReadDocuments(listInputDocuments(options)))
        .apply(new ComputeTfIdf())
        .apply(new WriteTfIdf(options.getOutput()));

    pipeline.run();
  }
 @Test
 public void testCreateUnknownParDoFn() throws Exception {
   CloudObject cloudUserFn = CloudObject.forClassName("UnknownKindOfDoFn");
   try {
     CounterSet counters = new CounterSet();
     StateSampler stateSampler = new StateSampler("test", counters.getAddCounterMutator());
     factory.create(
         PipelineOptionsFactory.create(),
         cloudUserFn,
         "name",
         "transformName",
         null,
         null,
         1,
         BatchModeExecutionContext.fromOptions(PipelineOptionsFactory.create()),
         counters.getAddCounterMutator(),
         stateSampler);
     fail("should have thrown an exception");
   } catch (Exception exn) {
     assertThat(exn.toString(), Matchers.containsString("No known ParDoFnFactory"));
   }
 }
  /** Sets up and starts streaming pipeline. */
  public static void main(String[] args) {
    PubsubFileInjectorOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(PubsubFileInjectorOptions.class);

    Pipeline pipeline = Pipeline.create(options);

    pipeline
        .apply(TextIO.Read.from(options.getInput()))
        .apply(
            IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic()))
                .withMaxParallelism(20));

    pipeline.run();
  }
  private static DataflowPipelineOptions buildPipelineOptions() throws IOException {
    GcsUtil mockGcsUtil = mock(GcsUtil.class);
    when(mockGcsUtil.bucketExists(any(GcsPath.class))).thenReturn(true);
    when(mockGcsUtil.isGcsPatternSupported(anyString())).thenCallRealMethod();

    DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
    options.setGcpCredential(new TestCredential());
    options.setJobName("some-job-name");
    options.setProject("some-project");
    options.setTempLocation(GcsPath.fromComponents("somebucket", "some/path").toString());
    options.setFilesToStage(new LinkedList<String>());
    options.setDataflowClient(buildMockDataflow(new IsValidCreateRequest()));
    options.setGcsUtil(mockGcsUtil);
    return options;
  }
  @Test
  public void testMultiGraphPipelineSerialization() throws IOException {
    Pipeline p = DataflowPipeline.create(buildPipelineOptions());

    PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3));

    input.apply(new UnrelatedOutputCreator());
    input.apply(new UnboundOutputCreator());

    DataflowPipelineTranslator t =
        DataflowPipelineTranslator.fromOptions(
            PipelineOptionsFactory.as(DataflowPipelineOptions.class));

    // Check that translation doesn't fail.
    t.translate(p, Collections.<DataflowPackage>emptyList());
  }
  @SuppressWarnings("rawtypes")
  private static ParDoFn createCombineValuesFn(String phase, Combine.KeyedCombineFn combineFn)
      throws Exception {
    // This partially mirrors the work that
    // com.google.cloud.dataflow.sdk.transforms.Combine.translateHelper
    // does, at least for the KeyedCombineFn. The phase is generated
    // by the back-end.
    CloudObject spec = CloudObject.forClassName("CombineValuesFn");
    addString(
        spec, PropertyNames.SERIALIZED_FN, byteArrayToJsonString(serializeToByteArray(combineFn)));
    addString(spec, PropertyNames.PHASE, phase);

    return parDoFnFactory.create(
        PipelineOptionsFactory.create(),
        spec,
        "name",
        "transformName",
        null, // no side inputs
        null, // no side outputs
        1, // single main output
        DataflowExecutionContext.withoutSideInputs(),
        (new CounterSet()).getAddCounterMutator(),
        null);
  }
 private TestDataflowPipelineOptions buildTestPipelineOptions() {
   TestDataflowPipelineOptions options =
       PipelineOptionsFactory.as(TestDataflowPipelineOptions.class);
   options.setGcpCredential(new TestCredential());
   return options;
 }