コード例 #1
0
ファイル: TfIdf.java プロジェクト: ssesha/DataflowJavaSDK
    @Override
    public PCollection<KV<URI, String>> apply(PInput input) {
      Pipeline pipeline = input.getPipeline();

      // Create one TextIO.Read transform for each document
      // and add its output to a PCollectionList
      PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline);

      // TextIO.Read supports:
      //  - file: URIs and paths locally
      //  - gs: URIs on the service
      for (final URI uri : uris) {
        String uriString;
        if (uri.getScheme().equals("file")) {
          uriString = new File(uri).getPath();
        } else {
          uriString = uri.toString();
        }

        PCollection<KV<URI, String>> oneUriToLines =
            pipeline
                .apply(TextIO.Read.from(uriString).named("TextIO.Read(" + uriString + ")"))
                .apply(WithKeys.<URI, String>of(uri).setName("WithKeys(" + uriString + ")"));

        urisToLines = urisToLines.and(oneUriToLines);
      }

      return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
    }
  /** Recursive wildcards are not supported. This tests "**". */
  @Test
  public void testBadWildcardRecursive() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    Pipeline pipeline = DataflowPipeline.create(options);
    DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions(options);

    pipeline.apply(TextIO.Read.from("gs://bucket/foo**/baz"));

    // Check that translation does fail.
    thrown.expect(IllegalArgumentException.class);
    thrown.expectMessage("Unsupported wildcard usage");
    t.translate(pipeline, Collections.<DataflowPackage>emptyList());
  }
コード例 #3
0
  /** Sets up and starts streaming pipeline. */
  public static void main(String[] args) {
    PubsubFileInjectorOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(PubsubFileInjectorOptions.class);

    Pipeline pipeline = Pipeline.create(options);

    pipeline
        .apply(TextIO.Read.from(options.getInput()))
        .apply(
            IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic()))
                .withMaxParallelism(20));

    pipeline.run();
  }
コード例 #4
0
 /**
  * Runs the batch injector for the streaming pipeline.
  *
  * <p>The injector pipeline will read from the given text file, and inject data into the Google
  * Cloud Pub/Sub topic.
  */
 public void runInjectorPipeline(String inputFile, String topic) {
   DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class);
   copiedOptions.setStreaming(false);
   copiedOptions.setNumWorkers(
       options.as(ExamplePubsubTopicOptions.class).getInjectorNumWorkers());
   copiedOptions.setJobName(options.getJobName() + "-injector");
   Pipeline injectorPipeline = Pipeline.create(copiedOptions);
   injectorPipeline
       .apply(TextIO.Read.from(inputFile))
       .apply(
           IntraBundleParallelization.of(PubsubFileInjector.publish(topic))
               .withMaxParallelism(20));
   DataflowPipelineJob injectorJob = (DataflowPipelineJob) injectorPipeline.run();
   jobsToCancel.add(injectorJob);
 }
 private void applyRead(Pipeline pipeline, String path) {
   pipeline.apply("Read(" + path + ")", TextIO.Read.from(path));
 }