@Override public PCollection<KV<URI, String>> apply(PInput input) { Pipeline pipeline = input.getPipeline(); // Create one TextIO.Read transform for each document // and add its output to a PCollectionList PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline); // TextIO.Read supports: // - file: URIs and paths locally // - gs: URIs on the service for (final URI uri : uris) { String uriString; if (uri.getScheme().equals("file")) { uriString = new File(uri).getPath(); } else { uriString = uri.toString(); } PCollection<KV<URI, String>> oneUriToLines = pipeline .apply(TextIO.Read.from(uriString).named("TextIO.Read(" + uriString + ")")) .apply(WithKeys.<URI, String>of(uri).setName("WithKeys(" + uriString + ")")); urisToLines = urisToLines.and(oneUriToLines); } return urisToLines.apply(Flatten.<KV<URI, String>>pCollections()); }
/** Recursive wildcards are not supported. This tests "**". */ @Test public void testBadWildcardRecursive() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); Pipeline pipeline = DataflowPipeline.create(options); DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions(options); pipeline.apply(TextIO.Read.from("gs://bucket/foo**/baz")); // Check that translation does fail. thrown.expect(IllegalArgumentException.class); thrown.expectMessage("Unsupported wildcard usage"); t.translate(pipeline, Collections.<DataflowPackage>emptyList()); }
/** Sets up and starts streaming pipeline. */ public static void main(String[] args) { PubsubFileInjectorOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(PubsubFileInjectorOptions.class); Pipeline pipeline = Pipeline.create(options); pipeline .apply(TextIO.Read.from(options.getInput())) .apply( IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic())) .withMaxParallelism(20)); pipeline.run(); }
/** * Runs the batch injector for the streaming pipeline. * * <p>The injector pipeline will read from the given text file, and inject data into the Google * Cloud Pub/Sub topic. */ public void runInjectorPipeline(String inputFile, String topic) { DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class); copiedOptions.setStreaming(false); copiedOptions.setNumWorkers( options.as(ExamplePubsubTopicOptions.class).getInjectorNumWorkers()); copiedOptions.setJobName(options.getJobName() + "-injector"); Pipeline injectorPipeline = Pipeline.create(copiedOptions); injectorPipeline .apply(TextIO.Read.from(inputFile)) .apply( IntraBundleParallelization.of(PubsubFileInjector.publish(topic)) .withMaxParallelism(20)); DataflowPipelineJob injectorJob = (DataflowPipelineJob) injectorPipeline.run(); jobsToCancel.add(injectorJob); }
private void applyRead(Pipeline pipeline, String path) { pipeline.apply("Read(" + path + ")", TextIO.Read.from(path)); }