private BigQueryReader createTyped(CloudObject spec, PipelineOptions options) throws Exception { Bigquery client = Transport.newBigQueryClient(options.as(BigQueryOptions.class)).build(); String query = getString(spec, PropertyNames.BIGQUERY_QUERY, null); if (query != null) { String project = options.as(GcpOptions.class).getProject(); Boolean flatten = getBoolean(spec, PropertyNames.BIGQUERY_FLATTEN_RESULTS, true); return BigQueryReader.fromQuery(query, project, client, flatten); } else { String tableId = getString(spec, PropertyNames.BIGQUERY_TABLE, null); checkArgument(tableId != null, "Either a table or a query has to be specified"); String project = getString(spec, PropertyNames.BIGQUERY_PROJECT); String dataset = getString(spec, PropertyNames.BIGQUERY_DATASET); return BigQueryReader.fromTable( new TableReference().setProjectId(project).setDatasetId(dataset).setTableId(tableId), client); } }
private Datastore getDatastore(PipelineOptions pipelineOptions) { DatastoreOptions.Builder builder = new DatastoreOptions.Builder() .host(host) .dataset(datasetId) .initializer(new RetryHttpRequestInitializer()); Credential credential = pipelineOptions.as(GcpOptions.class).getGcpCredential(); if (credential != null) { builder.credential(credential); } return DatastoreFactory.get().create(builder.build()); }
@Override public List<Source> splitIntoBundles(long desiredBundleSizeBytes, PipelineOptions options) throws Exception { // Users may request a limit on the number of results. We can currently support this by // simply disabling parallel reads and using only a single split. if (query.hasLimit()) { return ImmutableList.of(this); } long numSplits; try { numSplits = Math.round(((double) getEstimatedSizeBytes(options)) / desiredBundleSizeBytes); } catch (Exception e) { // Fallback in case estimated size is unavailable. TODO: fix this, it's horrible. // 1. Try Dataflow's numWorkers, which will be 0 for other workers. DataflowPipelineWorkerPoolOptions poolOptions = options.as(DataflowPipelineWorkerPoolOptions.class); if (poolOptions.getNumWorkers() > 0) { LOG.warn( "Estimated size of unavailable, using the number of workers {}", poolOptions.getNumWorkers(), e); numSplits = poolOptions.getNumWorkers(); } else { // 2. Default to 12 in the unknown case. numSplits = 12; } } // If the desiredBundleSize or number of workers results in 1 split, simply return // a source that reads from the original query. if (numSplits <= 1) { return ImmutableList.of(this); } List<Query> datastoreSplits; try { datastoreSplits = getSplitQueries(Ints.checkedCast(numSplits), options); } catch (IllegalArgumentException | DatastoreException e) { LOG.warn("Unable to parallelize the given query: {}", query, e); return ImmutableList.of(this); } ImmutableList.Builder<Source> splits = ImmutableList.builder(); for (Query splitQuery : datastoreSplits) { splits.add(new Source(host, datasetId, splitQuery, namespace)); } return splits.build(); }
@Override public DatastoreWriter createWriter(PipelineOptions options) throws Exception { DatastoreOptions.Builder builder = new DatastoreOptions.Builder() .host(sink.host) .dataset(sink.datasetId) .initializer(new RetryHttpRequestInitializer()); Credential credential = options.as(GcpOptions.class).getGcpCredential(); if (credential != null) { builder.credential(credential); } Datastore datastore = DatastoreFactory.get().create(builder.build()); return new DatastoreWriter(this, datastore); }
public static DataflowPathValidator fromOptions(PipelineOptions options) { return new DataflowPathValidator(options.as(DataflowPipelineOptions.class)); }
@Override public void startBundle(final Receiver... receivers) throws Exception { if (receivers.length != sideOutputTags.size() + 1) { throw new AssertionError("unexpected number of receivers for DoFn"); } StepContext stepContext = null; if (executionContext != null) { stepContext = executionContext.getOrCreateStepContext(stepName, transformName, stateSampler); } @SuppressWarnings("unchecked") DoFnInfo<Object, Object> doFnInfo = (DoFnInfo<Object, Object>) getDoFnInfo(); OutputManager outputManager = new OutputManager() { final Map<TupleTag<?>, OutputReceiver> undeclaredOutputs = new HashMap<>(); @Nullable private Receiver getReceiverOrNull(TupleTag<?> tag) { if (tag.equals(mainOutputTag)) { return receivers[0]; } else if (sideOutputTags.contains(tag)) { return receivers[sideOutputTags.indexOf(tag) + 1]; } else { return undeclaredOutputs.get(tag); } } @Override public <T> void output(TupleTag<T> tag, WindowedValue<T> output) { Receiver receiver = getReceiverOrNull(tag); if (receiver == null) { // A new undeclared output. // TODO: plumb through the operationName, so that we can // name implicit outputs after it. String outputName = "implicit-" + tag.getId(); // TODO: plumb through the counter prefix, so we can // make it available to the OutputReceiver class in case // it wants to use it in naming output counters. (It // doesn't today.) OutputReceiver undeclaredReceiver = new OutputReceiver(); ElementCounter outputCounter = new DataflowOutputCounter(outputName, addCounterMutator); undeclaredReceiver.addOutputCounter(outputCounter); undeclaredOutputs.put(tag, undeclaredReceiver); receiver = undeclaredReceiver; } try { receiver.process(output); } catch (Throwable t) { throw Throwables.propagate(t); } } }; if (options.as(StreamingOptions.class).isStreaming() && !sideInputReader.isEmpty()) { fnRunner = new StreamingSideInputDoFnRunner<Object, Object, BoundedWindow>( options, doFnInfo, sideInputReader, outputManager, mainOutputTag, sideOutputTags, stepContext, addCounterMutator); } else { fnRunner = DoFnRunner.create( options, doFnInfo.getDoFn(), sideInputReader, outputManager, mainOutputTag, sideOutputTags, stepContext, addCounterMutator, doFnInfo.getWindowingStrategy()); } fnRunner.startBundle(); }