private BigQueryReader createTyped(CloudObject spec, PipelineOptions options) throws Exception {
    Bigquery client = Transport.newBigQueryClient(options.as(BigQueryOptions.class)).build();

    String query = getString(spec, PropertyNames.BIGQUERY_QUERY, null);
    if (query != null) {
      String project = options.as(GcpOptions.class).getProject();
      Boolean flatten = getBoolean(spec, PropertyNames.BIGQUERY_FLATTEN_RESULTS, true);
      return BigQueryReader.fromQuery(query, project, client, flatten);
    } else {
      String tableId = getString(spec, PropertyNames.BIGQUERY_TABLE, null);
      checkArgument(tableId != null, "Either a table or a query has to be specified");
      String project = getString(spec, PropertyNames.BIGQUERY_PROJECT);
      String dataset = getString(spec, PropertyNames.BIGQUERY_DATASET);
      return BigQueryReader.fromTable(
          new TableReference().setProjectId(project).setDatasetId(dataset).setTableId(tableId),
          client);
    }
  }
Example #2
0
    private Datastore getDatastore(PipelineOptions pipelineOptions) {
      DatastoreOptions.Builder builder =
          new DatastoreOptions.Builder()
              .host(host)
              .dataset(datasetId)
              .initializer(new RetryHttpRequestInitializer());

      Credential credential = pipelineOptions.as(GcpOptions.class).getGcpCredential();
      if (credential != null) {
        builder.credential(credential);
      }
      return DatastoreFactory.get().create(builder.build());
    }
Example #3
0
    @Override
    public List<Source> splitIntoBundles(long desiredBundleSizeBytes, PipelineOptions options)
        throws Exception {
      // Users may request a limit on the number of results. We can currently support this by
      // simply disabling parallel reads and using only a single split.
      if (query.hasLimit()) {
        return ImmutableList.of(this);
      }

      long numSplits;
      try {
        numSplits = Math.round(((double) getEstimatedSizeBytes(options)) / desiredBundleSizeBytes);
      } catch (Exception e) {
        // Fallback in case estimated size is unavailable. TODO: fix this, it's horrible.

        // 1. Try Dataflow's numWorkers, which will be 0 for other workers.
        DataflowPipelineWorkerPoolOptions poolOptions =
            options.as(DataflowPipelineWorkerPoolOptions.class);
        if (poolOptions.getNumWorkers() > 0) {
          LOG.warn(
              "Estimated size of unavailable, using the number of workers {}",
              poolOptions.getNumWorkers(),
              e);
          numSplits = poolOptions.getNumWorkers();
        } else {
          // 2. Default to 12 in the unknown case.
          numSplits = 12;
        }
      }

      // If the desiredBundleSize or number of workers results in 1 split, simply return
      // a source that reads from the original query.
      if (numSplits <= 1) {
        return ImmutableList.of(this);
      }

      List<Query> datastoreSplits;
      try {
        datastoreSplits = getSplitQueries(Ints.checkedCast(numSplits), options);
      } catch (IllegalArgumentException | DatastoreException e) {
        LOG.warn("Unable to parallelize the given query: {}", query, e);
        return ImmutableList.of(this);
      }

      ImmutableList.Builder<Source> splits = ImmutableList.builder();
      for (Query splitQuery : datastoreSplits) {
        splits.add(new Source(host, datasetId, splitQuery, namespace));
      }
      return splits.build();
    }
Example #4
0
    @Override
    public DatastoreWriter createWriter(PipelineOptions options) throws Exception {
      DatastoreOptions.Builder builder =
          new DatastoreOptions.Builder()
              .host(sink.host)
              .dataset(sink.datasetId)
              .initializer(new RetryHttpRequestInitializer());
      Credential credential = options.as(GcpOptions.class).getGcpCredential();
      if (credential != null) {
        builder.credential(credential);
      }
      Datastore datastore = DatastoreFactory.get().create(builder.build());

      return new DatastoreWriter(this, datastore);
    }
 public static DataflowPathValidator fromOptions(PipelineOptions options) {
   return new DataflowPathValidator(options.as(DataflowPipelineOptions.class));
 }
  @Override
  public void startBundle(final Receiver... receivers) throws Exception {
    if (receivers.length != sideOutputTags.size() + 1) {
      throw new AssertionError("unexpected number of receivers for DoFn");
    }

    StepContext stepContext = null;
    if (executionContext != null) {
      stepContext = executionContext.getOrCreateStepContext(stepName, transformName, stateSampler);
    }

    @SuppressWarnings("unchecked")
    DoFnInfo<Object, Object> doFnInfo = (DoFnInfo<Object, Object>) getDoFnInfo();

    OutputManager outputManager =
        new OutputManager() {
          final Map<TupleTag<?>, OutputReceiver> undeclaredOutputs = new HashMap<>();

          @Nullable
          private Receiver getReceiverOrNull(TupleTag<?> tag) {
            if (tag.equals(mainOutputTag)) {
              return receivers[0];
            } else if (sideOutputTags.contains(tag)) {
              return receivers[sideOutputTags.indexOf(tag) + 1];
            } else {
              return undeclaredOutputs.get(tag);
            }
          }

          @Override
          public <T> void output(TupleTag<T> tag, WindowedValue<T> output) {
            Receiver receiver = getReceiverOrNull(tag);
            if (receiver == null) {
              // A new undeclared output.
              // TODO: plumb through the operationName, so that we can
              // name implicit outputs after it.
              String outputName = "implicit-" + tag.getId();
              // TODO: plumb through the counter prefix, so we can
              // make it available to the OutputReceiver class in case
              // it wants to use it in naming output counters.  (It
              // doesn't today.)
              OutputReceiver undeclaredReceiver = new OutputReceiver();
              ElementCounter outputCounter =
                  new DataflowOutputCounter(outputName, addCounterMutator);
              undeclaredReceiver.addOutputCounter(outputCounter);
              undeclaredOutputs.put(tag, undeclaredReceiver);
              receiver = undeclaredReceiver;
            }

            try {
              receiver.process(output);
            } catch (Throwable t) {
              throw Throwables.propagate(t);
            }
          }
        };

    if (options.as(StreamingOptions.class).isStreaming() && !sideInputReader.isEmpty()) {
      fnRunner =
          new StreamingSideInputDoFnRunner<Object, Object, BoundedWindow>(
              options,
              doFnInfo,
              sideInputReader,
              outputManager,
              mainOutputTag,
              sideOutputTags,
              stepContext,
              addCounterMutator);
    } else {
      fnRunner =
          DoFnRunner.create(
              options,
              doFnInfo.getDoFn(),
              sideInputReader,
              outputManager,
              mainOutputTag,
              sideOutputTags,
              stepContext,
              addCounterMutator,
              doFnInfo.getWindowingStrategy());
    }

    fnRunner.startBundle();
  }