Example #1
0
 /**
  * Returns a new {@link Source} that reads the results of the specified query.
  *
  * <p>Does not modify this object.
  *
  * <p><b>Note:</b> Normally, a Cloud Dataflow job will read from Cloud Datastore in parallel
  * across many workers. However, when the {@link Query} is configured with a limit using {@link
  * com.google.api.services.datastore.DatastoreV1.Query.Builder#setLimit(int)}, then all returned
  * results will be read by a single Dataflow worker in order to ensure correct data.
  */
 public Source withQuery(Query query) {
   checkNotNull(query, "query");
   checkArgument(
       !query.hasLimit() || query.getLimit() > 0,
       "Invalid query limit %s: must be positive",
       query.getLimit());
   return new Source(host, datasetId, query, namespace);
 }
Example #2
0
    @Override
    public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
      // Datastore provides no way to get a good estimate of how large the result of a query
      // will be. As a rough approximation, we attempt to fetch the statistics of the whole
      // entity kind being queried, using the __Stat_Kind__ system table, assuming exactly 1 kind
      // is specified in the query.
      //
      // See https://cloud.google.com/datastore/docs/concepts/stats
      if (mockEstimateSizeBytes != null) {
        return mockEstimateSizeBytes;
      }

      Datastore datastore = getDatastore(options);
      if (query.getKindCount() != 1) {
        throw new UnsupportedOperationException(
            "Can only estimate size for queries specifying exactly 1 kind.");
      }
      String ourKind = query.getKind(0).getName();
      long latestTimestamp = queryLatestStatisticsTimestamp(datastore);
      Query.Builder query = Query.newBuilder();
      if (namespace == null) {
        query.addKindBuilder().setName("__Stat_Kind__");
      } else {
        query.addKindBuilder().setName("__Ns_Stat_Kind__");
      }
      query.setFilter(
          makeFilter(
              makeFilter("kind_name", EQUAL, makeValue(ourKind)).build(),
              makeFilter("timestamp", EQUAL, makeValue(latestTimestamp)).build()));
      RunQueryRequest request = makeRequest(query.build());

      long now = System.currentTimeMillis();
      RunQueryResponse response = datastore.runQuery(request);
      LOG.info("Query for per-kind statistics took {}ms", System.currentTimeMillis() - now);

      QueryResultBatch batch = response.getBatch();
      if (batch.getEntityResultCount() == 0) {
        throw new NoSuchElementException(
            "Datastore statistics for kind " + ourKind + " unavailable");
      }
      Entity entity = batch.getEntityResult(0).getEntity();
      return getPropertyMap(entity).get("entity_bytes").getIntegerValue();
    }
Example #3
0
    /**
     * Datastore system tables with statistics are periodically updated. This method fetches the
     * latest timestamp of statistics update using the {@code __Stat_Total__} table.
     */
    private long queryLatestStatisticsTimestamp(Datastore datastore) throws DatastoreException {
      Query.Builder query = Query.newBuilder();
      query.addKindBuilder().setName("__Stat_Total__");
      query.addOrder(makeOrder("timestamp", DESCENDING));
      query.setLimit(1);
      RunQueryRequest request = makeRequest(query.build());

      long now = System.currentTimeMillis();
      RunQueryResponse response = datastore.runQuery(request);
      LOG.info(
          "Query for latest stats timestamp of dataset {} took {}ms",
          datasetId,
          System.currentTimeMillis() - now);
      QueryResultBatch batch = response.getBatch();
      if (batch.getEntityResultCount() == 0) {
        throw new NoSuchElementException(
            "Datastore total statistics for dataset " + datasetId + " unavailable");
      }
      Entity entity = batch.getEntityResult(0).getEntity();
      return getPropertyMap(entity).get("timestamp").getTimestampMicrosecondsValue();
    }