/** * Returns a new {@link Source} that reads the results of the specified query. * * <p>Does not modify this object. * * <p><b>Note:</b> Normally, a Cloud Dataflow job will read from Cloud Datastore in parallel * across many workers. However, when the {@link Query} is configured with a limit using {@link * com.google.api.services.datastore.DatastoreV1.Query.Builder#setLimit(int)}, then all returned * results will be read by a single Dataflow worker in order to ensure correct data. */ public Source withQuery(Query query) { checkNotNull(query, "query"); checkArgument( !query.hasLimit() || query.getLimit() > 0, "Invalid query limit %s: must be positive", query.getLimit()); return new Source(host, datasetId, query, namespace); }
@Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { // Datastore provides no way to get a good estimate of how large the result of a query // will be. As a rough approximation, we attempt to fetch the statistics of the whole // entity kind being queried, using the __Stat_Kind__ system table, assuming exactly 1 kind // is specified in the query. // // See https://cloud.google.com/datastore/docs/concepts/stats if (mockEstimateSizeBytes != null) { return mockEstimateSizeBytes; } Datastore datastore = getDatastore(options); if (query.getKindCount() != 1) { throw new UnsupportedOperationException( "Can only estimate size for queries specifying exactly 1 kind."); } String ourKind = query.getKind(0).getName(); long latestTimestamp = queryLatestStatisticsTimestamp(datastore); Query.Builder query = Query.newBuilder(); if (namespace == null) { query.addKindBuilder().setName("__Stat_Kind__"); } else { query.addKindBuilder().setName("__Ns_Stat_Kind__"); } query.setFilter( makeFilter( makeFilter("kind_name", EQUAL, makeValue(ourKind)).build(), makeFilter("timestamp", EQUAL, makeValue(latestTimestamp)).build())); RunQueryRequest request = makeRequest(query.build()); long now = System.currentTimeMillis(); RunQueryResponse response = datastore.runQuery(request); LOG.info("Query for per-kind statistics took {}ms", System.currentTimeMillis() - now); QueryResultBatch batch = response.getBatch(); if (batch.getEntityResultCount() == 0) { throw new NoSuchElementException( "Datastore statistics for kind " + ourKind + " unavailable"); } Entity entity = batch.getEntityResult(0).getEntity(); return getPropertyMap(entity).get("entity_bytes").getIntegerValue(); }
/** * Datastore system tables with statistics are periodically updated. This method fetches the * latest timestamp of statistics update using the {@code __Stat_Total__} table. */ private long queryLatestStatisticsTimestamp(Datastore datastore) throws DatastoreException { Query.Builder query = Query.newBuilder(); query.addKindBuilder().setName("__Stat_Total__"); query.addOrder(makeOrder("timestamp", DESCENDING)); query.setLimit(1); RunQueryRequest request = makeRequest(query.build()); long now = System.currentTimeMillis(); RunQueryResponse response = datastore.runQuery(request); LOG.info( "Query for latest stats timestamp of dataset {} took {}ms", datasetId, System.currentTimeMillis() - now); QueryResultBatch batch = response.getBatch(); if (batch.getEntityResultCount() == 0) { throw new NoSuchElementException( "Datastore total statistics for dataset " + datasetId + " unavailable"); } Entity entity = batch.getEntityResult(0).getEntity(); return getPropertyMap(entity).get("timestamp").getTimestampMicrosecondsValue(); }