@Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { // Datastore provides no way to get a good estimate of how large the result of a query // will be. As a rough approximation, we attempt to fetch the statistics of the whole // entity kind being queried, using the __Stat_Kind__ system table, assuming exactly 1 kind // is specified in the query. // // See https://cloud.google.com/datastore/docs/concepts/stats if (mockEstimateSizeBytes != null) { return mockEstimateSizeBytes; } Datastore datastore = getDatastore(options); if (query.getKindCount() != 1) { throw new UnsupportedOperationException( "Can only estimate size for queries specifying exactly 1 kind."); } String ourKind = query.getKind(0).getName(); long latestTimestamp = queryLatestStatisticsTimestamp(datastore); Query.Builder query = Query.newBuilder(); if (namespace == null) { query.addKindBuilder().setName("__Stat_Kind__"); } else { query.addKindBuilder().setName("__Ns_Stat_Kind__"); } query.setFilter( makeFilter( makeFilter("kind_name", EQUAL, makeValue(ourKind)).build(), makeFilter("timestamp", EQUAL, makeValue(latestTimestamp)).build())); RunQueryRequest request = makeRequest(query.build()); long now = System.currentTimeMillis(); RunQueryResponse response = datastore.runQuery(request); LOG.info("Query for per-kind statistics took {}ms", System.currentTimeMillis() - now); QueryResultBatch batch = response.getBatch(); if (batch.getEntityResultCount() == 0) { throw new NoSuchElementException( "Datastore statistics for kind " + ourKind + " unavailable"); } Entity entity = batch.getEntityResult(0).getEntity(); return getPropertyMap(entity).get("entity_bytes").getIntegerValue(); }
/** * Datastore system tables with statistics are periodically updated. This method fetches the * latest timestamp of statistics update using the {@code __Stat_Total__} table. */ private long queryLatestStatisticsTimestamp(Datastore datastore) throws DatastoreException { Query.Builder query = Query.newBuilder(); query.addKindBuilder().setName("__Stat_Total__"); query.addOrder(makeOrder("timestamp", DESCENDING)); query.setLimit(1); RunQueryRequest request = makeRequest(query.build()); long now = System.currentTimeMillis(); RunQueryResponse response = datastore.runQuery(request); LOG.info( "Query for latest stats timestamp of dataset {} took {}ms", datasetId, System.currentTimeMillis() - now); QueryResultBatch batch = response.getBatch(); if (batch.getEntityResultCount() == 0) { throw new NoSuchElementException( "Datastore total statistics for dataset " + datasetId + " unavailable"); } Entity entity = batch.getEntityResult(0).getEntity(); return getPropertyMap(entity).get("timestamp").getTimestampMicrosecondsValue(); }
/** * Returns an iterator over the next batch of records for the query and updates the cursor to * get the next batch as needed. Query has specified limit and offset from InputSplit. */ private Iterator<EntityResult> getIteratorAndMoveCursor() throws DatastoreException { Query.Builder query = source.query.toBuilder().clone(); query.setLimit(Math.min(userLimit, QUERY_BATCH_LIMIT)); if (currentBatch != null && currentBatch.hasEndCursor()) { query.setStartCursor(currentBatch.getEndCursor()); } RunQueryRequest request = source.makeRequest(query.build()); RunQueryResponse response = datastore.runQuery(request); currentBatch = response.getBatch(); // MORE_RESULTS_AFTER_LIMIT is not implemented yet: // https://groups.google.com/forum/#!topic/gcd-discuss/iNs6M1jA2Vw, so // use result count to determine if more results might exist. int numFetch = currentBatch.getEntityResultCount(); if (source.query.hasLimit()) { verify( userLimit >= numFetch, "Expected userLimit %s >= numFetch %s, because query limit %s should be <= userLimit", userLimit, numFetch, query.getLimit()); userLimit -= numFetch; } moreResults = // User-limit does not exist (so userLimit == MAX_VALUE) and/or has not been satisfied. (userLimit > 0) // All indications from the API are that there are/may be more results. && ((numFetch == QUERY_BATCH_LIMIT) || (currentBatch.getMoreResults() == NOT_FINISHED)); // May receive a batch of 0 results if the number of records is a multiple // of the request limit. if (numFetch == 0) { return null; } return currentBatch.getEntityResultList().iterator(); }