Example #1
0
    @Override
    public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
      // Datastore provides no way to get a good estimate of how large the result of a query
      // will be. As a rough approximation, we attempt to fetch the statistics of the whole
      // entity kind being queried, using the __Stat_Kind__ system table, assuming exactly 1 kind
      // is specified in the query.
      //
      // See https://cloud.google.com/datastore/docs/concepts/stats
      if (mockEstimateSizeBytes != null) {
        return mockEstimateSizeBytes;
      }

      Datastore datastore = getDatastore(options);
      if (query.getKindCount() != 1) {
        throw new UnsupportedOperationException(
            "Can only estimate size for queries specifying exactly 1 kind.");
      }
      String ourKind = query.getKind(0).getName();
      long latestTimestamp = queryLatestStatisticsTimestamp(datastore);
      Query.Builder query = Query.newBuilder();
      if (namespace == null) {
        query.addKindBuilder().setName("__Stat_Kind__");
      } else {
        query.addKindBuilder().setName("__Ns_Stat_Kind__");
      }
      query.setFilter(
          makeFilter(
              makeFilter("kind_name", EQUAL, makeValue(ourKind)).build(),
              makeFilter("timestamp", EQUAL, makeValue(latestTimestamp)).build()));
      RunQueryRequest request = makeRequest(query.build());

      long now = System.currentTimeMillis();
      RunQueryResponse response = datastore.runQuery(request);
      LOG.info("Query for per-kind statistics took {}ms", System.currentTimeMillis() - now);

      QueryResultBatch batch = response.getBatch();
      if (batch.getEntityResultCount() == 0) {
        throw new NoSuchElementException(
            "Datastore statistics for kind " + ourKind + " unavailable");
      }
      Entity entity = batch.getEntityResult(0).getEntity();
      return getPropertyMap(entity).get("entity_bytes").getIntegerValue();
    }
Example #2
0
    /**
     * Datastore system tables with statistics are periodically updated. This method fetches the
     * latest timestamp of statistics update using the {@code __Stat_Total__} table.
     */
    private long queryLatestStatisticsTimestamp(Datastore datastore) throws DatastoreException {
      Query.Builder query = Query.newBuilder();
      query.addKindBuilder().setName("__Stat_Total__");
      query.addOrder(makeOrder("timestamp", DESCENDING));
      query.setLimit(1);
      RunQueryRequest request = makeRequest(query.build());

      long now = System.currentTimeMillis();
      RunQueryResponse response = datastore.runQuery(request);
      LOG.info(
          "Query for latest stats timestamp of dataset {} took {}ms",
          datasetId,
          System.currentTimeMillis() - now);
      QueryResultBatch batch = response.getBatch();
      if (batch.getEntityResultCount() == 0) {
        throw new NoSuchElementException(
            "Datastore total statistics for dataset " + datasetId + " unavailable");
      }
      Entity entity = batch.getEntityResult(0).getEntity();
      return getPropertyMap(entity).get("timestamp").getTimestampMicrosecondsValue();
    }
Example #3
0
    /**
     * Returns an iterator over the next batch of records for the query and updates the cursor to
     * get the next batch as needed. Query has specified limit and offset from InputSplit.
     */
    private Iterator<EntityResult> getIteratorAndMoveCursor() throws DatastoreException {
      Query.Builder query = source.query.toBuilder().clone();
      query.setLimit(Math.min(userLimit, QUERY_BATCH_LIMIT));
      if (currentBatch != null && currentBatch.hasEndCursor()) {
        query.setStartCursor(currentBatch.getEndCursor());
      }

      RunQueryRequest request = source.makeRequest(query.build());
      RunQueryResponse response = datastore.runQuery(request);

      currentBatch = response.getBatch();

      // MORE_RESULTS_AFTER_LIMIT is not implemented yet:
      // https://groups.google.com/forum/#!topic/gcd-discuss/iNs6M1jA2Vw, so
      // use result count to determine if more results might exist.
      int numFetch = currentBatch.getEntityResultCount();
      if (source.query.hasLimit()) {
        verify(
            userLimit >= numFetch,
            "Expected userLimit %s >= numFetch %s, because query limit %s should be <= userLimit",
            userLimit,
            numFetch,
            query.getLimit());
        userLimit -= numFetch;
      }
      moreResults =
          // User-limit does not exist (so userLimit == MAX_VALUE) and/or has not been satisfied.
          (userLimit > 0)
              // All indications from the API are that there are/may be more results.
              && ((numFetch == QUERY_BATCH_LIMIT)
                  || (currentBatch.getMoreResults() == NOT_FINISHED));

      // May receive a batch of 0 results if the number of records is a multiple
      // of the request limit.
      if (numFetch == 0) {
        return null;
      }

      return currentBatch.getEntityResultList().iterator();
    }