Example #1
0
 private static void logMutationSize(HTableInterface htable, List<Mutation> mutations) {
   long byteSize = 0;
   int keyValueCount = 0;
   for (Mutation mutation : mutations) {
     if (mutation.getFamilyCellMap() != null) { // Not a Delete of the row
       for (Entry<byte[], List<Cell>> entry : mutation.getFamilyCellMap().entrySet()) {
         if (entry.getValue() != null) {
           for (Cell kv : entry.getValue()) {
             byteSize += CellUtil.estimatedSizeOf(kv);
             keyValueCount++;
           }
         }
       }
     }
   }
   logger.debug(
       "Sending "
           + mutations.size()
           + " mutations for "
           + Bytes.toString(htable.getTableName())
           + " with "
           + keyValueCount
           + " key values of total size "
           + byteSize
           + " bytes");
 }
Example #2
0
 /**
  * Helper method for a {@link KeyValueBuilder} that catches an IOException from a {@link Put} when
  * adding a {@link KeyValue} generated by the KeyValueBuilder.
  *
  * @throws RuntimeException if there is an IOException thrown from the underlying {@link Put}
  */
 @SuppressWarnings("javadoc")
 public static void addQuietly(Mutation m, KeyValueBuilder builder, KeyValue kv) {
   byte[] family = CellUtil.cloneFamily(kv);
   List<Cell> list = m.getFamilyCellMap().get(family);
   if (list == null) {
     list = new ArrayList<Cell>();
     m.getFamilyCellMap().put(family, list);
   }
   list.add(kv);
 }
Example #3
0
 public static ScanRanges newScanRanges(List<Mutation> mutations) throws SQLException {
   List<KeyRange> keys = Lists.newArrayListWithExpectedSize(mutations.size());
   for (Mutation m : mutations) {
     keys.add(PVarbinary.INSTANCE.getKeyRange(m.getRow()));
   }
   ScanRanges keyRanges =
       ScanRanges.create(
           SchemaUtil.VAR_BINARY_SCHEMA,
           Collections.singletonList(keys),
           ScanUtil.SINGLE_COLUMN_SLOT_SPAN);
   return keyRanges;
 }
 private static byte[] getTableName(List<Mutation> tableMetaData, byte[] physicalTableName) {
   if (physicalTableName != null) {
     return physicalTableName;
   }
   byte[][] rowKeyMetadata = new byte[3][];
   Mutation m = MetaDataUtil.getTableHeaderRow(tableMetaData);
   byte[] key = m.getRow();
   SchemaUtil.getVarChars(key, rowKeyMetadata);
   byte[] schemaBytes = rowKeyMetadata[PhoenixDatabaseMetaData.SCHEMA_NAME_INDEX];
   byte[] tableBytes = rowKeyMetadata[PhoenixDatabaseMetaData.TABLE_NAME_INDEX];
   return SchemaUtil.getTableNameAsBytes(schemaBytes, tableBytes);
 }
 private static void commitBatch(HRegion region, List<Mutation> mutations, byte[] indexUUID)
     throws IOException {
   if (indexUUID != null) {
     for (Mutation m : mutations) {
       m.setAttribute(PhoenixIndexCodec.INDEX_UUID, indexUUID);
     }
   }
   @SuppressWarnings("unchecked")
   Mutation[] mutationArray = new Mutation[mutations.size()];
   // TODO: should we use the one that is all or none?
   region.batchMutate(mutations.toArray(mutationArray));
 }
Example #6
0
 private void mutateRow(Mutation mut, Integer lockId) throws IOException {
   @SuppressWarnings("unchecked")
   Pair<Mutation, Integer> pair[] = new Pair[1];
   mut.setWriteToWAL(true);
   pair[0] = new Pair<Mutation, Integer>(mut, lockId);
   region.batchMutate(pair);
 }
Example #7
0
 private void removeIfPresent(Mutation m, byte[] family, byte[] qualifier) {
   Map<byte[], List<KeyValue>> familyMap = m.getFamilyMap();
   List<KeyValue> kvs = familyMap.get(family);
   if (kvs != null) {
     Iterator<KeyValue> iterator = kvs.iterator();
     while (iterator.hasNext()) {
       KeyValue kv = iterator.next();
       if (Bytes.compareTo(kv.getQualifier(), qualifier) == 0) {
         iterator.remove();
       }
     }
   }
 }
  @Override
  public Result getCurrentRowState(Mutation m, Collection<? extends ColumnReference> columns)
      throws IOException {
    byte[] row = m.getRow();
    // need to use a scan here so we can get raw state, which Get doesn't provide.
    Scan s = IndexManagementUtil.newLocalStateScan(Collections.singletonList(columns));
    s.setStartRow(row);
    s.setStopRow(row);
    HRegion region = this.env.getRegion();
    RegionScanner scanner = region.getScanner(s);
    List<KeyValue> kvs = new ArrayList<KeyValue>(1);
    boolean more = scanner.next(kvs);
    assert !more : "Got more than one result when scanning" + " a single row in the primary table!";

    Result r = new Result(kvs);
    scanner.close();
    return r;
  }
Example #9
0
  @SuppressWarnings("deprecation")
  public void commit() throws SQLException {
    int i = 0;
    byte[] tenantId = connection.getTenantId() == null ? null : connection.getTenantId().getBytes();
    long[] serverTimeStamps = validate();
    Iterator<Map.Entry<TableRef, Map<ImmutableBytesPtr, Map<PColumn, byte[]>>>> iterator =
        this.mutations.entrySet().iterator();
    List<Map.Entry<TableRef, Map<ImmutableBytesPtr, Map<PColumn, byte[]>>>> committedList =
        Lists.newArrayListWithCapacity(this.mutations.size());

    // add tracing for this operation
    TraceScope trace = Tracing.startNewSpan(connection, "Committing mutations to tables");
    Span span = trace.getSpan();
    while (iterator.hasNext()) {
      Map.Entry<TableRef, Map<ImmutableBytesPtr, Map<PColumn, byte[]>>> entry = iterator.next();
      Map<ImmutableBytesPtr, Map<PColumn, byte[]>> valuesMap = entry.getValue();
      TableRef tableRef = entry.getKey();
      PTable table = tableRef.getTable();
      table.getIndexMaintainers(tempPtr);
      boolean hasIndexMaintainers = tempPtr.getLength() > 0;
      boolean isDataTable = true;
      long serverTimestamp = serverTimeStamps[i++];
      Iterator<Pair<byte[], List<Mutation>>> mutationsIterator =
          addRowMutations(tableRef, valuesMap, serverTimestamp, false);
      while (mutationsIterator.hasNext()) {
        Pair<byte[], List<Mutation>> pair = mutationsIterator.next();
        byte[] htableName = pair.getFirst();
        List<Mutation> mutations = pair.getSecond();

        // create a span per target table
        // TODO maybe we can be smarter about the table name to string here?
        Span child =
            Tracing.child(span, "Writing mutation batch for table: " + Bytes.toString(htableName));

        int retryCount = 0;
        boolean shouldRetry = false;
        do {
          ServerCache cache = null;
          if (hasIndexMaintainers && isDataTable) {
            byte[] attribValue = null;
            byte[] uuidValue;
            if (IndexMetaDataCacheClient.useIndexMetadataCache(
                connection, mutations, tempPtr.getLength())) {
              IndexMetaDataCacheClient client = new IndexMetaDataCacheClient(connection, tableRef);
              cache = client.addIndexMetadataCache(mutations, tempPtr);
              child.addTimelineAnnotation("Updated index metadata cache");
              uuidValue = cache.getId();
              // If we haven't retried yet, retry for this case only, as it's possible that
              // a split will occur after we send the index metadata cache to all known
              // region servers.
              shouldRetry = true;
            } else {
              attribValue = ByteUtil.copyKeyBytesIfNecessary(tempPtr);
              uuidValue = ServerCacheClient.generateId();
            }
            // Either set the UUID to be able to access the index metadata from the cache
            // or set the index metadata directly on the Mutation
            for (Mutation mutation : mutations) {
              if (tenantId != null) {
                mutation.setAttribute(PhoenixRuntime.TENANT_ID_ATTRIB, tenantId);
              }
              mutation.setAttribute(PhoenixIndexCodec.INDEX_UUID, uuidValue);
              if (attribValue != null) {
                mutation.setAttribute(PhoenixIndexCodec.INDEX_MD, attribValue);
              }
            }
          }

          SQLException sqlE = null;
          HTableInterface hTable = connection.getQueryServices().getTable(htableName);
          try {
            if (logger.isDebugEnabled()) logMutationSize(hTable, mutations);
            long startTime = System.currentTimeMillis();
            child.addTimelineAnnotation("Attempt " + retryCount);
            hTable.batch(mutations);
            child.stop();
            shouldRetry = false;
            if (logger.isDebugEnabled())
              logger.debug(
                  "Total time for batch call of  "
                      + mutations.size()
                      + " mutations into "
                      + table.getName().getString()
                      + ": "
                      + (System.currentTimeMillis() - startTime)
                      + " ms");
            committedList.add(entry);
          } catch (Exception e) {
            SQLException inferredE = ServerUtil.parseServerExceptionOrNull(e);
            if (inferredE != null) {
              if (shouldRetry
                  && retryCount == 0
                  && inferredE.getErrorCode()
                      == SQLExceptionCode.INDEX_METADATA_NOT_FOUND.getErrorCode()) {
                // Swallow this exception once, as it's possible that we split after sending the
                // index metadata
                // and one of the region servers doesn't have it. This will cause it to have it the
                // next go around.
                // If it fails again, we don't retry.
                String msg =
                    "Swallowing exception and retrying after clearing meta cache on connection. "
                        + inferredE;
                logger.warn(msg);
                connection.getQueryServices().clearTableRegionCache(htableName);

                // add a new child span as this one failed
                child.addTimelineAnnotation(msg);
                child.stop();
                child = Tracing.child(span, "Failed batch, attempting retry");

                continue;
              }
              e = inferredE;
            }
            // Throw to client with both what was committed so far and what is left to be committed.
            // That way, client can either undo what was done or try again with what was not done.
            sqlE =
                new CommitException(
                    e,
                    this,
                    new MutationState(
                        committedList, this.sizeOffset, this.maxSize, this.connection));
          } finally {
            try {
              hTable.close();
            } catch (IOException e) {
              if (sqlE != null) {
                sqlE.setNextException(ServerUtil.parseServerException(e));
              } else {
                sqlE = ServerUtil.parseServerException(e);
              }
            } finally {
              try {
                if (cache != null) {
                  cache.close();
                }
              } finally {
                if (sqlE != null) {
                  throw sqlE;
                }
              }
            }
          }
        } while (shouldRetry && retryCount++ < 1);
        isDataTable = false;
      }
      numRows -= entry.getValue().size();
      iterator.remove(); // Remove batches as we process them
    }
    trace.close();
    assert (numRows == 0);
    assert (this.mutations.isEmpty());
  }
 /**
  * Helper to add a {@link Mutation} to the values stored for the current row
  *
  * @param pendingUpdate update to apply
  */
 public void addUpdateForTesting(Mutation pendingUpdate) {
   for (Map.Entry<byte[], List<KeyValue>> e : pendingUpdate.getFamilyMap().entrySet()) {
     List<KeyValue> edits = e.getValue();
     addUpdate(edits);
   }
 }
Example #11
0
  /**
   * Prepare the regions and region files.
   *
   * @param server Hosting server instance. Can be null when testing (won't try and update in zk if
   *     a null server)
   * @param services Used to online/offline regions.
   * @param user
   * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server,
   *     RegionServerServices)}
   * @return Regions created
   */
  @VisibleForTesting
  PairOfSameType<Region> createDaughters(
      final Server server, final RegionServerServices services, User user) throws IOException {
    LOG.info("Starting split of region " + this.parent);
    if ((server != null && server.isStopped()) || (services != null && services.isStopping())) {
      throw new IOException("Server is stopped or stopping");
    }
    assert !this.parent.lock.writeLock().isHeldByCurrentThread()
        : "Unsafe to hold write lock while performing RPCs";

    transition(SplitTransactionPhase.BEFORE_PRE_SPLIT_HOOK);

    // Coprocessor callback
    if (this.parent.getCoprocessorHost() != null) {
      // TODO: Remove one of these
      parent.getCoprocessorHost().preSplit(user);
      parent.getCoprocessorHost().preSplit(splitrow, user);
    }

    transition(SplitTransactionPhase.AFTER_PRE_SPLIT_HOOK);

    // If true, no cluster to write meta edits to or to update znodes in.
    boolean testing =
        server == null
            ? true
            : server.getConfiguration().getBoolean("hbase.testing.nocluster", false);
    this.fileSplitTimeout =
        testing
            ? this.fileSplitTimeout
            : server
                .getConfiguration()
                .getLong("hbase.regionserver.fileSplitTimeout", this.fileSplitTimeout);

    PairOfSameType<Region> daughterRegions = stepsBeforePONR(server, services, testing);

    final List<Mutation> metaEntries = new ArrayList<Mutation>();
    boolean ret = false;
    if (this.parent.getCoprocessorHost() != null) {
      ret = parent.getCoprocessorHost().preSplitBeforePONR(splitrow, metaEntries, user);
      if (ret) {
        throw new IOException(
            "Coprocessor bypassing region "
                + parent.getRegionInfo().getRegionNameAsString()
                + " split.");
      }
      try {
        for (Mutation p : metaEntries) {
          HRegionInfo.parseRegionName(p.getRow());
        }
      } catch (IOException e) {
        LOG.error(
            "Row key of mutation from coprossor is not parsable as region name."
                + "Mutations from coprocessor should only for hbase:meta table.");
        throw e;
      }
    }

    // This is the point of no return.  Adding subsequent edits to .META. as we
    // do below when we do the daughter opens adding each to .META. can fail in
    // various interesting ways the most interesting of which is a timeout
    // BUT the edits all go through (See HBASE-3872).  IF we reach the PONR
    // then subsequent failures need to crash out this regionserver; the
    // server shutdown processing should be able to fix-up the incomplete split.
    // The offlined parent will have the daughters as extra columns.  If
    // we leave the daughter regions in place and do not remove them when we
    // crash out, then they will have their references to the parent in place
    // still and the server shutdown fixup of .META. will point to these
    // regions.
    // We should add PONR JournalEntry before offlineParentInMeta,so even if
    // OfflineParentInMeta timeout,this will cause regionserver exit,and then
    // master ServerShutdownHandler will fix daughter & avoid data loss. (See
    // HBase-4562).

    transition(SplitTransactionPhase.PONR);

    // Edit parent in meta.  Offlines parent region and adds splita and splitb
    // as an atomic update. See HBASE-7721. This update to META makes the region
    // will determine whether the region is split or not in case of failures.
    // If it is successful, master will roll-forward, if not, master will rollback
    // and assign the parent region.
    if (services != null
        && !services.reportRegionStateTransition(
            TransitionCode.SPLIT_PONR, parent.getRegionInfo(), hri_a, hri_b)) {
      // Passed PONR, let SSH clean it up
      throw new IOException(
          "Failed to notify master that split passed PONR: "
              + parent.getRegionInfo().getRegionNameAsString());
    }
    return daughterRegions;
  }
  /**
   * Prepare the merged region and region files.
   *
   * @param server Hosting server instance. Can be null when testing
   * @param services Used to online/offline regions.
   * @return merged region
   * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server,
   *     RegionServerServices)}
   */
  HRegion createMergedRegion(final Server server, final RegionServerServices services)
      throws IOException {
    LOG.info(
        "Starting merge of "
            + region_a
            + " and "
            + region_b.getRegionNameAsString()
            + ", forcible="
            + forcible);
    if ((server != null && server.isStopped()) || (services != null && services.isStopping())) {
      throw new IOException("Server is stopped or stopping");
    }

    if (rsCoprocessorHost != null) {
      if (rsCoprocessorHost.preMerge(this.region_a, this.region_b)) {
        throw new IOException(
            "Coprocessor bypassing regions " + this.region_a + " " + this.region_b + " merge.");
      }
    }

    // If true, no cluster to write meta edits to or to use coordination.
    boolean testing =
        server == null
            ? true
            : server.getConfiguration().getBoolean("hbase.testing.nocluster", false);

    HRegion mergedRegion = stepsBeforePONR(server, services, testing);

    @MetaMutationAnnotation List<Mutation> metaEntries = new ArrayList<Mutation>();
    if (rsCoprocessorHost != null) {
      if (rsCoprocessorHost.preMergeCommit(this.region_a, this.region_b, metaEntries)) {
        throw new IOException(
            "Coprocessor bypassing regions " + this.region_a + " " + this.region_b + " merge.");
      }
      try {
        for (Mutation p : metaEntries) {
          HRegionInfo.parseRegionName(p.getRow());
        }
      } catch (IOException e) {
        LOG.error(
            "Row key of mutation from coprocessor is not parsable as region name."
                + "Mutations from coprocessor should only be for hbase:meta table.",
            e);
        throw e;
      }
    }

    // This is the point of no return. Similar with SplitTransaction.
    // IF we reach the PONR then subsequent failures need to crash out this
    // regionserver
    this.journal.add(JournalEntry.PONR);

    // Add merged region and delete region_a and region_b
    // as an atomic update. See HBASE-7721. This update to hbase:meta makes the region
    // will determine whether the region is merged or not in case of failures.
    // If it is successful, master will roll-forward, if not, master will
    // rollback
    if (!testing && useCoordinationForAssignment) {
      if (metaEntries.isEmpty()) {
        MetaTableAccessor.mergeRegions(
            server.getConnection(),
            mergedRegion.getRegionInfo(),
            region_a.getRegionInfo(),
            region_b.getRegionInfo(),
            server.getServerName(),
            region_a.getTableDesc().getRegionReplication());
      } else {
        mergeRegionsAndPutMetaEntries(
            server.getConnection(),
            mergedRegion.getRegionInfo(),
            region_a.getRegionInfo(),
            region_b.getRegionInfo(),
            server.getServerName(),
            metaEntries,
            region_a.getTableDesc().getRegionReplication());
      }
    } else if (services != null && !useCoordinationForAssignment) {
      if (!services.reportRegionStateTransition(
          TransitionCode.MERGE_PONR,
          mergedRegionInfo,
          region_a.getRegionInfo(),
          region_b.getRegionInfo())) {
        // Passed PONR, let SSH clean it up
        throw new IOException(
            "Failed to notify master that merge passed PONR: "
                + region_a.getRegionInfo().getRegionNameAsString()
                + " and "
                + region_b.getRegionInfo().getRegionNameAsString());
      }
    }
    return mergedRegion;
  }
  /**
   * Override the preAppend for checkAndPut and checkAndDelete, as we need the ability to a) set the
   * TimeRange for the Get being done and b) return something back to the client to indicate
   * success/failure
   */
  @SuppressWarnings("deprecation")
  @Override
  public Result preAppend(
      final ObserverContext<RegionCoprocessorEnvironment> e, final Append append)
      throws IOException {
    byte[] opBuf = append.getAttribute(OPERATION_ATTRIB);
    if (opBuf == null) {
      return null;
    }
    Op op = Op.values()[opBuf[0]];

    long clientTimestamp = HConstants.LATEST_TIMESTAMP;
    byte[] clientTimestampBuf = append.getAttribute(MAX_TIMERANGE_ATTRIB);
    if (clientTimestampBuf != null) {
      clientTimestamp = Bytes.toLong(clientTimestampBuf);
    }
    boolean hadClientTimestamp = (clientTimestamp != HConstants.LATEST_TIMESTAMP);
    if (hadClientTimestamp) {
      // Prevent race condition of creating two sequences at the same timestamp
      // by looking for a sequence at or after the timestamp at which it'll be
      // created.
      if (op == Op.CREATE_SEQUENCE) {
        clientTimestamp++;
      }
    } else {
      clientTimestamp = EnvironmentEdgeManager.currentTimeMillis();
      clientTimestampBuf = Bytes.toBytes(clientTimestamp);
    }

    RegionCoprocessorEnvironment env = e.getEnvironment();
    // We need to set this to prevent region.append from being called
    e.bypass();
    e.complete();
    HRegion region = env.getRegion();
    byte[] row = append.getRow();
    region.startRegionOperation();
    try {
      Integer lid = region.getLock(null, row, true);
      try {
        KeyValue keyValue = append.getFamilyMap().values().iterator().next().iterator().next();
        byte[] family = keyValue.getFamily();
        byte[] qualifier = keyValue.getQualifier();

        Get get = new Get(row);
        get.setTimeRange(MetaDataProtocol.MIN_TABLE_TIMESTAMP, clientTimestamp);
        get.addColumn(family, qualifier);
        Result result = region.get(get);
        if (result.isEmpty()) {
          if (op == Op.DROP_SEQUENCE || op == Op.RESET_SEQUENCE) {
            return getErrorResult(
                row, clientTimestamp, SQLExceptionCode.SEQUENCE_UNDEFINED.getErrorCode());
          }
        } else {
          if (op == Op.CREATE_SEQUENCE) {
            return getErrorResult(
                row, clientTimestamp, SQLExceptionCode.SEQUENCE_ALREADY_EXIST.getErrorCode());
          }
        }
        Mutation m = null;
        switch (op) {
          case RESET_SEQUENCE:
            KeyValue currentValueKV = result.raw()[0];
            long expectedValue =
                PDataType.LONG
                    .getCodec()
                    .decodeLong(append.getAttribute(CURRENT_VALUE_ATTRIB), 0, null);
            long value =
                PDataType.LONG
                    .getCodec()
                    .decodeLong(currentValueKV.getBuffer(), currentValueKV.getValueOffset(), null);
            // Timestamp should match exactly, or we may have the wrong sequence
            if (expectedValue != value || currentValueKV.getTimestamp() != clientTimestamp) {
              return new Result(
                  Collections.singletonList(
                      KeyValueUtil.newKeyValue(
                          row,
                          PhoenixDatabaseMetaData.SEQUENCE_FAMILY_BYTES,
                          QueryConstants.EMPTY_COLUMN_BYTES,
                          currentValueKV.getTimestamp(),
                          ByteUtil.EMPTY_BYTE_ARRAY)));
            }
            m = new Put(row, currentValueKV.getTimestamp());
            m.getFamilyMap().putAll(append.getFamilyMap());
            break;
          case DROP_SEQUENCE:
            m = new Delete(row, clientTimestamp, null);
            break;
          case CREATE_SEQUENCE:
            m = new Put(row, clientTimestamp);
            m.getFamilyMap().putAll(append.getFamilyMap());
            break;
        }
        if (!hadClientTimestamp) {
          for (List<KeyValue> kvs : m.getFamilyMap().values()) {
            for (KeyValue kv : kvs) {
              kv.updateLatestStamp(clientTimestampBuf);
            }
          }
        }
        @SuppressWarnings("unchecked")
        Pair<Mutation, Integer>[] mutations = new Pair[1];
        mutations[0] = new Pair<Mutation, Integer>(m, lid);
        region.batchMutate(mutations);
        long serverTimestamp = MetaDataUtil.getClientTimeStamp(m);
        // Return result with single KeyValue. The only piece of information
        // the client cares about is the timestamp, which is the timestamp of
        // when the mutation was actually performed (useful in the case of .
        return new Result(
            Collections.singletonList(
                KeyValueUtil.newKeyValue(
                    row,
                    PhoenixDatabaseMetaData.SEQUENCE_FAMILY_BYTES,
                    QueryConstants.EMPTY_COLUMN_BYTES,
                    serverTimestamp,
                    SUCCESS_VALUE)));
      } finally {
        region.releaseRowLock(lid);
      }
    } catch (Throwable t) {
      ServerUtil.throwIOException("Increment of sequence " + Bytes.toStringBinary(row), t);
      return null; // Impossible
    } finally {
      region.closeRegionOperation();
    }
  }