private static void logMutationSize(HTableInterface htable, List<Mutation> mutations) { long byteSize = 0; int keyValueCount = 0; for (Mutation mutation : mutations) { if (mutation.getFamilyCellMap() != null) { // Not a Delete of the row for (Entry<byte[], List<Cell>> entry : mutation.getFamilyCellMap().entrySet()) { if (entry.getValue() != null) { for (Cell kv : entry.getValue()) { byteSize += CellUtil.estimatedSizeOf(kv); keyValueCount++; } } } } } logger.debug( "Sending " + mutations.size() + " mutations for " + Bytes.toString(htable.getTableName()) + " with " + keyValueCount + " key values of total size " + byteSize + " bytes"); }
/** * Helper method for a {@link KeyValueBuilder} that catches an IOException from a {@link Put} when * adding a {@link KeyValue} generated by the KeyValueBuilder. * * @throws RuntimeException if there is an IOException thrown from the underlying {@link Put} */ @SuppressWarnings("javadoc") public static void addQuietly(Mutation m, KeyValueBuilder builder, KeyValue kv) { byte[] family = CellUtil.cloneFamily(kv); List<Cell> list = m.getFamilyCellMap().get(family); if (list == null) { list = new ArrayList<Cell>(); m.getFamilyCellMap().put(family, list); } list.add(kv); }
public static ScanRanges newScanRanges(List<Mutation> mutations) throws SQLException { List<KeyRange> keys = Lists.newArrayListWithExpectedSize(mutations.size()); for (Mutation m : mutations) { keys.add(PVarbinary.INSTANCE.getKeyRange(m.getRow())); } ScanRanges keyRanges = ScanRanges.create( SchemaUtil.VAR_BINARY_SCHEMA, Collections.singletonList(keys), ScanUtil.SINGLE_COLUMN_SLOT_SPAN); return keyRanges; }
private static byte[] getTableName(List<Mutation> tableMetaData, byte[] physicalTableName) { if (physicalTableName != null) { return physicalTableName; } byte[][] rowKeyMetadata = new byte[3][]; Mutation m = MetaDataUtil.getTableHeaderRow(tableMetaData); byte[] key = m.getRow(); SchemaUtil.getVarChars(key, rowKeyMetadata); byte[] schemaBytes = rowKeyMetadata[PhoenixDatabaseMetaData.SCHEMA_NAME_INDEX]; byte[] tableBytes = rowKeyMetadata[PhoenixDatabaseMetaData.TABLE_NAME_INDEX]; return SchemaUtil.getTableNameAsBytes(schemaBytes, tableBytes); }
private static void commitBatch(HRegion region, List<Mutation> mutations, byte[] indexUUID) throws IOException { if (indexUUID != null) { for (Mutation m : mutations) { m.setAttribute(PhoenixIndexCodec.INDEX_UUID, indexUUID); } } @SuppressWarnings("unchecked") Mutation[] mutationArray = new Mutation[mutations.size()]; // TODO: should we use the one that is all or none? region.batchMutate(mutations.toArray(mutationArray)); }
private void mutateRow(Mutation mut, Integer lockId) throws IOException { @SuppressWarnings("unchecked") Pair<Mutation, Integer> pair[] = new Pair[1]; mut.setWriteToWAL(true); pair[0] = new Pair<Mutation, Integer>(mut, lockId); region.batchMutate(pair); }
private void removeIfPresent(Mutation m, byte[] family, byte[] qualifier) { Map<byte[], List<KeyValue>> familyMap = m.getFamilyMap(); List<KeyValue> kvs = familyMap.get(family); if (kvs != null) { Iterator<KeyValue> iterator = kvs.iterator(); while (iterator.hasNext()) { KeyValue kv = iterator.next(); if (Bytes.compareTo(kv.getQualifier(), qualifier) == 0) { iterator.remove(); } } } }
@Override public Result getCurrentRowState(Mutation m, Collection<? extends ColumnReference> columns) throws IOException { byte[] row = m.getRow(); // need to use a scan here so we can get raw state, which Get doesn't provide. Scan s = IndexManagementUtil.newLocalStateScan(Collections.singletonList(columns)); s.setStartRow(row); s.setStopRow(row); HRegion region = this.env.getRegion(); RegionScanner scanner = region.getScanner(s); List<KeyValue> kvs = new ArrayList<KeyValue>(1); boolean more = scanner.next(kvs); assert !more : "Got more than one result when scanning" + " a single row in the primary table!"; Result r = new Result(kvs); scanner.close(); return r; }
@SuppressWarnings("deprecation") public void commit() throws SQLException { int i = 0; byte[] tenantId = connection.getTenantId() == null ? null : connection.getTenantId().getBytes(); long[] serverTimeStamps = validate(); Iterator<Map.Entry<TableRef, Map<ImmutableBytesPtr, Map<PColumn, byte[]>>>> iterator = this.mutations.entrySet().iterator(); List<Map.Entry<TableRef, Map<ImmutableBytesPtr, Map<PColumn, byte[]>>>> committedList = Lists.newArrayListWithCapacity(this.mutations.size()); // add tracing for this operation TraceScope trace = Tracing.startNewSpan(connection, "Committing mutations to tables"); Span span = trace.getSpan(); while (iterator.hasNext()) { Map.Entry<TableRef, Map<ImmutableBytesPtr, Map<PColumn, byte[]>>> entry = iterator.next(); Map<ImmutableBytesPtr, Map<PColumn, byte[]>> valuesMap = entry.getValue(); TableRef tableRef = entry.getKey(); PTable table = tableRef.getTable(); table.getIndexMaintainers(tempPtr); boolean hasIndexMaintainers = tempPtr.getLength() > 0; boolean isDataTable = true; long serverTimestamp = serverTimeStamps[i++]; Iterator<Pair<byte[], List<Mutation>>> mutationsIterator = addRowMutations(tableRef, valuesMap, serverTimestamp, false); while (mutationsIterator.hasNext()) { Pair<byte[], List<Mutation>> pair = mutationsIterator.next(); byte[] htableName = pair.getFirst(); List<Mutation> mutations = pair.getSecond(); // create a span per target table // TODO maybe we can be smarter about the table name to string here? Span child = Tracing.child(span, "Writing mutation batch for table: " + Bytes.toString(htableName)); int retryCount = 0; boolean shouldRetry = false; do { ServerCache cache = null; if (hasIndexMaintainers && isDataTable) { byte[] attribValue = null; byte[] uuidValue; if (IndexMetaDataCacheClient.useIndexMetadataCache( connection, mutations, tempPtr.getLength())) { IndexMetaDataCacheClient client = new IndexMetaDataCacheClient(connection, tableRef); cache = client.addIndexMetadataCache(mutations, tempPtr); child.addTimelineAnnotation("Updated index metadata cache"); uuidValue = cache.getId(); // If we haven't retried yet, retry for this case only, as it's possible that // a split will occur after we send the index metadata cache to all known // region servers. shouldRetry = true; } else { attribValue = ByteUtil.copyKeyBytesIfNecessary(tempPtr); uuidValue = ServerCacheClient.generateId(); } // Either set the UUID to be able to access the index metadata from the cache // or set the index metadata directly on the Mutation for (Mutation mutation : mutations) { if (tenantId != null) { mutation.setAttribute(PhoenixRuntime.TENANT_ID_ATTRIB, tenantId); } mutation.setAttribute(PhoenixIndexCodec.INDEX_UUID, uuidValue); if (attribValue != null) { mutation.setAttribute(PhoenixIndexCodec.INDEX_MD, attribValue); } } } SQLException sqlE = null; HTableInterface hTable = connection.getQueryServices().getTable(htableName); try { if (logger.isDebugEnabled()) logMutationSize(hTable, mutations); long startTime = System.currentTimeMillis(); child.addTimelineAnnotation("Attempt " + retryCount); hTable.batch(mutations); child.stop(); shouldRetry = false; if (logger.isDebugEnabled()) logger.debug( "Total time for batch call of " + mutations.size() + " mutations into " + table.getName().getString() + ": " + (System.currentTimeMillis() - startTime) + " ms"); committedList.add(entry); } catch (Exception e) { SQLException inferredE = ServerUtil.parseServerExceptionOrNull(e); if (inferredE != null) { if (shouldRetry && retryCount == 0 && inferredE.getErrorCode() == SQLExceptionCode.INDEX_METADATA_NOT_FOUND.getErrorCode()) { // Swallow this exception once, as it's possible that we split after sending the // index metadata // and one of the region servers doesn't have it. This will cause it to have it the // next go around. // If it fails again, we don't retry. String msg = "Swallowing exception and retrying after clearing meta cache on connection. " + inferredE; logger.warn(msg); connection.getQueryServices().clearTableRegionCache(htableName); // add a new child span as this one failed child.addTimelineAnnotation(msg); child.stop(); child = Tracing.child(span, "Failed batch, attempting retry"); continue; } e = inferredE; } // Throw to client with both what was committed so far and what is left to be committed. // That way, client can either undo what was done or try again with what was not done. sqlE = new CommitException( e, this, new MutationState( committedList, this.sizeOffset, this.maxSize, this.connection)); } finally { try { hTable.close(); } catch (IOException e) { if (sqlE != null) { sqlE.setNextException(ServerUtil.parseServerException(e)); } else { sqlE = ServerUtil.parseServerException(e); } } finally { try { if (cache != null) { cache.close(); } } finally { if (sqlE != null) { throw sqlE; } } } } } while (shouldRetry && retryCount++ < 1); isDataTable = false; } numRows -= entry.getValue().size(); iterator.remove(); // Remove batches as we process them } trace.close(); assert (numRows == 0); assert (this.mutations.isEmpty()); }
/** * Helper to add a {@link Mutation} to the values stored for the current row * * @param pendingUpdate update to apply */ public void addUpdateForTesting(Mutation pendingUpdate) { for (Map.Entry<byte[], List<KeyValue>> e : pendingUpdate.getFamilyMap().entrySet()) { List<KeyValue> edits = e.getValue(); addUpdate(edits); } }
/** * Prepare the regions and region files. * * @param server Hosting server instance. Can be null when testing (won't try and update in zk if * a null server) * @param services Used to online/offline regions. * @param user * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server, * RegionServerServices)} * @return Regions created */ @VisibleForTesting PairOfSameType<Region> createDaughters( final Server server, final RegionServerServices services, User user) throws IOException { LOG.info("Starting split of region " + this.parent); if ((server != null && server.isStopped()) || (services != null && services.isStopping())) { throw new IOException("Server is stopped or stopping"); } assert !this.parent.lock.writeLock().isHeldByCurrentThread() : "Unsafe to hold write lock while performing RPCs"; transition(SplitTransactionPhase.BEFORE_PRE_SPLIT_HOOK); // Coprocessor callback if (this.parent.getCoprocessorHost() != null) { // TODO: Remove one of these parent.getCoprocessorHost().preSplit(user); parent.getCoprocessorHost().preSplit(splitrow, user); } transition(SplitTransactionPhase.AFTER_PRE_SPLIT_HOOK); // If true, no cluster to write meta edits to or to update znodes in. boolean testing = server == null ? true : server.getConfiguration().getBoolean("hbase.testing.nocluster", false); this.fileSplitTimeout = testing ? this.fileSplitTimeout : server .getConfiguration() .getLong("hbase.regionserver.fileSplitTimeout", this.fileSplitTimeout); PairOfSameType<Region> daughterRegions = stepsBeforePONR(server, services, testing); final List<Mutation> metaEntries = new ArrayList<Mutation>(); boolean ret = false; if (this.parent.getCoprocessorHost() != null) { ret = parent.getCoprocessorHost().preSplitBeforePONR(splitrow, metaEntries, user); if (ret) { throw new IOException( "Coprocessor bypassing region " + parent.getRegionInfo().getRegionNameAsString() + " split."); } try { for (Mutation p : metaEntries) { HRegionInfo.parseRegionName(p.getRow()); } } catch (IOException e) { LOG.error( "Row key of mutation from coprossor is not parsable as region name." + "Mutations from coprocessor should only for hbase:meta table."); throw e; } } // This is the point of no return. Adding subsequent edits to .META. as we // do below when we do the daughter opens adding each to .META. can fail in // various interesting ways the most interesting of which is a timeout // BUT the edits all go through (See HBASE-3872). IF we reach the PONR // then subsequent failures need to crash out this regionserver; the // server shutdown processing should be able to fix-up the incomplete split. // The offlined parent will have the daughters as extra columns. If // we leave the daughter regions in place and do not remove them when we // crash out, then they will have their references to the parent in place // still and the server shutdown fixup of .META. will point to these // regions. // We should add PONR JournalEntry before offlineParentInMeta,so even if // OfflineParentInMeta timeout,this will cause regionserver exit,and then // master ServerShutdownHandler will fix daughter & avoid data loss. (See // HBase-4562). transition(SplitTransactionPhase.PONR); // Edit parent in meta. Offlines parent region and adds splita and splitb // as an atomic update. See HBASE-7721. This update to META makes the region // will determine whether the region is split or not in case of failures. // If it is successful, master will roll-forward, if not, master will rollback // and assign the parent region. if (services != null && !services.reportRegionStateTransition( TransitionCode.SPLIT_PONR, parent.getRegionInfo(), hri_a, hri_b)) { // Passed PONR, let SSH clean it up throw new IOException( "Failed to notify master that split passed PONR: " + parent.getRegionInfo().getRegionNameAsString()); } return daughterRegions; }
/** * Prepare the merged region and region files. * * @param server Hosting server instance. Can be null when testing * @param services Used to online/offline regions. * @return merged region * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server, * RegionServerServices)} */ HRegion createMergedRegion(final Server server, final RegionServerServices services) throws IOException { LOG.info( "Starting merge of " + region_a + " and " + region_b.getRegionNameAsString() + ", forcible=" + forcible); if ((server != null && server.isStopped()) || (services != null && services.isStopping())) { throw new IOException("Server is stopped or stopping"); } if (rsCoprocessorHost != null) { if (rsCoprocessorHost.preMerge(this.region_a, this.region_b)) { throw new IOException( "Coprocessor bypassing regions " + this.region_a + " " + this.region_b + " merge."); } } // If true, no cluster to write meta edits to or to use coordination. boolean testing = server == null ? true : server.getConfiguration().getBoolean("hbase.testing.nocluster", false); HRegion mergedRegion = stepsBeforePONR(server, services, testing); @MetaMutationAnnotation List<Mutation> metaEntries = new ArrayList<Mutation>(); if (rsCoprocessorHost != null) { if (rsCoprocessorHost.preMergeCommit(this.region_a, this.region_b, metaEntries)) { throw new IOException( "Coprocessor bypassing regions " + this.region_a + " " + this.region_b + " merge."); } try { for (Mutation p : metaEntries) { HRegionInfo.parseRegionName(p.getRow()); } } catch (IOException e) { LOG.error( "Row key of mutation from coprocessor is not parsable as region name." + "Mutations from coprocessor should only be for hbase:meta table.", e); throw e; } } // This is the point of no return. Similar with SplitTransaction. // IF we reach the PONR then subsequent failures need to crash out this // regionserver this.journal.add(JournalEntry.PONR); // Add merged region and delete region_a and region_b // as an atomic update. See HBASE-7721. This update to hbase:meta makes the region // will determine whether the region is merged or not in case of failures. // If it is successful, master will roll-forward, if not, master will // rollback if (!testing && useCoordinationForAssignment) { if (metaEntries.isEmpty()) { MetaTableAccessor.mergeRegions( server.getConnection(), mergedRegion.getRegionInfo(), region_a.getRegionInfo(), region_b.getRegionInfo(), server.getServerName(), region_a.getTableDesc().getRegionReplication()); } else { mergeRegionsAndPutMetaEntries( server.getConnection(), mergedRegion.getRegionInfo(), region_a.getRegionInfo(), region_b.getRegionInfo(), server.getServerName(), metaEntries, region_a.getTableDesc().getRegionReplication()); } } else if (services != null && !useCoordinationForAssignment) { if (!services.reportRegionStateTransition( TransitionCode.MERGE_PONR, mergedRegionInfo, region_a.getRegionInfo(), region_b.getRegionInfo())) { // Passed PONR, let SSH clean it up throw new IOException( "Failed to notify master that merge passed PONR: " + region_a.getRegionInfo().getRegionNameAsString() + " and " + region_b.getRegionInfo().getRegionNameAsString()); } } return mergedRegion; }
/** * Override the preAppend for checkAndPut and checkAndDelete, as we need the ability to a) set the * TimeRange for the Get being done and b) return something back to the client to indicate * success/failure */ @SuppressWarnings("deprecation") @Override public Result preAppend( final ObserverContext<RegionCoprocessorEnvironment> e, final Append append) throws IOException { byte[] opBuf = append.getAttribute(OPERATION_ATTRIB); if (opBuf == null) { return null; } Op op = Op.values()[opBuf[0]]; long clientTimestamp = HConstants.LATEST_TIMESTAMP; byte[] clientTimestampBuf = append.getAttribute(MAX_TIMERANGE_ATTRIB); if (clientTimestampBuf != null) { clientTimestamp = Bytes.toLong(clientTimestampBuf); } boolean hadClientTimestamp = (clientTimestamp != HConstants.LATEST_TIMESTAMP); if (hadClientTimestamp) { // Prevent race condition of creating two sequences at the same timestamp // by looking for a sequence at or after the timestamp at which it'll be // created. if (op == Op.CREATE_SEQUENCE) { clientTimestamp++; } } else { clientTimestamp = EnvironmentEdgeManager.currentTimeMillis(); clientTimestampBuf = Bytes.toBytes(clientTimestamp); } RegionCoprocessorEnvironment env = e.getEnvironment(); // We need to set this to prevent region.append from being called e.bypass(); e.complete(); HRegion region = env.getRegion(); byte[] row = append.getRow(); region.startRegionOperation(); try { Integer lid = region.getLock(null, row, true); try { KeyValue keyValue = append.getFamilyMap().values().iterator().next().iterator().next(); byte[] family = keyValue.getFamily(); byte[] qualifier = keyValue.getQualifier(); Get get = new Get(row); get.setTimeRange(MetaDataProtocol.MIN_TABLE_TIMESTAMP, clientTimestamp); get.addColumn(family, qualifier); Result result = region.get(get); if (result.isEmpty()) { if (op == Op.DROP_SEQUENCE || op == Op.RESET_SEQUENCE) { return getErrorResult( row, clientTimestamp, SQLExceptionCode.SEQUENCE_UNDEFINED.getErrorCode()); } } else { if (op == Op.CREATE_SEQUENCE) { return getErrorResult( row, clientTimestamp, SQLExceptionCode.SEQUENCE_ALREADY_EXIST.getErrorCode()); } } Mutation m = null; switch (op) { case RESET_SEQUENCE: KeyValue currentValueKV = result.raw()[0]; long expectedValue = PDataType.LONG .getCodec() .decodeLong(append.getAttribute(CURRENT_VALUE_ATTRIB), 0, null); long value = PDataType.LONG .getCodec() .decodeLong(currentValueKV.getBuffer(), currentValueKV.getValueOffset(), null); // Timestamp should match exactly, or we may have the wrong sequence if (expectedValue != value || currentValueKV.getTimestamp() != clientTimestamp) { return new Result( Collections.singletonList( KeyValueUtil.newKeyValue( row, PhoenixDatabaseMetaData.SEQUENCE_FAMILY_BYTES, QueryConstants.EMPTY_COLUMN_BYTES, currentValueKV.getTimestamp(), ByteUtil.EMPTY_BYTE_ARRAY))); } m = new Put(row, currentValueKV.getTimestamp()); m.getFamilyMap().putAll(append.getFamilyMap()); break; case DROP_SEQUENCE: m = new Delete(row, clientTimestamp, null); break; case CREATE_SEQUENCE: m = new Put(row, clientTimestamp); m.getFamilyMap().putAll(append.getFamilyMap()); break; } if (!hadClientTimestamp) { for (List<KeyValue> kvs : m.getFamilyMap().values()) { for (KeyValue kv : kvs) { kv.updateLatestStamp(clientTimestampBuf); } } } @SuppressWarnings("unchecked") Pair<Mutation, Integer>[] mutations = new Pair[1]; mutations[0] = new Pair<Mutation, Integer>(m, lid); region.batchMutate(mutations); long serverTimestamp = MetaDataUtil.getClientTimeStamp(m); // Return result with single KeyValue. The only piece of information // the client cares about is the timestamp, which is the timestamp of // when the mutation was actually performed (useful in the case of . return new Result( Collections.singletonList( KeyValueUtil.newKeyValue( row, PhoenixDatabaseMetaData.SEQUENCE_FAMILY_BYTES, QueryConstants.EMPTY_COLUMN_BYTES, serverTimestamp, SUCCESS_VALUE))); } finally { region.releaseRowLock(lid); } } catch (Throwable t) { ServerUtil.throwIOException("Increment of sequence " + Bytes.toStringBinary(row), t); return null; // Impossible } finally { region.closeRegionOperation(); } }