/** * Updates metadata when a worker periodically heartbeats with the master. * * @param workerId the worker id * @param usedBytesOnTiers a mapping from tier alias to the used bytes * @param removedBlockIds a list of block ids removed from this worker * @param addedBlocksOnTiers a mapping from tier alias to the added blocks * @return an optional command for the worker to execute */ public Command workerHeartbeat( long workerId, Map<String, Long> usedBytesOnTiers, List<Long> removedBlockIds, Map<String, List<Long>> addedBlocksOnTiers) { MasterWorkerInfo worker = mWorkers.getFirstByField(ID_INDEX, workerId); if (worker == null) { LOG.warn("Could not find worker id: {} for heartbeat.", workerId); return new Command(CommandType.Register, new ArrayList<Long>()); } synchronized (worker) { // Technically, 'worker' should be confirmed to still be in the data structure. Lost worker // detection can remove it. However, we are intentionally ignoring this race, since the worker // will just re-register regardless. processWorkerRemovedBlocks(worker, removedBlockIds); processWorkerAddedBlocks(worker, addedBlocksOnTiers); worker.updateUsedBytes(usedBytesOnTiers); worker.updateLastUpdatedTimeMs(); List<Long> toRemoveBlocks = worker.getToRemoveBlocks(); if (toRemoveBlocks.isEmpty()) { return new Command(CommandType.Nothing, new ArrayList<Long>()); } return new Command(CommandType.Free, toRemoveBlocks); } }
// TODO(binfan): check the logic is correct or not when commitBlock is a retry public void commitBlock( long workerId, long usedBytesOnTier, String tierAlias, long blockId, long length) throws NoWorkerException { LOG.debug( "Commit block from workerId: {}, usedBytesOnTier: {}, blockId: {}, length: {}", workerId, usedBytesOnTier, blockId, length); long counter = AsyncJournalWriter.INVALID_FLUSH_COUNTER; MasterWorkerInfo worker = mWorkers.getFirstByField(ID_INDEX, workerId); // TODO(peis): Check lost workers as well. if (worker == null) { throw new NoWorkerException(ExceptionMessage.NO_WORKER_FOUND.getMessage(workerId)); } // Lock the worker metadata first. synchronized (worker) { // Loop until block metadata is successfully locked. for (; ; ) { boolean newBlock = false; MasterBlockInfo block = mBlocks.get(blockId); if (block == null) { // The block metadata doesn't exist yet. block = new MasterBlockInfo(blockId, length); newBlock = true; } // Lock the block metadata. synchronized (block) { boolean writeJournal = false; if (newBlock) { if (mBlocks.putIfAbsent(blockId, block) != null) { // Another thread already inserted the metadata for this block, so start loop over. continue; } // Successfully added the new block metadata. Append a journal entry for the new // metadata. writeJournal = true; } else if (block.getLength() != length && block.getLength() == Constants.UNKNOWN_SIZE) { // The block size was previously unknown. Update the block size with the committed // size, and append a journal entry. block.updateLength(length); writeJournal = true; } if (writeJournal) { BlockInfoEntry blockInfo = BlockInfoEntry.newBuilder().setBlockId(blockId).setLength(length).build(); counter = appendJournalEntry(JournalEntry.newBuilder().setBlockInfo(blockInfo).build()); } // At this point, both the worker and the block metadata are locked. // Update the block metadata with the new worker location. block.addWorker(workerId, tierAlias); // This worker has this block, so it is no longer lost. mLostBlocks.remove(blockId); // Update the worker information for this new block. // TODO(binfan): when retry commitBlock on master is expected, make sure metrics are not // double counted. worker.addBlock(blockId); worker.updateUsedBytes(tierAlias, usedBytesOnTier); worker.updateLastUpdatedTimeMs(); } break; } } waitForJournalFlush(counter); }