/** * Creates an <tt>ArrayBlockingQueue</tt> with the given (fixed) capacity and the specified access * policy. * * @param capacity the capacity of this queue * @param fair if <tt>true</tt> then queue accesses for threads blocked on insertion or removal, * are processed in FIFO order; if <tt>false</tt> the access order is unspecified. * @throws IllegalArgumentException if <tt>capacity</tt> is less than 1 */ public ArrayBlockingQueue(int capacity, boolean fair) { if (capacity <= 0) throw new IllegalArgumentException(); this.items = (E[]) new Object[capacity]; lock = new ReentrantLock(fair); notEmpty = lock.newCondition(); notFull = lock.newCondition(); }
private void init() { hasFreeThread = lock.newCondition(); hasTask = lock.newCondition(); pool = new LinkedList<WorkThread>(); monitor = new QueueExecuteThread(); freer = new FreeCleanThread(); killer = new KillCleanThread(); }
@SuppressWarnings("unchecked") public PriorityBlockingDeque(SortedSet<? extends E> c) { this.lock = new ReentrantLock(); this.notEmpty = lock.newCondition(); this.comparator = (Comparator<? super E>) c.comparator(); addAll(c); }
/** * Method to create a new TE LSP initiated in this node * * @param destinationId IP AddreStart LSP Errorss of the destination of the LSP * @param bw Bandwidth requested * @param bidirectional bidirectional * @param OFcode * @throws LSPCreationException */ public long addnewLSP( Inet4Address destinationId, float bw, boolean bidirectional, int OFcode, int lspID) throws LSPCreationException { log.info("Adding New LSP to " + destinationId); // FIXME: mirar esto // meter structura --> RequestedLSPinformation --> Dependiente de cada tecnologia // meter campo con el estado del LSP e ir cambiandolo LSPTE lsp = new LSPTE( lspID, localIP, destinationId, bidirectional, OFcode, bw, PathStateParameters.creatingLPS); LSPList.put(new LSPKey(localIP, lsp.getIdLSP()), lsp); ReentrantLock lock = new ReentrantLock(); Condition lspEstablished = lock.newCondition(); // log.info("Metemos en Lock list con ID: "+lsp.getIdLSP()); lockList.put(lsp.getIdLSP(), lock); conditionList.put(lsp.getIdLSP(), lspEstablished); /*log.info("Size lockList : "+lockList.size()); log.info("Size conditionList : "+conditionList.size());*/ timeIni = System.nanoTime(); log.info("Start to establish path: " + System.nanoTime()); try { startLSP(lsp); } catch (LSPCreationException e) { log.info("Start LSP Error!"); throw e; } return lsp.getIdLSP(); }
public GifDecoder(GifAction gifaction) { isDestroy = false; f = 1; A = new byte[256]; B = 0; C = 0; D = 0; E = false; F = 0; O = new ArrayBlockingQueue(15); P = new ReentrantLock(); Q = P.newCondition(); R = P.newCondition(); S = 0; T = false; U = new ArrayList(M); V = 0; W = false; X = null; Y = null; Z = false; aa = 0; ab = null; ac = 0; ad = null; ae = null; af = new int[256]; X = gifaction; }
/** * Manages the election of which asynchronous saga event processor is responsible for creating a new * Saga instance, when necessary. * * @author Allard Buijze * @since 2.0 */ class AsyncSagaCreationElector { private static final Logger logger = LoggerFactory.getLogger(AsyncSagaCreationElector.class); private final ReentrantLock votingLock = new ReentrantLock(); private final Condition allVotesCast = votingLock.newCondition(); // guarded by "votingLock" private int castVotes = 0; private volatile boolean invocationDetected = false; /** * Forces the current thread to wait for the voting to complete if it is responsible for creating * the Saga. As soon as an invocation has been recorded, the waiting thread is released. * * @param didInvocation indicates whether the current processor found a Saga to process * @param totalVotesExpected The total number of processors expected to cast a vote * @param isSagaOwner Indicates whether the current processor "owns" the to-be-created saga * instance. * @return <code>true</code> if the current processor should create the new instance, <code>false * </code> otherwise. */ public boolean waitForSagaCreationVote( final boolean didInvocation, final int totalVotesExpected, final boolean isSagaOwner) { votingLock.lock(); try { invocationDetected = invocationDetected || didInvocation; castVotes++; while (isSagaOwner && !invocationDetected && castVotes < totalVotesExpected) { try { allVotesCast.await(); } catch (InterruptedException e) { // interrupting this process is not supported. logger.warn( "This thread has been interrupted, but the interruption has " + "been ignored to prevent loss of information."); } } if (isSagaOwner) { return !invocationDetected; } allVotesCast.signalAll(); } finally { votingLock.unlock(); } return false; } /** Clears the voting counts for a new round. */ public void clear() { votingLock.lock(); try { castVotes = 0; invocationDetected = false; } finally { votingLock.unlock(); } } }
/** * Constructor of the class. Initialize all the objects * * @param maxSize The size of the buffer */ public Buffer(final int maxSize) { this.maxSize = maxSize; buffer = new LinkedList<>(); lock = new ReentrantLock(); lines = lock.newCondition(); space = lock.newCondition(); pendingLines = true; }
public SimpleSemaphore(int permits, boolean fair) { // TODO - you fill in here to initialize the SimpleSemaphore, // making sure to allow both fair and non-fair Semaphore // semantics. this.permits = permits; reentrantLock = new ReentrantLock(fair); condition = reentrantLock.newCondition(); }
public PriorityBlockingDeque(int initialCapacity, Comparator<? super E> comparator) { // Note: This restriction of at least one is not actually needed, // but continues for 1.5 compatibility if (initialCapacity < 1) throw new IllegalArgumentException(); this.lock = new ReentrantLock(); this.notEmpty = lock.newCondition(); this.comparator = comparator; this.deque = new Object[initialCapacity]; }
private class SpiceArrayAdapterUnderTest extends SpiceArrayAdapter<DataUnderTest> { private ReentrantLock reentrantLock = new ReentrantLock(); private Condition loadBitmapHasBeenCalledCondition = reentrantLock.newCondition(); private boolean loadBitmapHasBeenCalled = false; public SpiceArrayAdapterUnderTest( Context context, BitmapSpiceManager spiceManagerBinary, List<DataUnderTest> data) { super(context, spiceManagerBinary, data); } @Override public BitmapRequest createRequest( DataUnderTest data, int imageIndex, int reqWidth, int reqHeight) { return new BitmapRequest( mockWebServer.getUrl("/" + data.getImageUrl()).toString(), reqWidth, reqHeight, cacheFile); } // ---------------------------------------------------- // ----- Block Test thread until drawable is refreshed. // ---------------------------------------------------- @Override protected void loadBitmapAsynchronously( DataUnderTest octo, ImageView thumbImageView, String tempThumbnailImageFileName) { super.loadBitmapAsynchronously(octo, thumbImageView, tempThumbnailImageFileName); reentrantLock.lock(); try { loadBitmapHasBeenCalled = true; loadBitmapHasBeenCalledCondition.signal(); } finally { reentrantLock.unlock(); } } public void await(long millisecond) throws InterruptedException { reentrantLock.lock(); try { loadBitmapHasBeenCalledCondition.await(millisecond, TimeUnit.MILLISECONDS); } finally { reentrantLock.unlock(); } } public boolean isLoadBitmapHasBeenCalled() { return loadBitmapHasBeenCalled; } @Override public SpiceListItemView<DataUnderTest> createView(Context context, ViewGroup parent) { return new ListItemViewStub(getContext()); } }
class PausableThreadPoolExecutor extends ThreadPoolExecutor implements ExecutorRemoteControllerService { public PausableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue, ThreadFactory threadFactory, RejectedExecutionHandler handler) { super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory, handler); } private boolean isPaused; private ReentrantLock pauseLock = new ReentrantLock(); private Condition unpaused = pauseLock.newCondition(); @Override protected void beforeExecute(Thread t, Runnable r) { super.beforeExecute(t, r); pauseLock.lock(); try { while (isPaused) unpaused.await(); } catch (InterruptedException ie) { t.interrupt(); } finally { pauseLock.unlock(); } } @Override public void pause() { pauseLock.lock(); try { isPaused = true; } finally { pauseLock.unlock(); } } @Override public void resume() { pauseLock.lock(); try { isPaused = false; unpaused.signalAll(); } finally { pauseLock.unlock(); } } }
/** * Constructor. * * @param thread The MoSync thread. */ public MoSyncCameraController(MoSyncThread thread) { mMoSyncThread = thread; lock = new ReentrantLock(); mPreview = null; condition = lock.newCondition(); dataReady = false; userWidths = new ArrayList<Integer>(); userHeights = new ArrayList<Integer>(); mCameraParametersList = new ArrayList<Camera.Parameters>(); mNumCameras = numberOfCameras(); initilizeCameras(); rawMode = false; mCurrentCameraIndex = 0; }
public class ManualFlag extends Flag { private final ReentrantLock lock = new ReentrantLock(); private final Condition condition = lock.newCondition(); private boolean set; public ManualFlag(boolean state) { set = state; } public ManualFlag() { this(false); } @Override public void set() { lock.lock(); try { set = true; condition.signalAll(); } finally { lock.unlock(); } } public void reset() { lock.lock(); try { set = false; } finally { lock.unlock(); } } @Override public void await() { lock.lock(); while (true) { try { while (!set) condition.await(); return; } catch (InterruptedException ex) { continue; } finally { lock.unlock(); } } } }
/** * Transfer the local file to the hypervisor's importdir * * @param from Local file name * @throws OccpException */ public void stageFile(String from) throws OccpException { boolean transferSuccess = false; /* * Only VBox requires transfer to the host; Esxi allows uploads */ if (this.hv.getClass() == OccpVBoxHV.class) { try { completionLock.lock(); if (!completed_conditions.containsKey(from)) { completed_conditions.put(from, completionLock.newCondition()); } else { while (!completed_transfers.containsKey(from)) { try { completed_conditions.get(from).await(); } catch (InterruptedException e) { throw new VMOperationFailedException( hv.getName(), vm.getName(), ErrorCode.TRANSFER_TO, "Transfer interrupted", e); } } // If it has been transferred, say we did it, otherwise try again if (completed_transfers.get(from) == true) { return; } } } finally { completionLock.unlock(); } boolean hasPath = (from.lastIndexOf('/') >= 0); String to = from; if (hasPath) { to = from.substring(from.lastIndexOf('/') + 1); } try { this.hv.transferFileToVM(this.vm, from, "/mnt/" + OccpAdmin.scenarioName + "/" + to, false); transferSuccess = true; } finally { completionLock.lock(); try { completed_transfers.put(from, transferSuccess); completed_conditions.get(from).signal(); } finally { completionLock.unlock(); } } } }
/** for array */ public class TestArrayBlockingQueueforList { public ReentrantLock lock = new ReentrantLock(); public Condition condition = lock.newCondition(); public List<Integer> list = new ArrayList<Integer>(); boolean isEmpty = true; public static void main(String[] args) { TestArrayBlockingQueueforList test = new TestArrayBlockingQueueforList(); Resource res = new Resource(); new Thread(new ProducerThread(test.getCondition(), test.getLock(), res)).start(); new Thread(new ConsumerThread(test.getCondition(), test.getLock(), res)).start(); } public ReentrantLock getLock() { return lock; } public void setLock(ReentrantLock lock) { this.lock = lock; } public Condition getCondition() { return condition; } public void setCondition(Condition condition) { this.condition = condition; } public List<Integer> getList() { return list; } public void setList(List<Integer> list) { this.list = list; } public boolean isEmpty() { return isEmpty; } public void setEmpty(boolean isEmpty) { this.isEmpty = isEmpty; } }
/** * Constructs the consumer which will read from the given destination and is a child of the given * context. * * @param destination the destination that this consumer will read from * @param hazelcastMQContext the parent context of this consumer */ DefaultHazelcastMQConsumer(String destination, DefaultHazelcastMQContext hazelcastMQContext) { super(); this.destination = destination; this.receiveLock = new ReentrantLock(); this.receiveCondition = receiveLock.newCondition(); this.closed = false; this.active = false; this.hazelcastMQContext = hazelcastMQContext; this.config = hazelcastMQContext.getHazelcastMQInstance().getConfig(); HazelcastInstance hazelcast = this.hazelcastMQContext.getHazelcastMQInstance().getConfig().getHazelcastInstance(); IdGenerator idGenerator = hazelcast.getIdGenerator("hazelcastmqconsumer"); this.id = "hazelcastmqconsumer-" + String.valueOf(idGenerator.newId()); }
public class MyService { private ReentrantLock lock = new ReentrantLock(); private Condition condition = lock.newCondition(); public void waitMethod() { try { lock.lock(); System.out.println("A"); condition.await(); System.out.println("B"); } catch (InterruptedException e) { e.printStackTrace(); } finally { lock.unlock(); System.out.println("锁释放了!"); } } }
private static final class Notifier { private final ReentrantLock lock = new ReentrantLock(); private final Condition condition = lock.newCondition(); private volatile TimeValue timeout; public Notifier(TimeValue timeout) { assert timeout != null; this.timeout = timeout; } public void await() { lock.lock(); try { condition.await(timeout.millis(), TimeUnit.MILLISECONDS); } catch (InterruptedException e) { // we intentionally do not want to restore the interruption flag, we're about to shutdown // anyway } finally { lock.unlock(); } } public void setTimeout(TimeValue timeout) { assert timeout != null; this.timeout = timeout; doNotify(); } public TimeValue getTimeout() { return timeout; } public void doNotify() { lock.lock(); try { condition.signalAll(); } finally { lock.unlock(); } } }
public class BrokenOrderingReentrantLock implements Runnable { private final ReentrantLock lock1 = new ReentrantLock(); private final ReentrantLock lock2 = new ReentrantLock(); private final Condition condition = lock1.newCondition(); public static void main(String[] args) throws InterruptedException { BrokenOrderingReentrantLock runnable = new BrokenOrderingReentrantLock(); Thread thread1 = new Thread(runnable, "thread1"); Thread thread2 = new Thread(runnable, "thread2"); thread1.start(); Thread.sleep(500); thread2.start(); } @Override public void run() { try { String threadName = Thread.currentThread().getName(); lock1.lock(); try { System.out.println(threadName + " has lock1"); lock2.lock(); try { System.out.println(threadName + " has lock2"); lock1.lock(); try { System.out.println(threadName + " reenters lock1"); condition.await(1, TimeUnit.SECONDS); } finally { lock1.unlock(); } } finally { lock2.unlock(); } } finally { lock1.unlock(); } } catch (InterruptedException e) { e.printStackTrace(); } } }
@SuppressWarnings("unchecked") public PriorityBlockingDeque(Collection<? extends E> c) { this.lock = new ReentrantLock(); this.notEmpty = lock.newCondition(); if (c instanceof SortedSet<?>) { SortedSet<? extends E> ss = (SortedSet<? extends E>) c; this.comparator = (Comparator<? super E>) ss.comparator(); addAll(ss); } else if (c instanceof PriorityDeque<?>) { PriorityDeque<? extends E> pq = (PriorityDeque<? extends E>) c; this.comparator = (Comparator<? super E>) pq.comparator(); initFromPriorityDeque(pq); } else if (c instanceof PriorityBlockingDeque<?>) { PriorityBlockingDeque<? extends E> pq = (PriorityBlockingDeque<? extends E>) c; this.comparator = (Comparator<? super E>) pq.comparator(); initFromPriorityBlockingDeque(pq); } else { this.comparator = null; addAll(c); } }
ConsumerImpl( URI uri, String submitQName, String statusQName, String statusTName, String heartbeatTName, String commandTName, IEventConnectorService service, IEventService eservice) throws EventException { super(uri, submitQName, statusQName, statusTName, commandTName, service, eservice); this.lock = new ReentrantLock(); this.paused = lock.newCondition(); durable = true; consumerId = UUID.randomUUID(); name = "Consumer " + consumerId; // This will hopefully be changed to something meaningful... this.processes = new Hashtable<>(7); // Synch! this.heartbeatTopicName = heartbeatTName; connect(); }
public class MyService { private ReentrantLock lock = new ReentrantLock(); private Condition condition = lock.newCondition(); private boolean hasValue = false; public void set() { try { lock.lock(); while (hasValue == true) { condition.await(); } System.out.println("打印★"); hasValue = true; condition.signal(); } catch (InterruptedException e) { e.printStackTrace(); } finally { lock.unlock(); } } public void get() { try { lock.lock(); while (hasValue == false) { condition.await(); } System.out.println("打印☆"); hasValue = false; condition.signal(); } catch (InterruptedException e) { e.printStackTrace(); } finally { lock.unlock(); } } }
class SuspendableThreadPoolExecutor extends ThreadPoolExecutor { private boolean available = false; private ReentrantLock suspendLock = new ReentrantLock(); private Condition availableCondition = suspendLock.newCondition(); public SuspendableThreadPoolExecutor(ThreadFactory threadFactory) { super(1, 1, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>(), threadFactory); } @Override protected void beforeExecute(Thread thread, Runnable task) { super.beforeExecute(thread, task); suspendLock.lock(); try { while (!available) { availableCondition.await(); } } catch (InterruptedException interruptedException) { thread.interrupt(); } finally { suspendLock.unlock(); } } public void setAvailable(boolean available) { suspendLock.lock(); try { this.available = available; if (available) { availableCondition.signalAll(); } } finally { suspendLock.unlock(); } } }
// This only knows how to deal with a single srcIndex for a given targetIndex. // In case the src task generates multiple outputs for the same target Index // (multiple src-indices), modifications will be required. public class ShuffleManager implements FetcherCallback { private static final Log LOG = LogFactory.getLog(ShuffleManager.class); private final TezInputContext inputContext; private final int numInputs; private final FetchedInputAllocator inputManager; private final ListeningExecutorService fetcherExecutor; private final ListeningExecutorService schedulerExecutor; private final RunShuffleCallable schedulerCallable = new RunShuffleCallable(); private final BlockingQueue<FetchedInput> completedInputs; private final AtomicBoolean inputReadyNotificationSent = new AtomicBoolean(false); private final Set<InputIdentifier> completedInputSet; private final ConcurrentMap<String, InputHost> knownSrcHosts; private final BlockingQueue<InputHost> pendingHosts; private final Set<InputAttemptIdentifier> obsoletedInputs; private Set<Fetcher> runningFetchers; private final AtomicInteger numCompletedInputs = new AtomicInteger(0); private final long startTime; private long lastProgressTime; // Required to be held when manipulating pendingHosts private final ReentrantLock lock = new ReentrantLock(); private final Condition wakeLoop = lock.newCondition(); private final int numFetchers; // Parameters required by Fetchers private final SecretKey shuffleSecret; private final CompressionCodec codec; private final int ifileBufferSize; private final boolean ifileReadAhead; private final int ifileReadAheadLength; private final String srcNameTrimmed; private final AtomicBoolean isShutdown = new AtomicBoolean(false); private final TezCounter shuffledInputsCounter; private final TezCounter failedShufflesCounter; private final TezCounter bytesShuffledCounter; private final TezCounter decompressedDataSizeCounter; private final TezCounter bytesShuffledToDiskCounter; private final TezCounter bytesShuffledToMemCounter; private volatile Throwable shuffleError; private final HttpConnectionParams httpConnectionParams; // TODO More counters - FetchErrors, speed? public ShuffleManager( TezInputContext inputContext, Configuration conf, int numInputs, int bufferSize, boolean ifileReadAheadEnabled, int ifileReadAheadLength, CompressionCodec codec, FetchedInputAllocator inputAllocator) throws IOException { this.inputContext = inputContext; this.numInputs = numInputs; this.shuffledInputsCounter = inputContext.getCounters().findCounter(TaskCounter.NUM_SHUFFLED_INPUTS); this.failedShufflesCounter = inputContext.getCounters().findCounter(TaskCounter.NUM_FAILED_SHUFFLE_INPUTS); this.bytesShuffledCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES); this.decompressedDataSizeCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_DECOMPRESSED); this.bytesShuffledToDiskCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_TO_DISK); this.bytesShuffledToMemCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_TO_MEM); this.ifileBufferSize = bufferSize; this.ifileReadAhead = ifileReadAheadEnabled; this.ifileReadAheadLength = ifileReadAheadLength; this.codec = codec; this.inputManager = inputAllocator; this.srcNameTrimmed = TezUtils.cleanVertexName(inputContext.getSourceVertexName()); completedInputSet = Collections.newSetFromMap(new ConcurrentHashMap<InputIdentifier, Boolean>(numInputs)); completedInputs = new LinkedBlockingQueue<FetchedInput>(numInputs); knownSrcHosts = new ConcurrentHashMap<String, InputHost>(); pendingHosts = new LinkedBlockingQueue<InputHost>(); obsoletedInputs = Collections.newSetFromMap(new ConcurrentHashMap<InputAttemptIdentifier, Boolean>()); runningFetchers = Collections.newSetFromMap(new ConcurrentHashMap<Fetcher, Boolean>()); int maxConfiguredFetchers = conf.getInt( TezJobConfig.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES, TezJobConfig.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES_DEFAULT); this.numFetchers = Math.min(maxConfiguredFetchers, numInputs); ExecutorService fetcherRawExecutor = Executors.newFixedThreadPool( numFetchers, new ThreadFactoryBuilder() .setDaemon(true) .setNameFormat("Fetcher [" + srcNameTrimmed + "] #%d") .build()); this.fetcherExecutor = MoreExecutors.listeningDecorator(fetcherRawExecutor); ExecutorService schedulerRawExecutor = Executors.newFixedThreadPool( 1, new ThreadFactoryBuilder() .setDaemon(true) .setNameFormat("ShuffleRunner [" + srcNameTrimmed + "]") .build()); this.schedulerExecutor = MoreExecutors.listeningDecorator(schedulerRawExecutor); this.startTime = System.currentTimeMillis(); this.lastProgressTime = startTime; this.shuffleSecret = ShuffleUtils.getJobTokenSecretFromTokenBytes( inputContext.getServiceConsumerMetaData( TezConfiguration.TEZ_SHUFFLE_HANDLER_SERVICE_ID)); httpConnectionParams = ShuffleUtils.constructHttpShuffleConnectionParams(conf); LOG.info( this.getClass().getSimpleName() + " : numInputs=" + numInputs + ", compressionCodec=" + (codec == null ? "NoCompressionCodec" : codec.getClass().getName()) + ", numFetchers=" + numFetchers + ", ifileBufferSize=" + ifileBufferSize + ", ifileReadAheadEnabled=" + ifileReadAhead + ", ifileReadAheadLength=" + ifileReadAheadLength + ", " + httpConnectionParams.toString()); } public void run() throws IOException { Preconditions.checkState(inputManager != null, "InputManager must be configured"); ListenableFuture<Void> runShuffleFuture = schedulerExecutor.submit(schedulerCallable); Futures.addCallback(runShuffleFuture, new SchedulerFutureCallback()); // Shutdown this executor once this task, and the callback complete. schedulerExecutor.shutdown(); } private class RunShuffleCallable implements Callable<Void> { @Override public Void call() throws Exception { while (!isShutdown.get() && numCompletedInputs.get() < numInputs) { lock.lock(); try { if (runningFetchers.size() >= numFetchers || pendingHosts.isEmpty()) { if (numCompletedInputs.get() < numInputs) { wakeLoop.await(); } } } finally { lock.unlock(); } if (shuffleError != null) { // InputContext has already been informed of a fatal error. Relying on // tez to kill the task. break; } if (LOG.isDebugEnabled()) { LOG.debug("NumCompletedInputs: " + numCompletedInputs); } if (numCompletedInputs.get() < numInputs && !isShutdown.get()) { lock.lock(); try { int maxFetchersToRun = numFetchers - runningFetchers.size(); int count = 0; while (pendingHosts.peek() != null && !isShutdown.get()) { InputHost inputHost = null; try { inputHost = pendingHosts.take(); } catch (InterruptedException e) { if (isShutdown.get()) { LOG.info( "Interrupted and hasBeenShutdown, Breaking out of ShuffleScheduler Loop"); break; } else { throw e; } } if (LOG.isDebugEnabled()) { LOG.debug("Processing pending host: " + inputHost.toDetailedString()); } if (inputHost.getNumPendingInputs() > 0 && !isShutdown.get()) { LOG.info("Scheduling fetch for inputHost: " + inputHost.getIdentifier()); Fetcher fetcher = constructFetcherForHost(inputHost); runningFetchers.add(fetcher); if (isShutdown.get()) { LOG.info("hasBeenShutdown, Breaking out of ShuffleScheduler Loop"); } ListenableFuture<FetchResult> future = fetcherExecutor.submit(fetcher); Futures.addCallback(future, new FetchFutureCallback(fetcher)); if (++count >= maxFetchersToRun) { break; } } else { if (LOG.isDebugEnabled()) { LOG.debug( "Skipping host: " + inputHost.getIdentifier() + " since it has no inputs to process"); } } } } finally { lock.unlock(); } } } LOG.info( "Shutting down FetchScheduler, Was Interrupted: " + Thread.currentThread().isInterrupted()); // TODO NEWTEZ Maybe clean up inputs. if (!fetcherExecutor.isShutdown()) { fetcherExecutor.shutdownNow(); } return null; } } private Fetcher constructFetcherForHost(InputHost inputHost) { FetcherBuilder fetcherBuilder = new FetcherBuilder( ShuffleManager.this, httpConnectionParams, inputManager, inputContext.getApplicationId(), shuffleSecret, srcNameTrimmed); if (codec != null) { fetcherBuilder.setCompressionParameters(codec); } fetcherBuilder.setIFileParams(ifileReadAhead, ifileReadAheadLength); // Remove obsolete inputs from the list being given to the fetcher. Also // remove from the obsolete list. List<InputAttemptIdentifier> pendingInputsForHost = inputHost.clearAndGetPendingInputs(); for (Iterator<InputAttemptIdentifier> inputIter = pendingInputsForHost.iterator(); inputIter.hasNext(); ) { InputAttemptIdentifier input = inputIter.next(); // Avoid adding attempts which have already completed. if (completedInputSet.contains(input.getInputIdentifier())) { inputIter.remove(); continue; } // Avoid adding attempts which have been marked as OBSOLETE if (obsoletedInputs.contains(input)) { inputIter.remove(); } } // TODO NEWTEZ Maybe limit the number of inputs being given to a single // fetcher, especially in the case where #hosts < #fetchers fetcherBuilder.assignWork( inputHost.getHost(), inputHost.getPort(), inputHost.getSrcPhysicalIndex(), pendingInputsForHost); LOG.info( "Created Fetcher for host: " + inputHost.getHost() + ", with inputs: " + pendingInputsForHost); return fetcherBuilder.build(); } /////////////////// Methods for InputEventHandler public void addKnownInput( String hostName, int port, InputAttemptIdentifier srcAttemptIdentifier, int srcPhysicalIndex) { String identifier = InputHost.createIdentifier(hostName, port); InputHost host = knownSrcHosts.get(identifier); if (host == null) { host = new InputHost(hostName, port, inputContext.getApplicationId(), srcPhysicalIndex); assert identifier.equals(host.getIdentifier()); InputHost old = knownSrcHosts.putIfAbsent(identifier, host); if (old != null) { host = old; } } if (LOG.isDebugEnabled()) { LOG.debug("Adding input: " + srcAttemptIdentifier + ", to host: " + host); } host.addKnownInput(srcAttemptIdentifier); lock.lock(); try { boolean added = pendingHosts.offer(host); if (!added) { String errorMessage = "Unable to add host: " + host.getIdentifier() + " to pending queue"; LOG.error(errorMessage); throw new TezUncheckedException(errorMessage); } wakeLoop.signal(); } finally { lock.unlock(); } } public void addCompletedInputWithNoData(InputAttemptIdentifier srcAttemptIdentifier) { InputIdentifier inputIdentifier = srcAttemptIdentifier.getInputIdentifier(); LOG.info("No input data exists for SrcTask: " + inputIdentifier + ". Marking as complete."); if (!completedInputSet.contains(inputIdentifier)) { synchronized (completedInputSet) { if (!completedInputSet.contains(inputIdentifier)) { registerCompletedInput(new NullFetchedInput(srcAttemptIdentifier)); } } } // Awake the loop to check for termination. lock.lock(); try { wakeLoop.signal(); } finally { lock.unlock(); } } public void addCompletedInputWithData( InputAttemptIdentifier srcAttemptIdentifier, FetchedInput fetchedInput) throws IOException { InputIdentifier inputIdentifier = srcAttemptIdentifier.getInputIdentifier(); LOG.info("Received Data via Event: " + srcAttemptIdentifier + " to " + fetchedInput.getType()); // Count irrespective of whether this is a copy of an already fetched input lock.lock(); try { lastProgressTime = System.currentTimeMillis(); } finally { lock.unlock(); } boolean committed = false; if (!completedInputSet.contains(inputIdentifier)) { synchronized (completedInputSet) { if (!completedInputSet.contains(inputIdentifier)) { fetchedInput.commit(); committed = true; registerCompletedInput(fetchedInput); } } } if (!committed) { fetchedInput.abort(); // If this fails, the fetcher may attempt another // abort. } else { lock.lock(); try { // Signal the wakeLoop to check for termination. wakeLoop.signal(); } finally { lock.unlock(); } } } public synchronized void obsoleteKnownInput(InputAttemptIdentifier srcAttemptIdentifier) { obsoletedInputs.add(srcAttemptIdentifier); // TODO NEWTEZ Maybe inform the fetcher about this. For now, this is used during the initial // fetch list construction. } /////////////////// End of Methods for InputEventHandler /////////////////// Methods from FetcherCallbackHandler @Override public void fetchSucceeded( String host, InputAttemptIdentifier srcAttemptIdentifier, FetchedInput fetchedInput, long fetchedBytes, long decompressedLength, long copyDuration) throws IOException { InputIdentifier inputIdentifier = srcAttemptIdentifier.getInputIdentifier(); LOG.info( "Completed fetch for attempt: " + srcAttemptIdentifier + " to " + fetchedInput.getType()); // Count irrespective of whether this is a copy of an already fetched input lock.lock(); try { lastProgressTime = System.currentTimeMillis(); } finally { lock.unlock(); } boolean committed = false; if (!completedInputSet.contains(inputIdentifier)) { synchronized (completedInputSet) { if (!completedInputSet.contains(inputIdentifier)) { fetchedInput.commit(); committed = true; // Processing counters for completed and commit fetches only. Need // additional counters for excessive fetches - which primarily comes // in after speculation or retries. shuffledInputsCounter.increment(1); bytesShuffledCounter.increment(fetchedBytes); if (fetchedInput.getType() == Type.MEMORY) { bytesShuffledToMemCounter.increment(fetchedBytes); } else { bytesShuffledToDiskCounter.increment(fetchedBytes); } decompressedDataSizeCounter.increment(decompressedLength); registerCompletedInput(fetchedInput); } } } if (!committed) { fetchedInput.abort(); // If this fails, the fetcher may attempt another abort. } else { lock.lock(); try { // Signal the wakeLoop to check for termination. wakeLoop.signal(); } finally { lock.unlock(); } } // TODO NEWTEZ Maybe inform fetchers, in case they have an alternate attempt of the same task in // their queue. } @Override public void fetchFailed( String host, InputAttemptIdentifier srcAttemptIdentifier, boolean connectFailed) { // TODO NEWTEZ. Implement logic to report fetch failures after a threshold. // For now, reporting immediately. LOG.info( "Fetch failed for src: " + srcAttemptIdentifier + "InputIdentifier: " + srcAttemptIdentifier + ", connectFailed: " + connectFailed); failedShufflesCounter.increment(1); if (srcAttemptIdentifier == null) { String message = "Received fetchFailure for an unknown src (null)"; LOG.fatal(message); inputContext.fatalError(null, message); } else { InputReadErrorEvent readError = new InputReadErrorEvent( "Fetch failure while fetching from " + TezRuntimeUtils.getTaskAttemptIdentifier( inputContext.getSourceVertexName(), srcAttemptIdentifier.getInputIdentifier().getInputIndex(), srcAttemptIdentifier.getAttemptNumber()), srcAttemptIdentifier.getInputIdentifier().getInputIndex(), srcAttemptIdentifier.getAttemptNumber()); List<Event> failedEvents = Lists.newArrayListWithCapacity(1); failedEvents.add(readError); inputContext.sendEvents(failedEvents); } } /////////////////// End of Methods from FetcherCallbackHandler public void shutdown() throws InterruptedException { if (!isShutdown.getAndSet(true)) { // Shut down any pending fetchers LOG.info( "Shutting down pending fetchers on source" + srcNameTrimmed + ": " + runningFetchers.size()); lock.lock(); try { wakeLoop.signal(); // signal the fetch-scheduler for (Fetcher fetcher : runningFetchers) { fetcher.shutdown(); // This could be parallelized. } } finally { lock.unlock(); } if (this.schedulerExecutor != null && !this.schedulerExecutor.isShutdown()) { this.schedulerExecutor.shutdownNow(); } if (this.fetcherExecutor != null && !this.fetcherExecutor.isShutdown()) { this.fetcherExecutor.shutdownNow(); // Interrupts all running fetchers. } } // All threads are shutdown. It is safe to shutdown SSL factory if (httpConnectionParams.isSSLShuffleEnabled()) { HttpConnection.cleanupSSLFactory(); } } private void registerCompletedInput(FetchedInput fetchedInput) { lock.lock(); try { completedInputSet.add(fetchedInput.getInputAttemptIdentifier().getInputIdentifier()); completedInputs.add(fetchedInput); if (!inputReadyNotificationSent.getAndSet(true)) { // TODO Should eventually be controlled by Inputs which are processing the data. inputContext.inputIsReady(); } int numComplete = numCompletedInputs.incrementAndGet(); if (numComplete == numInputs) { LOG.info("All inputs fetched for input vertex : " + inputContext.getSourceVertexName()); } } finally { lock.unlock(); } } /////////////////// Methods for walking the available inputs /** @return true if there is another input ready for consumption. */ public boolean newInputAvailable() { FetchedInput head = completedInputs.peek(); if (head == null || head instanceof NullFetchedInput) { return false; } else { return true; } } /** @return true if all of the required inputs have been fetched. */ public boolean allInputsFetched() { lock.lock(); try { return numCompletedInputs.get() == numInputs; } finally { lock.unlock(); } } /** * @return the next available input, or null if there are no available inputs. This method will * block if there are currently no available inputs, but more may become available. */ public FetchedInput getNextInput() throws InterruptedException { FetchedInput input = null; do { // Check for no additional inputs lock.lock(); try { input = completedInputs.peek(); if (input == null && allInputsFetched()) { break; } } finally { lock.unlock(); } input = completedInputs.take(); // block } while (input instanceof NullFetchedInput); return input; } /////////////////// End of methods for walking the available inputs /** * Fake input that is added to the completed input list in case an input does not have any data. */ private class NullFetchedInput extends FetchedInput { public NullFetchedInput(InputAttemptIdentifier inputAttemptIdentifier) { super(Type.MEMORY, -1, -1, inputAttemptIdentifier, null); } @Override public OutputStream getOutputStream() throws IOException { throw new UnsupportedOperationException("Not supported for NullFetchedInput"); } @Override public InputStream getInputStream() throws IOException { throw new UnsupportedOperationException("Not supported for NullFetchedInput"); } @Override public void commit() throws IOException { throw new UnsupportedOperationException("Not supported for NullFetchedInput"); } @Override public void abort() throws IOException { throw new UnsupportedOperationException("Not supported for NullFetchedInput"); } @Override public void free() { throw new UnsupportedOperationException("Not supported for NullFetchedInput"); } } private class SchedulerFutureCallback implements FutureCallback<Void> { @Override public void onSuccess(Void result) { LOG.info("Scheduler thread completed"); } @Override public void onFailure(Throwable t) { if (isShutdown.get()) { LOG.info("Already shutdown. Ignoring error: " + t); } else { LOG.error("Scheduler failed with error: ", t); inputContext.fatalError(t, "Shuffle Scheduler Failed"); } } } private class FetchFutureCallback implements FutureCallback<FetchResult> { private final Fetcher fetcher; public FetchFutureCallback(Fetcher fetcher) { this.fetcher = fetcher; } private void doBookKeepingForFetcherComplete() { lock.lock(); try { runningFetchers.remove(fetcher); wakeLoop.signal(); } finally { lock.unlock(); } } @Override public void onSuccess(FetchResult result) { fetcher.shutdown(); if (isShutdown.get()) { LOG.info("Already shutdown. Ignoring event from fetcher"); } else { Iterable<InputAttemptIdentifier> pendingInputs = result.getPendingInputs(); if (pendingInputs != null && pendingInputs.iterator().hasNext()) { InputHost inputHost = knownSrcHosts.get(InputHost.createIdentifier(result.getHost(), result.getPort())); assert inputHost != null; for (InputAttemptIdentifier input : pendingInputs) { inputHost.addKnownInput(input); } pendingHosts.add(inputHost); } doBookKeepingForFetcherComplete(); } } @Override public void onFailure(Throwable t) { // Unsuccessful - the fetcher may not have shutdown correctly. Try shutting it down. fetcher.shutdown(); if (isShutdown.get()) { LOG.info("Already shutdown. Ignoring error from fetcher: " + t); } else { LOG.error("Fetcher failed with error: ", t); shuffleError = t; inputContext.fatalError(t, "Fetch failed"); doBookKeepingForFetcherComplete(); } } } }
/** @author tolgam */ public class Optimizer extends Observable implements Runnable { /** Population size */ private static final int POPULATION_SIZE = 5; /** Maximum generation to wait before finding an optima */ private static final int MAXIMUM_GENERATION = 40; /** Logger */ protected final Logger logger = LoggerFactory.getLogger(Optimizer.class); /** Population */ protected final SortedSet<Solution> population = new TreeSet<Solution>(); /** Hall of fame to put all the results found */ private final Set<Triple> blackListedTriples = new HashSet<Triple>(); /** Mutation operator used to generate new populations */ private final Generate generateOp; /** Evaluation operator to evaluate all the candidates */ private final Evaluate evaluateOp; /** Counter for statistics about the number of evaluations */ private int evaluationsCounter = 0; private final Request request; /** Activity control */ private boolean isPaused = false; private boolean isTerminated = false; private ReentrantLock pauseLock = new ReentrantLock(); private Condition unpaused = pauseLock.newCondition(); // Generation counter private int generation = 0; private DataLayer datalayer; /** * Optimizer * * @param datalayer * @param request * @param executor */ public Optimizer( final DataLayer datalayer, final Request request, final ExecutorService executor) { // Save a pointer to the request and the datalayer this.request = request; this.datalayer = datalayer; // Create the operators this.generateOp = new Generate(datalayer, request); this.evaluateOp = new Evaluate(request, blackListedTriples, executor); } /* * (non-Javadoc) * * @see java.lang.Runnable#run() */ public void run() { // Do not run something terminated if (isTerminated()) return; logger.info("Run optimizer"); generation = 0; while (!isTerminated()) { pauseLock.lock(); try { while (isPaused) unpaused.await(); if (isTerminated) return; } catch (InterruptedException ie) { // Finish return; } finally { pauseLock.unlock(); } // // Initialise the population with a dummy individual // if (population.isEmpty()) { Solution solution = new Solution(); for (Node_Variable variable : request.variables()) solution.add(new Binding(variable, Node.NULL)); population.add(solution); } // Increment the generation counter ++generation; // // Generate a new set of offspring and copy the parents into it // first // // logger.info("Generate"); Set<Solution> newPopulation = new HashSet<Solution>(); newPopulation.addAll(population); // Add the parents generateOp.createPopulation(population, newPopulation); // // Evaluate all of them // // logger.info("Evaluate " + newPopulation.size()); // Counts the number of different solutions evaluationsCounter += newPopulation.size() - population.size(); evaluateOp.evaluatePopulation(newPopulation); /* * String buffer = "Fitnesses "; for (Solution s : newPopulation) * buffer += s.getFitness() + " "; logger.info(buffer); */ // Provide feed back to the generation operator generateOp.updateProviderRewards(newPopulation); // // Get rid of the previous population and insert the kids // // logger.info("Cut"); population.clear(); population.addAll(newPopulation); while (population.size() > POPULATION_SIZE) population.remove(population.first()); // // Track for optimality // double topFitness = population.last().getFitness(); for (Solution s : population) { // Increment age if (s.getFitness() != topFitness) s.resetAge(); s.incrementAge(); // Check optimality s.setOptimal(false); if (s.getAge() >= MAXIMUM_GENERATION && s.getFitness() > 0) s.setOptimal(true); if (s.getFitness() == 1.0d) s.setOptimal(true); // If the solution is optimal add its (valid!) triples to the // black // list if (s.isOptimal()) { synchronized (blackListedTriples) { blackListedTriples.addAll(request.getTripleSet(s)); } } // Print solution // logger.info(s.toString()); } logger.info("Generation " + generation + ", best fitness=" + topFitness); for (Solution s : population) logger.info(s.toString()); // // Notify observers that a loop has been done // setChanged(); notifyObservers(population); // for (Solution s : population) // if (s.isOptimal()) // this.terminate(); // // Wait a bit for the data layer // datalayer.waitForLatencyBuffer(); // // Remove all optimum individuals from the population // List<Solution> toRemove = new ArrayList<Solution>(); for (Solution s : population) if (s.isOptimal()) toRemove.add(s); population.removeAll(toRemove); } } /** Stop the execution of the optimizer */ public void terminate() { logger.info("Terminate optimizer"); pauseLock.lock(); try { // Set the status to true isTerminated = true; } finally { pauseLock.unlock(); } } /** @return true if the optimizer is stopped */ public boolean isTerminated() { boolean res; pauseLock.lock(); try { res = isTerminated; } finally { pauseLock.unlock(); } return res; } /** Pause the algorithm */ public void pause() { logger.info("Pause optimizer " + this); pauseLock.lock(); try { isPaused = true; } finally { pauseLock.unlock(); } } /** @return true if the search algorithm is paused */ public boolean isPaused() { boolean res; pauseLock.lock(); try { res = isPaused; } finally { pauseLock.unlock(); } return res; } /** Continue the execution */ public void resume() { logger.info("Resume optimizer " + this); pauseLock.lock(); try { isPaused = false; unpaused.signalAll(); } finally { pauseLock.unlock(); } } /** @return the evaluations counter */ public int getEvaluationsCounter() { return evaluationsCounter; } /** @return the generations counter */ public int getGenerationsCounter() { return generation; } }
@SuppressWarnings({"unchecked", "rawtypes"}) public class DefaultSorter extends ExternalSorter implements IndexedSortable { private static final Log LOG = LogFactory.getLog(DefaultSorter.class); // TODO NEWTEZ Progress reporting to Tez framework. (making progress vs %complete) /** The size of each record in the index file for the map-outputs. */ public static final int MAP_OUTPUT_INDEX_RECORD_LENGTH = 24; private static final int APPROX_HEADER_LENGTH = 150; // k/v accounting private final IntBuffer kvmeta; // metadata overlay on backing store int kvstart; // marks origin of spill metadata int kvend; // marks end of spill metadata int kvindex; // marks end of fully serialized records int equator; // marks origin of meta/serialization int bufstart; // marks beginning of spill int bufend; // marks beginning of collectable int bufmark; // marks end of record int bufindex; // marks end of collected int bufvoid; // marks the point where we should stop // reading at the end of the buffer private final byte[] kvbuffer; // main output buffer private final byte[] b0 = new byte[0]; protected static final int VALSTART = 0; // val offset in acct protected static final int KEYSTART = 1; // key offset in acct protected static final int PARTITION = 2; // partition offset in acct protected static final int VALLEN = 3; // length of value protected static final int NMETA = 4; // num meta ints protected static final int METASIZE = NMETA * 4; // size in bytes // spill accounting final int maxRec; final int softLimit; boolean spillInProgress; int bufferRemaining; volatile Throwable sortSpillException = null; int numSpills = 0; final int minSpillsForCombine; final ReentrantLock spillLock = new ReentrantLock(); final Condition spillDone = spillLock.newCondition(); final Condition spillReady = spillLock.newCondition(); final BlockingBuffer bb = new BlockingBuffer(); volatile boolean spillThreadRunning = false; final SpillThread spillThread = new SpillThread(); final ArrayList<TezSpillRecord> indexCacheList = new ArrayList<TezSpillRecord>(); private final int indexCacheMemoryLimit; private int totalIndexCacheMemory; public DefaultSorter( OutputContext outputContext, Configuration conf, int numOutputs, long initialMemoryAvailable) throws IOException { super(outputContext, conf, numOutputs, initialMemoryAvailable); // sanity checks final float spillper = this.conf.getFloat( TezRuntimeConfiguration.TEZ_RUNTIME_SORT_SPILL_PERCENT, TezRuntimeConfiguration.TEZ_RUNTIME_SORT_SPILL_PERCENT_DEFAULT); final int sortmb = this.availableMemoryMb; if (spillper > (float) 1.0 || spillper <= (float) 0.0) { throw new IOException( "Invalid \"" + TezRuntimeConfiguration.TEZ_RUNTIME_SORT_SPILL_PERCENT + "\": " + spillper); } if ((sortmb & 0x7FF) != sortmb) { throw new IOException( "Invalid \"" + TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB + "\": " + sortmb); } indexCacheMemoryLimit = this.conf.getInt( TezRuntimeConfiguration.TEZ_RUNTIME_INDEX_CACHE_MEMORY_LIMIT_BYTES, TezRuntimeConfiguration.TEZ_RUNTIME_INDEX_CACHE_MEMORY_LIMIT_BYTES_DEFAULT); // buffers and accounting int maxMemUsage = sortmb << 20; maxMemUsage -= maxMemUsage % METASIZE; kvbuffer = new byte[maxMemUsage]; bufvoid = kvbuffer.length; kvmeta = ByteBuffer.wrap(kvbuffer).order(ByteOrder.nativeOrder()).asIntBuffer(); setEquator(0); bufstart = bufend = bufindex = equator; kvstart = kvend = kvindex; maxRec = kvmeta.capacity() / NMETA; softLimit = (int) (kvbuffer.length * spillper); bufferRemaining = softLimit; if (LOG.isInfoEnabled()) { LOG.info(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB + ": " + sortmb); LOG.info("soft limit at " + softLimit); LOG.info("bufstart = " + bufstart + "; bufvoid = " + bufvoid); LOG.info("kvstart = " + kvstart + "; length = " + maxRec); } // k/v serialization valSerializer.open(bb); keySerializer.open(bb); spillInProgress = false; minSpillsForCombine = this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINE_MIN_SPILLS, 3); spillThread.setDaemon(true); spillThread.setName( "SpillThread [" + TezUtilsInternal.cleanVertexName(outputContext.getDestinationVertexName() + "]")); spillLock.lock(); try { spillThread.start(); while (!spillThreadRunning) { spillDone.await(); } } catch (InterruptedException e) { throw new IOException("Spill thread failed to initialize", e); } finally { spillLock.unlock(); } if (sortSpillException != null) { throw new IOException("Spill thread failed to initialize", sortSpillException); } } @Override public void write(Object key, Object value) throws IOException { collect(key, value, partitioner.getPartition(key, value, partitions)); } /** * Serialize the key, value to intermediate storage. When this method returns, kvindex must refer * to sufficient unused storage to store one METADATA. */ synchronized void collect(Object key, Object value, final int partition) throws IOException { if (key.getClass() != keyClass) { throw new IOException( "Type mismatch in key from map: expected " + keyClass.getName() + ", received " + key.getClass().getName()); } if (value.getClass() != valClass) { throw new IOException( "Type mismatch in value from map: expected " + valClass.getName() + ", received " + value.getClass().getName()); } if (partition < 0 || partition >= partitions) { throw new IOException( "Illegal partition for " + key + " (" + partition + ")" + ", TotalPartitions: " + partitions); } checkSpillException(); bufferRemaining -= METASIZE; if (bufferRemaining <= 0) { // start spill if the thread is not running and the soft limit has been // reached spillLock.lock(); try { do { if (!spillInProgress) { final int kvbidx = 4 * kvindex; final int kvbend = 4 * kvend; // serialized, unspilled bytes always lie between kvindex and // bufindex, crossing the equator. Note that any void space // created by a reset must be included in "used" bytes final int bUsed = distanceTo(kvbidx, bufindex); final boolean bufsoftlimit = bUsed >= softLimit; if ((kvbend + METASIZE) % kvbuffer.length != equator - (equator % METASIZE)) { // spill finished, reclaim space resetSpill(); bufferRemaining = Math.min(distanceTo(bufindex, kvbidx) - 2 * METASIZE, softLimit - bUsed) - METASIZE; continue; } else if (bufsoftlimit && kvindex != kvend) { // spill records, if any collected; check latter, as it may // be possible for metadata alignment to hit spill pcnt startSpill(); final int avgRec = (int) (mapOutputByteCounter.getValue() / mapOutputRecordCounter.getValue()); // leave at least half the split buffer for serialization data // ensure that kvindex >= bufindex final int distkvi = distanceTo(bufindex, kvbidx); final int newPos = (bufindex + Math.max( 2 * METASIZE - 1, Math.min(distkvi / 2, distkvi / (METASIZE + avgRec) * METASIZE))) % kvbuffer.length; setEquator(newPos); bufmark = bufindex = newPos; final int serBound = 4 * kvend; // bytes remaining before the lock must be held and limits // checked is the minimum of three arcs: the metadata space, the // serialization space, and the soft limit bufferRemaining = Math.min( // metadata max distanceTo(bufend, newPos), Math.min( // serialization max distanceTo(newPos, serBound), // soft limit softLimit)) - 2 * METASIZE; } } } while (false); } finally { spillLock.unlock(); } } try { // serialize key bytes into buffer int keystart = bufindex; keySerializer.serialize(key); if (bufindex < keystart) { // wrapped the key; must make contiguous bb.shiftBufferedKey(); keystart = 0; } // serialize value bytes into buffer final int valstart = bufindex; valSerializer.serialize(value); // It's possible for records to have zero length, i.e. the serializer // will perform no writes. To ensure that the boundary conditions are // checked and that the kvindex invariant is maintained, perform a // zero-length write into the buffer. The logic monitoring this could be // moved into collect, but this is cleaner and inexpensive. For now, it // is acceptable. bb.write(b0, 0, 0); // the record must be marked after the preceding write, as the metadata // for this record are not yet written int valend = bb.markRecord(); mapOutputRecordCounter.increment(1); mapOutputByteCounter.increment(distanceTo(keystart, valend, bufvoid)); // write accounting info kvmeta.put(kvindex + PARTITION, partition); kvmeta.put(kvindex + KEYSTART, keystart); kvmeta.put(kvindex + VALSTART, valstart); kvmeta.put(kvindex + VALLEN, distanceTo(valstart, valend)); // advance kvindex kvindex = (int) (((long) kvindex - NMETA + kvmeta.capacity()) % kvmeta.capacity()); } catch (MapBufferTooSmallException e) { LOG.info("Record too large for in-memory buffer: " + e.getMessage()); spillSingleRecord(key, value, partition); mapOutputRecordCounter.increment(1); return; } } /** * Set the point from which meta and serialization data expand. The meta indices are aligned with * the buffer, so metadata never spans the ends of the circular buffer. */ private void setEquator(int pos) { equator = pos; // set index prior to first entry, aligned at meta boundary final int aligned = pos - (pos % METASIZE); // Cast one of the operands to long to avoid integer overflow kvindex = (int) (((long) aligned - METASIZE + kvbuffer.length) % kvbuffer.length) / 4; if (LOG.isInfoEnabled()) { LOG.info("(EQUATOR) " + pos + " kvi " + kvindex + "(" + (kvindex * 4) + ")"); } } /** * The spill is complete, so set the buffer and meta indices to be equal to the new equator to * free space for continuing collection. Note that when kvindex == kvend == kvstart, the buffer is * empty. */ private void resetSpill() { final int e = equator; bufstart = bufend = e; final int aligned = e - (e % METASIZE); // set start/end to point to first meta record // Cast one of the operands to long to avoid integer overflow kvstart = kvend = (int) (((long) aligned - METASIZE + kvbuffer.length) % kvbuffer.length) / 4; if (LOG.isInfoEnabled()) { LOG.info( "(RESET) equator " + e + " kv " + kvstart + "(" + (kvstart * 4) + ")" + " kvi " + kvindex + "(" + (kvindex * 4) + ")"); } } /** * Compute the distance in bytes between two indices in the serialization buffer. * * @see #distanceTo(int,int,int) */ final int distanceTo(final int i, final int j) { return distanceTo(i, j, kvbuffer.length); } /** Compute the distance between two indices in the circular buffer given the max distance. */ int distanceTo(final int i, final int j, final int mod) { return i <= j ? j - i : mod - i + j; } /** For the given meta position, return the offset into the int-sized kvmeta buffer. */ int offsetFor(int metapos) { return (metapos % maxRec) * NMETA; } /** * Compare logical range, st i, j MOD offset capacity. Compare by partition, then by key. * * @see IndexedSortable#compare */ public int compare(final int mi, final int mj) { final int kvi = offsetFor(mi); final int kvj = offsetFor(mj); final int kvip = kvmeta.get(kvi + PARTITION); final int kvjp = kvmeta.get(kvj + PARTITION); // sort by partition if (kvip != kvjp) { return kvip - kvjp; } // sort by key return comparator.compare( kvbuffer, kvmeta.get(kvi + KEYSTART), kvmeta.get(kvi + VALSTART) - kvmeta.get(kvi + KEYSTART), kvbuffer, kvmeta.get(kvj + KEYSTART), kvmeta.get(kvj + VALSTART) - kvmeta.get(kvj + KEYSTART)); } final byte META_BUFFER_TMP[] = new byte[METASIZE]; /** * Swap metadata for items i,j * * @see IndexedSortable#swap */ public void swap(final int mi, final int mj) { int iOff = (mi % maxRec) * METASIZE; int jOff = (mj % maxRec) * METASIZE; System.arraycopy(kvbuffer, iOff, META_BUFFER_TMP, 0, METASIZE); System.arraycopy(kvbuffer, jOff, kvbuffer, iOff, METASIZE); System.arraycopy(META_BUFFER_TMP, 0, kvbuffer, jOff, METASIZE); } /** Inner class managing the spill of serialized records to disk. */ protected class BlockingBuffer extends DataOutputStream { public BlockingBuffer() { super(new Buffer()); } /** * Mark end of record. Note that this is required if the buffer is to cut the spill in the * proper place. */ public int markRecord() { bufmark = bufindex; return bufindex; } /** * Set position from last mark to end of writable buffer, then rewrite the data between last * mark and kvindex. This handles a special case where the key wraps around the buffer. If the * key is to be passed to a RawComparator, then it must be contiguous in the buffer. This * recopies the data in the buffer back into itself, but starting at the beginning of the * buffer. Note that this method should <b>only</b> be called immediately after detecting this * condition. To call it at any other time is undefined and would likely result in data loss or * corruption. * * @see #markRecord() */ protected void shiftBufferedKey() throws IOException { // spillLock unnecessary; both kvend and kvindex are current int headbytelen = bufvoid - bufmark; bufvoid = bufmark; final int kvbidx = 4 * kvindex; final int kvbend = 4 * kvend; final int avail = Math.min(distanceTo(0, kvbidx), distanceTo(0, kvbend)); if (bufindex + headbytelen < avail) { System.arraycopy(kvbuffer, 0, kvbuffer, headbytelen, bufindex); System.arraycopy(kvbuffer, bufvoid, kvbuffer, 0, headbytelen); bufindex += headbytelen; bufferRemaining -= kvbuffer.length - bufvoid; } else { byte[] keytmp = new byte[bufindex]; System.arraycopy(kvbuffer, 0, keytmp, 0, bufindex); bufindex = 0; out.write(kvbuffer, bufmark, headbytelen); out.write(keytmp); } } } public class Buffer extends OutputStream { private final byte[] scratch = new byte[1]; @Override public void write(int v) throws IOException { scratch[0] = (byte) v; write(scratch, 0, 1); } /** * Attempt to write a sequence of bytes to the collection buffer. This method will block if the * spill thread is running and it cannot write. * * @throws MapBufferTooSmallException if record is too large to deserialize into the collection * buffer. */ @Override public void write(byte b[], int off, int len) throws IOException { // must always verify the invariant that at least METASIZE bytes are // available beyond kvindex, even when len == 0 bufferRemaining -= len; if (bufferRemaining <= 0) { // writing these bytes could exhaust available buffer space or fill // the buffer to soft limit. check if spill or blocking are necessary boolean blockwrite = false; spillLock.lock(); try { do { checkSpillException(); final int kvbidx = 4 * kvindex; final int kvbend = 4 * kvend; // ser distance to key index final int distkvi = distanceTo(bufindex, kvbidx); // ser distance to spill end index final int distkve = distanceTo(bufindex, kvbend); // if kvindex is closer than kvend, then a spill is neither in // progress nor complete and reset since the lock was held. The // write should block only if there is insufficient space to // complete the current write, write the metadata for this record, // and write the metadata for the next record. If kvend is closer, // then the write should block if there is too little space for // either the metadata or the current write. Note that collect // ensures its metadata requirement with a zero-length write blockwrite = distkvi <= distkve ? distkvi <= len + 2 * METASIZE : distkve <= len || distanceTo(bufend, kvbidx) < 2 * METASIZE; if (!spillInProgress) { if (blockwrite) { if ((kvbend + METASIZE) % kvbuffer.length != equator - (equator % METASIZE)) { // spill finished, reclaim space // need to use meta exclusively; zero-len rec & 100% spill // pcnt would fail resetSpill(); // resetSpill doesn't move bufindex, kvindex bufferRemaining = Math.min(distkvi - 2 * METASIZE, softLimit - distanceTo(kvbidx, bufindex)) - len; continue; } // we have records we can spill; only spill if blocked if (kvindex != kvend) { startSpill(); // Blocked on this write, waiting for the spill just // initiated to finish. Instead of repositioning the marker // and copying the partial record, we set the record start // to be the new equator setEquator(bufmark); } else { // We have no buffered records, and this record is too large // to write into kvbuffer. We must spill it directly from // collect final int size = distanceTo(bufstart, bufindex) + len; setEquator(0); bufstart = bufend = bufindex = equator; kvstart = kvend = kvindex; bufvoid = kvbuffer.length; throw new MapBufferTooSmallException(size + " bytes"); } } } if (blockwrite) { // wait for spill try { while (spillInProgress) { spillDone.await(); } } catch (InterruptedException e) { throw new IOException("Buffer interrupted while waiting for the writer", e); } } } while (blockwrite); } finally { spillLock.unlock(); } } // here, we know that we have sufficient space to write if (bufindex + len > bufvoid) { final int gaplen = bufvoid - bufindex; System.arraycopy(b, off, kvbuffer, bufindex, gaplen); len -= gaplen; off += gaplen; bufindex = 0; } System.arraycopy(b, off, kvbuffer, bufindex, len); bufindex += len; } } @Override public void flush() throws IOException { LOG.info("Starting flush of map output"); spillLock.lock(); try { while (spillInProgress) { spillDone.await(); } checkSpillException(); final int kvbend = 4 * kvend; if ((kvbend + METASIZE) % kvbuffer.length != equator - (equator % METASIZE)) { // spill finished resetSpill(); } if (kvindex != kvend) { kvend = (kvindex + NMETA) % kvmeta.capacity(); bufend = bufmark; if (LOG.isInfoEnabled()) { LOG.info("Sorting & Spilling map output"); LOG.info("bufstart = " + bufstart + "; bufend = " + bufmark + "; bufvoid = " + bufvoid); LOG.info( "kvstart = " + kvstart + "(" + (kvstart * 4) + "); kvend = " + kvend + "(" + (kvend * 4) + "); length = " + (distanceTo(kvend, kvstart, kvmeta.capacity()) + 1) + "/" + maxRec); } sortAndSpill(); } } catch (InterruptedException e) { throw new IOException("Interrupted while waiting for the writer", e); } finally { spillLock.unlock(); } assert !spillLock.isHeldByCurrentThread(); // shut down spill thread and wait for it to exit. Since the preceding // ensures that it is finished with its work (and sortAndSpill did not // throw), we elect to use an interrupt instead of setting a flag. // Spilling simultaneously from this thread while the spill thread // finishes its work might be both a useful way to extend this and also // sufficient motivation for the latter approach. try { spillThread.interrupt(); spillThread.join(); } catch (InterruptedException e) { throw new IOException("Spill failed", e); } // release sort buffer before the merge // FIXME // kvbuffer = null; mergeParts(); Path outputPath = mapOutputFile.getOutputFile(); fileOutputByteCounter.increment(rfs.getFileStatus(outputPath).getLen()); } @Override public void close() throws IOException {} protected class SpillThread extends Thread { @Override public void run() { spillLock.lock(); spillThreadRunning = true; try { while (true) { spillDone.signal(); while (!spillInProgress) { spillReady.await(); } try { spillLock.unlock(); sortAndSpill(); } catch (Throwable t) { LOG.warn("Got an exception in sortAndSpill", t); sortSpillException = t; } finally { spillLock.lock(); if (bufend < bufstart) { bufvoid = kvbuffer.length; } kvstart = kvend; bufstart = bufend; spillInProgress = false; } } } catch (InterruptedException e) { Thread.currentThread().interrupt(); } finally { spillLock.unlock(); spillThreadRunning = false; } } } private void checkSpillException() throws IOException { final Throwable lspillException = sortSpillException; if (lspillException != null) { if (lspillException instanceof Error) { final String logMsg = "Task " + outputContext.getUniqueIdentifier() + " failed : " + ExceptionUtils.getStackTrace(lspillException); outputContext.fatalError(lspillException, logMsg); } throw new IOException("Spill failed", lspillException); } } private void startSpill() { assert !spillInProgress; kvend = (kvindex + NMETA) % kvmeta.capacity(); bufend = bufmark; spillInProgress = true; if (LOG.isInfoEnabled()) { LOG.info("Spilling map output"); LOG.info("bufstart = " + bufstart + "; bufend = " + bufmark + "; bufvoid = " + bufvoid); LOG.info( "kvstart = " + kvstart + "(" + (kvstart * 4) + "); kvend = " + kvend + "(" + (kvend * 4) + "); length = " + (distanceTo(kvend, kvstart, kvmeta.capacity()) + 1) + "/" + maxRec); } spillReady.signal(); } int getMetaStart() { return kvend / NMETA; } int getMetaEnd() { return 1 + // kvend is a valid record (kvstart >= kvend ? kvstart : kvmeta.capacity() + kvstart) / NMETA; } protected void sortAndSpill() throws IOException, InterruptedException { final int mstart = getMetaStart(); final int mend = getMetaEnd(); sorter.sort(this, mstart, mend, nullProgressable); spill(mstart, mend); } protected void spill(int mstart, int mend) throws IOException, InterruptedException { // approximate the length of the output file to be the length of the // buffer + header lengths for the partitions final long size = (bufend >= bufstart ? bufend - bufstart : (bufvoid - bufend) + bufstart) + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try { // create spill file final TezSpillRecord spillRec = new TezSpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); out = rfs.create(filename); int spindex = mstart; final InMemValBytes value = createInMemValBytes(); for (int i = 0; i < partitions; ++i) { IFile.Writer writer = null; try { long segmentStart = out.getPos(); writer = new Writer(conf, out, keyClass, valClass, codec, spilledRecordsCounter, null); if (combiner == null) { // spill directly DataInputBuffer key = new DataInputBuffer(); while (spindex < mend && kvmeta.get(offsetFor(spindex) + PARTITION) == i) { final int kvoff = offsetFor(spindex); int keystart = kvmeta.get(kvoff + KEYSTART); int valstart = kvmeta.get(kvoff + VALSTART); key.reset(kvbuffer, keystart, valstart - keystart); getVBytesForOffset(kvoff, value); writer.append(key, value); ++spindex; } } else { int spstart = spindex; while (spindex < mend && kvmeta.get(offsetFor(spindex) + PARTITION) == i) { ++spindex; } // Note: we would like to avoid the combiner if we've fewer // than some threshold of records for a partition if (spstart != spindex) { TezRawKeyValueIterator kvIter = new MRResultIterator(spstart, spindex); if (LOG.isDebugEnabled()) { LOG.debug("Running combine processor"); } runCombineProcessor(kvIter, writer); } } // close the writer writer.close(); if (numSpills > 0) { additionalSpillBytesWritten.increment(writer.getCompressedLength()); numAdditionalSpills.increment(1); // Reset the value will be set during the final merge. outputBytesWithOverheadCounter.setValue(0); } else { // Set this up for the first write only. Subsequent ones will be handled in the final // merge. outputBytesWithOverheadCounter.increment(writer.getRawLength()); } // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, i); writer = null; } finally { if (null != writer) writer.close(); } } if (totalIndexCacheMemory >= indexCacheMemoryLimit) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite( numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, conf); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } LOG.info("Finished spill " + numSpills); ++numSpills; } finally { if (out != null) out.close(); } } /** * Handles the degenerate case where serialization fails to fit in the in-memory buffer, so we * must spill the record from collect directly to a spill file. Consider this "losing". */ private void spillSingleRecord(final Object key, final Object value, int partition) throws IOException { long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try { // create spill file final TezSpillRecord spillRec = new TezSpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); out = rfs.create(filename); // we don't run the combiner for a single record for (int i = 0; i < partitions; ++i) { IFile.Writer writer = null; try { long segmentStart = out.getPos(); // Create a new codec, don't care! writer = new IFile.Writer(conf, out, keyClass, valClass, codec, spilledRecordsCounter, null); if (i == partition) { final long recordStart = out.getPos(); writer.append(key, value); // Note that our map byte count will not be accurate with // compression mapOutputByteCounter.increment(out.getPos() - recordStart); } writer.close(); if (numSpills > 0) { additionalSpillBytesWritten.increment(writer.getCompressedLength()); numAdditionalSpills.increment(1); outputBytesWithOverheadCounter.setValue(0); } else { // Set this up for the first write only. Subsequent ones will be handled in the final // merge. outputBytesWithOverheadCounter.increment(writer.getRawLength()); } // record offsets TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, i); writer = null; } catch (IOException e) { if (null != writer) writer.close(); throw e; } } if (totalIndexCacheMemory >= indexCacheMemoryLimit) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite( numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, conf); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } ++numSpills; } finally { if (out != null) out.close(); } } protected int getInMemVBytesLength(int kvoff) { // get the keystart for the next serialized value to be the end // of this value. If this is the last value in the buffer, use bufend final int vallen = kvmeta.get(kvoff + VALLEN); assert vallen >= 0; return vallen; } /** * Given an offset, populate vbytes with the associated set of deserialized value bytes. Should * only be called during a spill. */ int getVBytesForOffset(int kvoff, InMemValBytes vbytes) { int vallen = getInMemVBytesLength(kvoff); vbytes.reset(kvbuffer, kvmeta.get(kvoff + VALSTART), vallen); return vallen; } /** Inner class wrapping valuebytes, used for appendRaw. */ static class InMemValBytes extends DataInputBuffer { private byte[] buffer; private int start; private int length; private final int bufvoid; public InMemValBytes(int bufvoid) { this.bufvoid = bufvoid; } public void reset(byte[] buffer, int start, int length) { this.buffer = buffer; this.start = start; this.length = length; if (start + length > bufvoid) { this.buffer = new byte[this.length]; final int taillen = bufvoid - start; System.arraycopy(buffer, start, this.buffer, 0, taillen); System.arraycopy(buffer, 0, this.buffer, taillen, length - taillen); this.start = 0; } super.reset(this.buffer, this.start, this.length); } } InMemValBytes createInMemValBytes() { return new InMemValBytes(bufvoid); } protected class MRResultIterator implements TezRawKeyValueIterator { private final DataInputBuffer keybuf = new DataInputBuffer(); private final InMemValBytes vbytes = createInMemValBytes(); private final int end; private int current; public MRResultIterator(int start, int end) { this.end = end; current = start - 1; } public boolean next() throws IOException { return ++current < end; } public DataInputBuffer getKey() throws IOException { final int kvoff = offsetFor(current); keybuf.reset( kvbuffer, kvmeta.get(kvoff + KEYSTART), kvmeta.get(kvoff + VALSTART) - kvmeta.get(kvoff + KEYSTART)); return keybuf; } public DataInputBuffer getValue() throws IOException { getVBytesForOffset(offsetFor(current), vbytes); return vbytes; } public Progress getProgress() { return null; } public void close() {} } private void mergeParts() throws IOException { // get the approximate size of the final output/index files long finalOutFileSize = 0; long finalIndexFileSize = 0; final Path[] filename = new Path[numSpills]; final String taskIdentifier = outputContext.getUniqueIdentifier(); for (int i = 0; i < numSpills; i++) { filename[i] = mapOutputFile.getSpillFile(i); finalOutFileSize += rfs.getFileStatus(filename[i]).getLen(); } if (numSpills == 1) { // the spill is the final output sameVolRename(filename[0], mapOutputFile.getOutputFileForWriteInVolume(filename[0])); if (indexCacheList.size() == 0) { sameVolRename( mapOutputFile.getSpillIndexFile(0), mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0])); } else { indexCacheList .get(0) .writeToFile(mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]), conf); } return; } // read in paged indices for (int i = indexCacheList.size(); i < numSpills; ++i) { Path indexFileName = mapOutputFile.getSpillIndexFile(i); indexCacheList.add(new TezSpillRecord(indexFileName, conf)); } // make correction in the length to include the sequence file header // lengths for each partition finalOutFileSize += partitions * APPROX_HEADER_LENGTH; finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH; Path finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize); Path finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize); // The output stream for the final single output file FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096); if (numSpills == 0) { // TODO Change event generation to say there is no data rather than generating a dummy file // create dummy files TezSpillRecord sr = new TezSpillRecord(partitions); try { for (int i = 0; i < partitions; i++) { long segmentStart = finalOut.getPos(); Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, null, null); writer.close(); TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); // Covers the case of multiple spills. outputBytesWithOverheadCounter.increment(writer.getRawLength()); sr.putIndex(rec, i); } sr.writeToFile(finalIndexFile, conf); } finally { finalOut.close(); } return; } else { final TezSpillRecord spillRec = new TezSpillRecord(partitions); for (int parts = 0; parts < partitions; parts++) { // create the segments to be merged List<Segment> segmentList = new ArrayList<Segment>(numSpills); for (int i = 0; i < numSpills; i++) { TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts); Segment s = new Segment( conf, rfs, filename[i], indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, true); segmentList.add(i, s); if (LOG.isDebugEnabled()) { LOG.debug( "TaskIdentifier=" + taskIdentifier + " Partition=" + parts + "Spill =" + i + "(" + indexRecord.getStartOffset() + "," + indexRecord.getRawLength() + ", " + indexRecord.getPartLength() + ")"); } } int mergeFactor = this.conf.getInt( TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT); // sort the segments only if there are intermediate merges boolean sortSegments = segmentList.size() > mergeFactor; // merge TezRawKeyValueIterator kvIter = TezMerger.merge( conf, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(taskIdentifier), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), nullProgressable, sortSegments, true, null, spilledRecordsCounter, additionalSpillBytesRead, null); // Not using any Progress in TezMerger. Should just work. // write merged output to disk long segmentStart = finalOut.getPos(); Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null); if (combiner == null || numSpills < minSpillsForCombine) { TezMerger.writeFile( kvIter, writer, nullProgressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT); } else { runCombineProcessor(kvIter, writer); } writer.close(); // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, parts); } spillRec.writeToFile(finalIndexFile, conf); finalOut.close(); for (int i = 0; i < numSpills; i++) { rfs.delete(filename[i], true); } } } }
// todo make thread safe and concurrent public class DbImpl implements DB { private final Options options; private final File databaseDir; private final TableCache tableCache; private final DbLock dbLock; private final VersionSet versions; private final AtomicBoolean shuttingDown = new AtomicBoolean(); private final ReentrantLock mutex = new ReentrantLock(); private final Condition backgroundCondition = mutex.newCondition(); private final List<Long> pendingOutputs = newArrayList(); // todo private LogWriter log; private MemTable memTable; private MemTable immutableMemTable; private final InternalKeyComparator internalKeyComparator; private volatile Throwable backgroundException; private ExecutorService compactionExecutor; private Future<?> backgroundCompaction; private ManualCompaction manualCompaction; public DbImpl(Options options, File databaseDir) throws IOException { Preconditions.checkNotNull(options, "options is null"); Preconditions.checkNotNull(databaseDir, "databaseDir is null"); this.options = options; if (this.options.compressionType() == CompressionType.ZLIB && !Zlib.available()) { // There's little hope to continue. this.options.compressionType(CompressionType.NONE); } if (this.options.compressionType() == CompressionType.SNAPPY && !Snappy.available()) { // Disable snappy if it's not available. this.options.compressionType(CompressionType.NONE); } this.databaseDir = databaseDir; // use custom comparator if set DBComparator comparator = options.comparator(); UserComparator userComparator; if (comparator != null) { userComparator = new CustomUserComparator(comparator); } else { userComparator = new BytewiseComparator(); } internalKeyComparator = new InternalKeyComparator(userComparator); memTable = new MemTable(internalKeyComparator); immutableMemTable = null; ThreadFactory compactionThreadFactory = new ThreadFactoryBuilder() .setNameFormat("leveldb-compaction-%s") .setUncaughtExceptionHandler( new UncaughtExceptionHandler() { @Override public void uncaughtException(Thread t, Throwable e) { // todo need a real UncaughtExceptionHandler System.out.printf("%s%n", t); e.printStackTrace(); } }) .build(); compactionExecutor = Executors.newSingleThreadExecutor(compactionThreadFactory); // Reserve ten files or so for other uses and give the rest to TableCache. int tableCacheSize = options.maxOpenFiles() - 10; tableCache = new TableCache( databaseDir, tableCacheSize, new InternalUserComparator(internalKeyComparator), options.verifyChecksums()); // create the version set // create the database dir if it does not already exist databaseDir.mkdirs(); Preconditions.checkArgument( databaseDir.exists(), "Database directory '%s' does not exist and could not be created", databaseDir); Preconditions.checkArgument( databaseDir.isDirectory(), "Database directory '%s' is not a directory", databaseDir); mutex.lock(); try { // lock the database dir dbLock = new DbLock(new File(databaseDir, Filename.lockFileName())); // verify the "current" file File currentFile = new File(databaseDir, Filename.currentFileName()); if (!currentFile.canRead()) { Preconditions.checkArgument( options.createIfMissing(), "Database '%s' does not exist and the create if missing option is disabled", databaseDir); } else { Preconditions.checkArgument( !options.errorIfExists(), "Database '%s' exists and the error if exists option is enabled", databaseDir); } versions = new VersionSet(databaseDir, tableCache, internalKeyComparator); // load (and recover) current version versions.recover(); // Recover from all newer log files than the ones named in the // descriptor (new log files may have been added by the previous // incarnation without registering them in the descriptor). // // Note that PrevLogNumber() is no longer used, but we pay // attention to it in case we are recovering a database // produced by an older version of leveldb. long minLogNumber = versions.getLogNumber(); long previousLogNumber = versions.getPrevLogNumber(); List<File> filenames = Filename.listFiles(databaseDir); List<Long> logs = Lists.newArrayList(); for (File filename : filenames) { FileInfo fileInfo = Filename.parseFileName(filename); if (fileInfo != null && fileInfo.getFileType() == FileType.LOG && ((fileInfo.getFileNumber() >= minLogNumber) || (fileInfo.getFileNumber() == previousLogNumber))) { logs.add(fileInfo.getFileNumber()); } } // Recover in the order in which the logs were generated VersionEdit edit = new VersionEdit(); Collections.sort(logs); for (Long fileNumber : logs) { long maxSequence = recoverLogFile(fileNumber, edit); if (versions.getLastSequence() < maxSequence) { versions.setLastSequence(maxSequence); } } // open transaction log long logFileNumber = versions.getNextFileNumber(); this.log = Logs.createLogWriter( new File(databaseDir, Filename.logFileName(logFileNumber)), logFileNumber); edit.setLogNumber(log.getFileNumber()); // apply recovered edits versions.logAndApply(edit); // cleanup unused files deleteObsoleteFiles(); // schedule compactions maybeScheduleCompaction(); } finally { mutex.unlock(); } } public void close() { if (shuttingDown.getAndSet(true)) { return; } mutex.lock(); try { while (backgroundCompaction != null) { backgroundCondition.awaitUninterruptibly(); } } finally { mutex.unlock(); } compactionExecutor.shutdown(); try { compactionExecutor.awaitTermination(1, TimeUnit.DAYS); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } try { versions.destroy(); } catch (IOException ignored) { } try { log.close(); } catch (IOException ignored) { } tableCache.close(); dbLock.release(); } @Override public String getProperty(String name) { checkBackgroundException(); return null; } private void deleteObsoleteFiles() { Preconditions.checkState(mutex.isHeldByCurrentThread()); // Make a set of all of the live files List<Long> live = newArrayList(this.pendingOutputs); for (FileMetaData fileMetaData : versions.getLiveFiles()) { live.add(fileMetaData.getNumber()); } for (File file : Filename.listFiles(databaseDir)) { FileInfo fileInfo = Filename.parseFileName(file); if (fileInfo == null) continue; long number = fileInfo.getFileNumber(); boolean keep = true; switch (fileInfo.getFileType()) { case LOG: keep = ((number >= versions.getLogNumber()) || (number == versions.getPrevLogNumber())); break; case DESCRIPTOR: // Keep my manifest file, and any newer incarnations' // (in case there is a race that allows other incarnations) keep = (number >= versions.getManifestFileNumber()); break; case TABLE: keep = live.contains(number); break; case TEMP: // Any temp files that are currently being written to must // be recorded in pending_outputs_, which is inserted into "live" keep = live.contains(number); break; case CURRENT: case DB_LOCK: case INFO_LOG: keep = true; break; } if (!keep) { if (fileInfo.getFileType() == FileType.TABLE) { tableCache.evict(number); } // todo info logging system needed // Log(options_.info_log, "Delete type=%d #%lld\n", // int(type), // static_cast < unsigned long long>(number)); file.delete(); } } } public void flushMemTable() { mutex.lock(); try { // force compaction makeRoomForWrite(true); // todo bg_error code while (immutableMemTable != null) { backgroundCondition.awaitUninterruptibly(); } } finally { mutex.unlock(); } } public void compactRange(int level, Slice start, Slice end) { Preconditions.checkArgument(level >= 0, "level is negative"); Preconditions.checkArgument( level + 1 < NUM_LEVELS, "level is greater than or equal to %s", NUM_LEVELS); Preconditions.checkNotNull(start, "start is null"); Preconditions.checkNotNull(end, "end is null"); mutex.lock(); try { while (this.manualCompaction != null) { backgroundCondition.awaitUninterruptibly(); } ManualCompaction manualCompaction = new ManualCompaction(level, start, end); this.manualCompaction = manualCompaction; maybeScheduleCompaction(); while (this.manualCompaction == manualCompaction) { backgroundCondition.awaitUninterruptibly(); } } finally { mutex.unlock(); } } private void maybeScheduleCompaction() { Preconditions.checkState(mutex.isHeldByCurrentThread()); if (backgroundCompaction != null) { // Already scheduled } else if (shuttingDown.get()) { // DB is being shutdown; no more background compactions } else if (immutableMemTable == null && manualCompaction == null && !versions.needsCompaction()) { // No work to be done } else { backgroundCompaction = compactionExecutor.submit( new Callable<Void>() { @Override public Void call() throws Exception { try { backgroundCall(); } catch (DatabaseShutdownException ignored) { } catch (Throwable e) { backgroundException = e; } return null; } }); } } public void checkBackgroundException() { Throwable e = backgroundException; if (e != null) { throw new BackgroundProcessingException(e); } } private void backgroundCall() throws IOException { mutex.lock(); try { if (backgroundCompaction == null) { return; } try { if (!shuttingDown.get()) { backgroundCompaction(); } } finally { backgroundCompaction = null; } } finally { try { // Previous compaction may have produced too many files in a level, // so reschedule another compaction if needed. maybeScheduleCompaction(); } finally { try { backgroundCondition.signalAll(); } finally { mutex.unlock(); } } } } private void backgroundCompaction() throws IOException { Preconditions.checkState(mutex.isHeldByCurrentThread()); compactMemTableInternal(); Compaction compaction; if (manualCompaction != null) { compaction = versions.compactRange( manualCompaction.level, new InternalKey(manualCompaction.begin, MAX_SEQUENCE_NUMBER, ValueType.VALUE), new InternalKey(manualCompaction.end, 0, ValueType.DELETION)); } else { compaction = versions.pickCompaction(); } if (compaction == null) { // no compaction } else if (manualCompaction == null && compaction.isTrivialMove()) { // Move file to next level Preconditions.checkState(compaction.getLevelInputs().size() == 1); FileMetaData fileMetaData = compaction.getLevelInputs().get(0); compaction.getEdit().deleteFile(compaction.getLevel(), fileMetaData.getNumber()); compaction.getEdit().addFile(compaction.getLevel() + 1, fileMetaData); versions.logAndApply(compaction.getEdit()); // log } else { CompactionState compactionState = new CompactionState(compaction); doCompactionWork(compactionState); cleanupCompaction(compactionState); } // manual compaction complete if (manualCompaction != null) { manualCompaction = null; } } private void cleanupCompaction(CompactionState compactionState) { Preconditions.checkState(mutex.isHeldByCurrentThread()); if (compactionState.builder != null) { compactionState.builder.abandon(); } else { Preconditions.checkArgument(compactionState.outfile == null); } for (FileMetaData output : compactionState.outputs) { pendingOutputs.remove(output.getNumber()); } } private long recoverLogFile(long fileNumber, VersionEdit edit) throws IOException { Preconditions.checkState(mutex.isHeldByCurrentThread()); File file = new File(databaseDir, Filename.logFileName(fileNumber)); FileChannel channel = new FileInputStream(file).getChannel(); try { LogMonitor logMonitor = LogMonitors.logMonitor(); LogReader logReader = new LogReader(channel, logMonitor, true, 0); // Log(options_.info_log, "Recovering log #%llu", (unsigned long long) log_number); // Read all the records and add to a memtable long maxSequence = 0; MemTable memTable = null; for (Slice record = logReader.readRecord(); record != null; record = logReader.readRecord()) { SliceInput sliceInput = record.input(); // read header if (sliceInput.available() < 12) { logMonitor.corruption(sliceInput.available(), "log record too small"); continue; } long sequenceBegin = sliceInput.readLong(); int updateSize = sliceInput.readInt(); // read entries WriteBatchImpl writeBatch = readWriteBatch(sliceInput, updateSize); // apply entries to memTable if (memTable == null) { memTable = new MemTable(internalKeyComparator); } writeBatch.forEach(new InsertIntoHandler(memTable, sequenceBegin)); // update the maxSequence long lastSequence = sequenceBegin + updateSize - 1; if (lastSequence > maxSequence) { maxSequence = lastSequence; } // flush mem table if necessary if (memTable.approximateMemoryUsage() > options.writeBufferSize()) { writeLevel0Table(memTable, edit, null); memTable = null; } } // flush mem table if (memTable != null && !memTable.isEmpty()) { writeLevel0Table(memTable, edit, null); } return maxSequence; } finally { channel.close(); } } @Override public byte[] get(byte[] key) throws DBException { return get(key, new ReadOptions()); } @Override public byte[] get(byte[] key, ReadOptions options) throws DBException { checkBackgroundException(); LookupKey lookupKey; mutex.lock(); try { SnapshotImpl snapshot = getSnapshot(options); lookupKey = new LookupKey(Slices.wrappedBuffer(key), snapshot.getLastSequence()); // First look in the memtable, then in the immutable memtable (if any). LookupResult lookupResult = memTable.get(lookupKey); if (lookupResult != null) { Slice value = lookupResult.getValue(); if (value == null) { return null; } return value.getBytes(); } if (immutableMemTable != null) { lookupResult = immutableMemTable.get(lookupKey); if (lookupResult != null) { Slice value = lookupResult.getValue(); if (value == null) { return null; } return value.getBytes(); } } } finally { mutex.unlock(); } // Not in memTables; try live files in level order LookupResult lookupResult = versions.get(lookupKey); // schedule compaction if necessary mutex.lock(); try { if (versions.needsCompaction()) { maybeScheduleCompaction(); } } finally { mutex.unlock(); } if (lookupResult != null) { Slice value = lookupResult.getValue(); if (value != null) { return value.getBytes(); } } return null; } @Override public void put(byte[] key, byte[] value) throws DBException { put(key, value, new WriteOptions()); } @Override public Snapshot put(byte[] key, byte[] value, WriteOptions options) throws DBException { return writeInternal(new WriteBatchImpl().put(key, value), options); } @Override public void delete(byte[] key) throws DBException { writeInternal(new WriteBatchImpl().delete(key), new WriteOptions()); } @Override public Snapshot delete(byte[] key, WriteOptions options) throws DBException { return writeInternal(new WriteBatchImpl().delete(key), options); } @Override public void write(WriteBatch updates) throws DBException { writeInternal((WriteBatchImpl) updates, new WriteOptions()); } @Override public Snapshot write(WriteBatch updates, WriteOptions options) throws DBException { return writeInternal((WriteBatchImpl) updates, options); } public Snapshot writeInternal(WriteBatchImpl updates, WriteOptions options) throws DBException { checkBackgroundException(); mutex.lock(); try { long sequenceEnd; if (updates.size() != 0) { makeRoomForWrite(false); // Get sequence numbers for this change set final long sequenceBegin = versions.getLastSequence() + 1; sequenceEnd = sequenceBegin + updates.size() - 1; // Reserve this sequence in the version set versions.setLastSequence(sequenceEnd); // Log write Slice record = writeWriteBatch(updates, sequenceBegin); try { log.addRecord(record, options.sync()); } catch (IOException e) { throw Throwables.propagate(e); } // Update memtable updates.forEach(new InsertIntoHandler(memTable, sequenceBegin)); } else { sequenceEnd = versions.getLastSequence(); } if (options.snapshot()) { return new SnapshotImpl(versions.getCurrent(), sequenceEnd); } else { return null; } } finally { mutex.unlock(); } } @Override public WriteBatch createWriteBatch() { checkBackgroundException(); return new WriteBatchImpl(); } @Override public SeekingIteratorAdapter iterator() { return iterator(new ReadOptions()); } public SeekingIteratorAdapter iterator(ReadOptions options) { checkBackgroundException(); mutex.lock(); try { DbIterator rawIterator = internalIterator(); // filter any entries not visible in our snapshot SnapshotImpl snapshot = getSnapshot(options); SnapshotSeekingIterator snapshotIterator = new SnapshotSeekingIterator( rawIterator, snapshot, internalKeyComparator.getUserComparator()); return new SeekingIteratorAdapter(snapshotIterator); } finally { mutex.unlock(); } } SeekingIterable<InternalKey, Slice> internalIterable() { return new SeekingIterable<InternalKey, Slice>() { @Override public DbIterator iterator() { return internalIterator(); } }; } DbIterator internalIterator() { mutex.lock(); try { // merge together the memTable, immutableMemTable, and tables in version set MemTableIterator iterator = null; if (immutableMemTable != null) { iterator = immutableMemTable.iterator(); } Version current = versions.getCurrent(); return new DbIterator( memTable.iterator(), iterator, current.getLevel0Files(), current.getLevelIterators(), internalKeyComparator); } finally { mutex.unlock(); } } public Snapshot getSnapshot() { checkBackgroundException(); mutex.lock(); try { return new SnapshotImpl(versions.getCurrent(), versions.getLastSequence()); } finally { mutex.unlock(); } } private SnapshotImpl getSnapshot(ReadOptions options) { SnapshotImpl snapshot; if (options.snapshot() != null) { snapshot = (SnapshotImpl) options.snapshot(); } else { snapshot = new SnapshotImpl(versions.getCurrent(), versions.getLastSequence()); snapshot.close(); // To avoid holding the snapshot active.. } return snapshot; } private void makeRoomForWrite(boolean force) { Preconditions.checkState(mutex.isHeldByCurrentThread()); boolean allowDelay = !force; while (true) { // todo background processing system need work // if (!bg_error_.ok()) { // // Yield previous error // s = bg_error_; // break; // } else if (allowDelay && versions.numberOfFilesInLevel(0) > L0_SLOWDOWN_WRITES_TRIGGER) { // We are getting close to hitting a hard limit on the number of // L0 files. Rather than delaying a single write by several // seconds when we hit the hard limit, start delaying each // individual write by 1ms to reduce latency variance. Also, // this delay hands over some CPU to the compaction thread in // case it is sharing the same core as the writer. try { mutex.unlock(); Thread.sleep(1); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new RuntimeException(e); } finally { mutex.lock(); } // Do not delay a single write more than once allowDelay = false; } else if (!force && memTable.approximateMemoryUsage() <= options.writeBufferSize()) { // There is room in current memtable break; } else if (immutableMemTable != null) { // We have filled up the current memtable, but the previous // one is still being compacted, so we wait. backgroundCondition.awaitUninterruptibly(); } else if (versions.numberOfFilesInLevel(0) >= L0_STOP_WRITES_TRIGGER) { // There are too many level-0 files. // Log(options_.info_log, "waiting...\n"); backgroundCondition.awaitUninterruptibly(); } else { // Attempt to switch to a new memtable and trigger compaction of old Preconditions.checkState(versions.getPrevLogNumber() == 0); // close the existing log try { log.close(); } catch (IOException e) { throw new RuntimeException("Unable to close log file " + log.getFile(), e); } // open a new log long logNumber = versions.getNextFileNumber(); try { this.log = Logs.createLogWriter( new File(databaseDir, Filename.logFileName(logNumber)), logNumber); } catch (IOException e) { throw new RuntimeException( "Unable to open new log file " + new File(databaseDir, Filename.logFileName(logNumber)).getAbsoluteFile(), e); } // create a new mem table immutableMemTable = memTable; memTable = new MemTable(internalKeyComparator); // Do not force another compaction there is space available force = false; maybeScheduleCompaction(); } } } public void compactMemTable() throws IOException { mutex.lock(); try { compactMemTableInternal(); } finally { mutex.unlock(); } } private void compactMemTableInternal() throws IOException { Preconditions.checkState(mutex.isHeldByCurrentThread()); if (immutableMemTable == null) { return; } try { // Save the contents of the memtable as a new Table VersionEdit edit = new VersionEdit(); Version base = versions.getCurrent(); writeLevel0Table(immutableMemTable, edit, base); if (shuttingDown.get()) { throw new DatabaseShutdownException("Database shutdown during memtable compaction"); } // Replace immutable memtable with the generated Table edit.setPreviousLogNumber(0); edit.setLogNumber(log.getFileNumber()); // Earlier logs no longer needed versions.logAndApply(edit); immutableMemTable = null; deleteObsoleteFiles(); } finally { backgroundCondition.signalAll(); } } private void writeLevel0Table(MemTable mem, VersionEdit edit, Version base) throws IOException { Preconditions.checkState(mutex.isHeldByCurrentThread()); // skip empty mem table if (mem.isEmpty()) { return; } // write the memtable to a new sstable long fileNumber = versions.getNextFileNumber(); pendingOutputs.add(fileNumber); mutex.unlock(); FileMetaData meta; try { meta = buildTable(mem, fileNumber); } finally { mutex.lock(); } pendingOutputs.remove(fileNumber); // Note that if file size is zero, the file has been deleted and // should not be added to the manifest. int level = 0; if (meta != null && meta.getFileSize() > 0) { Slice minUserKey = meta.getSmallest().getUserKey(); Slice maxUserKey = meta.getLargest().getUserKey(); if (base != null) { level = base.pickLevelForMemTableOutput(minUserKey, maxUserKey); } edit.addFile(level, meta); } } private FileMetaData buildTable(SeekingIterable<InternalKey, Slice> data, long fileNumber) throws IOException { File file = new File(databaseDir, Filename.tableFileName(fileNumber)); try { InternalKey smallest = null; InternalKey largest = null; FileChannel channel = new FileOutputStream(file).getChannel(); try { TableBuilder tableBuilder = new TableBuilder(options, channel, new InternalUserComparator(internalKeyComparator)); for (Entry<InternalKey, Slice> entry : data) { // update keys InternalKey key = entry.getKey(); if (smallest == null) { smallest = key; } largest = key; tableBuilder.add(key.encode(), entry.getValue()); } tableBuilder.finish(); } finally { try { channel.force(true); } finally { channel.close(); } } if (smallest == null) { return null; } FileMetaData fileMetaData = new FileMetaData(fileNumber, file.length(), smallest, largest); // verify table can be opened tableCache.newIterator(fileMetaData); pendingOutputs.remove(fileNumber); return fileMetaData; } catch (IOException e) { file.delete(); throw e; } } private void doCompactionWork(CompactionState compactionState) throws IOException { Preconditions.checkState(mutex.isHeldByCurrentThread()); Preconditions.checkArgument( versions.numberOfBytesInLevel(compactionState.getCompaction().getLevel()) > 0); Preconditions.checkArgument(compactionState.builder == null); Preconditions.checkArgument(compactionState.outfile == null); // todo track snapshots compactionState.smallestSnapshot = versions.getLastSequence(); // Release mutex while we're actually doing the compaction work mutex.unlock(); try { MergingIterator iterator = versions.makeInputIterator(compactionState.compaction); Slice currentUserKey = null; boolean hasCurrentUserKey = false; long lastSequenceForKey = MAX_SEQUENCE_NUMBER; while (iterator.hasNext() && !shuttingDown.get()) { // always give priority to compacting the current mem table mutex.lock(); try { compactMemTableInternal(); } finally { mutex.unlock(); } InternalKey key = iterator.peek().getKey(); if (compactionState.compaction.shouldStopBefore(key) && compactionState.builder != null) { finishCompactionOutputFile(compactionState); } // Handle key/value, add to state, etc. boolean drop = false; // todo if key doesn't parse (it is corrupted), if (false /*!ParseInternalKey(key, &ikey)*/) { // do not hide error keys currentUserKey = null; hasCurrentUserKey = false; lastSequenceForKey = MAX_SEQUENCE_NUMBER; } else { if (!hasCurrentUserKey || internalKeyComparator.getUserComparator().compare(key.getUserKey(), currentUserKey) != 0) { // First occurrence of this user key currentUserKey = key.getUserKey(); hasCurrentUserKey = true; lastSequenceForKey = MAX_SEQUENCE_NUMBER; } if (lastSequenceForKey <= compactionState.smallestSnapshot) { // Hidden by an newer entry for same user key drop = true; // (A) } else if (key.getValueType() == ValueType.DELETION && key.getSequenceNumber() <= compactionState.smallestSnapshot && compactionState.compaction.isBaseLevelForKey(key.getUserKey())) { // For this user key: // (1) there is no data in higher levels // (2) data in lower levels will have larger sequence numbers // (3) data in layers that are being compacted here and have // smaller sequence numbers will be dropped in the next // few iterations of this loop (by rule (A) above). // Therefore this deletion marker is obsolete and can be dropped. drop = true; } lastSequenceForKey = key.getSequenceNumber(); } if (!drop) { // Open output file if necessary if (compactionState.builder == null) { openCompactionOutputFile(compactionState); } if (compactionState.builder.getEntryCount() == 0) { compactionState.currentSmallest = key; } compactionState.currentLargest = key; compactionState.builder.add(key.encode(), iterator.peek().getValue()); // Close output file if it is big enough if (compactionState.builder.getFileSize() >= compactionState.compaction.getMaxOutputFileSize()) { finishCompactionOutputFile(compactionState); } } iterator.next(); } if (shuttingDown.get()) { throw new DatabaseShutdownException("DB shutdown during compaction"); } if (compactionState.builder != null) { finishCompactionOutputFile(compactionState); } } finally { mutex.lock(); } // todo port CompactionStats code installCompactionResults(compactionState); } private void openCompactionOutputFile(CompactionState compactionState) throws FileNotFoundException { Preconditions.checkNotNull(compactionState, "compactionState is null"); Preconditions.checkArgument( compactionState.builder == null, "compactionState builder is not null"); mutex.lock(); try { long fileNumber = versions.getNextFileNumber(); pendingOutputs.add(fileNumber); compactionState.currentFileNumber = fileNumber; compactionState.currentFileSize = 0; compactionState.currentSmallest = null; compactionState.currentLargest = null; File file = new File(databaseDir, Filename.tableFileName(fileNumber)); compactionState.outfile = new FileOutputStream(file).getChannel(); compactionState.builder = new TableBuilder( options, compactionState.outfile, new InternalUserComparator(internalKeyComparator)); } finally { mutex.unlock(); } } private void finishCompactionOutputFile(CompactionState compactionState) throws IOException { Preconditions.checkNotNull(compactionState, "compactionState is null"); Preconditions.checkArgument(compactionState.outfile != null); Preconditions.checkArgument(compactionState.builder != null); long outputNumber = compactionState.currentFileNumber; Preconditions.checkArgument(outputNumber != 0); long currentEntries = compactionState.builder.getEntryCount(); compactionState.builder.finish(); long currentBytes = compactionState.builder.getFileSize(); compactionState.currentFileSize = currentBytes; compactionState.totalBytes += currentBytes; FileMetaData currentFileMetaData = new FileMetaData( compactionState.currentFileNumber, compactionState.currentFileSize, compactionState.currentSmallest, compactionState.currentLargest); compactionState.outputs.add(currentFileMetaData); compactionState.builder = null; compactionState.outfile.force(true); compactionState.outfile.close(); compactionState.outfile = null; if (currentEntries > 0) { // Verify that the table is usable tableCache.newIterator(outputNumber); } } private void installCompactionResults(CompactionState compact) throws IOException { Preconditions.checkState(mutex.isHeldByCurrentThread()); // Add compaction outputs compact.compaction.addInputDeletions(compact.compaction.getEdit()); int level = compact.compaction.getLevel(); for (FileMetaData output : compact.outputs) { compact.compaction.getEdit().addFile(level + 1, output); pendingOutputs.remove(output.getNumber()); } try { versions.logAndApply(compact.compaction.getEdit()); deleteObsoleteFiles(); } catch (IOException e) { // Compaction failed for some reason. Simply discard the work and try again later. // Discard any files we may have created during this failed compaction for (FileMetaData output : compact.outputs) { File file = new File(databaseDir, Filename.tableFileName(output.getNumber())); file.delete(); } compact.outputs.clear(); } } int numberOfFilesInLevel(int level) { return versions.getCurrent().numberOfFilesInLevel(level); } @Override public long[] getApproximateSizes(Range... ranges) { Preconditions.checkNotNull(ranges, "ranges is null"); long[] sizes = new long[ranges.length]; for (int i = 0; i < ranges.length; i++) { Range range = ranges[i]; sizes[i] = getApproximateSizes(range); } return sizes; } public long getApproximateSizes(Range range) { Version v = versions.getCurrent(); InternalKey startKey = new InternalKey( Slices.wrappedBuffer(range.start()), SequenceNumber.MAX_SEQUENCE_NUMBER, ValueType.VALUE); InternalKey limitKey = new InternalKey( Slices.wrappedBuffer(range.limit()), SequenceNumber.MAX_SEQUENCE_NUMBER, ValueType.VALUE); long startOffset = v.getApproximateOffsetOf(startKey); long limitOffset = v.getApproximateOffsetOf(limitKey); return (limitOffset >= startOffset ? limitOffset - startOffset : 0); } public long getMaxNextLevelOverlappingBytes() { return versions.getMaxNextLevelOverlappingBytes(); } private static class CompactionState { private final Compaction compaction; private final List<FileMetaData> outputs = newArrayList(); private long smallestSnapshot; // State kept for output being generated private FileChannel outfile; private TableBuilder builder; // Current file being generated private long currentFileNumber; private long currentFileSize; private InternalKey currentSmallest; private InternalKey currentLargest; private long totalBytes; private CompactionState(Compaction compaction) { this.compaction = compaction; } public Compaction getCompaction() { return compaction; } } private static class ManualCompaction { private final int level; private final Slice begin; private final Slice end; private ManualCompaction(int level, Slice begin, Slice end) { this.level = level; this.begin = begin; this.end = end; } } private WriteBatchImpl readWriteBatch(SliceInput record, int updateSize) throws IOException { WriteBatchImpl writeBatch = new WriteBatchImpl(); int entries = 0; while (record.isReadable()) { entries++; ValueType valueType = ValueType.getValueTypeByPersistentId(record.readByte()); if (valueType == VALUE) { Slice key = readLengthPrefixedBytes(record); Slice value = readLengthPrefixedBytes(record); writeBatch.put(key, value); } else if (valueType == DELETION) { Slice key = readLengthPrefixedBytes(record); writeBatch.delete(key); } else { throw new IllegalStateException("Unexpected value type " + valueType); } } if (entries != updateSize) { throw new IOException( String.format( "Expected %d entries in log record but found %s entries", updateSize, entries)); } return writeBatch; } private Slice writeWriteBatch(WriteBatchImpl updates, long sequenceBegin) { Slice record = Slices.allocate(SIZE_OF_LONG + SIZE_OF_INT + updates.getApproximateSize()); final SliceOutput sliceOutput = record.output(); sliceOutput.writeLong(sequenceBegin); sliceOutput.writeInt(updates.size()); updates.forEach( new Handler() { @Override public void put(Slice key, Slice value) { sliceOutput.writeByte(VALUE.getPersistentId()); writeLengthPrefixedBytes(sliceOutput, key); writeLengthPrefixedBytes(sliceOutput, value); } @Override public void delete(Slice key) { sliceOutput.writeByte(DELETION.getPersistentId()); writeLengthPrefixedBytes(sliceOutput, key); } }); return record.slice(0, sliceOutput.size()); } private static class InsertIntoHandler implements Handler { private long sequence; private final MemTable memTable; public InsertIntoHandler(MemTable memTable, long sequenceBegin) { this.memTable = memTable; this.sequence = sequenceBegin; } @Override public void put(Slice key, Slice value) { memTable.add(sequence++, VALUE, key, value); } @Override public void delete(Slice key) { memTable.add(sequence++, DELETION, key, Slices.EMPTY_SLICE); } } public static class DatabaseShutdownException extends DBException { public DatabaseShutdownException() {} public DatabaseShutdownException(String message) { super(message); } } public static class BackgroundProcessingException extends DBException { public BackgroundProcessingException(Throwable cause) { super(cause); } } private Object suspensionMutex = new Object(); private int suspensionCounter = 0; @Override public void suspendCompactions() throws InterruptedException { compactionExecutor.execute( new Runnable() { @Override public void run() { try { synchronized (suspensionMutex) { suspensionCounter++; suspensionMutex.notifyAll(); while (suspensionCounter > 0 && !compactionExecutor.isShutdown()) { suspensionMutex.wait(500); } } } catch (InterruptedException e) { } } }); synchronized (suspensionMutex) { while (suspensionCounter < 1) { suspensionMutex.wait(); } } } @Override public void resumeCompactions() { synchronized (suspensionMutex) { suspensionCounter--; suspensionMutex.notifyAll(); } } @Override public void compactRange(byte[] begin, byte[] end) throws DBException { throw new UnsupportedOperationException("Not yet implemented"); } }
/** * @author oifa yulian modified copy of linked blocking queue currently goes to garbage - node * elements which are based on 2 pointers : item and next and should signal boolean value under * offer function */ public class ConcurrentCyclicFIFO<E> { static class Node<E> { volatile E item; Node<E> next; Node(E x) { item = x; } } /** Current number of elements */ private final AtomicInteger count = new AtomicInteger(0); /** Head of linked list */ private transient Node<E> head; /** Tail of linked list */ private transient Node<E> last; /** Lock held by take, poll, etc */ private final ReentrantLock takeLock = new ReentrantLock(); /** Wait queue for waiting takes */ private final Condition notEmpty = takeLock.newCondition(); /** Lock held by put, offer, etc */ private final ReentrantLock putLock = new ReentrantLock(); /** * Signals a waiting take. Called only from put/offer (which do not otherwise ordinarily lock * takeLock.) */ private void signalNotEmpty() { final ReentrantLock takeLock = this.takeLock; takeLock.lock(); try { notEmpty.signal(); } finally { takeLock.unlock(); } } /** * Creates a node and links it at end of queue. * * @param x the item */ private void insert(Node<E> x) { last = last.next = x; } /** * Removes a node from head of queue, * * @return the node */ private Node<E> extract() { Node<E> current = head; head = head.next; current.item = head.item; head.item = null; return current; } public ConcurrentCyclicFIFO() { last = head = new Node<E>(null); } public int size() { return count.get(); } public boolean offer(E e) { if (e == null) throw new NullPointerException(); final AtomicInteger count = this.count; boolean shouldSignal = false; final ReentrantLock putLock = this.putLock; putLock.lock(); try { insert(new Node(e)); shouldSignal = (count.getAndIncrement() == 0); } finally { putLock.unlock(); } if (shouldSignal) signalNotEmpty(); return !shouldSignal; } public E take() throws InterruptedException { Node<E> x; final AtomicInteger count = this.count; final ReentrantLock takeLock = this.takeLock; takeLock.lockInterruptibly(); try { try { while (count.get() == 0) notEmpty.await(); } catch (InterruptedException ie) { notEmpty.signal(); // propagate to a non-interrupted thread throw ie; } x = extract(); if (count.getAndDecrement() > 1) notEmpty.signal(); } finally { takeLock.unlock(); } E result = x.item; // temporary clearence x.item = null; x.next = null; return result; } public E poll() { final AtomicInteger count = this.count; if (count.get() == 0) return null; Node<E> x = null; final ReentrantLock takeLock = this.takeLock; takeLock.lock(); try { if (count.get() > 0) { x = extract(); if (count.getAndDecrement() > 1) notEmpty.signal(); } } finally { takeLock.unlock(); } if (x != null) { E result = x.item; // temporary clearence x.item = null; x.next = null; return result; } return null; } public void clear() { putLock.lock(); takeLock.lock(); try { head.next = null; assert head.item == null; last = head; count.set(0); } finally { takeLock.unlock(); putLock.unlock(); } } }
public OTEServerDiscoveryImpl() { lock = new ReentrantLock(); condition = lock.newCondition(); store = new OTEServerStoreImpl(lock, condition); notification = new OteServerNotification(store); }
/** * Basic test of the ability to add to a buffer with a fixed capacity queue and to drain the * elements from the queue including tests of the non-blocking aspects of the API. * * @throws TimeoutException * @throws ExecutionException * @throws InterruptedException */ public void test_blockingBuffer() throws InterruptedException, ExecutionException, TimeoutException { final Integer e0 = new Integer(0); final Integer e1 = new Integer(1); final Integer e2 = new Integer(2); final int queueCapacity = 3; final BlockingQueue<Integer[]> queue = new ArrayBlockingQueue<Integer[]>(queueCapacity); final int chunkSize = 4; final long chunkTimeout = 1000; final TimeUnit chunkTimeoutUnit = TimeUnit.MILLISECONDS; /* * The test timeout is just a smidge longer than the chunk timeout. * * Note: use Long.MAX_VALUE iff debugging. */ // final long testTimeout = Long.MAX_VALUE; final long testTimeout = chunkTimeout + 20; final boolean ordered = false; final BlockingBuffer<Integer[]> buffer = new BlockingBuffer<Integer[]>(queue, chunkSize, chunkTimeout, chunkTimeoutUnit, ordered); // buffer is empty. assertTrue(buffer.isOpen()); assertTrue(buffer.isEmpty()); assertEquals("chunkCount", 0L, buffer.getChunksAddedCount()); assertEquals("elementCount", 0L, buffer.getElementsAddedCount()); final IAsynchronousIterator<Integer[]> itr = buffer.iterator(); // nothing available from the iterator (non-blocking test). assertFalse(itr.hasNext(1, TimeUnit.NANOSECONDS)); assertNull(itr.next(1, TimeUnit.NANOSECONDS)); // add an element to the buffer - should not block. buffer.add(new Integer[] {e0}); // should be one element and one chunk accepted by the buffer. assertTrue(buffer.isOpen()); assertFalse(buffer.isEmpty()); assertEquals("chunkCount", 1L, buffer.getChunksAddedCount()); assertEquals("elementCount", 1L, buffer.getElementsAddedCount()); // something should be available now (non-blocking). assertTrue(itr.hasNext(1, TimeUnit.NANOSECONDS)); // something should be available now (blocking). assertTrue(itr.hasNext()); // add another element to the buffer - should not block. buffer.add(new Integer[] {e1}); // should be two elements and two chunks accepted into the buffer assertTrue(buffer.isOpen()); assertFalse(buffer.isEmpty()); assertEquals("chunkCount", 2L, buffer.getChunksAddedCount()); assertEquals("elementCount", 2L, buffer.getElementsAddedCount()); final ReentrantLock lock = new ReentrantLock(); final Condition cond = lock.newCondition(); final AtomicBoolean proceedFlag = new AtomicBoolean(false); // future of task writing a 3rd element on the buffer. final Future<?> producerFuture = service.submit( new Callable<Void>() { public Void call() throws Exception { lock.lockInterruptibly(); try { if (!proceedFlag.get()) { cond.await(); } /* * add another element - should block until we take an * element using the iterator. */ buffer.add(new Integer[] {e2}); /* * itr.hasNext() will block until the buffer is closed. */ buffer.close(); } finally { lock.unlock(); } // done. return null; } }); // future of task draining the buffer. final Future<?> consumerFuture = service.submit( new Callable<Void>() { public Void call() throws Exception { try { lock.lockInterruptibly(); try { assertTrue(itr.hasNext()); // take the first chunk - two elements. if (log.isInfoEnabled()) log.info("Awaiting first chunk"); assertSameArray(new Integer[] {e0, e1}, itr.next(50, TimeUnit.MILLISECONDS)); if (log.isInfoEnabled()) log.info("Have first chunk"); /* * Verify that we obtained the first chunk before the * buffer was closed. Otherwise next() blocked * attempting to compile a full chunk until the producer * timeout, at which point the producer closed the * buffer and next() noticed the closed buffer and * returned. */ assertTrue(buffer.isOpen()); assertFalse("buffer was closed.", itr.isExhausted()); /* * Verify that nothing is available from the iterator * (non-blocking test). */ assertFalse(itr.hasNext(1, TimeUnit.NANOSECONDS)); assertNull(itr.next(1, TimeUnit.NANOSECONDS)); // Signal the producer that it should continue. proceedFlag.set(true); cond.signal(); } finally { lock.unlock(); } // should block until we close the buffer. assertTrue(itr.hasNext()); // last chunk assertSameArray(new Integer[] {e2}, itr.next()); // should be immediately false. assertFalse(itr.hasNext(1, TimeUnit.NANOSECONDS)); // should be immediately null. assertNull(itr.next(1, TimeUnit.NANOSECONDS)); // The synchronous API should also report an exhausted // itr. assertFalse(itr.hasNext()); try { itr.next(); fail("Expecting: " + NoSuchElementException.class); } catch (NoSuchElementException ex) { if (log.isInfoEnabled()) log.info("Ignoring expected exception: " + ex); } return null; } catch (Throwable t) { log.error("Consumer failed or blocked: " + t, t); throw new Exception(t); } } }); // wait a little bit for the producer future. producerFuture.get(testTimeout, chunkTimeoutUnit); // wait a little bit for the consumer future. consumerFuture.get(testTimeout, chunkTimeoutUnit); }