Ejemplo n.º 1
0
 /**
  * Creates an <tt>ArrayBlockingQueue</tt> with the given (fixed) capacity and the specified access
  * policy.
  *
  * @param capacity the capacity of this queue
  * @param fair if <tt>true</tt> then queue accesses for threads blocked on insertion or removal,
  *     are processed in FIFO order; if <tt>false</tt> the access order is unspecified.
  * @throws IllegalArgumentException if <tt>capacity</tt> is less than 1
  */
 public ArrayBlockingQueue(int capacity, boolean fair) {
   if (capacity <= 0) throw new IllegalArgumentException();
   this.items = (E[]) new Object[capacity];
   lock = new ReentrantLock(fair);
   notEmpty = lock.newCondition();
   notFull = lock.newCondition();
 }
Ejemplo n.º 2
0
 private void init() {
   hasFreeThread = lock.newCondition();
   hasTask = lock.newCondition();
   pool = new LinkedList<WorkThread>();
   monitor = new QueueExecuteThread();
   freer = new FreeCleanThread();
   killer = new KillCleanThread();
 }
 @SuppressWarnings("unchecked")
 public PriorityBlockingDeque(SortedSet<? extends E> c) {
   this.lock = new ReentrantLock();
   this.notEmpty = lock.newCondition();
   this.comparator = (Comparator<? super E>) c.comparator();
   addAll(c);
 }
 /**
  * Method to create a new TE LSP initiated in this node
  *
  * @param destinationId IP AddreStart LSP Errorss of the destination of the LSP
  * @param bw Bandwidth requested
  * @param bidirectional bidirectional
  * @param OFcode
  * @throws LSPCreationException
  */
 public long addnewLSP(
     Inet4Address destinationId, float bw, boolean bidirectional, int OFcode, int lspID)
     throws LSPCreationException {
   log.info("Adding New LSP to " + destinationId);
   // FIXME: mirar esto
   // meter structura --> RequestedLSPinformation --> Dependiente de cada tecnologia
   // meter campo con el estado del LSP e ir cambiandolo
   LSPTE lsp =
       new LSPTE(
           lspID,
           localIP,
           destinationId,
           bidirectional,
           OFcode,
           bw,
           PathStateParameters.creatingLPS);
   LSPList.put(new LSPKey(localIP, lsp.getIdLSP()), lsp);
   ReentrantLock lock = new ReentrantLock();
   Condition lspEstablished = lock.newCondition();
   // log.info("Metemos en Lock list con ID: "+lsp.getIdLSP());
   lockList.put(lsp.getIdLSP(), lock);
   conditionList.put(lsp.getIdLSP(), lspEstablished);
   /*log.info("Size lockList : "+lockList.size());
   log.info("Size conditionList : "+conditionList.size());*/
   timeIni = System.nanoTime();
   log.info("Start to establish path: " + System.nanoTime());
   try {
     startLSP(lsp);
   } catch (LSPCreationException e) {
     log.info("Start LSP Error!");
     throw e;
   }
   return lsp.getIdLSP();
 }
Ejemplo n.º 5
0
 public GifDecoder(GifAction gifaction)
 {
     isDestroy = false;
     f = 1;
     A = new byte[256];
     B = 0;
     C = 0;
     D = 0;
     E = false;
     F = 0;
     O = new ArrayBlockingQueue(15);
     P = new ReentrantLock();
     Q = P.newCondition();
     R = P.newCondition();
     S = 0;
     T = false;
     U = new ArrayList(M);
     V = 0;
     W = false;
     X = null;
     Y = null;
     Z = false;
     aa = 0;
     ab = null;
     ac = 0;
     ad = null;
     ae = null;
     af = new int[256];
     X = gifaction;
 }
/**
 * Manages the election of which asynchronous saga event processor is responsible for creating a new
 * Saga instance, when necessary.
 *
 * @author Allard Buijze
 * @since 2.0
 */
class AsyncSagaCreationElector {

  private static final Logger logger = LoggerFactory.getLogger(AsyncSagaCreationElector.class);

  private final ReentrantLock votingLock = new ReentrantLock();
  private final Condition allVotesCast = votingLock.newCondition();

  // guarded by "votingLock"
  private int castVotes = 0;
  private volatile boolean invocationDetected = false;

  /**
   * Forces the current thread to wait for the voting to complete if it is responsible for creating
   * the Saga. As soon as an invocation has been recorded, the waiting thread is released.
   *
   * @param didInvocation indicates whether the current processor found a Saga to process
   * @param totalVotesExpected The total number of processors expected to cast a vote
   * @param isSagaOwner Indicates whether the current processor "owns" the to-be-created saga
   *     instance.
   * @return <code>true</code> if the current processor should create the new instance, <code>false
   *     </code> otherwise.
   */
  public boolean waitForSagaCreationVote(
      final boolean didInvocation, final int totalVotesExpected, final boolean isSagaOwner) {
    votingLock.lock();
    try {
      invocationDetected = invocationDetected || didInvocation;
      castVotes++;
      while (isSagaOwner && !invocationDetected && castVotes < totalVotesExpected) {
        try {
          allVotesCast.await();
        } catch (InterruptedException e) {
          // interrupting this process is not supported.
          logger.warn(
              "This thread has been interrupted, but the interruption has "
                  + "been ignored to prevent loss of information.");
        }
      }
      if (isSagaOwner) {
        return !invocationDetected;
      }
      allVotesCast.signalAll();
    } finally {
      votingLock.unlock();
    }
    return false;
  }

  /** Clears the voting counts for a new round. */
  public void clear() {
    votingLock.lock();
    try {
      castVotes = 0;
      invocationDetected = false;
    } finally {
      votingLock.unlock();
    }
  }
}
Ejemplo n.º 7
0
 /**
  * Constructor of the class. Initialize all the objects
  *
  * @param maxSize The size of the buffer
  */
 public Buffer(final int maxSize) {
   this.maxSize = maxSize;
   buffer = new LinkedList<>();
   lock = new ReentrantLock();
   lines = lock.newCondition();
   space = lock.newCondition();
   pendingLines = true;
 }
Ejemplo n.º 8
0
 public SimpleSemaphore(int permits, boolean fair) {
   // TODO - you fill in here to initialize the SimpleSemaphore,
   // making sure to allow both fair and non-fair Semaphore
   // semantics.
   this.permits = permits;
   reentrantLock = new ReentrantLock(fair);
   condition = reentrantLock.newCondition();
 }
 public PriorityBlockingDeque(int initialCapacity, Comparator<? super E> comparator) {
   // Note: This restriction of at least one is not actually needed,
   // but continues for 1.5 compatibility
   if (initialCapacity < 1) throw new IllegalArgumentException();
   this.lock = new ReentrantLock();
   this.notEmpty = lock.newCondition();
   this.comparator = comparator;
   this.deque = new Object[initialCapacity];
 }
Ejemplo n.º 10
0
  private class SpiceArrayAdapterUnderTest extends SpiceArrayAdapter<DataUnderTest> {

    private ReentrantLock reentrantLock = new ReentrantLock();
    private Condition loadBitmapHasBeenCalledCondition = reentrantLock.newCondition();
    private boolean loadBitmapHasBeenCalled = false;

    public SpiceArrayAdapterUnderTest(
        Context context, BitmapSpiceManager spiceManagerBinary, List<DataUnderTest> data) {
      super(context, spiceManagerBinary, data);
    }

    @Override
    public BitmapRequest createRequest(
        DataUnderTest data, int imageIndex, int reqWidth, int reqHeight) {
      return new BitmapRequest(
          mockWebServer.getUrl("/" + data.getImageUrl()).toString(),
          reqWidth,
          reqHeight,
          cacheFile);
    }

    // ----------------------------------------------------
    // ----- Block Test thread until drawable is refreshed.
    // ----------------------------------------------------

    @Override
    protected void loadBitmapAsynchronously(
        DataUnderTest octo, ImageView thumbImageView, String tempThumbnailImageFileName) {
      super.loadBitmapAsynchronously(octo, thumbImageView, tempThumbnailImageFileName);
      reentrantLock.lock();
      try {
        loadBitmapHasBeenCalled = true;
        loadBitmapHasBeenCalledCondition.signal();
      } finally {
        reentrantLock.unlock();
      }
    }

    public void await(long millisecond) throws InterruptedException {
      reentrantLock.lock();
      try {
        loadBitmapHasBeenCalledCondition.await(millisecond, TimeUnit.MILLISECONDS);
      } finally {
        reentrantLock.unlock();
      }
    }

    public boolean isLoadBitmapHasBeenCalled() {
      return loadBitmapHasBeenCalled;
    }

    @Override
    public SpiceListItemView<DataUnderTest> createView(Context context, ViewGroup parent) {
      return new ListItemViewStub(getContext());
    }
  }
class PausableThreadPoolExecutor extends ThreadPoolExecutor
    implements ExecutorRemoteControllerService {
  public PausableThreadPoolExecutor(
      int corePoolSize,
      int maximumPoolSize,
      long keepAliveTime,
      TimeUnit unit,
      BlockingQueue<Runnable> workQueue,
      ThreadFactory threadFactory,
      RejectedExecutionHandler handler) {
    super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory, handler);
  }

  private boolean isPaused;
  private ReentrantLock pauseLock = new ReentrantLock();
  private Condition unpaused = pauseLock.newCondition();

  @Override
  protected void beforeExecute(Thread t, Runnable r) {
    super.beforeExecute(t, r);
    pauseLock.lock();
    try {
      while (isPaused) unpaused.await();
    } catch (InterruptedException ie) {
      t.interrupt();
    } finally {
      pauseLock.unlock();
    }
  }

  @Override
  public void pause() {
    pauseLock.lock();
    try {
      isPaused = true;
    } finally {
      pauseLock.unlock();
    }
  }

  @Override
  public void resume() {
    pauseLock.lock();
    try {
      isPaused = false;
      unpaused.signalAll();
    } finally {
      pauseLock.unlock();
    }
  }
}
Ejemplo n.º 12
0
 /**
  * Constructor.
  *
  * @param thread The MoSync thread.
  */
 public MoSyncCameraController(MoSyncThread thread) {
   mMoSyncThread = thread;
   lock = new ReentrantLock();
   mPreview = null;
   condition = lock.newCondition();
   dataReady = false;
   userWidths = new ArrayList<Integer>();
   userHeights = new ArrayList<Integer>();
   mCameraParametersList = new ArrayList<Camera.Parameters>();
   mNumCameras = numberOfCameras();
   initilizeCameras();
   rawMode = false;
   mCurrentCameraIndex = 0;
 }
Ejemplo n.º 13
0
public class ManualFlag extends Flag {
  private final ReentrantLock lock = new ReentrantLock();
  private final Condition condition = lock.newCondition();
  private boolean set;

  public ManualFlag(boolean state) {
    set = state;
  }

  public ManualFlag() {
    this(false);
  }

  @Override
  public void set() {
    lock.lock();
    try {
      set = true;
      condition.signalAll();
    } finally {
      lock.unlock();
    }
  }

  public void reset() {
    lock.lock();
    try {
      set = false;
    } finally {
      lock.unlock();
    }
  }

  @Override
  public void await() {
    lock.lock();
    while (true) {
      try {
        while (!set) condition.await();
        return;
      } catch (InterruptedException ex) {
        continue;
      } finally {
        lock.unlock();
      }
    }
  }
}
Ejemplo n.º 14
0
  /**
   * Transfer the local file to the hypervisor's importdir
   *
   * @param from Local file name
   * @throws OccpException
   */
  public void stageFile(String from) throws OccpException {
    boolean transferSuccess = false;
    /*
     * Only VBox requires transfer to the host; Esxi allows uploads
     */
    if (this.hv.getClass() == OccpVBoxHV.class) {
      try {
        completionLock.lock();
        if (!completed_conditions.containsKey(from)) {
          completed_conditions.put(from, completionLock.newCondition());
        } else {
          while (!completed_transfers.containsKey(from)) {
            try {
              completed_conditions.get(from).await();
            } catch (InterruptedException e) {
              throw new VMOperationFailedException(
                  hv.getName(), vm.getName(), ErrorCode.TRANSFER_TO, "Transfer interrupted", e);
            }
          }
          // If it has been transferred, say we did it, otherwise try again
          if (completed_transfers.get(from) == true) {
            return;
          }
        }
      } finally {
        completionLock.unlock();
      }

      boolean hasPath = (from.lastIndexOf('/') >= 0);
      String to = from;
      if (hasPath) {
        to = from.substring(from.lastIndexOf('/') + 1);
      }
      try {
        this.hv.transferFileToVM(this.vm, from, "/mnt/" + OccpAdmin.scenarioName + "/" + to, false);
        transferSuccess = true;
      } finally {
        completionLock.lock();
        try {
          completed_transfers.put(from, transferSuccess);
          completed_conditions.get(from).signal();
        } finally {
          completionLock.unlock();
        }
      }
    }
  }
/** for array */
public class TestArrayBlockingQueueforList {
  public ReentrantLock lock = new ReentrantLock();
  public Condition condition = lock.newCondition();
  public List<Integer> list = new ArrayList<Integer>();
  boolean isEmpty = true;

  public static void main(String[] args) {
    TestArrayBlockingQueueforList test = new TestArrayBlockingQueueforList();
    Resource res = new Resource();
    new Thread(new ProducerThread(test.getCondition(), test.getLock(), res)).start();
    new Thread(new ConsumerThread(test.getCondition(), test.getLock(), res)).start();
  }

  public ReentrantLock getLock() {
    return lock;
  }

  public void setLock(ReentrantLock lock) {
    this.lock = lock;
  }

  public Condition getCondition() {
    return condition;
  }

  public void setCondition(Condition condition) {
    this.condition = condition;
  }

  public List<Integer> getList() {
    return list;
  }

  public void setList(List<Integer> list) {
    this.list = list;
  }

  public boolean isEmpty() {
    return isEmpty;
  }

  public void setEmpty(boolean isEmpty) {
    this.isEmpty = isEmpty;
  }
}
  /**
   * Constructs the consumer which will read from the given destination and is a child of the given
   * context.
   *
   * @param destination the destination that this consumer will read from
   * @param hazelcastMQContext the parent context of this consumer
   */
  DefaultHazelcastMQConsumer(String destination, DefaultHazelcastMQContext hazelcastMQContext) {
    super();

    this.destination = destination;
    this.receiveLock = new ReentrantLock();
    this.receiveCondition = receiveLock.newCondition();
    this.closed = false;
    this.active = false;

    this.hazelcastMQContext = hazelcastMQContext;
    this.config = hazelcastMQContext.getHazelcastMQInstance().getConfig();

    HazelcastInstance hazelcast =
        this.hazelcastMQContext.getHazelcastMQInstance().getConfig().getHazelcastInstance();

    IdGenerator idGenerator = hazelcast.getIdGenerator("hazelcastmqconsumer");
    this.id = "hazelcastmqconsumer-" + String.valueOf(idGenerator.newId());
  }
Ejemplo n.º 17
0
public class MyService {
  private ReentrantLock lock = new ReentrantLock();
  private Condition condition = lock.newCondition();

  public void waitMethod() {
    try {
      lock.lock();
      System.out.println("A");
      condition.await();
      System.out.println("B");
    } catch (InterruptedException e) {
      e.printStackTrace();
    } finally {
      lock.unlock();
      System.out.println("锁释放了!");
    }
  }
}
Ejemplo n.º 18
0
  private static final class Notifier {

    private final ReentrantLock lock = new ReentrantLock();
    private final Condition condition = lock.newCondition();
    private volatile TimeValue timeout;

    public Notifier(TimeValue timeout) {
      assert timeout != null;
      this.timeout = timeout;
    }

    public void await() {
      lock.lock();
      try {
        condition.await(timeout.millis(), TimeUnit.MILLISECONDS);
      } catch (InterruptedException e) {
        // we intentionally do not want to restore the interruption flag, we're about to shutdown
        // anyway
      } finally {
        lock.unlock();
      }
    }

    public void setTimeout(TimeValue timeout) {
      assert timeout != null;
      this.timeout = timeout;
      doNotify();
    }

    public TimeValue getTimeout() {
      return timeout;
    }

    public void doNotify() {
      lock.lock();
      try {
        condition.signalAll();
      } finally {
        lock.unlock();
      }
    }
  }
public class BrokenOrderingReentrantLock implements Runnable {
  private final ReentrantLock lock1 = new ReentrantLock();
  private final ReentrantLock lock2 = new ReentrantLock();
  private final Condition condition = lock1.newCondition();

  public static void main(String[] args) throws InterruptedException {
    BrokenOrderingReentrantLock runnable = new BrokenOrderingReentrantLock();
    Thread thread1 = new Thread(runnable, "thread1");
    Thread thread2 = new Thread(runnable, "thread2");
    thread1.start();
    Thread.sleep(500);
    thread2.start();
  }

  @Override
  public void run() {
    try {
      String threadName = Thread.currentThread().getName();
      lock1.lock();
      try {
        System.out.println(threadName + " has lock1");
        lock2.lock();
        try {
          System.out.println(threadName + " has lock2");
          lock1.lock();
          try {
            System.out.println(threadName + " reenters lock1");
            condition.await(1, TimeUnit.SECONDS);
          } finally {
            lock1.unlock();
          }
        } finally {
          lock2.unlock();
        }
      } finally {
        lock1.unlock();
      }
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
  }
}
 @SuppressWarnings("unchecked")
 public PriorityBlockingDeque(Collection<? extends E> c) {
   this.lock = new ReentrantLock();
   this.notEmpty = lock.newCondition();
   if (c instanceof SortedSet<?>) {
     SortedSet<? extends E> ss = (SortedSet<? extends E>) c;
     this.comparator = (Comparator<? super E>) ss.comparator();
     addAll(ss);
   } else if (c instanceof PriorityDeque<?>) {
     PriorityDeque<? extends E> pq = (PriorityDeque<? extends E>) c;
     this.comparator = (Comparator<? super E>) pq.comparator();
     initFromPriorityDeque(pq);
   } else if (c instanceof PriorityBlockingDeque<?>) {
     PriorityBlockingDeque<? extends E> pq = (PriorityBlockingDeque<? extends E>) c;
     this.comparator = (Comparator<? super E>) pq.comparator();
     initFromPriorityBlockingDeque(pq);
   } else {
     this.comparator = null;
     addAll(c);
   }
 }
  ConsumerImpl(
      URI uri,
      String submitQName,
      String statusQName,
      String statusTName,
      String heartbeatTName,
      String commandTName,
      IEventConnectorService service,
      IEventService eservice)
      throws EventException {

    super(uri, submitQName, statusQName, statusTName, commandTName, service, eservice);
    this.lock = new ReentrantLock();
    this.paused = lock.newCondition();

    durable = true;
    consumerId = UUID.randomUUID();
    name = "Consumer " + consumerId; // This will hopefully be changed to something meaningful...
    this.processes = new Hashtable<>(7); // Synch!
    this.heartbeatTopicName = heartbeatTName;
    connect();
  }
Ejemplo n.º 22
0
public class MyService {

  private ReentrantLock lock = new ReentrantLock();
  private Condition condition = lock.newCondition();
  private boolean hasValue = false;

  public void set() {
    try {
      lock.lock();
      while (hasValue == true) {
        condition.await();
      }
      System.out.println("打印★");
      hasValue = true;
      condition.signal();
    } catch (InterruptedException e) {
      e.printStackTrace();
    } finally {
      lock.unlock();
    }
  }

  public void get() {
    try {
      lock.lock();
      while (hasValue == false) {
        condition.await();
      }
      System.out.println("打印☆");
      hasValue = false;
      condition.signal();
    } catch (InterruptedException e) {
      e.printStackTrace();
    } finally {
      lock.unlock();
    }
  }
}
class SuspendableThreadPoolExecutor extends ThreadPoolExecutor {

  private boolean available = false;
  private ReentrantLock suspendLock = new ReentrantLock();
  private Condition availableCondition = suspendLock.newCondition();

  public SuspendableThreadPoolExecutor(ThreadFactory threadFactory) {
    super(1, 1, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>(), threadFactory);
  }

  @Override
  protected void beforeExecute(Thread thread, Runnable task) {
    super.beforeExecute(thread, task);
    suspendLock.lock();
    try {
      while (!available) {
        availableCondition.await();
      }
    } catch (InterruptedException interruptedException) {
      thread.interrupt();
    } finally {
      suspendLock.unlock();
    }
  }

  public void setAvailable(boolean available) {
    suspendLock.lock();
    try {
      this.available = available;
      if (available) {
        availableCondition.signalAll();
      }
    } finally {
      suspendLock.unlock();
    }
  }
}
Ejemplo n.º 24
0
// This only knows how to deal with a single srcIndex for a given targetIndex.
// In case the src task generates multiple outputs for the same target Index
// (multiple src-indices), modifications will be required.
public class ShuffleManager implements FetcherCallback {

  private static final Log LOG = LogFactory.getLog(ShuffleManager.class);

  private final TezInputContext inputContext;
  private final int numInputs;

  private final FetchedInputAllocator inputManager;

  private final ListeningExecutorService fetcherExecutor;

  private final ListeningExecutorService schedulerExecutor;
  private final RunShuffleCallable schedulerCallable = new RunShuffleCallable();

  private final BlockingQueue<FetchedInput> completedInputs;
  private final AtomicBoolean inputReadyNotificationSent = new AtomicBoolean(false);
  private final Set<InputIdentifier> completedInputSet;
  private final ConcurrentMap<String, InputHost> knownSrcHosts;
  private final BlockingQueue<InputHost> pendingHosts;
  private final Set<InputAttemptIdentifier> obsoletedInputs;
  private Set<Fetcher> runningFetchers;

  private final AtomicInteger numCompletedInputs = new AtomicInteger(0);

  private final long startTime;
  private long lastProgressTime;

  // Required to be held when manipulating pendingHosts
  private final ReentrantLock lock = new ReentrantLock();
  private final Condition wakeLoop = lock.newCondition();

  private final int numFetchers;

  // Parameters required by Fetchers
  private final SecretKey shuffleSecret;
  private final CompressionCodec codec;

  private final int ifileBufferSize;
  private final boolean ifileReadAhead;
  private final int ifileReadAheadLength;

  private final String srcNameTrimmed;

  private final AtomicBoolean isShutdown = new AtomicBoolean(false);

  private final TezCounter shuffledInputsCounter;
  private final TezCounter failedShufflesCounter;
  private final TezCounter bytesShuffledCounter;
  private final TezCounter decompressedDataSizeCounter;
  private final TezCounter bytesShuffledToDiskCounter;
  private final TezCounter bytesShuffledToMemCounter;

  private volatile Throwable shuffleError;
  private final HttpConnectionParams httpConnectionParams;

  // TODO More counters - FetchErrors, speed?

  public ShuffleManager(
      TezInputContext inputContext,
      Configuration conf,
      int numInputs,
      int bufferSize,
      boolean ifileReadAheadEnabled,
      int ifileReadAheadLength,
      CompressionCodec codec,
      FetchedInputAllocator inputAllocator)
      throws IOException {
    this.inputContext = inputContext;
    this.numInputs = numInputs;

    this.shuffledInputsCounter =
        inputContext.getCounters().findCounter(TaskCounter.NUM_SHUFFLED_INPUTS);
    this.failedShufflesCounter =
        inputContext.getCounters().findCounter(TaskCounter.NUM_FAILED_SHUFFLE_INPUTS);
    this.bytesShuffledCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES);
    this.decompressedDataSizeCounter =
        inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_DECOMPRESSED);
    this.bytesShuffledToDiskCounter =
        inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_TO_DISK);
    this.bytesShuffledToMemCounter =
        inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_TO_MEM);

    this.ifileBufferSize = bufferSize;
    this.ifileReadAhead = ifileReadAheadEnabled;
    this.ifileReadAheadLength = ifileReadAheadLength;
    this.codec = codec;
    this.inputManager = inputAllocator;

    this.srcNameTrimmed = TezUtils.cleanVertexName(inputContext.getSourceVertexName());

    completedInputSet =
        Collections.newSetFromMap(new ConcurrentHashMap<InputIdentifier, Boolean>(numInputs));
    completedInputs = new LinkedBlockingQueue<FetchedInput>(numInputs);
    knownSrcHosts = new ConcurrentHashMap<String, InputHost>();
    pendingHosts = new LinkedBlockingQueue<InputHost>();
    obsoletedInputs =
        Collections.newSetFromMap(new ConcurrentHashMap<InputAttemptIdentifier, Boolean>());
    runningFetchers = Collections.newSetFromMap(new ConcurrentHashMap<Fetcher, Boolean>());

    int maxConfiguredFetchers =
        conf.getInt(
            TezJobConfig.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES,
            TezJobConfig.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES_DEFAULT);

    this.numFetchers = Math.min(maxConfiguredFetchers, numInputs);

    ExecutorService fetcherRawExecutor =
        Executors.newFixedThreadPool(
            numFetchers,
            new ThreadFactoryBuilder()
                .setDaemon(true)
                .setNameFormat("Fetcher [" + srcNameTrimmed + "] #%d")
                .build());
    this.fetcherExecutor = MoreExecutors.listeningDecorator(fetcherRawExecutor);

    ExecutorService schedulerRawExecutor =
        Executors.newFixedThreadPool(
            1,
            new ThreadFactoryBuilder()
                .setDaemon(true)
                .setNameFormat("ShuffleRunner [" + srcNameTrimmed + "]")
                .build());
    this.schedulerExecutor = MoreExecutors.listeningDecorator(schedulerRawExecutor);

    this.startTime = System.currentTimeMillis();
    this.lastProgressTime = startTime;

    this.shuffleSecret =
        ShuffleUtils.getJobTokenSecretFromTokenBytes(
            inputContext.getServiceConsumerMetaData(
                TezConfiguration.TEZ_SHUFFLE_HANDLER_SERVICE_ID));
    httpConnectionParams = ShuffleUtils.constructHttpShuffleConnectionParams(conf);
    LOG.info(
        this.getClass().getSimpleName()
            + " : numInputs="
            + numInputs
            + ", compressionCodec="
            + (codec == null ? "NoCompressionCodec" : codec.getClass().getName())
            + ", numFetchers="
            + numFetchers
            + ", ifileBufferSize="
            + ifileBufferSize
            + ", ifileReadAheadEnabled="
            + ifileReadAhead
            + ", ifileReadAheadLength="
            + ifileReadAheadLength
            + ", "
            + httpConnectionParams.toString());
  }

  public void run() throws IOException {
    Preconditions.checkState(inputManager != null, "InputManager must be configured");

    ListenableFuture<Void> runShuffleFuture = schedulerExecutor.submit(schedulerCallable);
    Futures.addCallback(runShuffleFuture, new SchedulerFutureCallback());
    // Shutdown this executor once this task, and the callback complete.
    schedulerExecutor.shutdown();
  }

  private class RunShuffleCallable implements Callable<Void> {

    @Override
    public Void call() throws Exception {
      while (!isShutdown.get() && numCompletedInputs.get() < numInputs) {
        lock.lock();
        try {
          if (runningFetchers.size() >= numFetchers || pendingHosts.isEmpty()) {
            if (numCompletedInputs.get() < numInputs) {
              wakeLoop.await();
            }
          }
        } finally {
          lock.unlock();
        }

        if (shuffleError != null) {
          // InputContext has already been informed of a fatal error. Relying on
          // tez to kill the task.
          break;
        }

        if (LOG.isDebugEnabled()) {
          LOG.debug("NumCompletedInputs: " + numCompletedInputs);
        }
        if (numCompletedInputs.get() < numInputs && !isShutdown.get()) {
          lock.lock();
          try {
            int maxFetchersToRun = numFetchers - runningFetchers.size();
            int count = 0;
            while (pendingHosts.peek() != null && !isShutdown.get()) {
              InputHost inputHost = null;
              try {
                inputHost = pendingHosts.take();
              } catch (InterruptedException e) {
                if (isShutdown.get()) {
                  LOG.info(
                      "Interrupted and hasBeenShutdown, Breaking out of ShuffleScheduler Loop");
                  break;
                } else {
                  throw e;
                }
              }
              if (LOG.isDebugEnabled()) {
                LOG.debug("Processing pending host: " + inputHost.toDetailedString());
              }
              if (inputHost.getNumPendingInputs() > 0 && !isShutdown.get()) {
                LOG.info("Scheduling fetch for inputHost: " + inputHost.getIdentifier());
                Fetcher fetcher = constructFetcherForHost(inputHost);
                runningFetchers.add(fetcher);
                if (isShutdown.get()) {
                  LOG.info("hasBeenShutdown, Breaking out of ShuffleScheduler Loop");
                }
                ListenableFuture<FetchResult> future = fetcherExecutor.submit(fetcher);
                Futures.addCallback(future, new FetchFutureCallback(fetcher));
                if (++count >= maxFetchersToRun) {
                  break;
                }
              } else {
                if (LOG.isDebugEnabled()) {
                  LOG.debug(
                      "Skipping host: "
                          + inputHost.getIdentifier()
                          + " since it has no inputs to process");
                }
              }
            }
          } finally {
            lock.unlock();
          }
        }
      }
      LOG.info(
          "Shutting down FetchScheduler, Was Interrupted: "
              + Thread.currentThread().isInterrupted());
      // TODO NEWTEZ Maybe clean up inputs.
      if (!fetcherExecutor.isShutdown()) {
        fetcherExecutor.shutdownNow();
      }
      return null;
    }
  }

  private Fetcher constructFetcherForHost(InputHost inputHost) {
    FetcherBuilder fetcherBuilder =
        new FetcherBuilder(
            ShuffleManager.this,
            httpConnectionParams,
            inputManager,
            inputContext.getApplicationId(),
            shuffleSecret,
            srcNameTrimmed);
    if (codec != null) {
      fetcherBuilder.setCompressionParameters(codec);
    }
    fetcherBuilder.setIFileParams(ifileReadAhead, ifileReadAheadLength);

    // Remove obsolete inputs from the list being given to the fetcher. Also
    // remove from the obsolete list.
    List<InputAttemptIdentifier> pendingInputsForHost = inputHost.clearAndGetPendingInputs();
    for (Iterator<InputAttemptIdentifier> inputIter = pendingInputsForHost.iterator();
        inputIter.hasNext(); ) {
      InputAttemptIdentifier input = inputIter.next();
      // Avoid adding attempts which have already completed.
      if (completedInputSet.contains(input.getInputIdentifier())) {
        inputIter.remove();
        continue;
      }
      // Avoid adding attempts which have been marked as OBSOLETE
      if (obsoletedInputs.contains(input)) {
        inputIter.remove();
      }
    }
    // TODO NEWTEZ Maybe limit the number of inputs being given to a single
    // fetcher, especially in the case where #hosts < #fetchers
    fetcherBuilder.assignWork(
        inputHost.getHost(),
        inputHost.getPort(),
        inputHost.getSrcPhysicalIndex(),
        pendingInputsForHost);
    LOG.info(
        "Created Fetcher for host: "
            + inputHost.getHost()
            + ", with inputs: "
            + pendingInputsForHost);
    return fetcherBuilder.build();
  }

  /////////////////// Methods for InputEventHandler

  public void addKnownInput(
      String hostName,
      int port,
      InputAttemptIdentifier srcAttemptIdentifier,
      int srcPhysicalIndex) {
    String identifier = InputHost.createIdentifier(hostName, port);
    InputHost host = knownSrcHosts.get(identifier);
    if (host == null) {
      host = new InputHost(hostName, port, inputContext.getApplicationId(), srcPhysicalIndex);
      assert identifier.equals(host.getIdentifier());
      InputHost old = knownSrcHosts.putIfAbsent(identifier, host);
      if (old != null) {
        host = old;
      }
    }
    if (LOG.isDebugEnabled()) {
      LOG.debug("Adding input: " + srcAttemptIdentifier + ", to host: " + host);
    }
    host.addKnownInput(srcAttemptIdentifier);
    lock.lock();
    try {
      boolean added = pendingHosts.offer(host);
      if (!added) {
        String errorMessage = "Unable to add host: " + host.getIdentifier() + " to pending queue";
        LOG.error(errorMessage);
        throw new TezUncheckedException(errorMessage);
      }
      wakeLoop.signal();
    } finally {
      lock.unlock();
    }
  }

  public void addCompletedInputWithNoData(InputAttemptIdentifier srcAttemptIdentifier) {
    InputIdentifier inputIdentifier = srcAttemptIdentifier.getInputIdentifier();
    LOG.info("No input data exists for SrcTask: " + inputIdentifier + ". Marking as complete.");

    if (!completedInputSet.contains(inputIdentifier)) {
      synchronized (completedInputSet) {
        if (!completedInputSet.contains(inputIdentifier)) {
          registerCompletedInput(new NullFetchedInput(srcAttemptIdentifier));
        }
      }
    }

    // Awake the loop to check for termination.
    lock.lock();
    try {
      wakeLoop.signal();
    } finally {
      lock.unlock();
    }
  }

  public void addCompletedInputWithData(
      InputAttemptIdentifier srcAttemptIdentifier, FetchedInput fetchedInput) throws IOException {
    InputIdentifier inputIdentifier = srcAttemptIdentifier.getInputIdentifier();

    LOG.info("Received Data via Event: " + srcAttemptIdentifier + " to " + fetchedInput.getType());
    // Count irrespective of whether this is a copy of an already fetched input
    lock.lock();
    try {
      lastProgressTime = System.currentTimeMillis();
    } finally {
      lock.unlock();
    }

    boolean committed = false;
    if (!completedInputSet.contains(inputIdentifier)) {
      synchronized (completedInputSet) {
        if (!completedInputSet.contains(inputIdentifier)) {
          fetchedInput.commit();
          committed = true;
          registerCompletedInput(fetchedInput);
        }
      }
    }
    if (!committed) {
      fetchedInput.abort(); // If this fails, the fetcher may attempt another
      // abort.
    } else {
      lock.lock();
      try {
        // Signal the wakeLoop to check for termination.
        wakeLoop.signal();
      } finally {
        lock.unlock();
      }
    }
  }

  public synchronized void obsoleteKnownInput(InputAttemptIdentifier srcAttemptIdentifier) {
    obsoletedInputs.add(srcAttemptIdentifier);
    // TODO NEWTEZ Maybe inform the fetcher about this. For now, this is used during the initial
    // fetch list construction.
  }

  /////////////////// End of Methods for InputEventHandler
  /////////////////// Methods from FetcherCallbackHandler

  @Override
  public void fetchSucceeded(
      String host,
      InputAttemptIdentifier srcAttemptIdentifier,
      FetchedInput fetchedInput,
      long fetchedBytes,
      long decompressedLength,
      long copyDuration)
      throws IOException {
    InputIdentifier inputIdentifier = srcAttemptIdentifier.getInputIdentifier();

    LOG.info(
        "Completed fetch for attempt: " + srcAttemptIdentifier + " to " + fetchedInput.getType());

    // Count irrespective of whether this is a copy of an already fetched input
    lock.lock();
    try {
      lastProgressTime = System.currentTimeMillis();
    } finally {
      lock.unlock();
    }

    boolean committed = false;
    if (!completedInputSet.contains(inputIdentifier)) {
      synchronized (completedInputSet) {
        if (!completedInputSet.contains(inputIdentifier)) {
          fetchedInput.commit();
          committed = true;

          // Processing counters for completed and commit fetches only. Need
          // additional counters for excessive fetches - which primarily comes
          // in after speculation or retries.
          shuffledInputsCounter.increment(1);
          bytesShuffledCounter.increment(fetchedBytes);
          if (fetchedInput.getType() == Type.MEMORY) {
            bytesShuffledToMemCounter.increment(fetchedBytes);
          } else {
            bytesShuffledToDiskCounter.increment(fetchedBytes);
          }
          decompressedDataSizeCounter.increment(decompressedLength);

          registerCompletedInput(fetchedInput);
        }
      }
    }
    if (!committed) {
      fetchedInput.abort(); // If this fails, the fetcher may attempt another abort.
    } else {
      lock.lock();
      try {
        // Signal the wakeLoop to check for termination.
        wakeLoop.signal();
      } finally {
        lock.unlock();
      }
    }
    // TODO NEWTEZ Maybe inform fetchers, in case they have an alternate attempt of the same task in
    // their queue.
  }

  @Override
  public void fetchFailed(
      String host, InputAttemptIdentifier srcAttemptIdentifier, boolean connectFailed) {
    // TODO NEWTEZ. Implement logic to report fetch failures after a threshold.
    // For now, reporting immediately.
    LOG.info(
        "Fetch failed for src: "
            + srcAttemptIdentifier
            + "InputIdentifier: "
            + srcAttemptIdentifier
            + ", connectFailed: "
            + connectFailed);
    failedShufflesCounter.increment(1);
    if (srcAttemptIdentifier == null) {
      String message = "Received fetchFailure for an unknown src (null)";
      LOG.fatal(message);
      inputContext.fatalError(null, message);
    } else {
      InputReadErrorEvent readError =
          new InputReadErrorEvent(
              "Fetch failure while fetching from "
                  + TezRuntimeUtils.getTaskAttemptIdentifier(
                      inputContext.getSourceVertexName(),
                      srcAttemptIdentifier.getInputIdentifier().getInputIndex(),
                      srcAttemptIdentifier.getAttemptNumber()),
              srcAttemptIdentifier.getInputIdentifier().getInputIndex(),
              srcAttemptIdentifier.getAttemptNumber());

      List<Event> failedEvents = Lists.newArrayListWithCapacity(1);
      failedEvents.add(readError);
      inputContext.sendEvents(failedEvents);
    }
  }
  /////////////////// End of Methods from FetcherCallbackHandler

  public void shutdown() throws InterruptedException {
    if (!isShutdown.getAndSet(true)) {
      // Shut down any pending fetchers
      LOG.info(
          "Shutting down pending fetchers on source"
              + srcNameTrimmed
              + ": "
              + runningFetchers.size());
      lock.lock();
      try {
        wakeLoop.signal(); // signal the fetch-scheduler
        for (Fetcher fetcher : runningFetchers) {
          fetcher.shutdown(); // This could be parallelized.
        }
      } finally {
        lock.unlock();
      }

      if (this.schedulerExecutor != null && !this.schedulerExecutor.isShutdown()) {
        this.schedulerExecutor.shutdownNow();
      }
      if (this.fetcherExecutor != null && !this.fetcherExecutor.isShutdown()) {
        this.fetcherExecutor.shutdownNow(); // Interrupts all running fetchers.
      }
    }
    // All threads are shutdown.  It is safe to shutdown SSL factory
    if (httpConnectionParams.isSSLShuffleEnabled()) {
      HttpConnection.cleanupSSLFactory();
    }
  }

  private void registerCompletedInput(FetchedInput fetchedInput) {
    lock.lock();
    try {
      completedInputSet.add(fetchedInput.getInputAttemptIdentifier().getInputIdentifier());
      completedInputs.add(fetchedInput);
      if (!inputReadyNotificationSent.getAndSet(true)) {
        // TODO Should eventually be controlled by Inputs which are processing the data.
        inputContext.inputIsReady();
      }
      int numComplete = numCompletedInputs.incrementAndGet();
      if (numComplete == numInputs) {
        LOG.info("All inputs fetched for input vertex : " + inputContext.getSourceVertexName());
      }
    } finally {
      lock.unlock();
    }
  }

  /////////////////// Methods for walking the available inputs

  /** @return true if there is another input ready for consumption. */
  public boolean newInputAvailable() {
    FetchedInput head = completedInputs.peek();
    if (head == null || head instanceof NullFetchedInput) {
      return false;
    } else {
      return true;
    }
  }

  /** @return true if all of the required inputs have been fetched. */
  public boolean allInputsFetched() {
    lock.lock();
    try {
      return numCompletedInputs.get() == numInputs;
    } finally {
      lock.unlock();
    }
  }

  /**
   * @return the next available input, or null if there are no available inputs. This method will
   *     block if there are currently no available inputs, but more may become available.
   */
  public FetchedInput getNextInput() throws InterruptedException {
    FetchedInput input = null;
    do {
      // Check for no additional inputs
      lock.lock();
      try {
        input = completedInputs.peek();
        if (input == null && allInputsFetched()) {
          break;
        }
      } finally {
        lock.unlock();
      }
      input = completedInputs.take(); // block
    } while (input instanceof NullFetchedInput);
    return input;
  }
  /////////////////// End of methods for walking the available inputs

  /**
   * Fake input that is added to the completed input list in case an input does not have any data.
   */
  private class NullFetchedInput extends FetchedInput {

    public NullFetchedInput(InputAttemptIdentifier inputAttemptIdentifier) {
      super(Type.MEMORY, -1, -1, inputAttemptIdentifier, null);
    }

    @Override
    public OutputStream getOutputStream() throws IOException {
      throw new UnsupportedOperationException("Not supported for NullFetchedInput");
    }

    @Override
    public InputStream getInputStream() throws IOException {
      throw new UnsupportedOperationException("Not supported for NullFetchedInput");
    }

    @Override
    public void commit() throws IOException {
      throw new UnsupportedOperationException("Not supported for NullFetchedInput");
    }

    @Override
    public void abort() throws IOException {
      throw new UnsupportedOperationException("Not supported for NullFetchedInput");
    }

    @Override
    public void free() {
      throw new UnsupportedOperationException("Not supported for NullFetchedInput");
    }
  }

  private class SchedulerFutureCallback implements FutureCallback<Void> {

    @Override
    public void onSuccess(Void result) {
      LOG.info("Scheduler thread completed");
    }

    @Override
    public void onFailure(Throwable t) {
      if (isShutdown.get()) {
        LOG.info("Already shutdown. Ignoring error: " + t);
      } else {
        LOG.error("Scheduler failed with error: ", t);
        inputContext.fatalError(t, "Shuffle Scheduler Failed");
      }
    }
  }

  private class FetchFutureCallback implements FutureCallback<FetchResult> {

    private final Fetcher fetcher;

    public FetchFutureCallback(Fetcher fetcher) {
      this.fetcher = fetcher;
    }

    private void doBookKeepingForFetcherComplete() {
      lock.lock();
      try {
        runningFetchers.remove(fetcher);
        wakeLoop.signal();
      } finally {
        lock.unlock();
      }
    }

    @Override
    public void onSuccess(FetchResult result) {
      fetcher.shutdown();
      if (isShutdown.get()) {
        LOG.info("Already shutdown. Ignoring event from fetcher");
      } else {
        Iterable<InputAttemptIdentifier> pendingInputs = result.getPendingInputs();
        if (pendingInputs != null && pendingInputs.iterator().hasNext()) {
          InputHost inputHost =
              knownSrcHosts.get(InputHost.createIdentifier(result.getHost(), result.getPort()));
          assert inputHost != null;
          for (InputAttemptIdentifier input : pendingInputs) {
            inputHost.addKnownInput(input);
          }
          pendingHosts.add(inputHost);
        }
        doBookKeepingForFetcherComplete();
      }
    }

    @Override
    public void onFailure(Throwable t) {
      // Unsuccessful - the fetcher may not have shutdown correctly. Try shutting it down.
      fetcher.shutdown();
      if (isShutdown.get()) {
        LOG.info("Already shutdown. Ignoring error from fetcher: " + t);
      } else {
        LOG.error("Fetcher failed with error: ", t);
        shuffleError = t;
        inputContext.fatalError(t, "Fetch failed");
        doBookKeepingForFetcherComplete();
      }
    }
  }
}
Ejemplo n.º 25
0
/** @author tolgam */
public class Optimizer extends Observable implements Runnable {
  /** Population size */
  private static final int POPULATION_SIZE = 5;

  /** Maximum generation to wait before finding an optima */
  private static final int MAXIMUM_GENERATION = 40;

  /** Logger */
  protected final Logger logger = LoggerFactory.getLogger(Optimizer.class);

  /** Population */
  protected final SortedSet<Solution> population = new TreeSet<Solution>();

  /** Hall of fame to put all the results found */
  private final Set<Triple> blackListedTriples = new HashSet<Triple>();

  /** Mutation operator used to generate new populations */
  private final Generate generateOp;

  /** Evaluation operator to evaluate all the candidates */
  private final Evaluate evaluateOp;

  /** Counter for statistics about the number of evaluations */
  private int evaluationsCounter = 0;

  private final Request request;

  /** Activity control */
  private boolean isPaused = false;

  private boolean isTerminated = false;
  private ReentrantLock pauseLock = new ReentrantLock();
  private Condition unpaused = pauseLock.newCondition();

  // Generation counter
  private int generation = 0;

  private DataLayer datalayer;

  /**
   * Optimizer
   *
   * @param datalayer
   * @param request
   * @param executor
   */
  public Optimizer(
      final DataLayer datalayer, final Request request, final ExecutorService executor) {
    // Save a pointer to the request and the datalayer
    this.request = request;
    this.datalayer = datalayer;

    // Create the operators
    this.generateOp = new Generate(datalayer, request);
    this.evaluateOp = new Evaluate(request, blackListedTriples, executor);
  }

  /*
   * (non-Javadoc)
   *
   * @see java.lang.Runnable#run()
   */
  public void run() {
    // Do not run something terminated
    if (isTerminated()) return;

    logger.info("Run optimizer");
    generation = 0;
    while (!isTerminated()) {
      pauseLock.lock();
      try {
        while (isPaused) unpaused.await();
        if (isTerminated) return;
      } catch (InterruptedException ie) {
        // Finish
        return;
      } finally {
        pauseLock.unlock();
      }

      //
      // Initialise the population with a dummy individual
      //
      if (population.isEmpty()) {
        Solution solution = new Solution();
        for (Node_Variable variable : request.variables())
          solution.add(new Binding(variable, Node.NULL));
        population.add(solution);
      }

      // Increment the generation counter
      ++generation;

      //
      // Generate a new set of offspring and copy the parents into it
      // first
      //
      // logger.info("Generate");
      Set<Solution> newPopulation = new HashSet<Solution>();
      newPopulation.addAll(population); // Add the parents
      generateOp.createPopulation(population, newPopulation);

      //
      // Evaluate all of them
      //
      // logger.info("Evaluate " + newPopulation.size());
      // Counts the number of different solutions
      evaluationsCounter += newPopulation.size() - population.size();
      evaluateOp.evaluatePopulation(newPopulation);

      /*
       * String buffer = "Fitnesses "; for (Solution s : newPopulation)
       * buffer += s.getFitness() + " "; logger.info(buffer);
       */

      // Provide feed back to the generation operator
      generateOp.updateProviderRewards(newPopulation);

      //
      // Get rid of the previous population and insert the kids
      //
      // logger.info("Cut");
      population.clear();
      population.addAll(newPopulation);
      while (population.size() > POPULATION_SIZE) population.remove(population.first());

      //
      // Track for optimality
      //
      double topFitness = population.last().getFitness();
      for (Solution s : population) {
        // Increment age
        if (s.getFitness() != topFitness) s.resetAge();
        s.incrementAge();

        // Check optimality
        s.setOptimal(false);
        if (s.getAge() >= MAXIMUM_GENERATION && s.getFitness() > 0) s.setOptimal(true);
        if (s.getFitness() == 1.0d) s.setOptimal(true);

        // If the solution is optimal add its (valid!) triples to the
        // black
        // list
        if (s.isOptimal()) {
          synchronized (blackListedTriples) {
            blackListedTriples.addAll(request.getTripleSet(s));
          }
        }

        // Print solution
        // logger.info(s.toString());
      }

      logger.info("Generation " + generation + ", best fitness=" + topFitness);
      for (Solution s : population) logger.info(s.toString());

      //
      // Notify observers that a loop has been done
      //
      setChanged();
      notifyObservers(population);

      // for (Solution s : population)
      //	if (s.isOptimal())
      //		this.terminate();

      //
      // Wait a bit for the data layer
      //
      datalayer.waitForLatencyBuffer();

      //
      // Remove all optimum individuals from the population
      //
      List<Solution> toRemove = new ArrayList<Solution>();
      for (Solution s : population) if (s.isOptimal()) toRemove.add(s);
      population.removeAll(toRemove);
    }
  }

  /** Stop the execution of the optimizer */
  public void terminate() {
    logger.info("Terminate optimizer");
    pauseLock.lock();
    try {
      // Set the status to true
      isTerminated = true;
    } finally {
      pauseLock.unlock();
    }
  }

  /** @return true if the optimizer is stopped */
  public boolean isTerminated() {
    boolean res;
    pauseLock.lock();
    try {
      res = isTerminated;
    } finally {
      pauseLock.unlock();
    }
    return res;
  }

  /** Pause the algorithm */
  public void pause() {
    logger.info("Pause optimizer " + this);
    pauseLock.lock();
    try {
      isPaused = true;
    } finally {
      pauseLock.unlock();
    }
  }

  /** @return true if the search algorithm is paused */
  public boolean isPaused() {
    boolean res;
    pauseLock.lock();
    try {
      res = isPaused;
    } finally {
      pauseLock.unlock();
    }
    return res;
  }

  /** Continue the execution */
  public void resume() {
    logger.info("Resume optimizer " + this);
    pauseLock.lock();
    try {
      isPaused = false;
      unpaused.signalAll();
    } finally {
      pauseLock.unlock();
    }
  }

  /** @return the evaluations counter */
  public int getEvaluationsCounter() {
    return evaluationsCounter;
  }

  /** @return the generations counter */
  public int getGenerationsCounter() {
    return generation;
  }
}
Ejemplo n.º 26
0
@SuppressWarnings({"unchecked", "rawtypes"})
public class DefaultSorter extends ExternalSorter implements IndexedSortable {

  private static final Log LOG = LogFactory.getLog(DefaultSorter.class);

  // TODO NEWTEZ Progress reporting to Tez framework. (making progress vs %complete)

  /** The size of each record in the index file for the map-outputs. */
  public static final int MAP_OUTPUT_INDEX_RECORD_LENGTH = 24;

  private static final int APPROX_HEADER_LENGTH = 150;

  // k/v accounting
  private final IntBuffer kvmeta; // metadata overlay on backing store
  int kvstart; // marks origin of spill metadata
  int kvend; // marks end of spill metadata
  int kvindex; // marks end of fully serialized records

  int equator; // marks origin of meta/serialization
  int bufstart; // marks beginning of spill
  int bufend; // marks beginning of collectable
  int bufmark; // marks end of record
  int bufindex; // marks end of collected
  int bufvoid; // marks the point where we should stop
  // reading at the end of the buffer

  private final byte[] kvbuffer; // main output buffer
  private final byte[] b0 = new byte[0];

  protected static final int VALSTART = 0; // val offset in acct
  protected static final int KEYSTART = 1; // key offset in acct
  protected static final int PARTITION = 2; // partition offset in acct
  protected static final int VALLEN = 3; // length of value
  protected static final int NMETA = 4; // num meta ints
  protected static final int METASIZE = NMETA * 4; // size in bytes

  // spill accounting
  final int maxRec;
  final int softLimit;
  boolean spillInProgress;
  int bufferRemaining;
  volatile Throwable sortSpillException = null;

  int numSpills = 0;
  final int minSpillsForCombine;
  final ReentrantLock spillLock = new ReentrantLock();
  final Condition spillDone = spillLock.newCondition();
  final Condition spillReady = spillLock.newCondition();
  final BlockingBuffer bb = new BlockingBuffer();
  volatile boolean spillThreadRunning = false;
  final SpillThread spillThread = new SpillThread();

  final ArrayList<TezSpillRecord> indexCacheList = new ArrayList<TezSpillRecord>();
  private final int indexCacheMemoryLimit;
  private int totalIndexCacheMemory;

  public DefaultSorter(
      OutputContext outputContext, Configuration conf, int numOutputs, long initialMemoryAvailable)
      throws IOException {
    super(outputContext, conf, numOutputs, initialMemoryAvailable);
    // sanity checks
    final float spillper =
        this.conf.getFloat(
            TezRuntimeConfiguration.TEZ_RUNTIME_SORT_SPILL_PERCENT,
            TezRuntimeConfiguration.TEZ_RUNTIME_SORT_SPILL_PERCENT_DEFAULT);
    final int sortmb = this.availableMemoryMb;
    if (spillper > (float) 1.0 || spillper <= (float) 0.0) {
      throw new IOException(
          "Invalid \""
              + TezRuntimeConfiguration.TEZ_RUNTIME_SORT_SPILL_PERCENT
              + "\": "
              + spillper);
    }
    if ((sortmb & 0x7FF) != sortmb) {
      throw new IOException(
          "Invalid \"" + TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB + "\": " + sortmb);
    }

    indexCacheMemoryLimit =
        this.conf.getInt(
            TezRuntimeConfiguration.TEZ_RUNTIME_INDEX_CACHE_MEMORY_LIMIT_BYTES,
            TezRuntimeConfiguration.TEZ_RUNTIME_INDEX_CACHE_MEMORY_LIMIT_BYTES_DEFAULT);

    // buffers and accounting
    int maxMemUsage = sortmb << 20;
    maxMemUsage -= maxMemUsage % METASIZE;
    kvbuffer = new byte[maxMemUsage];
    bufvoid = kvbuffer.length;
    kvmeta = ByteBuffer.wrap(kvbuffer).order(ByteOrder.nativeOrder()).asIntBuffer();
    setEquator(0);
    bufstart = bufend = bufindex = equator;
    kvstart = kvend = kvindex;

    maxRec = kvmeta.capacity() / NMETA;
    softLimit = (int) (kvbuffer.length * spillper);
    bufferRemaining = softLimit;
    if (LOG.isInfoEnabled()) {
      LOG.info(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB + ": " + sortmb);
      LOG.info("soft limit at " + softLimit);
      LOG.info("bufstart = " + bufstart + "; bufvoid = " + bufvoid);
      LOG.info("kvstart = " + kvstart + "; length = " + maxRec);
    }

    // k/v serialization
    valSerializer.open(bb);
    keySerializer.open(bb);

    spillInProgress = false;
    minSpillsForCombine =
        this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINE_MIN_SPILLS, 3);
    spillThread.setDaemon(true);
    spillThread.setName(
        "SpillThread ["
            + TezUtilsInternal.cleanVertexName(outputContext.getDestinationVertexName() + "]"));
    spillLock.lock();
    try {
      spillThread.start();
      while (!spillThreadRunning) {
        spillDone.await();
      }
    } catch (InterruptedException e) {
      throw new IOException("Spill thread failed to initialize", e);
    } finally {
      spillLock.unlock();
    }
    if (sortSpillException != null) {
      throw new IOException("Spill thread failed to initialize", sortSpillException);
    }
  }

  @Override
  public void write(Object key, Object value) throws IOException {
    collect(key, value, partitioner.getPartition(key, value, partitions));
  }

  /**
   * Serialize the key, value to intermediate storage. When this method returns, kvindex must refer
   * to sufficient unused storage to store one METADATA.
   */
  synchronized void collect(Object key, Object value, final int partition) throws IOException {

    if (key.getClass() != keyClass) {
      throw new IOException(
          "Type mismatch in key from map: expected "
              + keyClass.getName()
              + ", received "
              + key.getClass().getName());
    }
    if (value.getClass() != valClass) {
      throw new IOException(
          "Type mismatch in value from map: expected "
              + valClass.getName()
              + ", received "
              + value.getClass().getName());
    }
    if (partition < 0 || partition >= partitions) {
      throw new IOException(
          "Illegal partition for "
              + key
              + " ("
              + partition
              + ")"
              + ", TotalPartitions: "
              + partitions);
    }
    checkSpillException();
    bufferRemaining -= METASIZE;
    if (bufferRemaining <= 0) {
      // start spill if the thread is not running and the soft limit has been
      // reached
      spillLock.lock();
      try {
        do {
          if (!spillInProgress) {
            final int kvbidx = 4 * kvindex;
            final int kvbend = 4 * kvend;
            // serialized, unspilled bytes always lie between kvindex and
            // bufindex, crossing the equator. Note that any void space
            // created by a reset must be included in "used" bytes
            final int bUsed = distanceTo(kvbidx, bufindex);
            final boolean bufsoftlimit = bUsed >= softLimit;
            if ((kvbend + METASIZE) % kvbuffer.length != equator - (equator % METASIZE)) {
              // spill finished, reclaim space
              resetSpill();
              bufferRemaining =
                  Math.min(distanceTo(bufindex, kvbidx) - 2 * METASIZE, softLimit - bUsed)
                      - METASIZE;
              continue;
            } else if (bufsoftlimit && kvindex != kvend) {
              // spill records, if any collected; check latter, as it may
              // be possible for metadata alignment to hit spill pcnt
              startSpill();
              final int avgRec =
                  (int) (mapOutputByteCounter.getValue() / mapOutputRecordCounter.getValue());
              // leave at least half the split buffer for serialization data
              // ensure that kvindex >= bufindex
              final int distkvi = distanceTo(bufindex, kvbidx);
              final int newPos =
                  (bufindex
                          + Math.max(
                              2 * METASIZE - 1,
                              Math.min(distkvi / 2, distkvi / (METASIZE + avgRec) * METASIZE)))
                      % kvbuffer.length;
              setEquator(newPos);
              bufmark = bufindex = newPos;
              final int serBound = 4 * kvend;
              // bytes remaining before the lock must be held and limits
              // checked is the minimum of three arcs: the metadata space, the
              // serialization space, and the soft limit
              bufferRemaining =
                  Math.min(
                          // metadata max
                          distanceTo(bufend, newPos),
                          Math.min(
                              // serialization max
                              distanceTo(newPos, serBound),
                              // soft limit
                              softLimit))
                      - 2 * METASIZE;
            }
          }
        } while (false);
      } finally {
        spillLock.unlock();
      }
    }

    try {
      // serialize key bytes into buffer
      int keystart = bufindex;
      keySerializer.serialize(key);
      if (bufindex < keystart) {
        // wrapped the key; must make contiguous
        bb.shiftBufferedKey();
        keystart = 0;
      }
      // serialize value bytes into buffer
      final int valstart = bufindex;
      valSerializer.serialize(value);
      // It's possible for records to have zero length, i.e. the serializer
      // will perform no writes. To ensure that the boundary conditions are
      // checked and that the kvindex invariant is maintained, perform a
      // zero-length write into the buffer. The logic monitoring this could be
      // moved into collect, but this is cleaner and inexpensive. For now, it
      // is acceptable.
      bb.write(b0, 0, 0);

      // the record must be marked after the preceding write, as the metadata
      // for this record are not yet written
      int valend = bb.markRecord();

      mapOutputRecordCounter.increment(1);
      mapOutputByteCounter.increment(distanceTo(keystart, valend, bufvoid));

      // write accounting info
      kvmeta.put(kvindex + PARTITION, partition);
      kvmeta.put(kvindex + KEYSTART, keystart);
      kvmeta.put(kvindex + VALSTART, valstart);
      kvmeta.put(kvindex + VALLEN, distanceTo(valstart, valend));
      // advance kvindex
      kvindex = (int) (((long) kvindex - NMETA + kvmeta.capacity()) % kvmeta.capacity());
    } catch (MapBufferTooSmallException e) {
      LOG.info("Record too large for in-memory buffer: " + e.getMessage());
      spillSingleRecord(key, value, partition);
      mapOutputRecordCounter.increment(1);
      return;
    }
  }

  /**
   * Set the point from which meta and serialization data expand. The meta indices are aligned with
   * the buffer, so metadata never spans the ends of the circular buffer.
   */
  private void setEquator(int pos) {
    equator = pos;
    // set index prior to first entry, aligned at meta boundary
    final int aligned = pos - (pos % METASIZE);
    // Cast one of the operands to long to avoid integer overflow
    kvindex = (int) (((long) aligned - METASIZE + kvbuffer.length) % kvbuffer.length) / 4;
    if (LOG.isInfoEnabled()) {
      LOG.info("(EQUATOR) " + pos + " kvi " + kvindex + "(" + (kvindex * 4) + ")");
    }
  }

  /**
   * The spill is complete, so set the buffer and meta indices to be equal to the new equator to
   * free space for continuing collection. Note that when kvindex == kvend == kvstart, the buffer is
   * empty.
   */
  private void resetSpill() {
    final int e = equator;
    bufstart = bufend = e;
    final int aligned = e - (e % METASIZE);
    // set start/end to point to first meta record
    // Cast one of the operands to long to avoid integer overflow
    kvstart = kvend = (int) (((long) aligned - METASIZE + kvbuffer.length) % kvbuffer.length) / 4;
    if (LOG.isInfoEnabled()) {
      LOG.info(
          "(RESET) equator "
              + e
              + " kv "
              + kvstart
              + "("
              + (kvstart * 4)
              + ")"
              + " kvi "
              + kvindex
              + "("
              + (kvindex * 4)
              + ")");
    }
  }

  /**
   * Compute the distance in bytes between two indices in the serialization buffer.
   *
   * @see #distanceTo(int,int,int)
   */
  final int distanceTo(final int i, final int j) {
    return distanceTo(i, j, kvbuffer.length);
  }

  /** Compute the distance between two indices in the circular buffer given the max distance. */
  int distanceTo(final int i, final int j, final int mod) {
    return i <= j ? j - i : mod - i + j;
  }

  /** For the given meta position, return the offset into the int-sized kvmeta buffer. */
  int offsetFor(int metapos) {
    return (metapos % maxRec) * NMETA;
  }

  /**
   * Compare logical range, st i, j MOD offset capacity. Compare by partition, then by key.
   *
   * @see IndexedSortable#compare
   */
  public int compare(final int mi, final int mj) {
    final int kvi = offsetFor(mi);
    final int kvj = offsetFor(mj);
    final int kvip = kvmeta.get(kvi + PARTITION);
    final int kvjp = kvmeta.get(kvj + PARTITION);
    // sort by partition
    if (kvip != kvjp) {
      return kvip - kvjp;
    }
    // sort by key
    return comparator.compare(
        kvbuffer,
        kvmeta.get(kvi + KEYSTART),
        kvmeta.get(kvi + VALSTART) - kvmeta.get(kvi + KEYSTART),
        kvbuffer,
        kvmeta.get(kvj + KEYSTART),
        kvmeta.get(kvj + VALSTART) - kvmeta.get(kvj + KEYSTART));
  }

  final byte META_BUFFER_TMP[] = new byte[METASIZE];
  /**
   * Swap metadata for items i,j
   *
   * @see IndexedSortable#swap
   */
  public void swap(final int mi, final int mj) {
    int iOff = (mi % maxRec) * METASIZE;
    int jOff = (mj % maxRec) * METASIZE;
    System.arraycopy(kvbuffer, iOff, META_BUFFER_TMP, 0, METASIZE);
    System.arraycopy(kvbuffer, jOff, kvbuffer, iOff, METASIZE);
    System.arraycopy(META_BUFFER_TMP, 0, kvbuffer, jOff, METASIZE);
  }

  /** Inner class managing the spill of serialized records to disk. */
  protected class BlockingBuffer extends DataOutputStream {

    public BlockingBuffer() {
      super(new Buffer());
    }

    /**
     * Mark end of record. Note that this is required if the buffer is to cut the spill in the
     * proper place.
     */
    public int markRecord() {
      bufmark = bufindex;
      return bufindex;
    }

    /**
     * Set position from last mark to end of writable buffer, then rewrite the data between last
     * mark and kvindex. This handles a special case where the key wraps around the buffer. If the
     * key is to be passed to a RawComparator, then it must be contiguous in the buffer. This
     * recopies the data in the buffer back into itself, but starting at the beginning of the
     * buffer. Note that this method should <b>only</b> be called immediately after detecting this
     * condition. To call it at any other time is undefined and would likely result in data loss or
     * corruption.
     *
     * @see #markRecord()
     */
    protected void shiftBufferedKey() throws IOException {
      // spillLock unnecessary; both kvend and kvindex are current
      int headbytelen = bufvoid - bufmark;
      bufvoid = bufmark;
      final int kvbidx = 4 * kvindex;
      final int kvbend = 4 * kvend;
      final int avail = Math.min(distanceTo(0, kvbidx), distanceTo(0, kvbend));
      if (bufindex + headbytelen < avail) {
        System.arraycopy(kvbuffer, 0, kvbuffer, headbytelen, bufindex);
        System.arraycopy(kvbuffer, bufvoid, kvbuffer, 0, headbytelen);
        bufindex += headbytelen;
        bufferRemaining -= kvbuffer.length - bufvoid;
      } else {
        byte[] keytmp = new byte[bufindex];
        System.arraycopy(kvbuffer, 0, keytmp, 0, bufindex);
        bufindex = 0;
        out.write(kvbuffer, bufmark, headbytelen);
        out.write(keytmp);
      }
    }
  }

  public class Buffer extends OutputStream {
    private final byte[] scratch = new byte[1];

    @Override
    public void write(int v) throws IOException {
      scratch[0] = (byte) v;
      write(scratch, 0, 1);
    }

    /**
     * Attempt to write a sequence of bytes to the collection buffer. This method will block if the
     * spill thread is running and it cannot write.
     *
     * @throws MapBufferTooSmallException if record is too large to deserialize into the collection
     *     buffer.
     */
    @Override
    public void write(byte b[], int off, int len) throws IOException {
      // must always verify the invariant that at least METASIZE bytes are
      // available beyond kvindex, even when len == 0
      bufferRemaining -= len;
      if (bufferRemaining <= 0) {
        // writing these bytes could exhaust available buffer space or fill
        // the buffer to soft limit. check if spill or blocking are necessary
        boolean blockwrite = false;
        spillLock.lock();
        try {
          do {
            checkSpillException();

            final int kvbidx = 4 * kvindex;
            final int kvbend = 4 * kvend;
            // ser distance to key index
            final int distkvi = distanceTo(bufindex, kvbidx);
            // ser distance to spill end index
            final int distkve = distanceTo(bufindex, kvbend);

            // if kvindex is closer than kvend, then a spill is neither in
            // progress nor complete and reset since the lock was held. The
            // write should block only if there is insufficient space to
            // complete the current write, write the metadata for this record,
            // and write the metadata for the next record. If kvend is closer,
            // then the write should block if there is too little space for
            // either the metadata or the current write. Note that collect
            // ensures its metadata requirement with a zero-length write
            blockwrite =
                distkvi <= distkve
                    ? distkvi <= len + 2 * METASIZE
                    : distkve <= len || distanceTo(bufend, kvbidx) < 2 * METASIZE;

            if (!spillInProgress) {
              if (blockwrite) {
                if ((kvbend + METASIZE) % kvbuffer.length != equator - (equator % METASIZE)) {
                  // spill finished, reclaim space
                  // need to use meta exclusively; zero-len rec & 100% spill
                  // pcnt would fail
                  resetSpill(); // resetSpill doesn't move bufindex, kvindex
                  bufferRemaining =
                      Math.min(distkvi - 2 * METASIZE, softLimit - distanceTo(kvbidx, bufindex))
                          - len;
                  continue;
                }
                // we have records we can spill; only spill if blocked
                if (kvindex != kvend) {
                  startSpill();
                  // Blocked on this write, waiting for the spill just
                  // initiated to finish. Instead of repositioning the marker
                  // and copying the partial record, we set the record start
                  // to be the new equator
                  setEquator(bufmark);
                } else {
                  // We have no buffered records, and this record is too large
                  // to write into kvbuffer. We must spill it directly from
                  // collect
                  final int size = distanceTo(bufstart, bufindex) + len;
                  setEquator(0);
                  bufstart = bufend = bufindex = equator;
                  kvstart = kvend = kvindex;
                  bufvoid = kvbuffer.length;
                  throw new MapBufferTooSmallException(size + " bytes");
                }
              }
            }

            if (blockwrite) {
              // wait for spill
              try {
                while (spillInProgress) {
                  spillDone.await();
                }
              } catch (InterruptedException e) {
                throw new IOException("Buffer interrupted while waiting for the writer", e);
              }
            }
          } while (blockwrite);
        } finally {
          spillLock.unlock();
        }
      }
      // here, we know that we have sufficient space to write
      if (bufindex + len > bufvoid) {
        final int gaplen = bufvoid - bufindex;
        System.arraycopy(b, off, kvbuffer, bufindex, gaplen);
        len -= gaplen;
        off += gaplen;
        bufindex = 0;
      }
      System.arraycopy(b, off, kvbuffer, bufindex, len);
      bufindex += len;
    }
  }

  @Override
  public void flush() throws IOException {
    LOG.info("Starting flush of map output");
    spillLock.lock();
    try {
      while (spillInProgress) {
        spillDone.await();
      }
      checkSpillException();

      final int kvbend = 4 * kvend;
      if ((kvbend + METASIZE) % kvbuffer.length != equator - (equator % METASIZE)) {
        // spill finished
        resetSpill();
      }
      if (kvindex != kvend) {
        kvend = (kvindex + NMETA) % kvmeta.capacity();
        bufend = bufmark;
        if (LOG.isInfoEnabled()) {
          LOG.info("Sorting & Spilling map output");
          LOG.info("bufstart = " + bufstart + "; bufend = " + bufmark + "; bufvoid = " + bufvoid);
          LOG.info(
              "kvstart = "
                  + kvstart
                  + "("
                  + (kvstart * 4)
                  + "); kvend = "
                  + kvend
                  + "("
                  + (kvend * 4)
                  + "); length = "
                  + (distanceTo(kvend, kvstart, kvmeta.capacity()) + 1)
                  + "/"
                  + maxRec);
        }
        sortAndSpill();
      }
    } catch (InterruptedException e) {
      throw new IOException("Interrupted while waiting for the writer", e);
    } finally {
      spillLock.unlock();
    }
    assert !spillLock.isHeldByCurrentThread();
    // shut down spill thread and wait for it to exit. Since the preceding
    // ensures that it is finished with its work (and sortAndSpill did not
    // throw), we elect to use an interrupt instead of setting a flag.
    // Spilling simultaneously from this thread while the spill thread
    // finishes its work might be both a useful way to extend this and also
    // sufficient motivation for the latter approach.
    try {
      spillThread.interrupt();
      spillThread.join();
    } catch (InterruptedException e) {
      throw new IOException("Spill failed", e);
    }
    // release sort buffer before the merge
    // FIXME
    // kvbuffer = null;
    mergeParts();
    Path outputPath = mapOutputFile.getOutputFile();
    fileOutputByteCounter.increment(rfs.getFileStatus(outputPath).getLen());
  }

  @Override
  public void close() throws IOException {}

  protected class SpillThread extends Thread {

    @Override
    public void run() {
      spillLock.lock();
      spillThreadRunning = true;
      try {
        while (true) {
          spillDone.signal();
          while (!spillInProgress) {
            spillReady.await();
          }
          try {
            spillLock.unlock();
            sortAndSpill();
          } catch (Throwable t) {
            LOG.warn("Got an exception in sortAndSpill", t);
            sortSpillException = t;
          } finally {
            spillLock.lock();
            if (bufend < bufstart) {
              bufvoid = kvbuffer.length;
            }
            kvstart = kvend;
            bufstart = bufend;
            spillInProgress = false;
          }
        }
      } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
      } finally {
        spillLock.unlock();
        spillThreadRunning = false;
      }
    }
  }

  private void checkSpillException() throws IOException {
    final Throwable lspillException = sortSpillException;
    if (lspillException != null) {
      if (lspillException instanceof Error) {
        final String logMsg =
            "Task "
                + outputContext.getUniqueIdentifier()
                + " failed : "
                + ExceptionUtils.getStackTrace(lspillException);
        outputContext.fatalError(lspillException, logMsg);
      }
      throw new IOException("Spill failed", lspillException);
    }
  }

  private void startSpill() {
    assert !spillInProgress;
    kvend = (kvindex + NMETA) % kvmeta.capacity();
    bufend = bufmark;
    spillInProgress = true;
    if (LOG.isInfoEnabled()) {
      LOG.info("Spilling map output");
      LOG.info("bufstart = " + bufstart + "; bufend = " + bufmark + "; bufvoid = " + bufvoid);
      LOG.info(
          "kvstart = "
              + kvstart
              + "("
              + (kvstart * 4)
              + "); kvend = "
              + kvend
              + "("
              + (kvend * 4)
              + "); length = "
              + (distanceTo(kvend, kvstart, kvmeta.capacity()) + 1)
              + "/"
              + maxRec);
    }
    spillReady.signal();
  }

  int getMetaStart() {
    return kvend / NMETA;
  }

  int getMetaEnd() {
    return 1
        + // kvend is a valid record
        (kvstart >= kvend ? kvstart : kvmeta.capacity() + kvstart) / NMETA;
  }

  protected void sortAndSpill() throws IOException, InterruptedException {
    final int mstart = getMetaStart();
    final int mend = getMetaEnd();
    sorter.sort(this, mstart, mend, nullProgressable);
    spill(mstart, mend);
  }

  protected void spill(int mstart, int mend) throws IOException, InterruptedException {

    // approximate the length of the output file to be the length of the
    // buffer + header lengths for the partitions
    final long size =
        (bufend >= bufstart ? bufend - bufstart : (bufvoid - bufend) + bufstart)
            + partitions * APPROX_HEADER_LENGTH;
    FSDataOutputStream out = null;
    try {
      // create spill file
      final TezSpillRecord spillRec = new TezSpillRecord(partitions);
      final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size);
      out = rfs.create(filename);

      int spindex = mstart;
      final InMemValBytes value = createInMemValBytes();
      for (int i = 0; i < partitions; ++i) {
        IFile.Writer writer = null;
        try {
          long segmentStart = out.getPos();
          writer = new Writer(conf, out, keyClass, valClass, codec, spilledRecordsCounter, null);
          if (combiner == null) {
            // spill directly
            DataInputBuffer key = new DataInputBuffer();
            while (spindex < mend && kvmeta.get(offsetFor(spindex) + PARTITION) == i) {
              final int kvoff = offsetFor(spindex);
              int keystart = kvmeta.get(kvoff + KEYSTART);
              int valstart = kvmeta.get(kvoff + VALSTART);
              key.reset(kvbuffer, keystart, valstart - keystart);
              getVBytesForOffset(kvoff, value);
              writer.append(key, value);
              ++spindex;
            }
          } else {
            int spstart = spindex;
            while (spindex < mend && kvmeta.get(offsetFor(spindex) + PARTITION) == i) {
              ++spindex;
            }
            // Note: we would like to avoid the combiner if we've fewer
            // than some threshold of records for a partition
            if (spstart != spindex) {
              TezRawKeyValueIterator kvIter = new MRResultIterator(spstart, spindex);
              if (LOG.isDebugEnabled()) {
                LOG.debug("Running combine processor");
              }
              runCombineProcessor(kvIter, writer);
            }
          }

          // close the writer
          writer.close();
          if (numSpills > 0) {
            additionalSpillBytesWritten.increment(writer.getCompressedLength());
            numAdditionalSpills.increment(1);
            // Reset the value will be set during the final merge.
            outputBytesWithOverheadCounter.setValue(0);
          } else {
            // Set this up for the first write only. Subsequent ones will be handled in the final
            // merge.
            outputBytesWithOverheadCounter.increment(writer.getRawLength());
          }
          // record offsets
          final TezIndexRecord rec =
              new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength());
          spillRec.putIndex(rec, i);

          writer = null;
        } finally {
          if (null != writer) writer.close();
        }
      }

      if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
        // create spill index file
        Path indexFilename =
            mapOutputFile.getSpillIndexFileForWrite(
                numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
        spillRec.writeToFile(indexFilename, conf);
      } else {
        indexCacheList.add(spillRec);
        totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
      }
      LOG.info("Finished spill " + numSpills);
      ++numSpills;
    } finally {
      if (out != null) out.close();
    }
  }

  /**
   * Handles the degenerate case where serialization fails to fit in the in-memory buffer, so we
   * must spill the record from collect directly to a spill file. Consider this "losing".
   */
  private void spillSingleRecord(final Object key, final Object value, int partition)
      throws IOException {
    long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH;
    FSDataOutputStream out = null;
    try {
      // create spill file
      final TezSpillRecord spillRec = new TezSpillRecord(partitions);
      final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size);
      out = rfs.create(filename);

      // we don't run the combiner for a single record
      for (int i = 0; i < partitions; ++i) {
        IFile.Writer writer = null;
        try {
          long segmentStart = out.getPos();
          // Create a new codec, don't care!
          writer =
              new IFile.Writer(conf, out, keyClass, valClass, codec, spilledRecordsCounter, null);

          if (i == partition) {
            final long recordStart = out.getPos();
            writer.append(key, value);
            // Note that our map byte count will not be accurate with
            // compression
            mapOutputByteCounter.increment(out.getPos() - recordStart);
          }
          writer.close();

          if (numSpills > 0) {
            additionalSpillBytesWritten.increment(writer.getCompressedLength());
            numAdditionalSpills.increment(1);
            outputBytesWithOverheadCounter.setValue(0);
          } else {
            // Set this up for the first write only. Subsequent ones will be handled in the final
            // merge.
            outputBytesWithOverheadCounter.increment(writer.getRawLength());
          }

          // record offsets
          TezIndexRecord rec =
              new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength());
          spillRec.putIndex(rec, i);

          writer = null;
        } catch (IOException e) {
          if (null != writer) writer.close();
          throw e;
        }
      }
      if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
        // create spill index file
        Path indexFilename =
            mapOutputFile.getSpillIndexFileForWrite(
                numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
        spillRec.writeToFile(indexFilename, conf);
      } else {
        indexCacheList.add(spillRec);
        totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
      }
      ++numSpills;
    } finally {
      if (out != null) out.close();
    }
  }

  protected int getInMemVBytesLength(int kvoff) {
    // get the keystart for the next serialized value to be the end
    // of this value. If this is the last value in the buffer, use bufend
    final int vallen = kvmeta.get(kvoff + VALLEN);
    assert vallen >= 0;
    return vallen;
  }

  /**
   * Given an offset, populate vbytes with the associated set of deserialized value bytes. Should
   * only be called during a spill.
   */
  int getVBytesForOffset(int kvoff, InMemValBytes vbytes) {
    int vallen = getInMemVBytesLength(kvoff);
    vbytes.reset(kvbuffer, kvmeta.get(kvoff + VALSTART), vallen);
    return vallen;
  }

  /** Inner class wrapping valuebytes, used for appendRaw. */
  static class InMemValBytes extends DataInputBuffer {
    private byte[] buffer;
    private int start;
    private int length;
    private final int bufvoid;

    public InMemValBytes(int bufvoid) {
      this.bufvoid = bufvoid;
    }

    public void reset(byte[] buffer, int start, int length) {
      this.buffer = buffer;
      this.start = start;
      this.length = length;

      if (start + length > bufvoid) {
        this.buffer = new byte[this.length];
        final int taillen = bufvoid - start;
        System.arraycopy(buffer, start, this.buffer, 0, taillen);
        System.arraycopy(buffer, 0, this.buffer, taillen, length - taillen);
        this.start = 0;
      }

      super.reset(this.buffer, this.start, this.length);
    }
  }

  InMemValBytes createInMemValBytes() {
    return new InMemValBytes(bufvoid);
  }

  protected class MRResultIterator implements TezRawKeyValueIterator {
    private final DataInputBuffer keybuf = new DataInputBuffer();
    private final InMemValBytes vbytes = createInMemValBytes();
    private final int end;
    private int current;

    public MRResultIterator(int start, int end) {
      this.end = end;
      current = start - 1;
    }

    public boolean next() throws IOException {
      return ++current < end;
    }

    public DataInputBuffer getKey() throws IOException {
      final int kvoff = offsetFor(current);
      keybuf.reset(
          kvbuffer,
          kvmeta.get(kvoff + KEYSTART),
          kvmeta.get(kvoff + VALSTART) - kvmeta.get(kvoff + KEYSTART));
      return keybuf;
    }

    public DataInputBuffer getValue() throws IOException {
      getVBytesForOffset(offsetFor(current), vbytes);
      return vbytes;
    }

    public Progress getProgress() {
      return null;
    }

    public void close() {}
  }

  private void mergeParts() throws IOException {
    // get the approximate size of the final output/index files
    long finalOutFileSize = 0;
    long finalIndexFileSize = 0;
    final Path[] filename = new Path[numSpills];
    final String taskIdentifier = outputContext.getUniqueIdentifier();

    for (int i = 0; i < numSpills; i++) {
      filename[i] = mapOutputFile.getSpillFile(i);
      finalOutFileSize += rfs.getFileStatus(filename[i]).getLen();
    }
    if (numSpills == 1) { // the spill is the final output
      sameVolRename(filename[0], mapOutputFile.getOutputFileForWriteInVolume(filename[0]));
      if (indexCacheList.size() == 0) {
        sameVolRename(
            mapOutputFile.getSpillIndexFile(0),
            mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]));
      } else {
        indexCacheList
            .get(0)
            .writeToFile(mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]), conf);
      }
      return;
    }

    // read in paged indices
    for (int i = indexCacheList.size(); i < numSpills; ++i) {
      Path indexFileName = mapOutputFile.getSpillIndexFile(i);
      indexCacheList.add(new TezSpillRecord(indexFileName, conf));
    }

    // make correction in the length to include the sequence file header
    // lengths for each partition
    finalOutFileSize += partitions * APPROX_HEADER_LENGTH;
    finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH;
    Path finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize);
    Path finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize);

    // The output stream for the final single output file
    FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);

    if (numSpills == 0) {
      // TODO Change event generation to say there is no data rather than generating a dummy file
      // create dummy files

      TezSpillRecord sr = new TezSpillRecord(partitions);
      try {
        for (int i = 0; i < partitions; i++) {
          long segmentStart = finalOut.getPos();
          Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, null, null);
          writer.close();

          TezIndexRecord rec =
              new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength());
          // Covers the case of multiple spills.
          outputBytesWithOverheadCounter.increment(writer.getRawLength());
          sr.putIndex(rec, i);
        }
        sr.writeToFile(finalIndexFile, conf);
      } finally {
        finalOut.close();
      }
      return;
    } else {
      final TezSpillRecord spillRec = new TezSpillRecord(partitions);
      for (int parts = 0; parts < partitions; parts++) {
        // create the segments to be merged
        List<Segment> segmentList = new ArrayList<Segment>(numSpills);
        for (int i = 0; i < numSpills; i++) {
          TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);

          Segment s =
              new Segment(
                  conf,
                  rfs,
                  filename[i],
                  indexRecord.getStartOffset(),
                  indexRecord.getPartLength(),
                  codec,
                  ifileReadAhead,
                  ifileReadAheadLength,
                  ifileBufferSize,
                  true);
          segmentList.add(i, s);

          if (LOG.isDebugEnabled()) {
            LOG.debug(
                "TaskIdentifier="
                    + taskIdentifier
                    + " Partition="
                    + parts
                    + "Spill ="
                    + i
                    + "("
                    + indexRecord.getStartOffset()
                    + ","
                    + indexRecord.getRawLength()
                    + ", "
                    + indexRecord.getPartLength()
                    + ")");
          }
        }

        int mergeFactor =
            this.conf.getInt(
                TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR,
                TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT);
        // sort the segments only if there are intermediate merges
        boolean sortSegments = segmentList.size() > mergeFactor;
        // merge
        TezRawKeyValueIterator kvIter =
            TezMerger.merge(
                conf,
                rfs,
                keyClass,
                valClass,
                codec,
                segmentList,
                mergeFactor,
                new Path(taskIdentifier),
                (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf),
                nullProgressable,
                sortSegments,
                true,
                null,
                spilledRecordsCounter,
                additionalSpillBytesRead,
                null); // Not using any Progress in TezMerger. Should just work.

        // write merged output to disk
        long segmentStart = finalOut.getPos();
        Writer writer =
            new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null);
        if (combiner == null || numSpills < minSpillsForCombine) {
          TezMerger.writeFile(
              kvIter,
              writer,
              nullProgressable,
              TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
        } else {
          runCombineProcessor(kvIter, writer);
        }
        writer.close();

        // record offsets
        final TezIndexRecord rec =
            new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength());
        spillRec.putIndex(rec, parts);
      }
      spillRec.writeToFile(finalIndexFile, conf);
      finalOut.close();
      for (int i = 0; i < numSpills; i++) {
        rfs.delete(filename[i], true);
      }
    }
  }
}
Ejemplo n.º 27
0
// todo make thread safe and concurrent
public class DbImpl implements DB {
  private final Options options;
  private final File databaseDir;
  private final TableCache tableCache;
  private final DbLock dbLock;
  private final VersionSet versions;

  private final AtomicBoolean shuttingDown = new AtomicBoolean();
  private final ReentrantLock mutex = new ReentrantLock();
  private final Condition backgroundCondition = mutex.newCondition();

  private final List<Long> pendingOutputs = newArrayList(); // todo

  private LogWriter log;

  private MemTable memTable;
  private MemTable immutableMemTable;

  private final InternalKeyComparator internalKeyComparator;

  private volatile Throwable backgroundException;
  private ExecutorService compactionExecutor;
  private Future<?> backgroundCompaction;

  private ManualCompaction manualCompaction;

  public DbImpl(Options options, File databaseDir) throws IOException {
    Preconditions.checkNotNull(options, "options is null");
    Preconditions.checkNotNull(databaseDir, "databaseDir is null");
    this.options = options;

    if (this.options.compressionType() == CompressionType.ZLIB && !Zlib.available()) {
      // There's little hope to continue.
      this.options.compressionType(CompressionType.NONE);
    }
    if (this.options.compressionType() == CompressionType.SNAPPY && !Snappy.available()) {
      // Disable snappy if it's not available.
      this.options.compressionType(CompressionType.NONE);
    }

    this.databaseDir = databaseDir;

    // use custom comparator if set
    DBComparator comparator = options.comparator();
    UserComparator userComparator;
    if (comparator != null) {
      userComparator = new CustomUserComparator(comparator);
    } else {
      userComparator = new BytewiseComparator();
    }
    internalKeyComparator = new InternalKeyComparator(userComparator);
    memTable = new MemTable(internalKeyComparator);
    immutableMemTable = null;

    ThreadFactory compactionThreadFactory =
        new ThreadFactoryBuilder()
            .setNameFormat("leveldb-compaction-%s")
            .setUncaughtExceptionHandler(
                new UncaughtExceptionHandler() {
                  @Override
                  public void uncaughtException(Thread t, Throwable e) {
                    // todo need a real UncaughtExceptionHandler
                    System.out.printf("%s%n", t);
                    e.printStackTrace();
                  }
                })
            .build();
    compactionExecutor = Executors.newSingleThreadExecutor(compactionThreadFactory);

    // Reserve ten files or so for other uses and give the rest to TableCache.
    int tableCacheSize = options.maxOpenFiles() - 10;
    tableCache =
        new TableCache(
            databaseDir,
            tableCacheSize,
            new InternalUserComparator(internalKeyComparator),
            options.verifyChecksums());

    // create the version set

    // create the database dir if it does not already exist
    databaseDir.mkdirs();
    Preconditions.checkArgument(
        databaseDir.exists(),
        "Database directory '%s' does not exist and could not be created",
        databaseDir);
    Preconditions.checkArgument(
        databaseDir.isDirectory(), "Database directory '%s' is not a directory", databaseDir);

    mutex.lock();
    try {
      // lock the database dir
      dbLock = new DbLock(new File(databaseDir, Filename.lockFileName()));

      // verify the "current" file
      File currentFile = new File(databaseDir, Filename.currentFileName());
      if (!currentFile.canRead()) {
        Preconditions.checkArgument(
            options.createIfMissing(),
            "Database '%s' does not exist and the create if missing option is disabled",
            databaseDir);
      } else {
        Preconditions.checkArgument(
            !options.errorIfExists(),
            "Database '%s' exists and the error if exists option is enabled",
            databaseDir);
      }

      versions = new VersionSet(databaseDir, tableCache, internalKeyComparator);

      // load  (and recover) current version
      versions.recover();

      // Recover from all newer log files than the ones named in the
      // descriptor (new log files may have been added by the previous
      // incarnation without registering them in the descriptor).
      //
      // Note that PrevLogNumber() is no longer used, but we pay
      // attention to it in case we are recovering a database
      // produced by an older version of leveldb.
      long minLogNumber = versions.getLogNumber();
      long previousLogNumber = versions.getPrevLogNumber();
      List<File> filenames = Filename.listFiles(databaseDir);

      List<Long> logs = Lists.newArrayList();
      for (File filename : filenames) {
        FileInfo fileInfo = Filename.parseFileName(filename);

        if (fileInfo != null
            && fileInfo.getFileType() == FileType.LOG
            && ((fileInfo.getFileNumber() >= minLogNumber)
                || (fileInfo.getFileNumber() == previousLogNumber))) {
          logs.add(fileInfo.getFileNumber());
        }
      }

      // Recover in the order in which the logs were generated
      VersionEdit edit = new VersionEdit();
      Collections.sort(logs);
      for (Long fileNumber : logs) {
        long maxSequence = recoverLogFile(fileNumber, edit);
        if (versions.getLastSequence() < maxSequence) {
          versions.setLastSequence(maxSequence);
        }
      }

      // open transaction log
      long logFileNumber = versions.getNextFileNumber();
      this.log =
          Logs.createLogWriter(
              new File(databaseDir, Filename.logFileName(logFileNumber)), logFileNumber);
      edit.setLogNumber(log.getFileNumber());

      // apply recovered edits
      versions.logAndApply(edit);

      // cleanup unused files
      deleteObsoleteFiles();

      // schedule compactions
      maybeScheduleCompaction();
    } finally {
      mutex.unlock();
    }
  }

  public void close() {
    if (shuttingDown.getAndSet(true)) {
      return;
    }

    mutex.lock();
    try {
      while (backgroundCompaction != null) {
        backgroundCondition.awaitUninterruptibly();
      }
    } finally {
      mutex.unlock();
    }

    compactionExecutor.shutdown();
    try {
      compactionExecutor.awaitTermination(1, TimeUnit.DAYS);
    } catch (InterruptedException e) {
      Thread.currentThread().interrupt();
    }
    try {
      versions.destroy();
    } catch (IOException ignored) {
    }
    try {
      log.close();
    } catch (IOException ignored) {
    }
    tableCache.close();
    dbLock.release();
  }

  @Override
  public String getProperty(String name) {
    checkBackgroundException();
    return null;
  }

  private void deleteObsoleteFiles() {
    Preconditions.checkState(mutex.isHeldByCurrentThread());

    // Make a set of all of the live files
    List<Long> live = newArrayList(this.pendingOutputs);
    for (FileMetaData fileMetaData : versions.getLiveFiles()) {
      live.add(fileMetaData.getNumber());
    }

    for (File file : Filename.listFiles(databaseDir)) {
      FileInfo fileInfo = Filename.parseFileName(file);
      if (fileInfo == null) continue;
      long number = fileInfo.getFileNumber();
      boolean keep = true;
      switch (fileInfo.getFileType()) {
        case LOG:
          keep = ((number >= versions.getLogNumber()) || (number == versions.getPrevLogNumber()));
          break;
        case DESCRIPTOR:
          // Keep my manifest file, and any newer incarnations'
          // (in case there is a race that allows other incarnations)
          keep = (number >= versions.getManifestFileNumber());
          break;
        case TABLE:
          keep = live.contains(number);
          break;
        case TEMP:
          // Any temp files that are currently being written to must
          // be recorded in pending_outputs_, which is inserted into "live"
          keep = live.contains(number);
          break;
        case CURRENT:
        case DB_LOCK:
        case INFO_LOG:
          keep = true;
          break;
      }

      if (!keep) {
        if (fileInfo.getFileType() == FileType.TABLE) {
          tableCache.evict(number);
        }
        // todo info logging system needed
        //                Log(options_.info_log, "Delete type=%d #%lld\n",
        //                int(type),
        //                        static_cast < unsigned long long>(number));
        file.delete();
      }
    }
  }

  public void flushMemTable() {
    mutex.lock();
    try {
      // force compaction
      makeRoomForWrite(true);

      // todo bg_error code
      while (immutableMemTable != null) {
        backgroundCondition.awaitUninterruptibly();
      }

    } finally {
      mutex.unlock();
    }
  }

  public void compactRange(int level, Slice start, Slice end) {
    Preconditions.checkArgument(level >= 0, "level is negative");
    Preconditions.checkArgument(
        level + 1 < NUM_LEVELS, "level is greater than or equal to %s", NUM_LEVELS);
    Preconditions.checkNotNull(start, "start is null");
    Preconditions.checkNotNull(end, "end is null");

    mutex.lock();
    try {
      while (this.manualCompaction != null) {
        backgroundCondition.awaitUninterruptibly();
      }
      ManualCompaction manualCompaction = new ManualCompaction(level, start, end);
      this.manualCompaction = manualCompaction;

      maybeScheduleCompaction();

      while (this.manualCompaction == manualCompaction) {
        backgroundCondition.awaitUninterruptibly();
      }
    } finally {
      mutex.unlock();
    }
  }

  private void maybeScheduleCompaction() {
    Preconditions.checkState(mutex.isHeldByCurrentThread());

    if (backgroundCompaction != null) {
      // Already scheduled
    } else if (shuttingDown.get()) {
      // DB is being shutdown; no more background compactions
    } else if (immutableMemTable == null
        && manualCompaction == null
        && !versions.needsCompaction()) {
      // No work to be done
    } else {
      backgroundCompaction =
          compactionExecutor.submit(
              new Callable<Void>() {
                @Override
                public Void call() throws Exception {
                  try {
                    backgroundCall();
                  } catch (DatabaseShutdownException ignored) {
                  } catch (Throwable e) {
                    backgroundException = e;
                  }
                  return null;
                }
              });
    }
  }

  public void checkBackgroundException() {
    Throwable e = backgroundException;
    if (e != null) {
      throw new BackgroundProcessingException(e);
    }
  }

  private void backgroundCall() throws IOException {
    mutex.lock();
    try {
      if (backgroundCompaction == null) {
        return;
      }

      try {
        if (!shuttingDown.get()) {
          backgroundCompaction();
        }
      } finally {
        backgroundCompaction = null;
      }
    } finally {
      try {
        // Previous compaction may have produced too many files in a level,
        // so reschedule another compaction if needed.
        maybeScheduleCompaction();
      } finally {
        try {
          backgroundCondition.signalAll();
        } finally {
          mutex.unlock();
        }
      }
    }
  }

  private void backgroundCompaction() throws IOException {
    Preconditions.checkState(mutex.isHeldByCurrentThread());

    compactMemTableInternal();

    Compaction compaction;
    if (manualCompaction != null) {
      compaction =
          versions.compactRange(
              manualCompaction.level,
              new InternalKey(manualCompaction.begin, MAX_SEQUENCE_NUMBER, ValueType.VALUE),
              new InternalKey(manualCompaction.end, 0, ValueType.DELETION));
    } else {
      compaction = versions.pickCompaction();
    }

    if (compaction == null) {
      // no compaction
    } else if (manualCompaction == null && compaction.isTrivialMove()) {
      // Move file to next level
      Preconditions.checkState(compaction.getLevelInputs().size() == 1);
      FileMetaData fileMetaData = compaction.getLevelInputs().get(0);
      compaction.getEdit().deleteFile(compaction.getLevel(), fileMetaData.getNumber());
      compaction.getEdit().addFile(compaction.getLevel() + 1, fileMetaData);
      versions.logAndApply(compaction.getEdit());
      // log
    } else {
      CompactionState compactionState = new CompactionState(compaction);
      doCompactionWork(compactionState);
      cleanupCompaction(compactionState);
    }

    // manual compaction complete
    if (manualCompaction != null) {
      manualCompaction = null;
    }
  }

  private void cleanupCompaction(CompactionState compactionState) {
    Preconditions.checkState(mutex.isHeldByCurrentThread());

    if (compactionState.builder != null) {
      compactionState.builder.abandon();
    } else {
      Preconditions.checkArgument(compactionState.outfile == null);
    }

    for (FileMetaData output : compactionState.outputs) {
      pendingOutputs.remove(output.getNumber());
    }
  }

  private long recoverLogFile(long fileNumber, VersionEdit edit) throws IOException {
    Preconditions.checkState(mutex.isHeldByCurrentThread());
    File file = new File(databaseDir, Filename.logFileName(fileNumber));
    FileChannel channel = new FileInputStream(file).getChannel();
    try {
      LogMonitor logMonitor = LogMonitors.logMonitor();
      LogReader logReader = new LogReader(channel, logMonitor, true, 0);

      // Log(options_.info_log, "Recovering log #%llu", (unsigned long long) log_number);

      // Read all the records and add to a memtable
      long maxSequence = 0;
      MemTable memTable = null;
      for (Slice record = logReader.readRecord(); record != null; record = logReader.readRecord()) {
        SliceInput sliceInput = record.input();
        // read header
        if (sliceInput.available() < 12) {
          logMonitor.corruption(sliceInput.available(), "log record too small");
          continue;
        }
        long sequenceBegin = sliceInput.readLong();
        int updateSize = sliceInput.readInt();

        // read entries
        WriteBatchImpl writeBatch = readWriteBatch(sliceInput, updateSize);

        // apply entries to memTable
        if (memTable == null) {
          memTable = new MemTable(internalKeyComparator);
        }
        writeBatch.forEach(new InsertIntoHandler(memTable, sequenceBegin));

        // update the maxSequence
        long lastSequence = sequenceBegin + updateSize - 1;
        if (lastSequence > maxSequence) {
          maxSequence = lastSequence;
        }

        // flush mem table if necessary
        if (memTable.approximateMemoryUsage() > options.writeBufferSize()) {
          writeLevel0Table(memTable, edit, null);
          memTable = null;
        }
      }

      // flush mem table
      if (memTable != null && !memTable.isEmpty()) {
        writeLevel0Table(memTable, edit, null);
      }

      return maxSequence;
    } finally {
      channel.close();
    }
  }

  @Override
  public byte[] get(byte[] key) throws DBException {
    return get(key, new ReadOptions());
  }

  @Override
  public byte[] get(byte[] key, ReadOptions options) throws DBException {
    checkBackgroundException();
    LookupKey lookupKey;
    mutex.lock();
    try {
      SnapshotImpl snapshot = getSnapshot(options);
      lookupKey = new LookupKey(Slices.wrappedBuffer(key), snapshot.getLastSequence());

      // First look in the memtable, then in the immutable memtable (if any).
      LookupResult lookupResult = memTable.get(lookupKey);
      if (lookupResult != null) {
        Slice value = lookupResult.getValue();
        if (value == null) {
          return null;
        }
        return value.getBytes();
      }
      if (immutableMemTable != null) {
        lookupResult = immutableMemTable.get(lookupKey);
        if (lookupResult != null) {
          Slice value = lookupResult.getValue();
          if (value == null) {
            return null;
          }
          return value.getBytes();
        }
      }
    } finally {
      mutex.unlock();
    }

    // Not in memTables; try live files in level order
    LookupResult lookupResult = versions.get(lookupKey);

    // schedule compaction if necessary
    mutex.lock();
    try {
      if (versions.needsCompaction()) {
        maybeScheduleCompaction();
      }
    } finally {
      mutex.unlock();
    }

    if (lookupResult != null) {
      Slice value = lookupResult.getValue();
      if (value != null) {
        return value.getBytes();
      }
    }
    return null;
  }

  @Override
  public void put(byte[] key, byte[] value) throws DBException {
    put(key, value, new WriteOptions());
  }

  @Override
  public Snapshot put(byte[] key, byte[] value, WriteOptions options) throws DBException {
    return writeInternal(new WriteBatchImpl().put(key, value), options);
  }

  @Override
  public void delete(byte[] key) throws DBException {
    writeInternal(new WriteBatchImpl().delete(key), new WriteOptions());
  }

  @Override
  public Snapshot delete(byte[] key, WriteOptions options) throws DBException {
    return writeInternal(new WriteBatchImpl().delete(key), options);
  }

  @Override
  public void write(WriteBatch updates) throws DBException {
    writeInternal((WriteBatchImpl) updates, new WriteOptions());
  }

  @Override
  public Snapshot write(WriteBatch updates, WriteOptions options) throws DBException {
    return writeInternal((WriteBatchImpl) updates, options);
  }

  public Snapshot writeInternal(WriteBatchImpl updates, WriteOptions options) throws DBException {
    checkBackgroundException();
    mutex.lock();
    try {
      long sequenceEnd;
      if (updates.size() != 0) {
        makeRoomForWrite(false);

        // Get sequence numbers for this change set
        final long sequenceBegin = versions.getLastSequence() + 1;
        sequenceEnd = sequenceBegin + updates.size() - 1;

        // Reserve this sequence in the version set
        versions.setLastSequence(sequenceEnd);

        // Log write
        Slice record = writeWriteBatch(updates, sequenceBegin);
        try {
          log.addRecord(record, options.sync());
        } catch (IOException e) {
          throw Throwables.propagate(e);
        }

        // Update memtable
        updates.forEach(new InsertIntoHandler(memTable, sequenceBegin));
      } else {
        sequenceEnd = versions.getLastSequence();
      }

      if (options.snapshot()) {
        return new SnapshotImpl(versions.getCurrent(), sequenceEnd);
      } else {
        return null;
      }
    } finally {
      mutex.unlock();
    }
  }

  @Override
  public WriteBatch createWriteBatch() {
    checkBackgroundException();
    return new WriteBatchImpl();
  }

  @Override
  public SeekingIteratorAdapter iterator() {
    return iterator(new ReadOptions());
  }

  public SeekingIteratorAdapter iterator(ReadOptions options) {
    checkBackgroundException();
    mutex.lock();
    try {
      DbIterator rawIterator = internalIterator();

      // filter any entries not visible in our snapshot
      SnapshotImpl snapshot = getSnapshot(options);
      SnapshotSeekingIterator snapshotIterator =
          new SnapshotSeekingIterator(
              rawIterator, snapshot, internalKeyComparator.getUserComparator());
      return new SeekingIteratorAdapter(snapshotIterator);
    } finally {
      mutex.unlock();
    }
  }

  SeekingIterable<InternalKey, Slice> internalIterable() {
    return new SeekingIterable<InternalKey, Slice>() {
      @Override
      public DbIterator iterator() {
        return internalIterator();
      }
    };
  }

  DbIterator internalIterator() {
    mutex.lock();
    try {
      // merge together the memTable, immutableMemTable, and tables in version set
      MemTableIterator iterator = null;
      if (immutableMemTable != null) {
        iterator = immutableMemTable.iterator();
      }
      Version current = versions.getCurrent();
      return new DbIterator(
          memTable.iterator(),
          iterator,
          current.getLevel0Files(),
          current.getLevelIterators(),
          internalKeyComparator);
    } finally {
      mutex.unlock();
    }
  }

  public Snapshot getSnapshot() {
    checkBackgroundException();
    mutex.lock();
    try {
      return new SnapshotImpl(versions.getCurrent(), versions.getLastSequence());
    } finally {
      mutex.unlock();
    }
  }

  private SnapshotImpl getSnapshot(ReadOptions options) {
    SnapshotImpl snapshot;
    if (options.snapshot() != null) {
      snapshot = (SnapshotImpl) options.snapshot();
    } else {
      snapshot = new SnapshotImpl(versions.getCurrent(), versions.getLastSequence());
      snapshot.close(); // To avoid holding the snapshot active..
    }
    return snapshot;
  }

  private void makeRoomForWrite(boolean force) {
    Preconditions.checkState(mutex.isHeldByCurrentThread());

    boolean allowDelay = !force;

    while (true) {
      // todo background processing system need work
      //            if (!bg_error_.ok()) {
      //              // Yield previous error
      //              s = bg_error_;
      //              break;
      //            } else
      if (allowDelay && versions.numberOfFilesInLevel(0) > L0_SLOWDOWN_WRITES_TRIGGER) {
        // We are getting close to hitting a hard limit on the number of
        // L0 files.  Rather than delaying a single write by several
        // seconds when we hit the hard limit, start delaying each
        // individual write by 1ms to reduce latency variance.  Also,
        // this delay hands over some CPU to the compaction thread in
        // case it is sharing the same core as the writer.
        try {
          mutex.unlock();
          Thread.sleep(1);
        } catch (InterruptedException e) {
          Thread.currentThread().interrupt();
          throw new RuntimeException(e);
        } finally {
          mutex.lock();
        }

        // Do not delay a single write more than once
        allowDelay = false;
      } else if (!force && memTable.approximateMemoryUsage() <= options.writeBufferSize()) {
        // There is room in current memtable
        break;
      } else if (immutableMemTable != null) {
        // We have filled up the current memtable, but the previous
        // one is still being compacted, so we wait.
        backgroundCondition.awaitUninterruptibly();
      } else if (versions.numberOfFilesInLevel(0) >= L0_STOP_WRITES_TRIGGER) {
        // There are too many level-0 files.
        //                Log(options_.info_log, "waiting...\n");
        backgroundCondition.awaitUninterruptibly();
      } else {
        // Attempt to switch to a new memtable and trigger compaction of old
        Preconditions.checkState(versions.getPrevLogNumber() == 0);

        // close the existing log
        try {
          log.close();
        } catch (IOException e) {
          throw new RuntimeException("Unable to close log file " + log.getFile(), e);
        }

        // open a new log
        long logNumber = versions.getNextFileNumber();
        try {
          this.log =
              Logs.createLogWriter(
                  new File(databaseDir, Filename.logFileName(logNumber)), logNumber);
        } catch (IOException e) {
          throw new RuntimeException(
              "Unable to open new log file "
                  + new File(databaseDir, Filename.logFileName(logNumber)).getAbsoluteFile(),
              e);
        }

        // create a new mem table
        immutableMemTable = memTable;
        memTable = new MemTable(internalKeyComparator);

        // Do not force another compaction there is space available
        force = false;

        maybeScheduleCompaction();
      }
    }
  }

  public void compactMemTable() throws IOException {
    mutex.lock();
    try {
      compactMemTableInternal();
    } finally {
      mutex.unlock();
    }
  }

  private void compactMemTableInternal() throws IOException {
    Preconditions.checkState(mutex.isHeldByCurrentThread());
    if (immutableMemTable == null) {
      return;
    }

    try {
      // Save the contents of the memtable as a new Table
      VersionEdit edit = new VersionEdit();
      Version base = versions.getCurrent();
      writeLevel0Table(immutableMemTable, edit, base);

      if (shuttingDown.get()) {
        throw new DatabaseShutdownException("Database shutdown during memtable compaction");
      }

      // Replace immutable memtable with the generated Table
      edit.setPreviousLogNumber(0);
      edit.setLogNumber(log.getFileNumber()); // Earlier logs no longer needed
      versions.logAndApply(edit);

      immutableMemTable = null;

      deleteObsoleteFiles();
    } finally {
      backgroundCondition.signalAll();
    }
  }

  private void writeLevel0Table(MemTable mem, VersionEdit edit, Version base) throws IOException {
    Preconditions.checkState(mutex.isHeldByCurrentThread());

    // skip empty mem table
    if (mem.isEmpty()) {
      return;
    }

    // write the memtable to a new sstable
    long fileNumber = versions.getNextFileNumber();
    pendingOutputs.add(fileNumber);
    mutex.unlock();
    FileMetaData meta;
    try {
      meta = buildTable(mem, fileNumber);
    } finally {
      mutex.lock();
    }
    pendingOutputs.remove(fileNumber);

    // Note that if file size is zero, the file has been deleted and
    // should not be added to the manifest.
    int level = 0;
    if (meta != null && meta.getFileSize() > 0) {
      Slice minUserKey = meta.getSmallest().getUserKey();
      Slice maxUserKey = meta.getLargest().getUserKey();
      if (base != null) {
        level = base.pickLevelForMemTableOutput(minUserKey, maxUserKey);
      }
      edit.addFile(level, meta);
    }
  }

  private FileMetaData buildTable(SeekingIterable<InternalKey, Slice> data, long fileNumber)
      throws IOException {
    File file = new File(databaseDir, Filename.tableFileName(fileNumber));
    try {
      InternalKey smallest = null;
      InternalKey largest = null;
      FileChannel channel = new FileOutputStream(file).getChannel();
      try {
        TableBuilder tableBuilder =
            new TableBuilder(options, channel, new InternalUserComparator(internalKeyComparator));

        for (Entry<InternalKey, Slice> entry : data) {
          // update keys
          InternalKey key = entry.getKey();
          if (smallest == null) {
            smallest = key;
          }
          largest = key;

          tableBuilder.add(key.encode(), entry.getValue());
        }

        tableBuilder.finish();
      } finally {
        try {
          channel.force(true);
        } finally {
          channel.close();
        }
      }

      if (smallest == null) {
        return null;
      }
      FileMetaData fileMetaData = new FileMetaData(fileNumber, file.length(), smallest, largest);

      // verify table can be opened
      tableCache.newIterator(fileMetaData);

      pendingOutputs.remove(fileNumber);

      return fileMetaData;

    } catch (IOException e) {
      file.delete();
      throw e;
    }
  }

  private void doCompactionWork(CompactionState compactionState) throws IOException {
    Preconditions.checkState(mutex.isHeldByCurrentThread());
    Preconditions.checkArgument(
        versions.numberOfBytesInLevel(compactionState.getCompaction().getLevel()) > 0);
    Preconditions.checkArgument(compactionState.builder == null);
    Preconditions.checkArgument(compactionState.outfile == null);

    // todo track snapshots
    compactionState.smallestSnapshot = versions.getLastSequence();

    // Release mutex while we're actually doing the compaction work
    mutex.unlock();
    try {
      MergingIterator iterator = versions.makeInputIterator(compactionState.compaction);

      Slice currentUserKey = null;
      boolean hasCurrentUserKey = false;

      long lastSequenceForKey = MAX_SEQUENCE_NUMBER;
      while (iterator.hasNext() && !shuttingDown.get()) {
        // always give priority to compacting the current mem table
        mutex.lock();
        try {
          compactMemTableInternal();
        } finally {
          mutex.unlock();
        }

        InternalKey key = iterator.peek().getKey();
        if (compactionState.compaction.shouldStopBefore(key) && compactionState.builder != null) {
          finishCompactionOutputFile(compactionState);
        }

        // Handle key/value, add to state, etc.
        boolean drop = false;
        // todo if key doesn't parse (it is corrupted),
        if (false /*!ParseInternalKey(key, &ikey)*/) {
          // do not hide error keys
          currentUserKey = null;
          hasCurrentUserKey = false;
          lastSequenceForKey = MAX_SEQUENCE_NUMBER;
        } else {
          if (!hasCurrentUserKey
              || internalKeyComparator.getUserComparator().compare(key.getUserKey(), currentUserKey)
                  != 0) {
            // First occurrence of this user key
            currentUserKey = key.getUserKey();
            hasCurrentUserKey = true;
            lastSequenceForKey = MAX_SEQUENCE_NUMBER;
          }

          if (lastSequenceForKey <= compactionState.smallestSnapshot) {
            // Hidden by an newer entry for same user key
            drop = true; // (A)
          } else if (key.getValueType() == ValueType.DELETION
              && key.getSequenceNumber() <= compactionState.smallestSnapshot
              && compactionState.compaction.isBaseLevelForKey(key.getUserKey())) {

            // For this user key:
            // (1) there is no data in higher levels
            // (2) data in lower levels will have larger sequence numbers
            // (3) data in layers that are being compacted here and have
            //     smaller sequence numbers will be dropped in the next
            //     few iterations of this loop (by rule (A) above).
            // Therefore this deletion marker is obsolete and can be dropped.
            drop = true;
          }

          lastSequenceForKey = key.getSequenceNumber();
        }

        if (!drop) {
          // Open output file if necessary
          if (compactionState.builder == null) {
            openCompactionOutputFile(compactionState);
          }
          if (compactionState.builder.getEntryCount() == 0) {
            compactionState.currentSmallest = key;
          }
          compactionState.currentLargest = key;
          compactionState.builder.add(key.encode(), iterator.peek().getValue());

          // Close output file if it is big enough
          if (compactionState.builder.getFileSize()
              >= compactionState.compaction.getMaxOutputFileSize()) {
            finishCompactionOutputFile(compactionState);
          }
        }
        iterator.next();
      }

      if (shuttingDown.get()) {
        throw new DatabaseShutdownException("DB shutdown during compaction");
      }
      if (compactionState.builder != null) {
        finishCompactionOutputFile(compactionState);
      }
    } finally {
      mutex.lock();
    }

    // todo port CompactionStats code

    installCompactionResults(compactionState);
  }

  private void openCompactionOutputFile(CompactionState compactionState)
      throws FileNotFoundException {
    Preconditions.checkNotNull(compactionState, "compactionState is null");
    Preconditions.checkArgument(
        compactionState.builder == null, "compactionState builder is not null");

    mutex.lock();
    try {
      long fileNumber = versions.getNextFileNumber();
      pendingOutputs.add(fileNumber);
      compactionState.currentFileNumber = fileNumber;
      compactionState.currentFileSize = 0;
      compactionState.currentSmallest = null;
      compactionState.currentLargest = null;

      File file = new File(databaseDir, Filename.tableFileName(fileNumber));
      compactionState.outfile = new FileOutputStream(file).getChannel();
      compactionState.builder =
          new TableBuilder(
              options, compactionState.outfile, new InternalUserComparator(internalKeyComparator));
    } finally {
      mutex.unlock();
    }
  }

  private void finishCompactionOutputFile(CompactionState compactionState) throws IOException {
    Preconditions.checkNotNull(compactionState, "compactionState is null");
    Preconditions.checkArgument(compactionState.outfile != null);
    Preconditions.checkArgument(compactionState.builder != null);

    long outputNumber = compactionState.currentFileNumber;
    Preconditions.checkArgument(outputNumber != 0);

    long currentEntries = compactionState.builder.getEntryCount();
    compactionState.builder.finish();

    long currentBytes = compactionState.builder.getFileSize();
    compactionState.currentFileSize = currentBytes;
    compactionState.totalBytes += currentBytes;

    FileMetaData currentFileMetaData =
        new FileMetaData(
            compactionState.currentFileNumber,
            compactionState.currentFileSize,
            compactionState.currentSmallest,
            compactionState.currentLargest);
    compactionState.outputs.add(currentFileMetaData);

    compactionState.builder = null;

    compactionState.outfile.force(true);
    compactionState.outfile.close();
    compactionState.outfile = null;

    if (currentEntries > 0) {
      // Verify that the table is usable
      tableCache.newIterator(outputNumber);
    }
  }

  private void installCompactionResults(CompactionState compact) throws IOException {
    Preconditions.checkState(mutex.isHeldByCurrentThread());

    // Add compaction outputs
    compact.compaction.addInputDeletions(compact.compaction.getEdit());
    int level = compact.compaction.getLevel();
    for (FileMetaData output : compact.outputs) {
      compact.compaction.getEdit().addFile(level + 1, output);
      pendingOutputs.remove(output.getNumber());
    }

    try {
      versions.logAndApply(compact.compaction.getEdit());
      deleteObsoleteFiles();
    } catch (IOException e) {
      // Compaction failed for some reason.  Simply discard the work and try again later.

      // Discard any files we may have created during this failed compaction
      for (FileMetaData output : compact.outputs) {
        File file = new File(databaseDir, Filename.tableFileName(output.getNumber()));
        file.delete();
      }
      compact.outputs.clear();
    }
  }

  int numberOfFilesInLevel(int level) {
    return versions.getCurrent().numberOfFilesInLevel(level);
  }

  @Override
  public long[] getApproximateSizes(Range... ranges) {
    Preconditions.checkNotNull(ranges, "ranges is null");
    long[] sizes = new long[ranges.length];
    for (int i = 0; i < ranges.length; i++) {
      Range range = ranges[i];
      sizes[i] = getApproximateSizes(range);
    }
    return sizes;
  }

  public long getApproximateSizes(Range range) {
    Version v = versions.getCurrent();

    InternalKey startKey =
        new InternalKey(
            Slices.wrappedBuffer(range.start()),
            SequenceNumber.MAX_SEQUENCE_NUMBER,
            ValueType.VALUE);
    InternalKey limitKey =
        new InternalKey(
            Slices.wrappedBuffer(range.limit()),
            SequenceNumber.MAX_SEQUENCE_NUMBER,
            ValueType.VALUE);
    long startOffset = v.getApproximateOffsetOf(startKey);
    long limitOffset = v.getApproximateOffsetOf(limitKey);

    return (limitOffset >= startOffset ? limitOffset - startOffset : 0);
  }

  public long getMaxNextLevelOverlappingBytes() {
    return versions.getMaxNextLevelOverlappingBytes();
  }

  private static class CompactionState {
    private final Compaction compaction;

    private final List<FileMetaData> outputs = newArrayList();

    private long smallestSnapshot;

    // State kept for output being generated
    private FileChannel outfile;
    private TableBuilder builder;

    // Current file being generated
    private long currentFileNumber;
    private long currentFileSize;
    private InternalKey currentSmallest;
    private InternalKey currentLargest;

    private long totalBytes;

    private CompactionState(Compaction compaction) {
      this.compaction = compaction;
    }

    public Compaction getCompaction() {
      return compaction;
    }
  }

  private static class ManualCompaction {
    private final int level;
    private final Slice begin;
    private final Slice end;

    private ManualCompaction(int level, Slice begin, Slice end) {
      this.level = level;
      this.begin = begin;
      this.end = end;
    }
  }

  private WriteBatchImpl readWriteBatch(SliceInput record, int updateSize) throws IOException {
    WriteBatchImpl writeBatch = new WriteBatchImpl();
    int entries = 0;
    while (record.isReadable()) {
      entries++;
      ValueType valueType = ValueType.getValueTypeByPersistentId(record.readByte());
      if (valueType == VALUE) {
        Slice key = readLengthPrefixedBytes(record);
        Slice value = readLengthPrefixedBytes(record);
        writeBatch.put(key, value);
      } else if (valueType == DELETION) {
        Slice key = readLengthPrefixedBytes(record);
        writeBatch.delete(key);
      } else {
        throw new IllegalStateException("Unexpected value type " + valueType);
      }
    }

    if (entries != updateSize) {
      throw new IOException(
          String.format(
              "Expected %d entries in log record but found %s entries", updateSize, entries));
    }

    return writeBatch;
  }

  private Slice writeWriteBatch(WriteBatchImpl updates, long sequenceBegin) {
    Slice record = Slices.allocate(SIZE_OF_LONG + SIZE_OF_INT + updates.getApproximateSize());
    final SliceOutput sliceOutput = record.output();
    sliceOutput.writeLong(sequenceBegin);
    sliceOutput.writeInt(updates.size());
    updates.forEach(
        new Handler() {
          @Override
          public void put(Slice key, Slice value) {
            sliceOutput.writeByte(VALUE.getPersistentId());
            writeLengthPrefixedBytes(sliceOutput, key);
            writeLengthPrefixedBytes(sliceOutput, value);
          }

          @Override
          public void delete(Slice key) {
            sliceOutput.writeByte(DELETION.getPersistentId());
            writeLengthPrefixedBytes(sliceOutput, key);
          }
        });
    return record.slice(0, sliceOutput.size());
  }

  private static class InsertIntoHandler implements Handler {
    private long sequence;
    private final MemTable memTable;

    public InsertIntoHandler(MemTable memTable, long sequenceBegin) {
      this.memTable = memTable;
      this.sequence = sequenceBegin;
    }

    @Override
    public void put(Slice key, Slice value) {
      memTable.add(sequence++, VALUE, key, value);
    }

    @Override
    public void delete(Slice key) {
      memTable.add(sequence++, DELETION, key, Slices.EMPTY_SLICE);
    }
  }

  public static class DatabaseShutdownException extends DBException {
    public DatabaseShutdownException() {}

    public DatabaseShutdownException(String message) {
      super(message);
    }
  }

  public static class BackgroundProcessingException extends DBException {
    public BackgroundProcessingException(Throwable cause) {
      super(cause);
    }
  }

  private Object suspensionMutex = new Object();
  private int suspensionCounter = 0;

  @Override
  public void suspendCompactions() throws InterruptedException {
    compactionExecutor.execute(
        new Runnable() {
          @Override
          public void run() {
            try {
              synchronized (suspensionMutex) {
                suspensionCounter++;
                suspensionMutex.notifyAll();
                while (suspensionCounter > 0 && !compactionExecutor.isShutdown()) {
                  suspensionMutex.wait(500);
                }
              }
            } catch (InterruptedException e) {
            }
          }
        });
    synchronized (suspensionMutex) {
      while (suspensionCounter < 1) {
        suspensionMutex.wait();
      }
    }
  }

  @Override
  public void resumeCompactions() {
    synchronized (suspensionMutex) {
      suspensionCounter--;
      suspensionMutex.notifyAll();
    }
  }

  @Override
  public void compactRange(byte[] begin, byte[] end) throws DBException {
    throw new UnsupportedOperationException("Not yet implemented");
  }
}
/**
 * @author oifa yulian modified copy of linked blocking queue currently goes to garbage - node
 *     elements which are based on 2 pointers : item and next and should signal boolean value under
 *     offer function
 */
public class ConcurrentCyclicFIFO<E> {
  static class Node<E> {
    volatile E item;

    Node<E> next;

    Node(E x) {
      item = x;
    }
  }

  /** Current number of elements */
  private final AtomicInteger count = new AtomicInteger(0);

  /** Head of linked list */
  private transient Node<E> head;

  /** Tail of linked list */
  private transient Node<E> last;

  /** Lock held by take, poll, etc */
  private final ReentrantLock takeLock = new ReentrantLock();

  /** Wait queue for waiting takes */
  private final Condition notEmpty = takeLock.newCondition();

  /** Lock held by put, offer, etc */
  private final ReentrantLock putLock = new ReentrantLock();

  /**
   * Signals a waiting take. Called only from put/offer (which do not otherwise ordinarily lock
   * takeLock.)
   */
  private void signalNotEmpty() {
    final ReentrantLock takeLock = this.takeLock;
    takeLock.lock();
    try {
      notEmpty.signal();
    } finally {
      takeLock.unlock();
    }
  }

  /**
   * Creates a node and links it at end of queue.
   *
   * @param x the item
   */
  private void insert(Node<E> x) {
    last = last.next = x;
  }

  /**
   * Removes a node from head of queue,
   *
   * @return the node
   */
  private Node<E> extract() {
    Node<E> current = head;
    head = head.next;

    current.item = head.item;
    head.item = null;

    return current;
  }

  public ConcurrentCyclicFIFO() {
    last = head = new Node<E>(null);
  }

  public int size() {
    return count.get();
  }

  public boolean offer(E e) {
    if (e == null) throw new NullPointerException();

    final AtomicInteger count = this.count;

    boolean shouldSignal = false;
    final ReentrantLock putLock = this.putLock;
    putLock.lock();
    try {
      insert(new Node(e));
      shouldSignal = (count.getAndIncrement() == 0);
    } finally {
      putLock.unlock();
    }

    if (shouldSignal) signalNotEmpty();

    return !shouldSignal;
  }

  public E take() throws InterruptedException {
    Node<E> x;
    final AtomicInteger count = this.count;
    final ReentrantLock takeLock = this.takeLock;
    takeLock.lockInterruptibly();
    try {
      try {
        while (count.get() == 0) notEmpty.await();
      } catch (InterruptedException ie) {
        notEmpty.signal(); // propagate to a non-interrupted thread
        throw ie;
      }

      x = extract();
      if (count.getAndDecrement() > 1) notEmpty.signal();
    } finally {
      takeLock.unlock();
    }

    E result = x.item;

    // temporary clearence
    x.item = null;
    x.next = null;

    return result;
  }

  public E poll() {
    final AtomicInteger count = this.count;
    if (count.get() == 0) return null;

    Node<E> x = null;
    final ReentrantLock takeLock = this.takeLock;
    takeLock.lock();

    try {
      if (count.get() > 0) {
        x = extract();
        if (count.getAndDecrement() > 1) notEmpty.signal();
      }
    } finally {
      takeLock.unlock();
    }

    if (x != null) {
      E result = x.item;

      // temporary clearence
      x.item = null;
      x.next = null;

      return result;
    }

    return null;
  }

  public void clear() {
    putLock.lock();
    takeLock.lock();

    try {
      head.next = null;
      assert head.item == null;
      last = head;
      count.set(0);
    } finally {
      takeLock.unlock();
      putLock.unlock();
    }
  }
}
 public OTEServerDiscoveryImpl() {
   lock = new ReentrantLock();
   condition = lock.newCondition();
   store = new OTEServerStoreImpl(lock, condition);
   notification = new OteServerNotification(store);
 }
  /**
   * Basic test of the ability to add to a buffer with a fixed capacity queue and to drain the
   * elements from the queue including tests of the non-blocking aspects of the API.
   *
   * @throws TimeoutException
   * @throws ExecutionException
   * @throws InterruptedException
   */
  public void test_blockingBuffer()
      throws InterruptedException, ExecutionException, TimeoutException {

    final Integer e0 = new Integer(0);
    final Integer e1 = new Integer(1);
    final Integer e2 = new Integer(2);

    final int queueCapacity = 3;
    final BlockingQueue<Integer[]> queue = new ArrayBlockingQueue<Integer[]>(queueCapacity);
    final int chunkSize = 4;
    final long chunkTimeout = 1000;
    final TimeUnit chunkTimeoutUnit = TimeUnit.MILLISECONDS;
    /*
     * The test timeout is just a smidge longer than the chunk timeout.
     *
     * Note: use Long.MAX_VALUE iff debugging.
     */
    //      final long testTimeout = Long.MAX_VALUE;
    final long testTimeout = chunkTimeout + 20;
    final boolean ordered = false;

    final BlockingBuffer<Integer[]> buffer =
        new BlockingBuffer<Integer[]>(queue, chunkSize, chunkTimeout, chunkTimeoutUnit, ordered);

    // buffer is empty.
    assertTrue(buffer.isOpen());
    assertTrue(buffer.isEmpty());
    assertEquals("chunkCount", 0L, buffer.getChunksAddedCount());
    assertEquals("elementCount", 0L, buffer.getElementsAddedCount());

    final IAsynchronousIterator<Integer[]> itr = buffer.iterator();

    // nothing available from the iterator (non-blocking test).
    assertFalse(itr.hasNext(1, TimeUnit.NANOSECONDS));
    assertNull(itr.next(1, TimeUnit.NANOSECONDS));

    // add an element to the buffer - should not block.
    buffer.add(new Integer[] {e0});

    // should be one element and one chunk accepted by the buffer.
    assertTrue(buffer.isOpen());
    assertFalse(buffer.isEmpty());
    assertEquals("chunkCount", 1L, buffer.getChunksAddedCount());
    assertEquals("elementCount", 1L, buffer.getElementsAddedCount());

    // something should be available now (non-blocking).
    assertTrue(itr.hasNext(1, TimeUnit.NANOSECONDS));

    // something should be available now (blocking).
    assertTrue(itr.hasNext());

    // add another element to the buffer - should not block.
    buffer.add(new Integer[] {e1});

    // should be two elements and two chunks accepted into the buffer
    assertTrue(buffer.isOpen());
    assertFalse(buffer.isEmpty());
    assertEquals("chunkCount", 2L, buffer.getChunksAddedCount());
    assertEquals("elementCount", 2L, buffer.getElementsAddedCount());

    final ReentrantLock lock = new ReentrantLock();
    final Condition cond = lock.newCondition();
    final AtomicBoolean proceedFlag = new AtomicBoolean(false);

    // future of task writing a 3rd element on the buffer.
    final Future<?> producerFuture =
        service.submit(
            new Callable<Void>() {
              public Void call() throws Exception {

                lock.lockInterruptibly();
                try {
                  if (!proceedFlag.get()) {
                    cond.await();
                  }
                  /*
                   * add another element - should block until we take an
                   * element using the iterator.
                   */
                  buffer.add(new Integer[] {e2});

                  /*
                   * itr.hasNext() will block until the buffer is closed.
                   */
                  buffer.close();
                } finally {
                  lock.unlock();
                }
                // done.
                return null;
              }
            });

    // future of task draining the buffer.
    final Future<?> consumerFuture =
        service.submit(
            new Callable<Void>() {
              public Void call() throws Exception {

                try {
                  lock.lockInterruptibly();
                  try {

                    assertTrue(itr.hasNext());

                    // take the first chunk - two elements.
                    if (log.isInfoEnabled()) log.info("Awaiting first chunk");
                    assertSameArray(new Integer[] {e0, e1}, itr.next(50, TimeUnit.MILLISECONDS));
                    if (log.isInfoEnabled()) log.info("Have first chunk");

                    /*
                     * Verify that we obtained the first chunk before the
                     * buffer was closed. Otherwise next() blocked
                     * attempting to compile a full chunk until the producer
                     * timeout, at which point the producer closed the
                     * buffer and next() noticed the closed buffer and
                     * returned.
                     */
                    assertTrue(buffer.isOpen());
                    assertFalse("buffer was closed.", itr.isExhausted());

                    /*
                     * Verify that nothing is available from the iterator
                     * (non-blocking test).
                     */
                    assertFalse(itr.hasNext(1, TimeUnit.NANOSECONDS));
                    assertNull(itr.next(1, TimeUnit.NANOSECONDS));

                    // Signal the producer that it should continue.
                    proceedFlag.set(true);
                    cond.signal();

                  } finally {

                    lock.unlock();
                  }

                  // should block until we close the buffer.
                  assertTrue(itr.hasNext());

                  // last chunk
                  assertSameArray(new Integer[] {e2}, itr.next());

                  // should be immediately false.
                  assertFalse(itr.hasNext(1, TimeUnit.NANOSECONDS));
                  // should be immediately null.
                  assertNull(itr.next(1, TimeUnit.NANOSECONDS));

                  // The synchronous API should also report an exhausted
                  // itr.
                  assertFalse(itr.hasNext());
                  try {
                    itr.next();
                    fail("Expecting: " + NoSuchElementException.class);
                  } catch (NoSuchElementException ex) {
                    if (log.isInfoEnabled()) log.info("Ignoring expected exception: " + ex);
                  }

                  return null;

                } catch (Throwable t) {
                  log.error("Consumer failed or blocked: " + t, t);
                  throw new Exception(t);
                }
              }
            });

    // wait a little bit for the producer future.
    producerFuture.get(testTimeout, chunkTimeoutUnit);

    // wait a little bit for the consumer future.
    consumerFuture.get(testTimeout, chunkTimeoutUnit);
  }