Exemple #1
0
/**
 * Implementation of total order protocol using a sequencer. Consult <a
 * href="https://github.com/belaban/JGroups/blob/master/doc/design/SEQUENCER.txt">SEQUENCER.txt</a>
 * for details
 *
 * @author Bela Ban
 */
@MBean(description = "Implementation of total order protocol using a sequencer")
public class SEQUENCER extends Protocol {
  protected Address local_addr;
  protected volatile Address coord;
  protected volatile View view;
  protected volatile boolean is_coord = false;
  protected final AtomicLong seqno = new AtomicLong(0);

  /**
   * Maintains messages forwarded to the coord which which no ack has been received yet. Needs to be
   * sorted so we resend them in the right order
   */
  protected final NavigableMap<Long, Message> forward_table = new ConcurrentSkipListMap<>();

  protected final Lock send_lock = new ReentrantLock();

  protected final Condition send_cond = send_lock.newCondition();

  /**
   * When ack_mode is set, we need to wait for an ack for each forwarded message until we can send
   * the next one
   */
  protected volatile boolean ack_mode = true;

  /** Set when we block all sending threads to resend all messages from forward_table */
  protected volatile boolean flushing = false;

  protected volatile boolean running = true;

  /** Keeps track of the threads sending messages */
  protected final AtomicInteger in_flight_sends = new AtomicInteger(0);

  // Maintains received seqnos, so we can weed out dupes
  protected final ConcurrentMap<Address, BoundedHashMap<Long, Long>> delivery_table =
      Util.createConcurrentMap();

  protected volatile Flusher flusher;

  /** Used for each resent message to wait until the message has been received */
  protected final Promise<Long> ack_promise = new Promise<>();

  @Property(description = "Size of the set to store received seqnos (for duplicate checking)")
  protected int delivery_table_max_size = 2000;

  @Property(
      description =
          "Number of acks needed before going from ack-mode to normal mode. "
              + "0 disables this, which means that ack-mode is always on")
  protected int threshold = 10;

  protected int num_acks = 0;

  protected long forwarded_msgs = 0;
  protected long bcast_msgs = 0;
  protected long received_forwards = 0;
  protected long received_bcasts = 0;
  protected long delivered_bcasts = 0;

  @ManagedAttribute
  public boolean isCoordinator() {
    return is_coord;
  }

  public Address getCoordinator() {
    return coord;
  }

  public Address getLocalAddress() {
    return local_addr;
  }

  @ManagedAttribute
  public long getForwarded() {
    return forwarded_msgs;
  }

  @ManagedAttribute
  public long getBroadcast() {
    return bcast_msgs;
  }

  @ManagedAttribute
  public long getReceivedForwards() {
    return received_forwards;
  }

  @ManagedAttribute
  public long getReceivedBroadcasts() {
    return received_bcasts;
  }

  @ManagedAttribute(description = "Number of messages in the forward-table")
  public int getForwardTableSize() {
    return forward_table.size();
  }

  public void setThreshold(int new_threshold) {
    this.threshold = new_threshold;
  }

  public void setDeliveryTableMaxSize(int size) {
    delivery_table_max_size = size;
  }

  @ManagedOperation
  public void resetStats() {
    forwarded_msgs = bcast_msgs = received_forwards = received_bcasts = delivered_bcasts = 0L;
  }

  @ManagedOperation
  public Map<String, Object> dumpStats() {
    Map<String, Object> m = super.dumpStats();
    m.put("forwarded", forwarded_msgs);
    m.put("broadcast", bcast_msgs);
    m.put("received_forwards", received_forwards);
    m.put("received_bcasts", received_bcasts);
    m.put("delivered_bcasts", delivered_bcasts);
    return m;
  }

  @ManagedOperation
  public String printStats() {
    return dumpStats().toString();
  }

  public void start() throws Exception {
    super.start();
    running = true;
    ack_mode = true;
  }

  public void stop() {
    running = false;
    unblockAll();
    stopFlusher();
    super.stop();
  }

  public Object down(Event evt) {
    switch (evt.getType()) {
      case Event.MSG:
        Message msg = (Message) evt.getArg();
        if (msg.getDest() != null
            || msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER)
            || msg.isFlagSet(Message.Flag.OOB)) break;

        if (msg.getSrc() == null) msg.setSrc(local_addr);

        if (flushing) block();

        // A seqno is not used to establish ordering, but only to weed out duplicates; next_seqno
        // doesn't need
        // to increase monotonically, but only to be unique
        // (https://issues.jboss.org/browse/JGRP-1461) !
        long next_seqno = seqno.incrementAndGet();
        in_flight_sends.incrementAndGet();
        try {
          SequencerHeader hdr =
              new SequencerHeader(
                  is_coord ? SequencerHeader.BCAST : SequencerHeader.WRAPPED_BCAST, next_seqno);
          msg.putHeader(this.id, hdr);
          if (log.isTraceEnabled())
            log.trace(
                "["
                    + local_addr
                    + "]: forwarding "
                    + local_addr
                    + "::"
                    + seqno
                    + " to coord "
                    + coord);

          // We always forward messages to the coordinator, even if we're the coordinator. Having
          // the coord
          // send its messages directly led to starvation of messages from other members. MPerf perf
          // went up
          // from 20MB/sec/node to 50MB/sec/node with this change !
          forwardToCoord(next_seqno, msg);
        } catch (Exception ex) {
          log.error(Util.getMessage("FailedSendingMessage"), ex);
        } finally {
          in_flight_sends.decrementAndGet();
        }
        return null; // don't pass down

      case Event.VIEW_CHANGE:
        handleViewChange((View) evt.getArg());
        break;

      case Event.TMP_VIEW:
        handleTmpView((View) evt.getArg());
        break;

      case Event.SET_LOCAL_ADDRESS:
        local_addr = (Address) evt.getArg();
        break;
    }
    return down_prot.down(evt);
  }

  public Object up(Event evt) {
    Message msg;
    SequencerHeader hdr;

    switch (evt.getType()) {
      case Event.MSG:
        msg = (Message) evt.getArg();
        if (msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER) || msg.isFlagSet(Message.Flag.OOB)) break;
        hdr = (SequencerHeader) msg.getHeader(this.id);
        if (hdr == null) break; // pass up

        switch (hdr.type) {
          case SequencerHeader.FORWARD:
          case SequencerHeader.FLUSH:
            if (!is_coord) {
              if (log.isErrorEnabled())
                log.error(
                    local_addr + ": non-coord; dropping FORWARD request from " + msg.getSrc());
              return null;
            }
            Address sender = msg.getSrc();
            if (view != null && !view.containsMember(sender)) {
              if (log.isErrorEnabled())
                log.error(
                    local_addr
                        + ": dropping FORWARD request from non-member "
                        + sender
                        + "; view="
                        + view);
              return null;
            }

            broadcast(
                msg,
                true,
                msg.getSrc(),
                hdr.seqno,
                hdr.type == SequencerHeader.FLUSH); // do copy the message
            received_forwards++;
            break;

          case SequencerHeader.BCAST:
            deliver(msg, evt, hdr);
            received_bcasts++;
            break;

          case SequencerHeader.WRAPPED_BCAST:
            unwrapAndDeliver(
                msg, hdr.flush_ack); // unwrap the original message (in the payload) and deliver it
            received_bcasts++;
            break;
        }
        return null;

      case Event.VIEW_CHANGE:
        Object retval = up_prot.up(evt);
        handleViewChange((View) evt.getArg());
        return retval;

      case Event.TMP_VIEW:
        handleTmpView((View) evt.getArg());
        break;
    }

    return up_prot.up(evt);
  }

  public void up(MessageBatch batch) {
    for (Message msg : batch) {
      if (msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER)
          || msg.isFlagSet(Message.Flag.OOB)
          || msg.getHeader(id) == null) continue;
      batch.remove(msg);

      // simplistic implementation
      try {
        up(new Event(Event.MSG, msg));
      } catch (Throwable t) {
        log.error(Util.getMessage("FailedPassingUpMessage"), t);
      }
    }

    if (!batch.isEmpty()) up_prot.up(batch);
  }

  /* --------------------------------- Private Methods ----------------------------------- */

  protected void handleViewChange(View v) {
    List<Address> mbrs = v.getMembers();
    if (mbrs.isEmpty()) return;

    if (view == null || view.compareTo(v) < 0) view = v;
    else return;

    delivery_table.keySet().retainAll(mbrs);

    Address existing_coord = coord, new_coord = mbrs.get(0);
    boolean coord_changed = !Objects.equals(existing_coord, new_coord);
    if (coord_changed && new_coord != null) {
      stopFlusher();
      startFlusher(
          new_coord); // needs to be done in the background, to prevent blocking if down() would
      // block
    }
  }

  protected void flush(final Address new_coord) throws InterruptedException {
    // wait until all threads currently sending messages have returned (new threads after
    // flushing=true) will block
    // flushing is set to true in startFlusher()
    while (flushing && running) {
      if (in_flight_sends.get() == 0) break;
      Thread.sleep(100);
    }

    send_lock.lockInterruptibly();
    try {
      if (log.isTraceEnabled())
        log.trace(local_addr + ": coord changed from " + coord + " to " + new_coord);
      coord = new_coord;
      is_coord = Objects.equals(local_addr, coord);
      flushMessagesInForwardTable();
    } finally {
      if (log.isTraceEnabled()) log.trace(local_addr + ": flushing completed");
      flushing = false;
      ack_mode = true; // go to ack-mode after flushing
      num_acks = 0;
      send_cond.signalAll();
      send_lock.unlock();
    }
  }

  // If we're becoming coordinator, we need to handle TMP_VIEW as
  // an immediate change of view. See JGRP-1452.
  private void handleTmpView(View v) {
    List<Address> mbrs = v.getMembers();
    if (mbrs.isEmpty()) return;

    Address new_coord = mbrs.get(0);
    if (!new_coord.equals(coord) && local_addr != null && local_addr.equals(new_coord))
      handleViewChange(v);
  }

  /**
   * Sends all messages currently in forward_table to the new coordinator (changing the dest field).
   * This needs to be done, so the underlying reliable unicast protocol (e.g. UNICAST) adds these
   * messages to its retransmission mechanism<br>
   * Note that we need to resend the messages in order of their seqnos ! We also need to prevent
   * other message from being inserted until we're done, that's why there's synchronization.<br>
   * Access to the forward_table doesn't need to be synchronized as there won't be any insertions
   * during flushing (all down-threads are blocked)
   */
  protected void flushMessagesInForwardTable() {
    if (is_coord) {
      for (Map.Entry<Long, Message> entry : forward_table.entrySet()) {
        Long key = entry.getKey();
        Message msg = entry.getValue();
        Buffer buf;
        try {
          buf = Util.streamableToBuffer(msg);
        } catch (Exception e) {
          log.error(Util.getMessage("FlushingBroadcastingFailed"), e);
          continue;
        }

        SequencerHeader hdr = new SequencerHeader(SequencerHeader.WRAPPED_BCAST, key);
        Message forward_msg = new Message(null, buf).putHeader(this.id, hdr);
        if (log.isTraceEnabled())
          log.trace(local_addr + ": flushing (broadcasting) " + local_addr + "::" + key);
        down_prot.down(new Event(Event.MSG, forward_msg));
      }
      return;
    }

    // for forwarded messages, we need to receive the forwarded message from the coordinator, to
    // prevent this case:
    // - V1={A,B,C}
    // - A crashes
    // - C installs V2={B,C}
    // - C forwards messages 3 and 4 to B (the new coord)
    // - B drops 3 because its view is still V1
    // - B installs V2
    // - B receives message 4 and broadcasts it
    // ==> C's message 4 is delivered *before* message 3 !
    // ==> By resending 3 until it is received, then resending 4 until it is received, we make sure
    // this won't happen
    // (see https://issues.jboss.org/browse/JGRP-1449)
    while (flushing && running && !forward_table.isEmpty()) {
      Map.Entry<Long, Message> entry = forward_table.firstEntry();
      final Long key = entry.getKey();
      Message msg = entry.getValue();
      Buffer buf;

      try {
        buf = Util.streamableToBuffer(msg);
      } catch (Exception e) {
        log.error(Util.getMessage("FlushingBroadcastingFailed"), e);
        continue;
      }

      while (flushing && running && !forward_table.isEmpty()) {
        SequencerHeader hdr = new SequencerHeader(SequencerHeader.FLUSH, key);
        Message forward_msg =
            new Message(coord, buf).putHeader(this.id, hdr).setFlag(Message.Flag.DONT_BUNDLE);
        if (log.isTraceEnabled())
          log.trace(
              local_addr
                  + ": flushing (forwarding) "
                  + local_addr
                  + "::"
                  + key
                  + " to coord "
                  + coord);
        ack_promise.reset();
        down_prot.down(new Event(Event.MSG, forward_msg));
        Long ack = ack_promise.getResult(500);
        if ((Objects.equals(ack, key)) || !forward_table.containsKey(key)) break;
      }
    }
  }

  protected void forwardToCoord(long seqno, Message msg) {
    if (is_coord) {
      forward(msg, seqno, false);
      return;
    }

    if (!running || flushing) {
      forward_table.put(seqno, msg);
      return;
    }

    if (!ack_mode) {
      forward_table.put(seqno, msg);
      forward(msg, seqno, false);
      return;
    }

    send_lock.lock();
    try {
      forward_table.put(seqno, msg);
      while (running && !flushing) {
        ack_promise.reset();
        forward(msg, seqno, true);
        if (!ack_mode || !running || flushing) break;
        Long ack = ack_promise.getResult(500);
        if ((Objects.equals(ack, seqno)) || !forward_table.containsKey(seqno)) break;
      }
    } finally {
      send_lock.unlock();
    }
  }

  protected void forward(final Message msg, long seqno, boolean flush) {
    Address target = coord;
    if (target == null) return;
    byte type = flush ? SequencerHeader.FLUSH : SequencerHeader.FORWARD;
    try {
      SequencerHeader hdr = new SequencerHeader(type, seqno);
      Message forward_msg =
          new Message(target, Util.streamableToBuffer(msg)).putHeader(this.id, hdr);
      down_prot.down(new Event(Event.MSG, forward_msg));
      forwarded_msgs++;
    } catch (Exception ex) {
      log.error(Util.getMessage("FailedForwardingMessageTo") + msg.getDest(), ex);
    }
  }

  protected void broadcast(
      final Message msg, boolean copy, Address original_sender, long seqno, boolean resend) {
    Message bcast_msg = null;

    if (!copy) {
      bcast_msg = msg; // no need to add a header, message already has one
    } else {
      SequencerHeader new_hdr = new SequencerHeader(SequencerHeader.WRAPPED_BCAST, seqno);
      bcast_msg =
          new Message(null, msg.getRawBuffer(), msg.getOffset(), msg.getLength())
              .putHeader(this.id, new_hdr);
      if (resend) {
        new_hdr.flush_ack = true;
        bcast_msg.setFlag(Message.Flag.DONT_BUNDLE);
      }
    }

    if (log.isTraceEnabled())
      log.trace(local_addr + ": broadcasting " + original_sender + "::" + seqno);

    down_prot.down(new Event(Event.MSG, bcast_msg));
    bcast_msgs++;
  }

  /**
   * Unmarshal the original message (in the payload) and then pass it up (unless already delivered)
   *
   * @param msg
   */
  protected void unwrapAndDeliver(final Message msg, boolean flush_ack) {
    try {
      Message msg_to_deliver =
          Util.streamableFromBuffer(
              Message.class, msg.getRawBuffer(), msg.getOffset(), msg.getLength());
      SequencerHeader hdr = (SequencerHeader) msg_to_deliver.getHeader(this.id);
      if (flush_ack) hdr.flush_ack = true;
      deliver(msg_to_deliver, new Event(Event.MSG, msg_to_deliver), hdr);
    } catch (Exception ex) {
      log.error(Util.getMessage("FailureUnmarshallingBuffer"), ex);
    }
  }

  protected void deliver(Message msg, Event evt, SequencerHeader hdr) {
    Address sender = msg.getSrc();
    if (sender == null) {
      if (log.isErrorEnabled())
        log.error(local_addr + ": sender is null, cannot deliver " + "::" + hdr.getSeqno());
      return;
    }
    long msg_seqno = hdr.getSeqno();
    if (sender.equals(local_addr)) {
      forward_table.remove(msg_seqno);
      if (hdr.flush_ack) {
        ack_promise.setResult(msg_seqno);
        if (ack_mode && !flushing && threshold > 0 && ++num_acks >= threshold) {
          ack_mode = false;
          num_acks = 0;
        }
      }
    }
    if (!canDeliver(sender, msg_seqno)) {
      if (log.isWarnEnabled())
        log.warn(local_addr + ": dropped duplicate message " + sender + "::" + msg_seqno);
      return;
    }
    if (log.isTraceEnabled()) log.trace(local_addr + ": delivering " + sender + "::" + msg_seqno);
    up_prot.up(evt);
    delivered_bcasts++;
  }

  /**
   * Checks if seqno has already been received from sender. This weeds out duplicates. Note that
   * this method is never called concurrently for the same sender, as the sender in NAKACK will
   * always be the coordinator.
   */
  protected boolean canDeliver(Address sender, long seqno) {
    BoundedHashMap<Long, Long> seqno_set = delivery_table.get(sender);
    if (seqno_set == null) {
      seqno_set = new BoundedHashMap<>(delivery_table_max_size);
      BoundedHashMap<Long, Long> existing = delivery_table.put(sender, seqno_set);
      if (existing != null) seqno_set = existing;
    }
    return seqno_set.add(seqno, seqno);
  }

  protected void block() {
    send_lock.lock();
    try {
      while (flushing && running) {
        try {
          send_cond.await();
        } catch (InterruptedException e) {
        }
      }
    } finally {
      send_lock.unlock();
    }
  }

  protected void unblockAll() {
    flushing = false;
    send_lock.lock();
    try {
      send_cond.signalAll();
      ack_promise.setResult(null);
    } finally {
      send_lock.unlock();
    }
  }

  protected synchronized void startFlusher(final Address new_coord) {
    if (flusher == null || !flusher.isAlive()) {
      if (log.isTraceEnabled()) log.trace(local_addr + ": flushing started");
      // causes subsequent message sends (broadcasts and forwards) to block
      // (https://issues.jboss.org/browse/JGRP-1495)
      flushing = true;

      flusher = new Flusher(new_coord);
      flusher.setName("Flusher");
      flusher.start();
    }
  }

  protected void stopFlusher() {
    flushing = false;
    Thread tmp = flusher;

    while (tmp != null && tmp.isAlive()) {
      tmp.interrupt();
      ack_promise.setResult(null);
      try {
        tmp.join();
      } catch (InterruptedException e) {
      }
    }
  }

  /* ----------------------------- End of Private Methods -------------------------------- */

  protected class Flusher extends Thread {
    protected final Address new_coord;

    public Flusher(Address new_coord) {
      this.new_coord = new_coord;
    }

    public void run() {
      try {
        flush(new_coord);
      } catch (InterruptedException e) {
      }
    }
  }

  public static class SequencerHeader extends Header {
    protected static final byte FORWARD = 1;
    protected static final byte FLUSH = 2;
    protected static final byte BCAST = 3;
    protected static final byte WRAPPED_BCAST = 4;

    protected byte type = -1;
    protected long seqno = -1;
    protected boolean flush_ack;

    public SequencerHeader() {}

    public SequencerHeader(byte type) {
      this.type = type;
    }

    public SequencerHeader(byte type, long seqno) {
      this(type);
      this.seqno = seqno;
    }

    public long getSeqno() {
      return seqno;
    }

    public String toString() {
      StringBuilder sb = new StringBuilder(64);
      sb.append(printType());
      if (seqno >= 0) sb.append(" seqno=" + seqno);
      if (flush_ack) sb.append(" (flush_ack)");
      return sb.toString();
    }

    protected final String printType() {
      switch (type) {
        case FORWARD:
          return "FORWARD";
        case FLUSH:
          return "FLUSH";
        case BCAST:
          return "BCAST";
        case WRAPPED_BCAST:
          return "WRAPPED_BCAST";
        default:
          return "n/a";
      }
    }

    public void writeTo(DataOutput out) throws Exception {
      out.writeByte(type);
      Bits.writeLong(seqno, out);
      out.writeBoolean(flush_ack);
    }

    public void readFrom(DataInput in) throws Exception {
      type = in.readByte();
      seqno = Bits.readLong(in);
      flush_ack = in.readBoolean();
    }

    public int size() {
      return Global.BYTE_SIZE + Bits.size(seqno) + Global.BYTE_SIZE; // type + seqno + flush_ack
    }
  }
}
Exemple #2
0
/**
 * Reliable unicast layer. Uses acknowledgement scheme similar to TCP to provide lossless
 * transmission of unicast messages (for reliable multicast see NAKACK layer). When a message is
 * sent to a peer for the first time, we add the pair <peer_addr, Entry> to the hashtable (peer
 * address is the key). All messages sent to that peer will be added to
 * hashtable.peer_addr.sent_msgs. When we receive a message from a peer for the first time, another
 * entry will be created and added to the hashtable (unless already existing). Msgs will then be
 * added to hashtable.peer_addr.received_msgs.
 *
 * <p>This layer is used to reliably transmit point-to-point messages, that is, either messages sent
 * to a single receiver (vs. messages multicast to a group) or for example replies to a multicast
 * message. The sender uses an <code>AckSenderWindow</code> which retransmits messages for which it
 * hasn't received an ACK, the receiver uses <code>AckReceiverWindow</code> which keeps track of the
 * lowest seqno received so far, and keeps messages in order.
 *
 * <p>Messages in both AckSenderWindows and AckReceiverWindows will be removed. A message will be
 * removed from AckSenderWindow when an ACK has been received for it and messages will be removed
 * from AckReceiverWindow whenever a message is received: the new message is added and then we try
 * to remove as many messages as possible (until we stop at a gap, or there are no more messages).
 *
 * @author Bela Ban
 */
@MBean(description = "Reliable unicast layer")
public class UNICAST extends Protocol implements AgeOutCache.Handler<Address> {
  public static final long DEFAULT_FIRST_SEQNO = Global.DEFAULT_FIRST_UNICAST_SEQNO;

  /* ------------------------------------------ Properties  ------------------------------------------ */

  @Deprecated
  protected int[] timeout = {
    400, 800, 1600, 3200
  }; // for AckSenderWindow: max time to wait for missing acks

  @Property(
      description =
          "Max number of messages to be removed from a retransmit window. This property might "
              + "get removed anytime, so don't use it !")
  protected int max_msg_batch_size = 500;

  @Property(
      description =
          "Time (in milliseconds) after which an idle incoming or outgoing connection is closed. The "
              + "connection will get re-established when used again. 0 disables connection reaping")
  protected long conn_expiry_timeout = 0;

  @Deprecated
  @Property(
      description =
          "Size (in bytes) of a Segment in the segments table. Only for experts, do not use !",
      deprecatedMessage = "not used anymore")
  protected int segment_capacity = 1000;

  @Property(
      description = "Number of rows of the matrix in the retransmission table (only for experts)",
      writable = false)
  protected int xmit_table_num_rows = 100;

  @Property(
      description =
          "Number of elements of a row of the matrix in the retransmission table (only for experts). "
              + "The capacity of the matrix is xmit_table_num_rows * xmit_table_msgs_per_row",
      writable = false)
  protected int xmit_table_msgs_per_row = 1000;

  @Property(
      description = "Resize factor of the matrix in the retransmission table (only for experts)",
      writable = false)
  protected double xmit_table_resize_factor = 1.2;

  @Property(
      description =
          "Number of milliseconds after which the matrix in the retransmission table "
              + "is compacted (only for experts)",
      writable = false)
  protected long xmit_table_max_compaction_time = 10 * 60 * 1000;

  // @Property(description="Max time (in ms) after which a connection to a non-member is closed")
  protected long max_retransmit_time = 60 * 1000L;

  @Property(
      description = "Interval (in milliseconds) at which messages in the send windows are resent")
  protected long xmit_interval = 2000;

  /* --------------------------------------------- JMX  ---------------------------------------------- */

  protected long num_msgs_sent = 0, num_msgs_received = 0;
  protected long num_acks_sent = 0, num_acks_received = 0, num_xmits = 0;

  /* --------------------------------------------- Fields ------------------------------------------------ */

  protected final ConcurrentMap<Address, SenderEntry> send_table = Util.createConcurrentMap();
  protected final ConcurrentMap<Address, ReceiverEntry> recv_table = Util.createConcurrentMap();

  protected final ReentrantLock recv_table_lock = new ReentrantLock();

  /** RetransmitTask running every xmit_interval ms */
  protected Future<?> xmit_task;

  protected volatile List<Address> members = new ArrayList<Address>(11);

  protected Address local_addr = null;

  protected TimeScheduler timer = null; // used for retransmissions (passed to AckSenderWindow)

  protected volatile boolean running = false;

  protected short last_conn_id = 0;

  protected AgeOutCache<Address> cache = null;

  protected Future<?> connection_reaper; // closes idle connections

  public int[] getTimeout() {
    return timeout;
  }

  @Deprecated
  @Property(
      name = "timeout",
      converter = PropertyConverters.IntegerArray.class,
      deprecatedMessage = "not used anymore")
  public void setTimeout(int[] val) {
    if (val != null) timeout = val;
  }

  public void setMaxMessageBatchSize(int size) {
    if (size >= 1) max_msg_batch_size = size;
  }

  @ManagedAttribute
  public String getLocalAddress() {
    return local_addr != null ? local_addr.toString() : "null";
  }

  @ManagedAttribute
  public String getMembers() {
    return members.toString();
  }

  @ManagedAttribute(description = "Whether the ConnectionReaper task is running")
  public boolean isConnectionReaperRunning() {
    return connection_reaper != null && !connection_reaper.isDone();
  }

  @ManagedAttribute(description = "Returns the number of outgoing (send) connections")
  public int getNumSendConnections() {
    return send_table.size();
  }

  @ManagedAttribute(description = "Returns the number of incoming (receive) connections")
  public int getNumReceiveConnections() {
    return recv_table.size();
  }

  @ManagedAttribute(
      description =
          "Returns the total number of outgoing (send) and incoming (receive) connections")
  public int getNumConnections() {
    return getNumReceiveConnections() + getNumSendConnections();
  }

  @ManagedOperation
  public String printConnections() {
    StringBuilder sb = new StringBuilder();
    if (!send_table.isEmpty()) {
      sb.append("\nsend connections:\n");
      for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) {
        sb.append(entry.getKey()).append(": ").append(entry.getValue()).append("\n");
      }
    }

    if (!recv_table.isEmpty()) {
      sb.append("\nreceive connections:\n");
      for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) {
        sb.append(entry.getKey()).append(": ").append(entry.getValue()).append("\n");
      }
    }
    return sb.toString();
  }

  @ManagedAttribute
  public long getNumMessagesSent() {
    return num_msgs_sent;
  }

  @ManagedAttribute
  public long getNumMessagesReceived() {
    return num_msgs_received;
  }

  @ManagedAttribute
  public long getNumAcksSent() {
    return num_acks_sent;
  }

  @ManagedAttribute
  public long getNumAcksReceived() {
    return num_acks_received;
  }

  @ManagedAttribute
  public long getNumXmits() {
    return num_xmits;
  }

  public long getMaxRetransmitTime() {
    return max_retransmit_time;
  }

  @Property(
      description =
          "Max number of milliseconds we try to retransmit a message to any given member. After that, "
              + "the connection is removed. Any new connection to that member will start with seqno #1 again. 0 disables this")
  public void setMaxRetransmitTime(long max_retransmit_time) {
    this.max_retransmit_time = max_retransmit_time;
    if (cache != null && max_retransmit_time > 0) cache.setTimeout(max_retransmit_time);
  }

  @ManagedAttribute(description = "Is the retransmit task running")
  public boolean isXmitTaskRunning() {
    return xmit_task != null && !xmit_task.isDone();
  }

  @ManagedAttribute
  public int getAgeOutCacheSize() {
    return cache != null ? cache.size() : 0;
  }

  @ManagedOperation
  public String printAgeOutCache() {
    return cache != null ? cache.toString() : "n/a";
  }

  public AgeOutCache<Address> getAgeOutCache() {
    return cache;
  }

  /** Used for testing only */
  public boolean hasSendConnectionTo(Address dest) {
    return send_table.containsKey(dest);
  }

  /** The number of messages in all Entry.sent_msgs tables (haven't received an ACK yet) */
  @ManagedAttribute
  public int getNumUnackedMessages() {
    int num = 0;
    for (SenderEntry entry : send_table.values()) {
      if (entry.sent_msgs != null) num += entry.sent_msgs.size();
    }
    return num;
  }

  @ManagedAttribute
  public int getNumberOfMessagesInReceiveWindows() {
    int num = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) num += entry.received_msgs.size();
    }
    return num;
  }

  @ManagedAttribute(description = "Total number of undelivered messages in all receive windows")
  public long getXmitTableUndeliveredMessages() {
    long retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.size();
    }
    return retval;
  }

  @ManagedAttribute(description = "Total number of missing messages in all receive windows")
  public long getXmitTableMissingMessages() {
    long retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.getNumMissing();
    }
    return retval;
  }

  @ManagedAttribute(description = "Number of compactions in all (receive and send) windows")
  public int getXmitTableNumCompactions() {
    int retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.getNumCompactions();
    }
    for (SenderEntry entry : send_table.values()) {
      if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumCompactions();
    }
    return retval;
  }

  @ManagedAttribute(description = "Number of moves in all (receive and send) windows")
  public int getXmitTableNumMoves() {
    int retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.getNumMoves();
    }
    for (SenderEntry entry : send_table.values()) {
      if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumMoves();
    }
    return retval;
  }

  @ManagedAttribute(description = "Number of resizes in all (receive and send) windows")
  public int getXmitTableNumResizes() {
    int retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.getNumResizes();
    }
    for (SenderEntry entry : send_table.values()) {
      if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumResizes();
    }
    return retval;
  }

  @ManagedAttribute(description = "Number of purges in all (receive and send) windows")
  public int getXmitTableNumPurges() {
    int retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.getNumPurges();
    }
    for (SenderEntry entry : send_table.values()) {
      if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumPurges();
    }
    return retval;
  }

  @ManagedOperation(description = "Prints the contents of the receive windows for all members")
  public String printReceiveWindowMessages() {
    StringBuilder ret = new StringBuilder(local_addr + ":\n");
    for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) {
      Address addr = entry.getKey();
      Table<Message> buf = entry.getValue().received_msgs;
      ret.append(addr).append(": ").append(buf.toString()).append('\n');
    }
    return ret.toString();
  }

  @ManagedOperation(description = "Prints the contents of the send windows for all members")
  public String printSendWindowMessages() {
    StringBuilder ret = new StringBuilder(local_addr + ":\n");
    for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) {
      Address addr = entry.getKey();
      Table<Message> buf = entry.getValue().sent_msgs;
      ret.append(addr).append(": ").append(buf.toString()).append('\n');
    }
    return ret.toString();
  }

  public void resetStats() {
    num_msgs_sent = num_msgs_received = num_acks_sent = num_acks_received = 0;
    num_xmits = 0;
  }

  public Map<String, Object> dumpStats() {
    Map<String, Object> m = super.dumpStats();
    m.put("num_unacked_msgs", getNumUnackedMessages());
    m.put("num_msgs_in_recv_windows", getNumberOfMessagesInReceiveWindows());
    return m;
  }

  public void start() throws Exception {
    timer = getTransport().getTimer();
    if (timer == null) throw new Exception("timer is null");
    if (max_retransmit_time > 0) cache = new AgeOutCache<Address>(timer, max_retransmit_time, this);
    running = true;
    if (conn_expiry_timeout > 0) startConnectionReaper();
    startRetransmitTask();
  }

  public void stop() {
    running = false;
    stopRetransmitTask();
    stopConnectionReaper();
    removeAllConnections();
  }

  public Object up(Event evt) {
    switch (evt.getType()) {
      case Event.MSG:
        Message msg = (Message) evt.getArg();
        if (msg.getDest() == null
            || msg.isFlagSet(Message.Flag.NO_RELIABILITY)) // only handle unicast messages
        break; // pass up

        UnicastHeader hdr = (UnicastHeader) msg.getHeader(this.id);
        if (hdr == null) break;
        Address sender = msg.getSrc();
        switch (hdr.type) {
          case UnicastHeader.DATA: // received regular message
            handleDataReceived(sender, hdr.seqno, hdr.conn_id, hdr.first, msg, evt);
            break;
          default:
            handleUpEvent(sender, hdr);
            break;
        }
        return null;
    }
    return up_prot.up(evt); // Pass up to the layer above us
  }

  protected void handleUpEvent(Address sender, UnicastHeader hdr) {
    switch (hdr.type) {
      case UnicastHeader.DATA: // received regular message
        throw new IllegalStateException(
            "header of type DATA is not supposed to be handled by this method");
      case UnicastHeader.ACK: // received ACK for previously sent message
        handleAckReceived(sender, hdr.seqno, hdr.conn_id);
        break;
      case UnicastHeader.SEND_FIRST_SEQNO:
        handleResendingOfFirstMessage(sender, hdr.seqno);
        break;
      default:
        log.error("UnicastHeader type " + hdr.type + " not known !");
        break;
    }
  }

  public void up(MessageBatch batch) {
    if (batch.dest() == null) { // not a unicast batch
      up_prot.up(batch);
      return;
    }

    int size = batch.size();
    Map<Short, List<Message>> msgs =
        new TreeMap<Short, List<Message>>(); // map of messages, keyed by conn-id
    for (Message msg : batch) {
      if (msg == null || msg.isFlagSet(Message.Flag.NO_RELIABILITY)) continue;
      UnicastHeader hdr = (UnicastHeader) msg.getHeader(id);
      if (hdr == null) continue;
      batch.remove(msg); // remove the message from the batch, so it won't be passed up the stack

      if (hdr.type != UnicastHeader.DATA) {
        try {
          handleUpEvent(msg.getSrc(), hdr);
        } catch (Throwable t) { // we cannot let an exception terminate the processing of this batch
          log.error(local_addr + ": failed handling event", t);
        }
        continue;
      }

      List<Message> list = msgs.get(hdr.conn_id);
      if (list == null) msgs.put(hdr.conn_id, list = new ArrayList<Message>(size));
      list.add(msg);
    }

    if (!msgs.isEmpty()) handleBatchReceived(batch.sender(), msgs); // process msgs:
    if (!batch.isEmpty()) up_prot.up(batch);
  }

  public Object down(Event evt) {
    switch (evt.getType()) {
      case Event.MSG: // Add UnicastHeader, add to AckSenderWindow and pass down
        Message msg = (Message) evt.getArg();
        Address dst = msg.getDest();

        /* only handle unicast messages */
        if (dst == null || msg.isFlagSet(Message.Flag.NO_RELIABILITY)) break;

        if (!running) {
          if (log.isTraceEnabled())
            log.trace("discarded message as start() has not yet been called, message: " + msg);
          return null;
        }

        SenderEntry entry = send_table.get(dst);
        if (entry == null) {
          entry = new SenderEntry(getNewConnectionId());
          SenderEntry existing = send_table.putIfAbsent(dst, entry);
          if (existing != null) entry = existing;
          else {
            if (log.isTraceEnabled())
              log.trace(
                  local_addr
                      + ": created sender window for "
                      + dst
                      + " (conn-id="
                      + entry.send_conn_id
                      + ")");
            if (cache != null && !members.contains(dst)) cache.add(dst);
          }
        }

        short send_conn_id = entry.send_conn_id;
        long seqno = entry.sent_msgs_seqno.getAndIncrement();
        long sleep = 10;
        do {
          try {
            msg.putHeader(
                this.id,
                UnicastHeader.createDataHeader(seqno, send_conn_id, seqno == DEFAULT_FIRST_SEQNO));
            entry.sent_msgs.add(seqno, msg); // add *including* UnicastHeader, adds to retransmitter
            if (conn_expiry_timeout > 0) entry.update();
            break;
          } catch (Throwable t) {
            if (!running) break;
            Util.sleep(sleep);
            sleep = Math.min(5000, sleep * 2);
          }
        } while (running);

        if (log.isTraceEnabled()) {
          StringBuilder sb = new StringBuilder();
          sb.append(local_addr)
              .append(" --> DATA(")
              .append(dst)
              .append(": #")
              .append(seqno)
              .append(", conn_id=")
              .append(send_conn_id);
          if (seqno == DEFAULT_FIRST_SEQNO) sb.append(", first");
          sb.append(')');
          log.trace(sb);
        }

        num_msgs_sent++;
        return down_prot.down(evt);

      case Event.VIEW_CHANGE: // remove connections to peers that are not members anymore !
        View view = (View) evt.getArg();
        List<Address> new_members = view.getMembers();
        Set<Address> non_members = new HashSet<Address>(send_table.keySet());
        non_members.addAll(recv_table.keySet());

        members = new_members;
        non_members.removeAll(new_members);
        if (cache != null) cache.removeAll(new_members);

        if (!non_members.isEmpty()) {
          if (log.isTraceEnabled()) log.trace("removing non members " + non_members);
          for (Address non_mbr : non_members) removeConnection(non_mbr);
        }
        break;

      case Event.SET_LOCAL_ADDRESS:
        local_addr = (Address) evt.getArg();
        break;
    }

    return down_prot.down(evt); // Pass on to the layer below us
  }

  /**
   * Removes and resets from connection table (which is already locked). Returns true if member was
   * found, otherwise false. This method is public only so it can be invoked by unit testing, but
   * should not otherwise be used !
   */
  public void removeConnection(Address mbr) {
    removeSendConnection(mbr);
    removeReceiveConnection(mbr);
  }

  public void removeSendConnection(Address mbr) {
    send_table.remove(mbr);
  }

  public void removeReceiveConnection(Address mbr) {
    recv_table.remove(mbr);
  }

  /**
   * This method is public only so it can be invoked by unit testing, but should not otherwise be
   * used !
   */
  @ManagedOperation(
      description = "Trashes all connections to other nodes. This is only used for testing")
  public void removeAllConnections() {
    send_table.clear();
    recv_table.clear();
  }

  /** Called by AckSenderWindow to resend messages for which no ACK has been received yet */
  public void retransmit(Message msg) {
    if (log.isTraceEnabled()) {
      UnicastHeader hdr = (UnicastHeader) msg.getHeader(id);
      long seqno = hdr != null ? hdr.seqno : -1;
      log.trace(local_addr + " --> XMIT(" + msg.getDest() + ": #" + seqno + ')');
    }
    down_prot.down(new Event(Event.MSG, msg));
    num_xmits++;
  }

  /**
   * Called by AgeOutCache, to removed expired connections
   *
   * @param key
   */
  public void expired(Address key) {
    if (key != null) {
      if (log.isDebugEnabled()) log.debug("removing connection to " + key + " because it expired");
      removeConnection(key);
    }
  }

  /**
   * Check whether the hashtable contains an entry e for <code>sender</code> (create if not). If
   * e.received_msgs is null and <code>first</code> is true: create a new AckReceiverWindow(seqno)
   * and add message. Set e.received_msgs to the new window. Else just add the message.
   */
  protected void handleDataReceived(
      Address sender, long seqno, short conn_id, boolean first, Message msg, Event evt) {
    if (log.isTraceEnabled()) {
      StringBuilder sb = new StringBuilder();
      sb.append(local_addr).append(" <-- DATA(").append(sender).append(": #").append(seqno);
      if (conn_id != 0) sb.append(", conn_id=").append(conn_id);
      if (first) sb.append(", first");
      sb.append(')');
      log.trace(sb);
    }

    ReceiverEntry entry = getReceiverEntry(sender, seqno, first, conn_id);
    if (entry == null) return;
    if (conn_expiry_timeout > 0) entry.update();
    Table<Message> win = entry.received_msgs;
    boolean added = win.add(seqno, msg); // win is guaranteed to be non-null if we get here
    num_msgs_received++;

    // An OOB message is passed up immediately. Later, when remove() is called, we discard it. This
    // affects ordering !
    // http://jira.jboss.com/jira/browse/JGRP-377
    if (msg.isFlagSet(Message.Flag.OOB) && added) {
      try {
        up_prot.up(evt);
      } catch (Throwable t) {
        log.error("couldn't deliver OOB message " + msg, t);
      }
    }

    final AtomicBoolean processing = win.getProcessing();
    if (!processing.compareAndSet(false, true)) {
      return;
    }

    // try to remove (from the AckReceiverWindow) as many messages as possible as pass them up

    // Prevents concurrent passing up of messages by different threads
    // (http://jira.jboss.com/jira/browse/JGRP-198);
    // this is all the more important once we have a concurrent stack
    // (http://jira.jboss.com/jira/browse/JGRP-181),
    // where lots of threads can come up to this point concurrently, but only 1 is allowed to pass
    // at a time
    // We *can* deliver messages from *different* senders concurrently, e.g. reception of P1, Q1,
    // P2, Q2 can result in
    // delivery of P1, Q1, Q2, P2: FIFO (implemented by UNICAST) says messages need to be delivered
    // only in the
    // order in which they were sent by their senders
    removeAndDeliver(processing, win, sender);
    sendAck(sender, win.getHighestDelivered(), conn_id);
  }

  protected void handleBatchReceived(Address sender, Map<Short, List<Message>> map) {
    for (Map.Entry<Short, List<Message>> element : map.entrySet()) {
      final List<Message> msg_list = element.getValue();
      if (log.isTraceEnabled()) {
        StringBuilder sb = new StringBuilder();
        sb.append(local_addr)
            .append(" <-- DATA(")
            .append(sender)
            .append(": " + printMessageList(msg_list))
            .append(')');
        log.trace(sb);
      }

      short conn_id = element.getKey();
      ReceiverEntry entry = null;
      for (Message msg : msg_list) {
        UnicastHeader hdr = (UnicastHeader) msg.getHeader(id);
        entry = getReceiverEntry(sender, hdr.seqno, hdr.first, conn_id);
        if (entry == null) continue;
        Table<Message> win = entry.received_msgs;
        boolean msg_added =
            win.add(hdr.seqno, msg); // win is guaranteed to be non-null if we get here
        num_msgs_received++;

        if (hdr.first && msg_added)
          sendAck(
              sender, hdr.seqno,
              conn_id); // send an ack immediately when we received the first message of a conn

        // An OOB message is passed up immediately. Later, when remove() is called, we discard it.
        // This affects ordering !
        // http://jira.jboss.com/jira/browse/JGRP-377
        if (msg.isFlagSet(Message.Flag.OOB) && msg_added) {
          try {
            up_prot.up(new Event(Event.MSG, msg));
          } catch (Throwable t) {
            log.error("couldn't deliver OOB message " + msg, t);
          }
        }
      }
      if (entry != null && conn_expiry_timeout > 0) entry.update();
    }

    ReceiverEntry entry = recv_table.get(sender);
    Table<Message> win = entry != null ? entry.received_msgs : null;
    if (win != null) {
      final AtomicBoolean processing = win.getProcessing();
      if (processing.compareAndSet(false, true)) {
        removeAndDeliver(processing, win, sender);
        sendAck(sender, win.getHighestDeliverable(), entry.recv_conn_id);
      }
    }
  }

  /**
   * Try to remove as many messages as possible from the table as pass them up. Prevents concurrent
   * passing up of messages by different threads (http://jira.jboss.com/jira/browse/JGRP-198); lots
   * of threads can come up to this point concurrently, but only 1 is allowed to pass at a time. We
   * *can* deliver messages from *different* senders concurrently, e.g. reception of P1, Q1, P2, Q2
   * can result in delivery of P1, Q1, Q2, P2: FIFO (implemented by UNICAST) says messages need to
   * be delivered in the order in which they were sent
   */
  protected int removeAndDeliver(
      final AtomicBoolean processing, Table<Message> win, Address sender) {
    int retval = 0;
    boolean released_processing = false;
    try {
      while (true) {
        List<Message> list = win.removeMany(processing, true, max_msg_batch_size);
        if (list == null) {
          released_processing = true;
          return retval;
        }

        MessageBatch batch = new MessageBatch(local_addr, sender, null, false, list);
        for (Message msg_to_deliver : batch) {
          // discard OOB msg: it has already been delivered
          // (http://jira.jboss.com/jira/browse/JGRP-377)
          if (msg_to_deliver.isFlagSet(Message.Flag.OOB)) batch.remove(msg_to_deliver);
        }

        try {
          if (log.isTraceEnabled()) {
            Message first = batch.first(), last = batch.last();
            StringBuilder sb = new StringBuilder(local_addr + ": delivering");
            if (first != null && last != null) {
              UnicastHeader hdr1 = (UnicastHeader) first.getHeader(id),
                  hdr2 = (UnicastHeader) last.getHeader(id);
              sb.append(" #").append(hdr1.seqno).append(" - #").append(hdr2.seqno);
            }
            sb.append(" (" + batch.size()).append(" messages)");
            log.trace(sb);
          }
          up_prot.up(batch);
        } catch (Throwable t) {
          log.error("failed to deliver batch " + batch, t);
        }
      }
    } finally {
      // processing is always set in win.remove(processing) above and never here ! This code is just
      // a
      // 2nd line of defense should there be an exception before win.remove(processing) sets
      // processing
      if (!released_processing) processing.set(false);
    }
  }

  protected ReceiverEntry getReceiverEntry(
      Address sender, long seqno, boolean first, short conn_id) {
    ReceiverEntry entry = recv_table.get(sender);
    if (entry != null && entry.recv_conn_id == conn_id) return entry;

    recv_table_lock.lock();
    try {
      entry = recv_table.get(sender);
      if (first) {
        if (entry == null) {
          entry = getOrCreateReceiverEntry(sender, seqno, conn_id);
        } else { // entry != null && win != null
          if (conn_id != entry.recv_conn_id) {
            if (log.isTraceEnabled())
              log.trace(
                  local_addr
                      + ": conn_id="
                      + conn_id
                      + " != "
                      + entry.recv_conn_id
                      + "; resetting receiver window");

            recv_table.remove(sender);
            entry = getOrCreateReceiverEntry(sender, seqno, conn_id);
          } else {;
          }
        }
      } else { // entry == null && win == null OR entry != null && win == null OR entry != null &&
               // win != null
        if (entry == null || entry.recv_conn_id != conn_id) {
          recv_table_lock.unlock();
          sendRequestForFirstSeqno(sender, seqno); // drops the message and returns (see below)
          return null;
        }
      }
      return entry;
    } finally {
      if (recv_table_lock.isHeldByCurrentThread()) recv_table_lock.unlock();
    }
  }

  protected ReceiverEntry getOrCreateReceiverEntry(Address sender, long seqno, short conn_id) {
    Table<Message> table =
        new Table<Message>(
            xmit_table_num_rows,
            xmit_table_msgs_per_row,
            seqno - 1,
            xmit_table_resize_factor,
            xmit_table_max_compaction_time);
    ReceiverEntry entry = new ReceiverEntry(table, conn_id);
    ReceiverEntry entry2 = recv_table.putIfAbsent(sender, entry);
    if (entry2 != null) return entry2;
    if (log.isTraceEnabled())
      log.trace(
          local_addr
              + ": created receiver window for "
              + sender
              + " at seqno=#"
              + seqno
              + " for conn-id="
              + conn_id);
    return entry;
  }

  protected void handleAckReceived(Address sender, long seqno, short conn_id) {
    if (log.isTraceEnabled())
      log.trace(
          new StringBuilder()
              .append(local_addr)
              .append(" <-- ACK(")
              .append(sender)
              .append(": #")
              .append(seqno)
              .append(", conn-id=")
              .append(conn_id)
              .append(')'));
    SenderEntry entry = send_table.get(sender);

    if (entry != null && entry.send_conn_id != conn_id) {
      if (log.isTraceEnabled())
        log.trace(
            local_addr
                + ": my conn_id ("
                + entry.send_conn_id
                + ") != received conn_id ("
                + conn_id
                + "); discarding ACK");
      return;
    }

    Table<Message> win = entry != null ? entry.sent_msgs : null;
    if (win != null) {
      win.purge(seqno, true); // removes all messages <= seqno (forced purge)
      num_acks_received++;
    }
  }

  /**
   * We need to resend our first message with our conn_id
   *
   * @param sender
   * @param seqno Resend the non null messages in the range [lowest .. seqno]
   */
  protected void handleResendingOfFirstMessage(Address sender, long seqno) {
    if (log.isTraceEnabled())
      log.trace(local_addr + " <-- SEND_FIRST_SEQNO(" + sender + "," + seqno + ")");
    SenderEntry entry = send_table.get(sender);
    Table<Message> win = entry != null ? entry.sent_msgs : null;
    if (win == null) {
      if (log.isWarnEnabled())
        log.warn(local_addr + ": sender window for " + sender + " not found");
      return;
    }

    boolean first_sent = false;
    for (long i = win.getLow() + 1; i <= seqno; i++) {
      Message rsp = win.get(i);
      if (rsp == null) continue;
      if (first_sent) {
        down_prot.down(new Event(Event.MSG, rsp));
      } else {
        first_sent = true;
        // We need to copy the UnicastHeader and put it back into the message because Message.copy()
        // doesn't copy
        // the headers and therefore we'd modify the original message in the sender retransmission
        // window
        // (https://jira.jboss.org/jira/browse/JGRP-965)
        Message copy = rsp.copy();
        UnicastHeader hdr = (UnicastHeader) copy.getHeader(this.id);
        UnicastHeader newhdr = hdr.copy();
        newhdr.first = true;
        copy.putHeader(this.id, newhdr);
        down_prot.down(new Event(Event.MSG, copy));
      }
    }
  }

  protected void startRetransmitTask() {
    if (xmit_task == null || xmit_task.isDone())
      xmit_task =
          timer.scheduleWithFixedDelay(
              new RetransmitTask(), 0, xmit_interval, TimeUnit.MILLISECONDS);
  }

  protected void stopRetransmitTask() {
    if (xmit_task != null) {
      xmit_task.cancel(true);
      xmit_task = null;
    }
  }

  protected void sendAck(Address dst, long seqno, short conn_id) {
    if (!running) // if we are disconnected, then don't send any acks which throw exceptions on
                  // shutdown
    return;
    Message ack =
        new Message(dst)
            .setFlag(Message.Flag.INTERNAL)
            .putHeader(this.id, UnicastHeader.createAckHeader(seqno, conn_id));
    if (log.isTraceEnabled())
      log.trace(
          new StringBuilder()
              .append(local_addr)
              .append(" --> ACK(")
              .append(dst)
              .append(": #")
              .append(seqno)
              .append(')'));
    try {
      down_prot.down(new Event(Event.MSG, ack));
      num_acks_sent++;
    } catch (Throwable t) {
      log.error("failed sending ACK(" + seqno + ") to " + dst, t);
    }
  }

  protected synchronized void startConnectionReaper() {
    if (connection_reaper == null || connection_reaper.isDone())
      connection_reaper =
          timer.scheduleWithFixedDelay(
              new ConnectionReaper(),
              conn_expiry_timeout,
              conn_expiry_timeout,
              TimeUnit.MILLISECONDS);
  }

  protected synchronized void stopConnectionReaper() {
    if (connection_reaper != null) connection_reaper.cancel(false);
  }

  protected synchronized short getNewConnectionId() {
    short retval = last_conn_id;
    if (last_conn_id >= Short.MAX_VALUE || last_conn_id < 0) last_conn_id = 0;
    else last_conn_id++;
    return retval;
  }

  protected void sendRequestForFirstSeqno(Address dest, long seqno_received) {
    Message msg = new Message(dest).setFlag(Message.Flag.OOB, Message.Flag.INTERNAL);
    UnicastHeader hdr = UnicastHeader.createSendFirstSeqnoHeader(seqno_received);
    msg.putHeader(this.id, hdr);
    if (log.isTraceEnabled())
      log.trace(local_addr + " --> SEND_FIRST_SEQNO(" + dest + "," + seqno_received + ")");
    down_prot.down(new Event(Event.MSG, msg));
  }

  @ManagedOperation(
      description = "Closes connections that have been idle for more than conn_expiry_timeout ms")
  public void reapIdleConnections() {
    // remove expired connections from send_table
    for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) {
      SenderEntry val = entry.getValue();
      long age = val.age();
      if (age >= conn_expiry_timeout) {
        removeSendConnection(entry.getKey());
        if (log.isDebugEnabled())
          log.debug(
              local_addr
                  + ": removed expired connection for "
                  + entry.getKey()
                  + " ("
                  + age
                  + " ms old) from send_table");
      }
    }

    // remove expired connections from recv_table
    for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) {
      ReceiverEntry val = entry.getValue();
      long age = val.age();
      if (age >= conn_expiry_timeout) {
        removeReceiveConnection(entry.getKey());
        if (log.isDebugEnabled())
          log.debug(
              local_addr
                  + ": removed expired connection for "
                  + entry.getKey()
                  + " ("
                  + age
                  + " ms old) from recv_table");
      }
    }
  }

  protected String printMessageList(List<Message> list) {
    StringBuilder sb = new StringBuilder();
    int size = list.size();
    Message first = size > 0 ? list.get(0) : null, second = size > 1 ? list.get(size - 1) : first;
    UnicastHeader hdr;
    if (first != null) {
      hdr = (UnicastHeader) first.getHeader(id);
      if (hdr != null) sb.append("#" + hdr.seqno);
    }
    if (second != null) {
      hdr = (UnicastHeader) second.getHeader(id);
      if (hdr != null) sb.append(" - #" + hdr.seqno);
    }
    return sb.toString();
  }

  /**
   * The following types and fields are serialized:
   *
   * <pre>
   * | DATA | seqno | conn_id | first |
   * | ACK  | seqno |
   * | SEND_FIRST_SEQNO |
   * </pre>
   */
  public static class UnicastHeader extends Header {
    public static final byte DATA = 0;
    public static final byte ACK = 1;
    public static final byte SEND_FIRST_SEQNO = 2;

    byte type;
    long seqno; // DATA and ACK
    short conn_id; // DATA
    boolean first; // DATA

    public UnicastHeader() {} // used for externalization

    public static UnicastHeader createDataHeader(long seqno, short conn_id, boolean first) {
      return new UnicastHeader(DATA, seqno, conn_id, first);
    }

    public static UnicastHeader createAckHeader(long seqno, short conn_id) {
      return new UnicastHeader(ACK, seqno, conn_id, false);
    }

    public static UnicastHeader createSendFirstSeqnoHeader(long seqno_received) {
      return new UnicastHeader(SEND_FIRST_SEQNO, seqno_received);
    }

    protected UnicastHeader(byte type, long seqno) {
      this.type = type;
      this.seqno = seqno;
    }

    protected UnicastHeader(byte type, long seqno, short conn_id, boolean first) {
      this.type = type;
      this.seqno = seqno;
      this.conn_id = conn_id;
      this.first = first;
    }

    public long getSeqno() {
      return seqno;
    }

    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append(type2Str(type)).append(", seqno=").append(seqno);
      if (conn_id != 0) sb.append(", conn_id=").append(conn_id);
      if (first) sb.append(", first");
      return sb.toString();
    }

    public static String type2Str(byte t) {
      switch (t) {
        case DATA:
          return "DATA";
        case ACK:
          return "ACK";
        case SEND_FIRST_SEQNO:
          return "SEND_FIRST_SEQNO";
        default:
          return "<unknown>";
      }
    }

    public final int size() {
      int retval = Global.BYTE_SIZE; // type
      switch (type) {
        case DATA:
          retval +=
              Bits.size(seqno) // seqno
                  + Global.SHORT_SIZE // conn_id
                  + Global.BYTE_SIZE; // first
          break;
        case ACK:
          retval += Bits.size(seqno) + Global.SHORT_SIZE; // conn_id
          break;
        case SEND_FIRST_SEQNO:
          retval += Bits.size(seqno);
          break;
      }
      return retval;
    }

    public UnicastHeader copy() {
      return new UnicastHeader(type, seqno, conn_id, first);
    }

    public void writeTo(DataOutput out) throws Exception {
      out.writeByte(type);
      switch (type) {
        case DATA:
          Bits.writeLong(seqno, out);
          out.writeShort(conn_id);
          out.writeBoolean(first);
          break;
        case ACK:
          Bits.writeLong(seqno, out);
          out.writeShort(conn_id);
          break;
        case SEND_FIRST_SEQNO:
          Bits.writeLong(seqno, out);
          break;
      }
    }

    public void readFrom(DataInput in) throws Exception {
      type = in.readByte();
      switch (type) {
        case DATA:
          seqno = Bits.readLong(in);
          conn_id = in.readShort();
          first = in.readBoolean();
          break;
        case ACK:
          seqno = Bits.readLong(in);
          conn_id = in.readShort();
          break;
        case SEND_FIRST_SEQNO:
          seqno = Bits.readLong(in);
          break;
      }
    }
  }

  protected final class SenderEntry {
    // stores (and retransmits) msgs sent by us to a certain peer
    final Table<Message> sent_msgs;
    final AtomicLong sent_msgs_seqno =
        new AtomicLong(DEFAULT_FIRST_SEQNO); // seqno for msgs sent by us
    final short send_conn_id;
    protected final AtomicLong timestamp = new AtomicLong(0);
    final Lock lock = new ReentrantLock();

    public SenderEntry(short send_conn_id) {
      this.send_conn_id = send_conn_id;
      this.sent_msgs =
          new Table<Message>(
              xmit_table_num_rows,
              xmit_table_msgs_per_row,
              0,
              xmit_table_resize_factor,
              xmit_table_max_compaction_time);
      update();
    }

    void update() {
      timestamp.set(System.currentTimeMillis());
    }

    long age() {
      return System.currentTimeMillis() - timestamp.longValue();
    }

    public String toString() {
      StringBuilder sb = new StringBuilder();
      if (sent_msgs != null) sb.append(sent_msgs).append(", ");
      sb.append("send_conn_id=" + send_conn_id).append(" (" + age() + " ms old)");
      return sb.toString();
    }
  }

  protected static final class ReceiverEntry {
    protected final Table<Message>
        received_msgs; // stores all msgs rcvd by a certain peer in seqno-order
    protected final short recv_conn_id;
    protected final AtomicLong timestamp = new AtomicLong(0);

    public ReceiverEntry(Table<Message> received_msgs, short recv_conn_id) {
      this.received_msgs = received_msgs;
      this.recv_conn_id = recv_conn_id;
      update();
    }

    void update() {
      timestamp.set(System.currentTimeMillis());
    }

    long age() {
      return System.currentTimeMillis() - timestamp.longValue();
    }

    public String toString() {
      StringBuilder sb = new StringBuilder();
      if (received_msgs != null) sb.append(received_msgs).append(", ");
      sb.append("recv_conn_id=" + recv_conn_id);
      sb.append(" (" + age() + " ms old)");
      return sb.toString();
    }
  }

  protected class ConnectionReaper implements Runnable {
    public void run() {
      reapIdleConnections();
    }

    public String toString() {
      return UNICAST.class.getSimpleName()
          + ": ConnectionReaper (interval="
          + conn_expiry_timeout
          + " ms)";
    }
  }

  /**
   * Retransmitter task which periodically (every xmit_interval ms) looks at all the retransmit
   * (send) tables and re-sends messages for which we haven't received an ack yet
   */
  protected class RetransmitTask implements Runnable {

    public void run() {
      for (SenderEntry val : send_table.values()) {
        Table<Message> buf = val != null ? val.sent_msgs : null;
        if (buf != null && !buf.isEmpty()) {
          long from = buf.getHighestDelivered() + 1, to = buf.getHighestReceived();
          List<Message> list = buf.get(from, to);
          if (list != null) {
            for (Message msg : list) retransmit(msg);
          }
        }
      }
    }

    public String toString() {
      return UNICAST.class.getSimpleName() + ": RetransmitTask (interval=" + xmit_interval + " ms)";
    }
  }
}
Exemple #3
0
/**
 * Reliable unicast layer. Implemented with negative acks. Every sender keeps its messages in an
 * AckSenderWindow. A receiver stores incoming messages in a NakReceiverWindow, and asks the sender
 * for retransmission if a gap is detected. Every now and then (stable_interval), a timer task sends
 * a STABLE message to all senders, including the highest received and delivered seqnos. A sender
 * purges messages lower than highest delivered and asks the STABLE sender for messages it might
 * have missed (smaller than highest received). A STABLE message can also be sent when a receiver
 * has received more than max_bytes from a given sender.
 *
 * <p>The advantage of this protocol over {@link org.jgroups.protocols.UNICAST} is that it doesn't
 * send acks for every message. Instead, it sends 'acks' after receiving max_bytes and/ or
 * periodically (stable_interval).
 *
 * @author Bela Ban
 */
@MBean(description = "Reliable unicast layer")
public class UNICAST2 extends Protocol implements AgeOutCache.Handler<Address> {
  public static final long DEFAULT_FIRST_SEQNO = Global.DEFAULT_FIRST_UNICAST_SEQNO;

  /* ------------------------------------------ Properties  ------------------------------------------ */
  @Deprecated
  protected int[] timeout = {
    400, 800, 1600, 3200
  }; // for NakSenderWindow: max time to wait for missing acks

  /**
   * The first value (in milliseconds) to use in the exponential backoff retransmission mechanism.
   * Only enabled if the value is > 0
   */
  @Deprecated
  @Property(
      description =
          "The first value (in milliseconds) to use in the exponential backoff. Enabled if greater than 0",
      deprecatedMessage = "Not used anymore")
  protected int exponential_backoff = 300;

  @Property(
      description =
          "Max number of messages to be removed from a NakReceiverWindow. This property might "
              + "get removed anytime, so don't use it !")
  protected int max_msg_batch_size = 500;

  @Property(description = "Max number of bytes before a stability message is sent to the sender")
  protected long max_bytes = 10000000;

  @Property(
      description =
          "Max number of milliseconds before a stability message is sent to the sender(s)")
  protected long stable_interval = 60000L;

  @Property(
      description =
          "Max number of STABLE messages sent for the same highest_received seqno. A value < 1 is invalid")
  protected int max_stable_msgs = 5;

  @Property(
      description = "Number of rows of the matrix in the retransmission table (only for experts)",
      writable = false)
  protected int xmit_table_num_rows = 100;

  @Property(
      description =
          "Number of elements of a row of the matrix in the retransmission table (only for experts). "
              + "The capacity of the matrix is xmit_table_num_rows * xmit_table_msgs_per_row",
      writable = false)
  protected int xmit_table_msgs_per_row = 2000;

  @Property(
      description = "Resize factor of the matrix in the retransmission table (only for experts)",
      writable = false)
  protected double xmit_table_resize_factor = 1.2;

  @Property(
      description =
          "Number of milliseconds after which the matrix in the retransmission table "
              + "is compacted (only for experts)",
      writable = false)
  protected long xmit_table_max_compaction_time = 10 * 60 * 1000;

  @Deprecated
  @Property(
      description =
          "If enabled, the removal of a message from the retransmission table causes an "
              + "automatic purge (only for experts)",
      writable = false,
      deprecatedMessage = "not used anymore")
  protected boolean xmit_table_automatic_purging = true;

  @Property(
      description =
          "Whether to use the old retransmitter which retransmits individual messages or the new one "
              + "which uses ranges of retransmitted messages. Default is true. Note that this property will be removed in 3.0; "
              + "it is only used to switch back to the old (and proven) retransmitter mechanism if issues occur")
  protected boolean use_range_based_retransmitter = true;

  @Property(
      description =
          "Time (in milliseconds) after which an idle incoming or outgoing connection is closed. The "
              + "connection will get re-established when used again. 0 disables connection reaping")
  protected long conn_expiry_timeout = 60000;

  @Property(
      description =
          "Interval (in milliseconds) at which missing messages (from all retransmit buffers) "
              + "are retransmitted")
  protected long xmit_interval = 1000;
  /* --------------------------------------------- JMX  ---------------------------------------------- */

  protected long num_msgs_sent = 0, num_msgs_received = 0;

  /* --------------------------------------------- Fields ------------------------------------------------ */

  protected final ConcurrentMap<Address, SenderEntry> send_table = Util.createConcurrentMap();
  protected final ConcurrentMap<Address, ReceiverEntry> recv_table = Util.createConcurrentMap();

  /** RetransmitTask running every xmit_interval ms */
  protected Future<?> xmit_task;

  protected final ReentrantLock recv_table_lock = new ReentrantLock();

  protected volatile List<Address> members = new ArrayList<Address>(11);

  protected Address local_addr = null;

  protected TimeScheduler timer = null; // used for retransmissions (passed to AckSenderWindow)

  protected volatile boolean running = false;

  protected short last_conn_id = 0;

  protected long max_retransmit_time = 60 * 1000L;

  protected AgeOutCache<Address> cache = null;

  protected Future<?> stable_task_future =
      null; // bcasts periodic STABLE message (added to timer below)

  protected Future<?> connection_reaper; // closes idle connections

  public int[] getTimeout() {
    return timeout;
  }

  @Deprecated
  @Property(
      name = "timeout",
      converter = PropertyConverters.IntegerArray.class,
      description = "list of timeouts",
      deprecatedMessage = "not used anymore")
  public void setTimeout(int[] val) {
    if (val != null) timeout = val;
  }

  public void setMaxMessageBatchSize(int size) {
    if (size >= 1) max_msg_batch_size = size;
  }

  @ManagedAttribute
  public String getLocalAddress() {
    return local_addr != null ? local_addr.toString() : "null";
  }

  @ManagedAttribute
  public String getMembers() {
    return members.toString();
  }

  @ManagedAttribute(description = "Returns the number of outgoing (send) connections")
  public int getNumSendConnections() {
    return send_table.size();
  }

  @ManagedAttribute(description = "Returns the number of incoming (receive) connections")
  public int getNumReceiveConnections() {
    return recv_table.size();
  }

  @ManagedAttribute(
      description =
          "Returns the total number of outgoing (send) and incoming (receive) connections")
  public int getNumConnections() {
    return getNumReceiveConnections() + getNumSendConnections();
  }

  @ManagedOperation
  public String printConnections() {
    StringBuilder sb = new StringBuilder();
    if (!send_table.isEmpty()) {
      sb.append("send connections:\n");
      for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) {
        sb.append(entry.getKey()).append(": ").append(entry.getValue()).append("\n");
      }
    }

    if (!recv_table.isEmpty()) {
      sb.append("\nreceive connections:\n");
      for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) {
        sb.append(entry.getKey()).append(": ").append(entry.getValue()).append("\n");
      }
    }
    return sb.toString();
  }

  @ManagedAttribute(description = "Whether the ConnectionReaper task is running")
  public boolean isConnectionReaperRunning() {
    return connection_reaper != null && !connection_reaper.isDone();
  }

  @ManagedAttribute
  public long getNumMessagesSent() {
    return num_msgs_sent;
  }

  @ManagedAttribute
  public long getNumMessagesReceived() {
    return num_msgs_received;
  }

  @ManagedAttribute(description = "Total number of undelivered messages in all receive windows")
  public long getXmitTableUndeliveredMessages() {
    long retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.size();
    }
    return retval;
  }

  @ManagedAttribute(description = "Total number of missing messages in all receive windows")
  public long getXmitTableMissingMessages() {
    long retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.getNumMissing();
    }
    return retval;
  }

  @ManagedAttribute(description = "Number of compactions in all (receive and send) windows")
  public int getXmitTableNumCompactions() {
    int retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.getNumCompactions();
    }
    for (SenderEntry entry : send_table.values()) {
      if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumCompactions();
    }
    return retval;
  }

  @ManagedAttribute(description = "Number of moves in all (receive and send) windows")
  public int getXmitTableNumMoves() {
    int retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.getNumMoves();
    }
    for (SenderEntry entry : send_table.values()) {
      if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumMoves();
    }
    return retval;
  }

  @ManagedAttribute(description = "Number of resizes in all (receive and send) windows")
  public int getXmitTableNumResizes() {
    int retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.getNumResizes();
    }
    for (SenderEntry entry : send_table.values()) {
      if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumResizes();
    }
    return retval;
  }

  @ManagedAttribute(description = "Number of purges in all (receive and send) windows")
  public int getXmitTableNumPurges() {
    int retval = 0;
    for (ReceiverEntry entry : recv_table.values()) {
      if (entry.received_msgs != null) retval += entry.received_msgs.getNumPurges();
    }
    for (SenderEntry entry : send_table.values()) {
      if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumPurges();
    }
    return retval;
  }

  @ManagedOperation(description = "Prints the contents of the receive windows for all members")
  public String printReceiveWindowMessages() {
    StringBuilder ret = new StringBuilder(local_addr + ":\n");
    for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) {
      Address addr = entry.getKey();
      Table<Message> buf = entry.getValue().received_msgs;
      ret.append(addr).append(": ").append(buf.toString()).append('\n');
    }
    return ret.toString();
  }

  @ManagedOperation(description = "Prints the contents of the send windows for all members")
  public String printSendWindowMessages() {
    StringBuilder ret = new StringBuilder(local_addr + ":\n");
    for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) {
      Address addr = entry.getKey();
      Table<Message> buf = entry.getValue().sent_msgs;
      ret.append(addr).append(": ").append(buf.toString()).append('\n');
    }
    return ret.toString();
  }

  @ManagedAttribute(description = "Number of retransmit requests received")
  protected final AtomicLong xmit_reqs_received = new AtomicLong(0);

  @ManagedAttribute(description = "Number of retransmit requests sent")
  protected final AtomicLong xmit_reqs_sent = new AtomicLong(0);

  @ManagedAttribute(description = "Number of retransmit responses sent")
  protected final AtomicLong xmit_rsps_sent = new AtomicLong(0);

  @ManagedAttribute(description = "Is the retransmit task running")
  public boolean isXmitTaskRunning() {
    return xmit_task != null && !xmit_task.isDone();
  }

  public long getMaxRetransmitTime() {
    return max_retransmit_time;
  }

  @Property(
      description =
          "Max number of milliseconds we try to retransmit a message to any given member. After that, "
              + "the connection is removed. Any new connection to that member will start with seqno #1 again. 0 disables this")
  public void setMaxRetransmitTime(long max_retransmit_time) {
    this.max_retransmit_time = max_retransmit_time;
    if (cache != null && max_retransmit_time > 0) cache.setTimeout(max_retransmit_time);
  }

  @ManagedAttribute
  public int getAgeOutCacheSize() {
    return cache != null ? cache.size() : 0;
  }

  @ManagedOperation
  public String printAgeOutCache() {
    return cache != null ? cache.toString() : "n/a";
  }

  public AgeOutCache<Address> getAgeOutCache() {
    return cache;
  }

  public void resetStats() {
    num_msgs_sent = num_msgs_received = 0;
    xmit_reqs_received.set(0);
    xmit_reqs_sent.set(0);
    xmit_rsps_sent.set(0);
  }

  public TimeScheduler getTimer() {
    return timer;
  }

  /**
   * Only used for unit tests, don't use !
   *
   * @param timer
   */
  public void setTimer(TimeScheduler timer) {
    this.timer = timer;
  }

  public void init() throws Exception {
    super.init();
    if (max_stable_msgs < 1)
      throw new IllegalArgumentException("max_stable_msgs ( " + max_stable_msgs + ") must be > 0");
    if (max_bytes <= 0) throw new IllegalArgumentException("max_bytes has to be > 0");
  }

  public void start() throws Exception {
    timer = getTransport().getTimer();
    if (timer == null) throw new Exception("timer is null");
    if (max_retransmit_time > 0) cache = new AgeOutCache<Address>(timer, max_retransmit_time, this);
    running = true;
    if (stable_interval > 0) startStableTask();
    if (conn_expiry_timeout > 0) startConnectionReaper();
    startRetransmitTask();
  }

  public void stop() {
    running = false;
    stopStableTask();
    stopConnectionReaper();
    stopRetransmitTask();
    removeAllConnections();
  }

  public Object up(Event evt) {
    Message msg;
    Address dst, src;
    Unicast2Header hdr;

    switch (evt.getType()) {
      case Event.MSG:
        msg = (Message) evt.getArg();
        dst = msg.getDest();

        if (dst == null || msg.isFlagSet(Message.NO_RELIABILITY)) // only handle unicast messages
        break; // pass up

        // changed from removeHeader(): we cannot remove the header because if we do loopback=true
        // at the
        // transport level, we will not have the header on retransmit ! (bela Aug 22 2006)
        hdr = (Unicast2Header) msg.getHeader(this.id);
        if (hdr == null) break;
        src = msg.getSrc();
        switch (hdr.type) {
          case Unicast2Header.DATA: // received regular message
            handleDataReceived(src, hdr.seqno, hdr.conn_id, hdr.first, msg, evt);
            return null; // we pass the deliverable message up in handleDataReceived()
          case Unicast2Header.XMIT_REQ: // received ACK for previously sent message
            handleXmitRequest(src, (SeqnoList) msg.getObject());
            break;
          case Unicast2Header.SEND_FIRST_SEQNO:
            handleResendingOfFirstMessage(src, hdr.seqno);
            break;
          case Unicast2Header.STABLE:
            stable(msg.getSrc(), hdr.conn_id, hdr.seqno, hdr.high_seqno);
            break;
          default:
            log.error("UnicastHeader type " + hdr.type + " not known !");
            break;
        }
        return null;
    }

    return up_prot.up(evt); // Pass up to the layer above us
  }

  public Object down(Event evt) {
    switch (evt.getType()) {
      case Event.MSG: // Add UnicastHeader, add to AckSenderWindow and pass down
        Message msg = (Message) evt.getArg();
        Address dst = msg.getDest();

        /* only handle unicast messages */
        if (dst == null || msg.isFlagSet(Message.NO_RELIABILITY)) break;

        if (!running) {
          if (log.isTraceEnabled())
            log.trace("discarded message as start() has not yet been called, message: " + msg);
          return null;
        }

        SenderEntry entry = send_table.get(dst);
        if (entry == null) {
          entry = new SenderEntry(getNewConnectionId());
          SenderEntry existing = send_table.putIfAbsent(dst, entry);
          if (existing != null) entry = existing;
          else {
            if (log.isTraceEnabled())
              log.trace(
                  local_addr
                      + ": created connection to "
                      + dst
                      + " (conn_id="
                      + entry.send_conn_id
                      + ")");
            if (cache != null && !members.contains(dst)) cache.add(dst);
          }
        }

        short send_conn_id = entry.send_conn_id;
        long seqno = entry.sent_msgs_seqno.getAndIncrement();
        long sleep = 10;
        while (running) {
          try {
            msg.putHeader(
                this.id,
                Unicast2Header.createDataHeader(seqno, send_conn_id, seqno == DEFAULT_FIRST_SEQNO));
            entry.sent_msgs.add(seqno, msg); // add *including* UnicastHeader, adds to retransmitter
            if (conn_expiry_timeout > 0) entry.update();
            break;
          } catch (Throwable t) {
            if (!running) break;
            if (log.isWarnEnabled()) log.warn("failed sending message", t);
            Util.sleep(sleep);
            sleep = Math.min(5000, sleep * 2);
          }
        }

        if (log.isTraceEnabled()) {
          StringBuilder sb = new StringBuilder();
          sb.append(local_addr)
              .append(" --> DATA(")
              .append(dst)
              .append(": #")
              .append(seqno)
              .append(", conn_id=")
              .append(send_conn_id);
          if (seqno == DEFAULT_FIRST_SEQNO) sb.append(", first");
          sb.append(')');
          log.trace(sb);
        }

        try {
          down_prot.down(evt);
          num_msgs_sent++;
        } catch (Throwable t) {
          log.warn("failed sending the message", t);
        }
        return null; // we already passed the msg down

      case Event.VIEW_CHANGE: // remove connections to peers that are not members anymore !
        View view = (View) evt.getArg();
        List<Address> new_members = view.getMembers();
        Set<Address> non_members = new HashSet<Address>(send_table.keySet());
        non_members.addAll(recv_table.keySet());

        members = new_members;
        non_members.removeAll(new_members);
        if (cache != null) cache.removeAll(new_members);

        if (!non_members.isEmpty()) {
          if (log.isTraceEnabled()) log.trace("removing non members " + non_members);
          for (Address non_mbr : non_members) removeConnection(non_mbr);
        }
        break;

      case Event.SET_LOCAL_ADDRESS:
        local_addr = (Address) evt.getArg();
        break;
    }

    return down_prot.down(evt); // Pass on to the layer below us
  }

  /**
   * Purge all messages in window for local_addr, which are <= low. Check if the window's highest
   * received message is > high: if true, retransmit all messages from high - win.high to sender
   *
   * @param sender
   * @param hd Highest delivered seqno
   * @param hr Highest received seqno
   */
  protected void stable(Address sender, short conn_id, long hd, long hr) {
    SenderEntry entry = send_table.get(sender);
    Table<Message> win = entry != null ? entry.sent_msgs : null;
    if (win == null) return;

    if (log.isTraceEnabled())
      log.trace(
          new StringBuilder()
                  .append(local_addr)
                  .append(" <-- STABLE(")
                  .append(sender)
                  .append(": ")
                  .append(hd)
                  .append("-")
                  .append(hr)
                  .append(", conn_id=" + conn_id)
              + ")");

    if (entry.send_conn_id != conn_id) {
      log.warn(
          local_addr
              + ": my conn_id ("
              + entry.send_conn_id
              + ") != received conn_id ("
              + conn_id
              + "); discarding STABLE message !");
      return;
    }

    win.purge(hd, true);
    long win_hr = win.getHighestReceived();
    if (win_hr > hr) {
      for (long seqno = hr; seqno <= win_hr; seqno++) {
        Message msg =
            win.get(
                seqno); // destination is still the same (the member which sent the STABLE message)
        if (msg != null) down_prot.down(new Event(Event.MSG, msg));
      }
    }
  }

  @ManagedOperation(
      description =
          "Sends a STABLE message to all senders. This causes message purging and potential"
              + " retransmissions from senders")
  public void sendStableMessages() {
    for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) {
      Address dest = entry.getKey();
      ReceiverEntry val = entry.getValue();
      Table<Message> win = val != null ? val.received_msgs : null;
      if (win != null) {
        long[] tmp = win.getDigest();
        long low = tmp[0], high = tmp[1];

        if (val.last_highest == high) {
          if (val.num_stable_msgs >= max_stable_msgs) {
            continue;
          } else val.num_stable_msgs++;
        } else {
          val.last_highest = high;
          val.num_stable_msgs = 1;
        }
        sendStableMessage(dest, val.recv_conn_id, low, high);
      }
    }
  }

  protected void sendStableMessage(Address dest, short conn_id, long hd, long hr) {
    Message stable_msg = new Message(dest, null, null);
    Unicast2Header hdr = Unicast2Header.createStableHeader(conn_id, hd, hr);
    stable_msg.putHeader(this.id, hdr);
    stable_msg.setFlag(Message.OOB);
    if (log.isTraceEnabled()) {
      StringBuilder sb = new StringBuilder();
      sb.append(local_addr)
          .append(" --> STABLE(")
          .append(dest)
          .append(": ")
          .append(hd)
          .append("-")
          .append(hr)
          .append(", conn_id=")
          .append(conn_id)
          .append(")");
      log.trace(sb.toString());
    }
    down_prot.down(new Event(Event.MSG, stable_msg));
  }

  protected void startStableTask() {
    if (stable_task_future == null || stable_task_future.isDone()) {
      final Runnable stable_task =
          new Runnable() {
            public void run() {
              try {
                sendStableMessages();
              } catch (Throwable t) {
                log.error("sending of STABLE messages failed", t);
              }
            }
          };
      stable_task_future =
          timer.scheduleWithFixedDelay(
              stable_task, stable_interval, stable_interval, TimeUnit.MILLISECONDS);
      if (log.isTraceEnabled()) log.trace("stable task started");
    }
  }

  protected void stopStableTask() {
    if (stable_task_future != null) {
      stable_task_future.cancel(false);
      stable_task_future = null;
    }
  }

  protected synchronized void startConnectionReaper() {
    if (connection_reaper == null || connection_reaper.isDone())
      connection_reaper =
          timer.scheduleWithFixedDelay(
              new ConnectionReaper(),
              conn_expiry_timeout,
              conn_expiry_timeout,
              TimeUnit.MILLISECONDS);
  }

  protected synchronized void stopConnectionReaper() {
    if (connection_reaper != null) connection_reaper.cancel(false);
  }

  /**
   * Removes and resets from connection table (which is already locked). Returns true if member was
   * found, otherwise false. This method is public only so it can be invoked by unit testing, but
   * should not otherwise be used !
   */
  public void removeConnection(Address mbr) {
    removeSendConnection(mbr);
    removeReceiveConnection(mbr);
  }

  public void removeSendConnection(Address mbr) {
    send_table.remove(mbr);
  }

  public void removeReceiveConnection(Address mbr) {
    ReceiverEntry entry2 = recv_table.remove(mbr);
    if (entry2 != null) {
      Table<Message> win = entry2.received_msgs;
      if (win != null)
        sendStableMessage(
            mbr, entry2.recv_conn_id, win.getHighestDelivered(), win.getHighestReceived());
      entry2.reset();
    }
  }

  /**
   * This method is public only so it can be invoked by unit testing, but should not otherwise be
   * used !
   */
  @ManagedOperation(
      description = "Trashes all connections to other nodes. This is only used for testing")
  public void removeAllConnections() {
    send_table.clear();
    sendStableMessages();
    for (ReceiverEntry entry2 : recv_table.values()) entry2.reset();
    recv_table.clear();
  }

  public void retransmit(SeqnoList missing, Address sender) {
    Unicast2Header hdr = Unicast2Header.createXmitReqHeader();
    Message retransmit_msg = new Message(sender, null, missing);
    retransmit_msg.setFlag(Message.OOB);
    if (log.isTraceEnabled())
      log.trace(local_addr + ": sending XMIT_REQ (" + missing + ") to " + sender);
    retransmit_msg.putHeader(this.id, hdr);
    down_prot.down(new Event(Event.MSG, retransmit_msg));
    xmit_reqs_sent.addAndGet(missing.size());
  }

  /**
   * Called by AgeOutCache, to removed expired connections
   *
   * @param key
   */
  public void expired(Address key) {
    if (key != null) {
      if (log.isDebugEnabled()) log.debug("removing connection to " + key + " because it expired");
      removeConnection(key);
    }
  }

  /**
   * Check whether the hashmap contains an entry e for <code>sender</code> (create if not). If
   * e.received_msgs is null and <code>first</code> is true: create a new AckReceiverWindow(seqno)
   * and add message. Set e.received_msgs to the new window. Else just add the message.
   */
  protected void handleDataReceived(
      Address sender, long seqno, short conn_id, boolean first, Message msg, Event evt) {
    if (log.isTraceEnabled()) {
      StringBuilder sb = new StringBuilder();
      sb.append(local_addr).append(" <-- DATA(").append(sender).append(": #").append(seqno);
      if (conn_id != 0) sb.append(", conn_id=").append(conn_id);
      if (first) sb.append(", first");
      sb.append(')');
      log.trace(sb);
    }

    ReceiverEntry entry = getReceiverEntry(sender, seqno, first, conn_id);
    if (entry == null) return;
    if (conn_expiry_timeout > 0) entry.update();
    Table<Message> win = entry.received_msgs;
    boolean added = win.add(seqno, msg); // win is guaranteed to be non-null if we get here
    num_msgs_received++;

    if (added) {
      int len = msg.getLength();
      if (len > 0 && entry.incrementStable(len))
        sendStableMessage(
            sender, entry.recv_conn_id, win.getHighestDelivered(), win.getHighestReceived());
    }

    // An OOB message is passed up immediately. Later, when remove() is called, we discard it. This
    // affects ordering !
    // http://jira.jboss.com/jira/browse/JGRP-377
    if (msg.isFlagSet(Message.OOB) && added) {
      try {
        up_prot.up(evt);
      } catch (Throwable t) {
        log.error("couldn't deliver OOB message " + msg, t);
      }
    }

    final AtomicBoolean processing = win.getProcessing();
    if (!processing.compareAndSet(false, true)) {
      return;
    }

    // Try to remove as many messages as possible and pass them up.
    // Prevents concurrent passing up of messages by different threads
    // (http://jira.jboss.com/jira/browse/JGRP-198);
    // this is all the more important once we have a concurrent stack
    // (http://jira.jboss.com/jira/browse/JGRP-181),
    // where lots of threads can come up to this point concurrently, but only 1 is allowed to pass
    // at a time
    // We *can* deliver messages from *different* senders concurrently, e.g. reception of P1, Q1,
    // P2, Q2 can result in
    // delivery of P1, Q1, Q2, P2: FIFO (implemented by UNICAST) says messages need to be delivered
    // only in the
    // order in which they were sent by their senders
    boolean released_processing = false;
    try {
      while (true) {
        List<Message> msgs =
            win.removeMany(processing, true, max_msg_batch_size); // remove my own messages
        if (msgs == null || msgs.isEmpty()) {
          released_processing = true;
          return;
        }

        for (Message m : msgs) {
          // discard OOB msg: it has already been delivered
          // (http://jira.jboss.com/jira/browse/JGRP-377)
          if (m.isFlagSet(Message.OOB)) continue;
          try {
            up_prot.up(new Event(Event.MSG, m));
          } catch (Throwable t) {
            log.error("couldn't deliver message " + m, t);
          }
        }
      }
    } finally {
      // processing is always set in win.remove(processing) above and never here ! This code is just
      // a
      // 2nd line of defense should there be an exception before win.remove(processing) sets
      // processing
      if (!released_processing) processing.set(false);
    }
  }

  protected ReceiverEntry getReceiverEntry(
      Address sender, long seqno, boolean first, short conn_id) {
    ReceiverEntry entry = recv_table.get(sender);
    if (entry != null && entry.recv_conn_id == conn_id) return entry;

    recv_table_lock.lock();
    try {
      entry = recv_table.get(sender);
      if (first) {
        if (entry == null) {
          entry = getOrCreateReceiverEntry(sender, seqno, conn_id);
        } else { // entry != null && win != null
          if (conn_id != entry.recv_conn_id) {
            if (log.isTraceEnabled())
              log.trace(
                  local_addr
                      + ": conn_id="
                      + conn_id
                      + " != "
                      + entry.recv_conn_id
                      + "; resetting receiver window");

            recv_table.remove(sender);
            entry = getOrCreateReceiverEntry(sender, seqno, conn_id);
          } else {;
          }
        }
      } else { // entry == null && win == null OR entry != null && win == null OR entry != null &&
               // win != null
        if (entry == null || entry.recv_conn_id != conn_id) {
          recv_table_lock.unlock();
          sendRequestForFirstSeqno(sender, seqno); // drops the message and returns (see below)
          return null;
        }
      }
      return entry;
    } finally {
      if (recv_table_lock.isHeldByCurrentThread()) recv_table_lock.unlock();
    }
  }

  protected ReceiverEntry getOrCreateReceiverEntry(Address sender, long seqno, short conn_id) {
    Table<Message> table =
        new Table<Message>(
            xmit_table_num_rows,
            xmit_table_msgs_per_row,
            seqno - 1,
            xmit_table_resize_factor,
            xmit_table_max_compaction_time);
    ReceiverEntry entry = new ReceiverEntry(table, conn_id);
    ReceiverEntry entry2 = recv_table.putIfAbsent(sender, entry);
    if (entry2 != null) return entry2;
    if (log.isTraceEnabled())
      log.trace(
          local_addr
              + ": created receiver window for "
              + sender
              + " at seqno=#"
              + seqno
              + " for conn-id="
              + conn_id);
    return entry;
  }

  protected void handleXmitRequest(Address sender, SeqnoList missing) {
    if (log.isTraceEnabled())
      log.trace(
          new StringBuilder()
              .append(local_addr)
              .append(" <-- XMIT(")
              .append(sender)
              .append(": #")
              .append(missing)
              .append(')'));

    SenderEntry entry = send_table.get(sender);
    xmit_reqs_received.addAndGet(missing.size());
    Table<Message> win = entry != null ? entry.sent_msgs : null;
    if (win != null) {
      for (long seqno : missing) {
        Message msg = win.get(seqno);
        if (msg == null) {
          if (log.isWarnEnabled() && !local_addr.equals(sender)) {
            StringBuilder sb = new StringBuilder();
            sb.append("(requester=").append(sender).append(", local_addr=").append(this.local_addr);
            sb.append(") message ").append(sender).append("::").append(seqno);
            sb.append(" not found in retransmission table of ")
                .append(sender)
                .append(":\n")
                .append(win);
            log.warn(sb.toString());
          }
          continue;
        }

        down_prot.down(new Event(Event.MSG, msg));
        xmit_rsps_sent.incrementAndGet();
      }
    }
  }

  /**
   * We need to resend our first message with our conn_id
   *
   * @param sender
   * @param seqno Resend the non null messages in the range [lowest .. seqno]
   */
  protected void handleResendingOfFirstMessage(Address sender, long seqno) {
    if (log.isTraceEnabled())
      log.trace(local_addr + " <-- SEND_FIRST_SEQNO(" + sender + "," + seqno + ")");
    SenderEntry entry = send_table.get(sender);
    Table<Message> win = entry != null ? entry.sent_msgs : null;
    if (win == null) {
      if (log.isErrorEnabled())
        log.error(local_addr + ": sender window for " + sender + " not found");
      return;
    }

    boolean first_sent = false;
    for (long i = win.getLow() + 1; i <= seqno; i++) {
      Message rsp = win.get(i);
      if (rsp == null) continue;
      if (first_sent) {
        down_prot.down(new Event(Event.MSG, rsp));
      } else {
        first_sent = true;
        // We need to copy the UnicastHeader and put it back into the message because Message.copy()
        // doesn't copy
        // the headers and therefore we'd modify the original message in the sender retransmission
        // window
        // (https://jira.jboss.org/jira/browse/JGRP-965)
        Message copy = rsp.copy();
        Unicast2Header hdr = (Unicast2Header) copy.getHeader(this.id);
        Unicast2Header newhdr = hdr.copy();
        newhdr.first = true;
        copy.putHeader(this.id, newhdr);
        down_prot.down(new Event(Event.MSG, copy));
      }
    }
  }

  protected void startRetransmitTask() {
    if (xmit_task == null || xmit_task.isDone())
      xmit_task =
          timer.scheduleWithFixedDelay(
              new RetransmitTask(), 0, xmit_interval, TimeUnit.MILLISECONDS);
  }

  protected void stopRetransmitTask() {
    if (xmit_task != null) {
      xmit_task.cancel(true);
      xmit_task = null;
    }
  }

  protected synchronized short getNewConnectionId() {
    short retval = last_conn_id;
    if (last_conn_id >= Short.MAX_VALUE || last_conn_id < 0) last_conn_id = 0;
    else last_conn_id++;
    return retval;
  }

  protected void sendRequestForFirstSeqno(Address dest, long seqno_received) {
    Message msg = new Message(dest);
    msg.setFlag(Message.OOB);
    Unicast2Header hdr = Unicast2Header.createSendFirstSeqnoHeader(seqno_received);
    msg.putHeader(this.id, hdr);
    if (log.isTraceEnabled())
      log.trace(local_addr + " --> SEND_FIRST_SEQNO(" + dest + "," + seqno_received + ")");
    down_prot.down(new Event(Event.MSG, msg));
  }

  @ManagedOperation(
      description = "Closes connections that have been idle for more than conn_expiry_timeout ms")
  public void reapIdleConnections() {
    if (conn_expiry_timeout <= 0) return;

    // remove expired connections from send_table
    for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) {
      SenderEntry val = entry.getValue();
      long age = val.age();
      if (age >= conn_expiry_timeout) {
        removeSendConnection(entry.getKey());
        if (log.isDebugEnabled())
          log.debug(
              local_addr
                  + ": removed expired connection for "
                  + entry.getKey()
                  + " ("
                  + age
                  + " ms old) from send_table");
      }
    }

    // remove expired connections from recv_table
    for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) {
      ReceiverEntry val = entry.getValue();
      long age = val.age();
      if (age >= conn_expiry_timeout) {
        removeReceiveConnection(entry.getKey());
        if (log.isDebugEnabled())
          log.debug(
              local_addr
                  + ": removed expired connection for "
                  + entry.getKey()
                  + " ("
                  + age
                  + " ms old) from recv_table");
      }
    }
  }

  /**
   * The following types and fields are serialized:
   *
   * <pre>
   * | DATA | seqno | conn_id | first |
   * | ACK  | seqno |
   * | SEND_FIRST_SEQNO | seqno |
   * </pre>
   */
  public static class Unicast2Header extends Header {
    public static final byte DATA = 0;
    public static final byte XMIT_REQ = 1;
    public static final byte SEND_FIRST_SEQNO = 2;
    public static final byte STABLE = 3;

    byte type;
    long seqno; // DATA and STABLE
    long high_seqno; // STABLE
    short conn_id; // DATA, STABLE
    boolean first; // DATA

    public Unicast2Header() {} // used for externalization

    public static Unicast2Header createDataHeader(long seqno, short conn_id, boolean first) {
      return new Unicast2Header(DATA, seqno, 0L, conn_id, first);
    }

    public static Unicast2Header createXmitReqHeader() {
      return new Unicast2Header(XMIT_REQ);
    }

    public static Unicast2Header createStableHeader(short conn_id, long low, long high) {
      if (low > high)
        throw new IllegalArgumentException("low (" + low + ") needs to be <= high (" + high + ")");
      Unicast2Header retval = new Unicast2Header(STABLE, low);
      retval.high_seqno = high;
      retval.conn_id = conn_id;
      return retval;
    }

    public static Unicast2Header createSendFirstSeqnoHeader(long seqno_received) {
      return new Unicast2Header(SEND_FIRST_SEQNO, seqno_received);
    }

    protected Unicast2Header(byte type) {
      this.type = type;
    }

    protected Unicast2Header(byte type, long seqno) {
      this.type = type;
      this.seqno = seqno;
    }

    protected Unicast2Header(byte type, long seqno, long high, short conn_id, boolean first) {
      this.type = type;
      this.seqno = seqno;
      this.high_seqno = high;
      this.conn_id = conn_id;
      this.first = first;
    }

    public byte getType() {
      return type;
    }

    public long getSeqno() {
      return seqno;
    }

    public long getHighSeqno() {
      return high_seqno;
    }

    public short getConnId() {
      return conn_id;
    }

    public boolean isFirst() {
      return first;
    }

    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append(type2Str(type)).append(", seqno=").append(seqno);
      if (conn_id != 0) sb.append(", conn_id=").append(conn_id);
      if (first) sb.append(", first");
      return sb.toString();
    }

    public static String type2Str(byte t) {
      switch (t) {
        case DATA:
          return "DATA";
        case XMIT_REQ:
          return "XMIT_REQ";
        case SEND_FIRST_SEQNO:
          return "SEND_FIRST_SEQNO";
        case STABLE:
          return "STABLE";
        default:
          return "<unknown>";
      }
    }

    public final int size() {
      int retval = Global.BYTE_SIZE; // type
      switch (type) {
        case DATA:
          retval +=
              Util.size(seqno) // seqno
                  + Global.SHORT_SIZE // conn_id
                  + Global.BYTE_SIZE; // first
          break;
        case XMIT_REQ:
          break;
        case STABLE:
          retval += Util.size(seqno, high_seqno) + Global.SHORT_SIZE; // conn_id
          break;
        case SEND_FIRST_SEQNO:
          retval += Util.size(seqno);
          break;
      }
      return retval;
    }

    public Unicast2Header copy() {
      return new Unicast2Header(type, seqno, high_seqno, conn_id, first);
    }

    public void writeTo(DataOutput out) throws Exception {
      out.writeByte(type);
      switch (type) {
        case DATA:
          Util.writeLong(seqno, out);
          out.writeShort(conn_id);
          out.writeBoolean(first);
          break;
        case XMIT_REQ:
          break;
        case STABLE:
          Util.writeLongSequence(seqno, high_seqno, out);
          out.writeShort(conn_id);
          break;
        case SEND_FIRST_SEQNO:
          Util.writeLong(seqno, out);
          break;
      }
    }

    public void readFrom(DataInput in) throws Exception {
      type = in.readByte();
      switch (type) {
        case DATA:
          seqno = Util.readLong(in);
          conn_id = in.readShort();
          first = in.readBoolean();
          break;
        case XMIT_REQ:
          break;
        case STABLE:
          long[] seqnos = Util.readLongSequence(in);
          seqno = seqnos[0];
          high_seqno = seqnos[1];
          conn_id = in.readShort();
          break;
        case SEND_FIRST_SEQNO:
          seqno = Util.readLong(in);
          break;
      }
    }
  }

  protected final class SenderEntry {
    // stores (and retransmits) msgs sent by us to a given peer
    final Table<Message> sent_msgs;
    final AtomicLong sent_msgs_seqno =
        new AtomicLong(DEFAULT_FIRST_SEQNO); // seqno for msgs sent by us
    final short send_conn_id;
    protected final AtomicLong timestamp = new AtomicLong(0);

    public SenderEntry(short send_conn_id) {
      this.send_conn_id = send_conn_id;
      this.sent_msgs =
          new Table<Message>(
              xmit_table_num_rows,
              xmit_table_msgs_per_row,
              0,
              xmit_table_resize_factor,
              xmit_table_max_compaction_time);
      update();
    }

    void update() {
      timestamp.set(System.currentTimeMillis());
    }

    long age() {
      return System.currentTimeMillis() - timestamp.longValue();
    }

    public String toString() {
      StringBuilder sb = new StringBuilder();
      if (sent_msgs != null) sb.append(sent_msgs).append(", ");
      sb.append("send_conn_id=" + send_conn_id).append(" (" + age() + " ms old)");
      return sb.toString();
    }
  }

  protected final class ReceiverEntry {
    protected final Table<Message>
        received_msgs; // stores all msgs rcvd by a certain peer in seqno-order
    protected final short recv_conn_id;
    protected int received_bytes = 0;
    protected final AtomicLong timestamp = new AtomicLong(0);
    protected final Lock lock = new ReentrantLock();

    protected long last_highest = -1;
    protected int num_stable_msgs = 0;

    public ReceiverEntry(Table<Message> received_msgs, short recv_conn_id) {
      this.received_msgs = received_msgs;
      this.recv_conn_id = recv_conn_id;
      update();
    }

    /**
     * Adds len bytes, if max_bytes is exceeded, the value is reset and true returned, else false
     */
    boolean incrementStable(int len) {
      lock.lock();
      try {
        if (received_bytes + len >= max_bytes) {
          received_bytes = 0;
          return true;
        }
        received_bytes += len;
        return false;
      } finally {
        lock.unlock();
      }
    }

    void reset() {
      received_bytes = 0;
      last_highest = -1;
      num_stable_msgs = 0;
    }

    void update() {
      timestamp.set(System.currentTimeMillis());
    }

    long age() {
      return System.currentTimeMillis() - timestamp.longValue();
    }

    public String toString() {
      StringBuilder sb = new StringBuilder();
      if (received_msgs != null) sb.append(received_msgs).append(", ");
      sb.append("recv_conn_id=" + recv_conn_id);
      sb.append(" (" + age() + " ms old)");
      return sb.toString();
    }
  }

  protected class ConnectionReaper implements Runnable {
    public void run() {
      reapIdleConnections();
    }
  }

  /**
   * Retransmitter task which periodically (every xmit_interval ms) looks at all the retransmit
   * tables and sends retransmit request to all members from which we have missing messages
   */
  protected class RetransmitTask implements Runnable {

    public void run() {
      for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) {
        Address target = entry.getKey(); // target to send retransmit requests to
        ReceiverEntry val = entry.getValue();
        Table<Message> buf = val != null ? val.received_msgs : null;
        if (buf != null && buf.getNumMissing() > 0) {
          SeqnoList missing = buf.getMissing();
          if (missing != null) retransmit(missing, target);
        }
      }
    }
  }
}
Exemple #4
0
  /**
   * Starts the merge protocol (only run by the merge leader). Essentially sends a MERGE_REQ to all
   * coordinators of all subgroups found. Each coord receives its digest and view and returns it.
   * The leader then computes the digest and view for the new group from the return values. Finally,
   * it sends this merged view/digest to all subgroup coordinators; each coordinator will install it
   * in their subgroup.
   */
  class MergeTask implements Runnable {
    private Thread thread = null;

    /** List of all subpartition coordinators and their members */
    private final ConcurrentMap<Address, Collection<Address>> coords =
        Util.createConcurrentMap(8, 0.75f, 8);

    /**
     * @param views Guaranteed to be non-null and to have >= 2 members, or else this thread would
     *     not be started
     */
    public synchronized void start(Map<Address, View> views) {
      if (thread != null && thread.isAlive()) // the merge thread is already running
      return;

      this.coords.clear();

      // now remove all members which don't have us in their view, so RPCs won't block (e.g. FLUSH)
      // https://jira.jboss.org/browse/JGRP-1061
      sanitizeViews(views);

      // Add all different coordinators of the views into the hashmap and sets their members:
      Collection<Address> coordinators = Util.determineMergeCoords(views);
      for (Address coord : coordinators) {
        View view = views.get(coord);
        if (view != null) this.coords.put(coord, new ArrayList<Address>(view.getMembers()));
      }

      // For the merge participants which are not coordinator, we simply add them, and the
      // associated
      // membership list consists only of themselves
      Collection<Address> merge_participants = Util.determineMergeParticipants(views);
      merge_participants.removeAll(coordinators);
      for (Address merge_participant : merge_participants) {
        Collection<Address> tmp = new ArrayList<Address>();
        tmp.add(merge_participant);
        coords.putIfAbsent(merge_participant, tmp);
      }

      thread = gms.getThreadFactory().newThread(this, "MergeTask");
      thread.setDaemon(true);
      thread.start();
    }

    public synchronized void stop() {
      Thread tmp = thread;
      if (thread != null && thread.isAlive()) tmp.interrupt();
      thread = null;
    }

    public synchronized boolean isRunning() {
      return thread != null && thread.isAlive();
    }

    public void run() {
      // 1. Generate merge_id
      final MergeId new_merge_id = MergeId.create(gms.local_addr);
      final Collection<Address> coordsCopy = new ArrayList<Address>(coords.keySet());

      long start = System.currentTimeMillis();

      try {
        _run(new_merge_id, coordsCopy); // might remove members from coordsCopy
      } catch (Throwable ex) {
        if (log.isWarnEnabled()) log.warn(gms.local_addr + ": " + ex + ", merge is cancelled");
        sendMergeCancelledMessage(coordsCopy, new_merge_id);
        cancelMerge(
            new_merge_id); // the message above cancels the merge, too, but this is a 2nd line of
        // defense
      } finally {
        /* 5. if flush is in stack stop the flush for entire cluster [JGRP-700] - FLUSH: flushing should span merge */
        if (gms.flushProtocolInStack) gms.stopFlush();
        thread = null;
      }
      long diff = System.currentTimeMillis() - start;
      if (log.isDebugEnabled())
        log.debug(gms.local_addr + ": merge " + new_merge_id + " took " + diff + " ms");
    }

    /** Runs the merge protocol as a leader */
    protected void _run(MergeId new_merge_id, final Collection<Address> coordsCopy)
        throws Exception {
      boolean success = setMergeId(null, new_merge_id);
      if (!success) {
        log.warn("failed to set my own merge_id (" + merge_id + ") to " + new_merge_id);
        return;
      }

      if (log.isDebugEnabled())
        log.debug(
            gms.local_addr
                + ": merge task "
                + merge_id
                + " started with "
                + coords.keySet().size()
                + " coords");

      /* 2. Fetch the current Views/Digests from all subgroup coordinators */
      success = getMergeDataFromSubgroupCoordinators(coords, new_merge_id, gms.merge_timeout);
      List<Address> missing = null;
      if (!success) {
        missing = merge_rsps.getMissing();
        if (log.isDebugEnabled())
          log.debug(
              "merge leader "
                  + gms.local_addr
                  + " did not get responses from all "
                  + coords.keySet().size()
                  + " partition coordinators; missing responses from "
                  + missing.size()
                  + " members, removing them from the merge");
        merge_rsps.remove(missing);
      }

      /* 3. Remove null or rejected merge responses from merge_rsp and coords (so we'll send the new view
       * only to members who accepted the merge request) */
      if (missing != null && !missing.isEmpty()) {
        coords.keySet().removeAll(missing);
        coordsCopy.removeAll(missing);
      }

      removeRejectedMergeRequests(coords.keySet());
      if (merge_rsps.size() == 0)
        throw new Exception("did not get any merge responses from partition coordinators");

      if (!coords
          .keySet()
          .contains(
              gms.local_addr)) // another member might have invoked a merge req on us before we got
        // there...
        throw new Exception("merge leader rejected merge request");

      /* 4. Combine all views and digests into 1 View/1 Digest */
      List<MergeData> merge_data = new ArrayList<MergeData>(merge_rsps.getResults().values());
      MergeData combined_merge_data = consolidateMergeData(merge_data);
      if (combined_merge_data == null) throw new Exception("could not consolidate merge");

      /* 4. Send the new View/Digest to all coordinators (including myself). On reception, they will
      install the digest and view in all of their subgroup members */
      if (log.isDebugEnabled())
        log.debug(
            gms.local_addr
                + ": installing merge view "
                + combined_merge_data.view.getViewId()
                + " ("
                + combined_merge_data.view.size()
                + " members) in "
                + coords.keySet().size()
                + " coords");
      sendMergeView(coords.keySet(), combined_merge_data, new_merge_id);
    }

    /**
     * Sends a MERGE_REQ to all coords and populates a list of MergeData (in merge_rsps). Returns
     * after coords.size() response have been received, or timeout msecs have elapsed (whichever is
     * first).
     *
     * <p>If a subgroup coordinator rejects the MERGE_REQ (e.g. because of participation in a
     * different merge), <em>that member will be removed from coords !</em>
     *
     * @param coords A map of coordinatgor addresses and associated membership lists
     * @param new_merge_id The new merge id
     * @param timeout Max number of msecs to wait for the merge responses from the subgroup coords
     */
    protected boolean getMergeDataFromSubgroupCoordinators(
        Map<Address, Collection<Address>> coords, MergeId new_merge_id, long timeout) {
      boolean gotAllResponses;
      long start = System.currentTimeMillis();
      merge_rsps.reset(coords.keySet());
      if (log.isTraceEnabled())
        log.trace(gms.local_addr + ": sending MERGE_REQ to " + coords.keySet());

      for (Map.Entry<Address, Collection<Address>> entry : coords.entrySet()) {
        Address coord = entry.getKey();
        Collection<Address> mbrs = entry.getValue();
        Message msg = new Message(coord).setFlag(Message.Flag.OOB, Message.Flag.INTERNAL);
        GMS.GmsHeader hdr = new GMS.GmsHeader(GMS.GmsHeader.MERGE_REQ, mbrs);
        hdr.mbr = gms.local_addr;
        hdr.merge_id = new_merge_id;
        msg.putHeader(gms.getId(), hdr);
        gms.getDownProtocol().down(new Event(Event.MSG, msg));
      }

      // wait until num_rsps_expected >= num_rsps or timeout elapsed
      merge_rsps.waitForAllResponses(timeout);
      gotAllResponses = merge_rsps.hasAllResponses();
      long stop = System.currentTimeMillis();
      if (log.isTraceEnabled())
        log.trace(
            gms.local_addr
                + ": collected "
                + merge_rsps.numberOfValidResponses()
                + " merge response(s) in "
                + (stop - start)
                + " ms");
      return gotAllResponses;
    }

    /**
     * Removed rejected merge requests from merge_rsps and coords. This method has a lock on
     * merge_rsps
     */
    private void removeRejectedMergeRequests(Collection<Address> coords) {
      int num_removed = 0;
      for (Iterator<Map.Entry<Address, MergeData>> it =
              merge_rsps.getResults().entrySet().iterator();
          it.hasNext(); ) {
        Map.Entry<Address, MergeData> entry = it.next();
        MergeData data = entry.getValue();
        if (data.merge_rejected) {
          if (data.getSender() != null) coords.remove(data.getSender());
          it.remove();
          num_removed++;
        }
      }

      if (num_removed > 0) {
        if (log.isTraceEnabled())
          log.trace(gms.local_addr + ": removed " + num_removed + " rejected merge responses");
      }
    }

    /**
     * Merge all MergeData. All MergeData elements should be disjunct (both views and digests).
     * However, this method is prepared to resolve duplicate entries (for the same member).
     * Resolution strategy for views is to merge only 1 of the duplicate members. Resolution
     * strategy for digests is to take the higher seqnos for duplicate digests.
     *
     * <p>After merging all members into a Membership and subsequent sorting, the first member of
     * the sorted membership will be the new coordinator. This method has a lock on merge_rsps.
     *
     * @param merge_rsps A list of MergeData items. Elements with merge_rejected=true were removed
     *     before. Is guaranteed not to be null and to contain at least 1 member.
     */
    private MergeData consolidateMergeData(List<MergeData> merge_rsps) {
      long logical_time = 0; // for new_vid
      List<View> subgroups =
          new ArrayList<View>(11); // contains a list of Views, each View is a subgroup
      Collection<Collection<Address>> sub_mbrships = new ArrayList<Collection<Address>>();

      for (MergeData tmp_data : merge_rsps) {
        View tmp_view = tmp_data.getView();
        if (tmp_view != null) {
          ViewId tmp_vid = tmp_view.getVid();
          if (tmp_vid != null) {
            // compute the new view id (max of all vids +1)
            logical_time = Math.max(logical_time, tmp_vid.getId());
          }
          // merge all membership lists into one (prevent duplicates)
          sub_mbrships.add(new ArrayList<Address>(tmp_view.getMembers()));
          subgroups.add(tmp_view.copy());
        }
      }

      // determine the new digest
      Digest new_digest = consolidateDigests(merge_rsps, merge_rsps.size());
      if (new_digest == null) return null;

      // remove all members from the new member list that are not in the digest
      Collection<Address> digest_mbrs = new_digest.getMembers();
      for (Collection<Address> coll : sub_mbrships) coll.retainAll(digest_mbrs);

      List<Address> merged_mbrs = gms.computeNewMembership(sub_mbrships);

      // the new coordinator is the first member of the consolidated & sorted membership list
      Address new_coord = merged_mbrs.isEmpty() ? null : merged_mbrs.get(0);
      if (new_coord == null) return null;

      // should be the highest view ID seen up to now plus 1
      ViewId new_vid = new ViewId(new_coord, logical_time + 1);

      // determine the new view
      MergeView new_view = new MergeView(new_vid, merged_mbrs, subgroups);

      if (log.isTraceEnabled())
        log.trace(
            gms.local_addr
                + ": consolidated view="
                + new_view
                + "\nconsolidated digest="
                + new_digest);
      return new MergeData(gms.local_addr, new_view, new_digest);
    }

    /**
     * Merge all digests into one. For each sender, the new value is max(highest_delivered),
     * max(highest_received). This method has a lock on merge_rsps
     */
    private Digest consolidateDigests(List<MergeData> merge_rsps, int num_mbrs) {
      MutableDigest retval = new MutableDigest(num_mbrs);

      for (MergeData data : merge_rsps) {
        Digest tmp_digest = data.getDigest();
        if (tmp_digest == null) continue;

        retval.merge(tmp_digest);
      }
      return retval.copy();
    }
  }