/** * Implementation of total order protocol using a sequencer. Consult <a * href="https://github.com/belaban/JGroups/blob/master/doc/design/SEQUENCER.txt">SEQUENCER.txt</a> * for details * * @author Bela Ban */ @MBean(description = "Implementation of total order protocol using a sequencer") public class SEQUENCER extends Protocol { protected Address local_addr; protected volatile Address coord; protected volatile View view; protected volatile boolean is_coord = false; protected final AtomicLong seqno = new AtomicLong(0); /** * Maintains messages forwarded to the coord which which no ack has been received yet. Needs to be * sorted so we resend them in the right order */ protected final NavigableMap<Long, Message> forward_table = new ConcurrentSkipListMap<>(); protected final Lock send_lock = new ReentrantLock(); protected final Condition send_cond = send_lock.newCondition(); /** * When ack_mode is set, we need to wait for an ack for each forwarded message until we can send * the next one */ protected volatile boolean ack_mode = true; /** Set when we block all sending threads to resend all messages from forward_table */ protected volatile boolean flushing = false; protected volatile boolean running = true; /** Keeps track of the threads sending messages */ protected final AtomicInteger in_flight_sends = new AtomicInteger(0); // Maintains received seqnos, so we can weed out dupes protected final ConcurrentMap<Address, BoundedHashMap<Long, Long>> delivery_table = Util.createConcurrentMap(); protected volatile Flusher flusher; /** Used for each resent message to wait until the message has been received */ protected final Promise<Long> ack_promise = new Promise<>(); @Property(description = "Size of the set to store received seqnos (for duplicate checking)") protected int delivery_table_max_size = 2000; @Property( description = "Number of acks needed before going from ack-mode to normal mode. " + "0 disables this, which means that ack-mode is always on") protected int threshold = 10; protected int num_acks = 0; protected long forwarded_msgs = 0; protected long bcast_msgs = 0; protected long received_forwards = 0; protected long received_bcasts = 0; protected long delivered_bcasts = 0; @ManagedAttribute public boolean isCoordinator() { return is_coord; } public Address getCoordinator() { return coord; } public Address getLocalAddress() { return local_addr; } @ManagedAttribute public long getForwarded() { return forwarded_msgs; } @ManagedAttribute public long getBroadcast() { return bcast_msgs; } @ManagedAttribute public long getReceivedForwards() { return received_forwards; } @ManagedAttribute public long getReceivedBroadcasts() { return received_bcasts; } @ManagedAttribute(description = "Number of messages in the forward-table") public int getForwardTableSize() { return forward_table.size(); } public void setThreshold(int new_threshold) { this.threshold = new_threshold; } public void setDeliveryTableMaxSize(int size) { delivery_table_max_size = size; } @ManagedOperation public void resetStats() { forwarded_msgs = bcast_msgs = received_forwards = received_bcasts = delivered_bcasts = 0L; } @ManagedOperation public Map<String, Object> dumpStats() { Map<String, Object> m = super.dumpStats(); m.put("forwarded", forwarded_msgs); m.put("broadcast", bcast_msgs); m.put("received_forwards", received_forwards); m.put("received_bcasts", received_bcasts); m.put("delivered_bcasts", delivered_bcasts); return m; } @ManagedOperation public String printStats() { return dumpStats().toString(); } public void start() throws Exception { super.start(); running = true; ack_mode = true; } public void stop() { running = false; unblockAll(); stopFlusher(); super.stop(); } public Object down(Event evt) { switch (evt.getType()) { case Event.MSG: Message msg = (Message) evt.getArg(); if (msg.getDest() != null || msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER) || msg.isFlagSet(Message.Flag.OOB)) break; if (msg.getSrc() == null) msg.setSrc(local_addr); if (flushing) block(); // A seqno is not used to establish ordering, but only to weed out duplicates; next_seqno // doesn't need // to increase monotonically, but only to be unique // (https://issues.jboss.org/browse/JGRP-1461) ! long next_seqno = seqno.incrementAndGet(); in_flight_sends.incrementAndGet(); try { SequencerHeader hdr = new SequencerHeader( is_coord ? SequencerHeader.BCAST : SequencerHeader.WRAPPED_BCAST, next_seqno); msg.putHeader(this.id, hdr); if (log.isTraceEnabled()) log.trace( "[" + local_addr + "]: forwarding " + local_addr + "::" + seqno + " to coord " + coord); // We always forward messages to the coordinator, even if we're the coordinator. Having // the coord // send its messages directly led to starvation of messages from other members. MPerf perf // went up // from 20MB/sec/node to 50MB/sec/node with this change ! forwardToCoord(next_seqno, msg); } catch (Exception ex) { log.error(Util.getMessage("FailedSendingMessage"), ex); } finally { in_flight_sends.decrementAndGet(); } return null; // don't pass down case Event.VIEW_CHANGE: handleViewChange((View) evt.getArg()); break; case Event.TMP_VIEW: handleTmpView((View) evt.getArg()); break; case Event.SET_LOCAL_ADDRESS: local_addr = (Address) evt.getArg(); break; } return down_prot.down(evt); } public Object up(Event evt) { Message msg; SequencerHeader hdr; switch (evt.getType()) { case Event.MSG: msg = (Message) evt.getArg(); if (msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER) || msg.isFlagSet(Message.Flag.OOB)) break; hdr = (SequencerHeader) msg.getHeader(this.id); if (hdr == null) break; // pass up switch (hdr.type) { case SequencerHeader.FORWARD: case SequencerHeader.FLUSH: if (!is_coord) { if (log.isErrorEnabled()) log.error( local_addr + ": non-coord; dropping FORWARD request from " + msg.getSrc()); return null; } Address sender = msg.getSrc(); if (view != null && !view.containsMember(sender)) { if (log.isErrorEnabled()) log.error( local_addr + ": dropping FORWARD request from non-member " + sender + "; view=" + view); return null; } broadcast( msg, true, msg.getSrc(), hdr.seqno, hdr.type == SequencerHeader.FLUSH); // do copy the message received_forwards++; break; case SequencerHeader.BCAST: deliver(msg, evt, hdr); received_bcasts++; break; case SequencerHeader.WRAPPED_BCAST: unwrapAndDeliver( msg, hdr.flush_ack); // unwrap the original message (in the payload) and deliver it received_bcasts++; break; } return null; case Event.VIEW_CHANGE: Object retval = up_prot.up(evt); handleViewChange((View) evt.getArg()); return retval; case Event.TMP_VIEW: handleTmpView((View) evt.getArg()); break; } return up_prot.up(evt); } public void up(MessageBatch batch) { for (Message msg : batch) { if (msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER) || msg.isFlagSet(Message.Flag.OOB) || msg.getHeader(id) == null) continue; batch.remove(msg); // simplistic implementation try { up(new Event(Event.MSG, msg)); } catch (Throwable t) { log.error(Util.getMessage("FailedPassingUpMessage"), t); } } if (!batch.isEmpty()) up_prot.up(batch); } /* --------------------------------- Private Methods ----------------------------------- */ protected void handleViewChange(View v) { List<Address> mbrs = v.getMembers(); if (mbrs.isEmpty()) return; if (view == null || view.compareTo(v) < 0) view = v; else return; delivery_table.keySet().retainAll(mbrs); Address existing_coord = coord, new_coord = mbrs.get(0); boolean coord_changed = !Objects.equals(existing_coord, new_coord); if (coord_changed && new_coord != null) { stopFlusher(); startFlusher( new_coord); // needs to be done in the background, to prevent blocking if down() would // block } } protected void flush(final Address new_coord) throws InterruptedException { // wait until all threads currently sending messages have returned (new threads after // flushing=true) will block // flushing is set to true in startFlusher() while (flushing && running) { if (in_flight_sends.get() == 0) break; Thread.sleep(100); } send_lock.lockInterruptibly(); try { if (log.isTraceEnabled()) log.trace(local_addr + ": coord changed from " + coord + " to " + new_coord); coord = new_coord; is_coord = Objects.equals(local_addr, coord); flushMessagesInForwardTable(); } finally { if (log.isTraceEnabled()) log.trace(local_addr + ": flushing completed"); flushing = false; ack_mode = true; // go to ack-mode after flushing num_acks = 0; send_cond.signalAll(); send_lock.unlock(); } } // If we're becoming coordinator, we need to handle TMP_VIEW as // an immediate change of view. See JGRP-1452. private void handleTmpView(View v) { List<Address> mbrs = v.getMembers(); if (mbrs.isEmpty()) return; Address new_coord = mbrs.get(0); if (!new_coord.equals(coord) && local_addr != null && local_addr.equals(new_coord)) handleViewChange(v); } /** * Sends all messages currently in forward_table to the new coordinator (changing the dest field). * This needs to be done, so the underlying reliable unicast protocol (e.g. UNICAST) adds these * messages to its retransmission mechanism<br> * Note that we need to resend the messages in order of their seqnos ! We also need to prevent * other message from being inserted until we're done, that's why there's synchronization.<br> * Access to the forward_table doesn't need to be synchronized as there won't be any insertions * during flushing (all down-threads are blocked) */ protected void flushMessagesInForwardTable() { if (is_coord) { for (Map.Entry<Long, Message> entry : forward_table.entrySet()) { Long key = entry.getKey(); Message msg = entry.getValue(); Buffer buf; try { buf = Util.streamableToBuffer(msg); } catch (Exception e) { log.error(Util.getMessage("FlushingBroadcastingFailed"), e); continue; } SequencerHeader hdr = new SequencerHeader(SequencerHeader.WRAPPED_BCAST, key); Message forward_msg = new Message(null, buf).putHeader(this.id, hdr); if (log.isTraceEnabled()) log.trace(local_addr + ": flushing (broadcasting) " + local_addr + "::" + key); down_prot.down(new Event(Event.MSG, forward_msg)); } return; } // for forwarded messages, we need to receive the forwarded message from the coordinator, to // prevent this case: // - V1={A,B,C} // - A crashes // - C installs V2={B,C} // - C forwards messages 3 and 4 to B (the new coord) // - B drops 3 because its view is still V1 // - B installs V2 // - B receives message 4 and broadcasts it // ==> C's message 4 is delivered *before* message 3 ! // ==> By resending 3 until it is received, then resending 4 until it is received, we make sure // this won't happen // (see https://issues.jboss.org/browse/JGRP-1449) while (flushing && running && !forward_table.isEmpty()) { Map.Entry<Long, Message> entry = forward_table.firstEntry(); final Long key = entry.getKey(); Message msg = entry.getValue(); Buffer buf; try { buf = Util.streamableToBuffer(msg); } catch (Exception e) { log.error(Util.getMessage("FlushingBroadcastingFailed"), e); continue; } while (flushing && running && !forward_table.isEmpty()) { SequencerHeader hdr = new SequencerHeader(SequencerHeader.FLUSH, key); Message forward_msg = new Message(coord, buf).putHeader(this.id, hdr).setFlag(Message.Flag.DONT_BUNDLE); if (log.isTraceEnabled()) log.trace( local_addr + ": flushing (forwarding) " + local_addr + "::" + key + " to coord " + coord); ack_promise.reset(); down_prot.down(new Event(Event.MSG, forward_msg)); Long ack = ack_promise.getResult(500); if ((Objects.equals(ack, key)) || !forward_table.containsKey(key)) break; } } } protected void forwardToCoord(long seqno, Message msg) { if (is_coord) { forward(msg, seqno, false); return; } if (!running || flushing) { forward_table.put(seqno, msg); return; } if (!ack_mode) { forward_table.put(seqno, msg); forward(msg, seqno, false); return; } send_lock.lock(); try { forward_table.put(seqno, msg); while (running && !flushing) { ack_promise.reset(); forward(msg, seqno, true); if (!ack_mode || !running || flushing) break; Long ack = ack_promise.getResult(500); if ((Objects.equals(ack, seqno)) || !forward_table.containsKey(seqno)) break; } } finally { send_lock.unlock(); } } protected void forward(final Message msg, long seqno, boolean flush) { Address target = coord; if (target == null) return; byte type = flush ? SequencerHeader.FLUSH : SequencerHeader.FORWARD; try { SequencerHeader hdr = new SequencerHeader(type, seqno); Message forward_msg = new Message(target, Util.streamableToBuffer(msg)).putHeader(this.id, hdr); down_prot.down(new Event(Event.MSG, forward_msg)); forwarded_msgs++; } catch (Exception ex) { log.error(Util.getMessage("FailedForwardingMessageTo") + msg.getDest(), ex); } } protected void broadcast( final Message msg, boolean copy, Address original_sender, long seqno, boolean resend) { Message bcast_msg = null; if (!copy) { bcast_msg = msg; // no need to add a header, message already has one } else { SequencerHeader new_hdr = new SequencerHeader(SequencerHeader.WRAPPED_BCAST, seqno); bcast_msg = new Message(null, msg.getRawBuffer(), msg.getOffset(), msg.getLength()) .putHeader(this.id, new_hdr); if (resend) { new_hdr.flush_ack = true; bcast_msg.setFlag(Message.Flag.DONT_BUNDLE); } } if (log.isTraceEnabled()) log.trace(local_addr + ": broadcasting " + original_sender + "::" + seqno); down_prot.down(new Event(Event.MSG, bcast_msg)); bcast_msgs++; } /** * Unmarshal the original message (in the payload) and then pass it up (unless already delivered) * * @param msg */ protected void unwrapAndDeliver(final Message msg, boolean flush_ack) { try { Message msg_to_deliver = Util.streamableFromBuffer( Message.class, msg.getRawBuffer(), msg.getOffset(), msg.getLength()); SequencerHeader hdr = (SequencerHeader) msg_to_deliver.getHeader(this.id); if (flush_ack) hdr.flush_ack = true; deliver(msg_to_deliver, new Event(Event.MSG, msg_to_deliver), hdr); } catch (Exception ex) { log.error(Util.getMessage("FailureUnmarshallingBuffer"), ex); } } protected void deliver(Message msg, Event evt, SequencerHeader hdr) { Address sender = msg.getSrc(); if (sender == null) { if (log.isErrorEnabled()) log.error(local_addr + ": sender is null, cannot deliver " + "::" + hdr.getSeqno()); return; } long msg_seqno = hdr.getSeqno(); if (sender.equals(local_addr)) { forward_table.remove(msg_seqno); if (hdr.flush_ack) { ack_promise.setResult(msg_seqno); if (ack_mode && !flushing && threshold > 0 && ++num_acks >= threshold) { ack_mode = false; num_acks = 0; } } } if (!canDeliver(sender, msg_seqno)) { if (log.isWarnEnabled()) log.warn(local_addr + ": dropped duplicate message " + sender + "::" + msg_seqno); return; } if (log.isTraceEnabled()) log.trace(local_addr + ": delivering " + sender + "::" + msg_seqno); up_prot.up(evt); delivered_bcasts++; } /** * Checks if seqno has already been received from sender. This weeds out duplicates. Note that * this method is never called concurrently for the same sender, as the sender in NAKACK will * always be the coordinator. */ protected boolean canDeliver(Address sender, long seqno) { BoundedHashMap<Long, Long> seqno_set = delivery_table.get(sender); if (seqno_set == null) { seqno_set = new BoundedHashMap<>(delivery_table_max_size); BoundedHashMap<Long, Long> existing = delivery_table.put(sender, seqno_set); if (existing != null) seqno_set = existing; } return seqno_set.add(seqno, seqno); } protected void block() { send_lock.lock(); try { while (flushing && running) { try { send_cond.await(); } catch (InterruptedException e) { } } } finally { send_lock.unlock(); } } protected void unblockAll() { flushing = false; send_lock.lock(); try { send_cond.signalAll(); ack_promise.setResult(null); } finally { send_lock.unlock(); } } protected synchronized void startFlusher(final Address new_coord) { if (flusher == null || !flusher.isAlive()) { if (log.isTraceEnabled()) log.trace(local_addr + ": flushing started"); // causes subsequent message sends (broadcasts and forwards) to block // (https://issues.jboss.org/browse/JGRP-1495) flushing = true; flusher = new Flusher(new_coord); flusher.setName("Flusher"); flusher.start(); } } protected void stopFlusher() { flushing = false; Thread tmp = flusher; while (tmp != null && tmp.isAlive()) { tmp.interrupt(); ack_promise.setResult(null); try { tmp.join(); } catch (InterruptedException e) { } } } /* ----------------------------- End of Private Methods -------------------------------- */ protected class Flusher extends Thread { protected final Address new_coord; public Flusher(Address new_coord) { this.new_coord = new_coord; } public void run() { try { flush(new_coord); } catch (InterruptedException e) { } } } public static class SequencerHeader extends Header { protected static final byte FORWARD = 1; protected static final byte FLUSH = 2; protected static final byte BCAST = 3; protected static final byte WRAPPED_BCAST = 4; protected byte type = -1; protected long seqno = -1; protected boolean flush_ack; public SequencerHeader() {} public SequencerHeader(byte type) { this.type = type; } public SequencerHeader(byte type, long seqno) { this(type); this.seqno = seqno; } public long getSeqno() { return seqno; } public String toString() { StringBuilder sb = new StringBuilder(64); sb.append(printType()); if (seqno >= 0) sb.append(" seqno=" + seqno); if (flush_ack) sb.append(" (flush_ack)"); return sb.toString(); } protected final String printType() { switch (type) { case FORWARD: return "FORWARD"; case FLUSH: return "FLUSH"; case BCAST: return "BCAST"; case WRAPPED_BCAST: return "WRAPPED_BCAST"; default: return "n/a"; } } public void writeTo(DataOutput out) throws Exception { out.writeByte(type); Bits.writeLong(seqno, out); out.writeBoolean(flush_ack); } public void readFrom(DataInput in) throws Exception { type = in.readByte(); seqno = Bits.readLong(in); flush_ack = in.readBoolean(); } public int size() { return Global.BYTE_SIZE + Bits.size(seqno) + Global.BYTE_SIZE; // type + seqno + flush_ack } } }
/** * Reliable unicast layer. Uses acknowledgement scheme similar to TCP to provide lossless * transmission of unicast messages (for reliable multicast see NAKACK layer). When a message is * sent to a peer for the first time, we add the pair <peer_addr, Entry> to the hashtable (peer * address is the key). All messages sent to that peer will be added to * hashtable.peer_addr.sent_msgs. When we receive a message from a peer for the first time, another * entry will be created and added to the hashtable (unless already existing). Msgs will then be * added to hashtable.peer_addr.received_msgs. * * <p>This layer is used to reliably transmit point-to-point messages, that is, either messages sent * to a single receiver (vs. messages multicast to a group) or for example replies to a multicast * message. The sender uses an <code>AckSenderWindow</code> which retransmits messages for which it * hasn't received an ACK, the receiver uses <code>AckReceiverWindow</code> which keeps track of the * lowest seqno received so far, and keeps messages in order. * * <p>Messages in both AckSenderWindows and AckReceiverWindows will be removed. A message will be * removed from AckSenderWindow when an ACK has been received for it and messages will be removed * from AckReceiverWindow whenever a message is received: the new message is added and then we try * to remove as many messages as possible (until we stop at a gap, or there are no more messages). * * @author Bela Ban */ @MBean(description = "Reliable unicast layer") public class UNICAST extends Protocol implements AgeOutCache.Handler<Address> { public static final long DEFAULT_FIRST_SEQNO = Global.DEFAULT_FIRST_UNICAST_SEQNO; /* ------------------------------------------ Properties ------------------------------------------ */ @Deprecated protected int[] timeout = { 400, 800, 1600, 3200 }; // for AckSenderWindow: max time to wait for missing acks @Property( description = "Max number of messages to be removed from a retransmit window. This property might " + "get removed anytime, so don't use it !") protected int max_msg_batch_size = 500; @Property( description = "Time (in milliseconds) after which an idle incoming or outgoing connection is closed. The " + "connection will get re-established when used again. 0 disables connection reaping") protected long conn_expiry_timeout = 0; @Deprecated @Property( description = "Size (in bytes) of a Segment in the segments table. Only for experts, do not use !", deprecatedMessage = "not used anymore") protected int segment_capacity = 1000; @Property( description = "Number of rows of the matrix in the retransmission table (only for experts)", writable = false) protected int xmit_table_num_rows = 100; @Property( description = "Number of elements of a row of the matrix in the retransmission table (only for experts). " + "The capacity of the matrix is xmit_table_num_rows * xmit_table_msgs_per_row", writable = false) protected int xmit_table_msgs_per_row = 1000; @Property( description = "Resize factor of the matrix in the retransmission table (only for experts)", writable = false) protected double xmit_table_resize_factor = 1.2; @Property( description = "Number of milliseconds after which the matrix in the retransmission table " + "is compacted (only for experts)", writable = false) protected long xmit_table_max_compaction_time = 10 * 60 * 1000; // @Property(description="Max time (in ms) after which a connection to a non-member is closed") protected long max_retransmit_time = 60 * 1000L; @Property( description = "Interval (in milliseconds) at which messages in the send windows are resent") protected long xmit_interval = 2000; /* --------------------------------------------- JMX ---------------------------------------------- */ protected long num_msgs_sent = 0, num_msgs_received = 0; protected long num_acks_sent = 0, num_acks_received = 0, num_xmits = 0; /* --------------------------------------------- Fields ------------------------------------------------ */ protected final ConcurrentMap<Address, SenderEntry> send_table = Util.createConcurrentMap(); protected final ConcurrentMap<Address, ReceiverEntry> recv_table = Util.createConcurrentMap(); protected final ReentrantLock recv_table_lock = new ReentrantLock(); /** RetransmitTask running every xmit_interval ms */ protected Future<?> xmit_task; protected volatile List<Address> members = new ArrayList<Address>(11); protected Address local_addr = null; protected TimeScheduler timer = null; // used for retransmissions (passed to AckSenderWindow) protected volatile boolean running = false; protected short last_conn_id = 0; protected AgeOutCache<Address> cache = null; protected Future<?> connection_reaper; // closes idle connections public int[] getTimeout() { return timeout; } @Deprecated @Property( name = "timeout", converter = PropertyConverters.IntegerArray.class, deprecatedMessage = "not used anymore") public void setTimeout(int[] val) { if (val != null) timeout = val; } public void setMaxMessageBatchSize(int size) { if (size >= 1) max_msg_batch_size = size; } @ManagedAttribute public String getLocalAddress() { return local_addr != null ? local_addr.toString() : "null"; } @ManagedAttribute public String getMembers() { return members.toString(); } @ManagedAttribute(description = "Whether the ConnectionReaper task is running") public boolean isConnectionReaperRunning() { return connection_reaper != null && !connection_reaper.isDone(); } @ManagedAttribute(description = "Returns the number of outgoing (send) connections") public int getNumSendConnections() { return send_table.size(); } @ManagedAttribute(description = "Returns the number of incoming (receive) connections") public int getNumReceiveConnections() { return recv_table.size(); } @ManagedAttribute( description = "Returns the total number of outgoing (send) and incoming (receive) connections") public int getNumConnections() { return getNumReceiveConnections() + getNumSendConnections(); } @ManagedOperation public String printConnections() { StringBuilder sb = new StringBuilder(); if (!send_table.isEmpty()) { sb.append("\nsend connections:\n"); for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) { sb.append(entry.getKey()).append(": ").append(entry.getValue()).append("\n"); } } if (!recv_table.isEmpty()) { sb.append("\nreceive connections:\n"); for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) { sb.append(entry.getKey()).append(": ").append(entry.getValue()).append("\n"); } } return sb.toString(); } @ManagedAttribute public long getNumMessagesSent() { return num_msgs_sent; } @ManagedAttribute public long getNumMessagesReceived() { return num_msgs_received; } @ManagedAttribute public long getNumAcksSent() { return num_acks_sent; } @ManagedAttribute public long getNumAcksReceived() { return num_acks_received; } @ManagedAttribute public long getNumXmits() { return num_xmits; } public long getMaxRetransmitTime() { return max_retransmit_time; } @Property( description = "Max number of milliseconds we try to retransmit a message to any given member. After that, " + "the connection is removed. Any new connection to that member will start with seqno #1 again. 0 disables this") public void setMaxRetransmitTime(long max_retransmit_time) { this.max_retransmit_time = max_retransmit_time; if (cache != null && max_retransmit_time > 0) cache.setTimeout(max_retransmit_time); } @ManagedAttribute(description = "Is the retransmit task running") public boolean isXmitTaskRunning() { return xmit_task != null && !xmit_task.isDone(); } @ManagedAttribute public int getAgeOutCacheSize() { return cache != null ? cache.size() : 0; } @ManagedOperation public String printAgeOutCache() { return cache != null ? cache.toString() : "n/a"; } public AgeOutCache<Address> getAgeOutCache() { return cache; } /** Used for testing only */ public boolean hasSendConnectionTo(Address dest) { return send_table.containsKey(dest); } /** The number of messages in all Entry.sent_msgs tables (haven't received an ACK yet) */ @ManagedAttribute public int getNumUnackedMessages() { int num = 0; for (SenderEntry entry : send_table.values()) { if (entry.sent_msgs != null) num += entry.sent_msgs.size(); } return num; } @ManagedAttribute public int getNumberOfMessagesInReceiveWindows() { int num = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) num += entry.received_msgs.size(); } return num; } @ManagedAttribute(description = "Total number of undelivered messages in all receive windows") public long getXmitTableUndeliveredMessages() { long retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.size(); } return retval; } @ManagedAttribute(description = "Total number of missing messages in all receive windows") public long getXmitTableMissingMessages() { long retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.getNumMissing(); } return retval; } @ManagedAttribute(description = "Number of compactions in all (receive and send) windows") public int getXmitTableNumCompactions() { int retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.getNumCompactions(); } for (SenderEntry entry : send_table.values()) { if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumCompactions(); } return retval; } @ManagedAttribute(description = "Number of moves in all (receive and send) windows") public int getXmitTableNumMoves() { int retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.getNumMoves(); } for (SenderEntry entry : send_table.values()) { if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumMoves(); } return retval; } @ManagedAttribute(description = "Number of resizes in all (receive and send) windows") public int getXmitTableNumResizes() { int retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.getNumResizes(); } for (SenderEntry entry : send_table.values()) { if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumResizes(); } return retval; } @ManagedAttribute(description = "Number of purges in all (receive and send) windows") public int getXmitTableNumPurges() { int retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.getNumPurges(); } for (SenderEntry entry : send_table.values()) { if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumPurges(); } return retval; } @ManagedOperation(description = "Prints the contents of the receive windows for all members") public String printReceiveWindowMessages() { StringBuilder ret = new StringBuilder(local_addr + ":\n"); for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) { Address addr = entry.getKey(); Table<Message> buf = entry.getValue().received_msgs; ret.append(addr).append(": ").append(buf.toString()).append('\n'); } return ret.toString(); } @ManagedOperation(description = "Prints the contents of the send windows for all members") public String printSendWindowMessages() { StringBuilder ret = new StringBuilder(local_addr + ":\n"); for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) { Address addr = entry.getKey(); Table<Message> buf = entry.getValue().sent_msgs; ret.append(addr).append(": ").append(buf.toString()).append('\n'); } return ret.toString(); } public void resetStats() { num_msgs_sent = num_msgs_received = num_acks_sent = num_acks_received = 0; num_xmits = 0; } public Map<String, Object> dumpStats() { Map<String, Object> m = super.dumpStats(); m.put("num_unacked_msgs", getNumUnackedMessages()); m.put("num_msgs_in_recv_windows", getNumberOfMessagesInReceiveWindows()); return m; } public void start() throws Exception { timer = getTransport().getTimer(); if (timer == null) throw new Exception("timer is null"); if (max_retransmit_time > 0) cache = new AgeOutCache<Address>(timer, max_retransmit_time, this); running = true; if (conn_expiry_timeout > 0) startConnectionReaper(); startRetransmitTask(); } public void stop() { running = false; stopRetransmitTask(); stopConnectionReaper(); removeAllConnections(); } public Object up(Event evt) { switch (evt.getType()) { case Event.MSG: Message msg = (Message) evt.getArg(); if (msg.getDest() == null || msg.isFlagSet(Message.Flag.NO_RELIABILITY)) // only handle unicast messages break; // pass up UnicastHeader hdr = (UnicastHeader) msg.getHeader(this.id); if (hdr == null) break; Address sender = msg.getSrc(); switch (hdr.type) { case UnicastHeader.DATA: // received regular message handleDataReceived(sender, hdr.seqno, hdr.conn_id, hdr.first, msg, evt); break; default: handleUpEvent(sender, hdr); break; } return null; } return up_prot.up(evt); // Pass up to the layer above us } protected void handleUpEvent(Address sender, UnicastHeader hdr) { switch (hdr.type) { case UnicastHeader.DATA: // received regular message throw new IllegalStateException( "header of type DATA is not supposed to be handled by this method"); case UnicastHeader.ACK: // received ACK for previously sent message handleAckReceived(sender, hdr.seqno, hdr.conn_id); break; case UnicastHeader.SEND_FIRST_SEQNO: handleResendingOfFirstMessage(sender, hdr.seqno); break; default: log.error("UnicastHeader type " + hdr.type + " not known !"); break; } } public void up(MessageBatch batch) { if (batch.dest() == null) { // not a unicast batch up_prot.up(batch); return; } int size = batch.size(); Map<Short, List<Message>> msgs = new TreeMap<Short, List<Message>>(); // map of messages, keyed by conn-id for (Message msg : batch) { if (msg == null || msg.isFlagSet(Message.Flag.NO_RELIABILITY)) continue; UnicastHeader hdr = (UnicastHeader) msg.getHeader(id); if (hdr == null) continue; batch.remove(msg); // remove the message from the batch, so it won't be passed up the stack if (hdr.type != UnicastHeader.DATA) { try { handleUpEvent(msg.getSrc(), hdr); } catch (Throwable t) { // we cannot let an exception terminate the processing of this batch log.error(local_addr + ": failed handling event", t); } continue; } List<Message> list = msgs.get(hdr.conn_id); if (list == null) msgs.put(hdr.conn_id, list = new ArrayList<Message>(size)); list.add(msg); } if (!msgs.isEmpty()) handleBatchReceived(batch.sender(), msgs); // process msgs: if (!batch.isEmpty()) up_prot.up(batch); } public Object down(Event evt) { switch (evt.getType()) { case Event.MSG: // Add UnicastHeader, add to AckSenderWindow and pass down Message msg = (Message) evt.getArg(); Address dst = msg.getDest(); /* only handle unicast messages */ if (dst == null || msg.isFlagSet(Message.Flag.NO_RELIABILITY)) break; if (!running) { if (log.isTraceEnabled()) log.trace("discarded message as start() has not yet been called, message: " + msg); return null; } SenderEntry entry = send_table.get(dst); if (entry == null) { entry = new SenderEntry(getNewConnectionId()); SenderEntry existing = send_table.putIfAbsent(dst, entry); if (existing != null) entry = existing; else { if (log.isTraceEnabled()) log.trace( local_addr + ": created sender window for " + dst + " (conn-id=" + entry.send_conn_id + ")"); if (cache != null && !members.contains(dst)) cache.add(dst); } } short send_conn_id = entry.send_conn_id; long seqno = entry.sent_msgs_seqno.getAndIncrement(); long sleep = 10; do { try { msg.putHeader( this.id, UnicastHeader.createDataHeader(seqno, send_conn_id, seqno == DEFAULT_FIRST_SEQNO)); entry.sent_msgs.add(seqno, msg); // add *including* UnicastHeader, adds to retransmitter if (conn_expiry_timeout > 0) entry.update(); break; } catch (Throwable t) { if (!running) break; Util.sleep(sleep); sleep = Math.min(5000, sleep * 2); } } while (running); if (log.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); sb.append(local_addr) .append(" --> DATA(") .append(dst) .append(": #") .append(seqno) .append(", conn_id=") .append(send_conn_id); if (seqno == DEFAULT_FIRST_SEQNO) sb.append(", first"); sb.append(')'); log.trace(sb); } num_msgs_sent++; return down_prot.down(evt); case Event.VIEW_CHANGE: // remove connections to peers that are not members anymore ! View view = (View) evt.getArg(); List<Address> new_members = view.getMembers(); Set<Address> non_members = new HashSet<Address>(send_table.keySet()); non_members.addAll(recv_table.keySet()); members = new_members; non_members.removeAll(new_members); if (cache != null) cache.removeAll(new_members); if (!non_members.isEmpty()) { if (log.isTraceEnabled()) log.trace("removing non members " + non_members); for (Address non_mbr : non_members) removeConnection(non_mbr); } break; case Event.SET_LOCAL_ADDRESS: local_addr = (Address) evt.getArg(); break; } return down_prot.down(evt); // Pass on to the layer below us } /** * Removes and resets from connection table (which is already locked). Returns true if member was * found, otherwise false. This method is public only so it can be invoked by unit testing, but * should not otherwise be used ! */ public void removeConnection(Address mbr) { removeSendConnection(mbr); removeReceiveConnection(mbr); } public void removeSendConnection(Address mbr) { send_table.remove(mbr); } public void removeReceiveConnection(Address mbr) { recv_table.remove(mbr); } /** * This method is public only so it can be invoked by unit testing, but should not otherwise be * used ! */ @ManagedOperation( description = "Trashes all connections to other nodes. This is only used for testing") public void removeAllConnections() { send_table.clear(); recv_table.clear(); } /** Called by AckSenderWindow to resend messages for which no ACK has been received yet */ public void retransmit(Message msg) { if (log.isTraceEnabled()) { UnicastHeader hdr = (UnicastHeader) msg.getHeader(id); long seqno = hdr != null ? hdr.seqno : -1; log.trace(local_addr + " --> XMIT(" + msg.getDest() + ": #" + seqno + ')'); } down_prot.down(new Event(Event.MSG, msg)); num_xmits++; } /** * Called by AgeOutCache, to removed expired connections * * @param key */ public void expired(Address key) { if (key != null) { if (log.isDebugEnabled()) log.debug("removing connection to " + key + " because it expired"); removeConnection(key); } } /** * Check whether the hashtable contains an entry e for <code>sender</code> (create if not). If * e.received_msgs is null and <code>first</code> is true: create a new AckReceiverWindow(seqno) * and add message. Set e.received_msgs to the new window. Else just add the message. */ protected void handleDataReceived( Address sender, long seqno, short conn_id, boolean first, Message msg, Event evt) { if (log.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); sb.append(local_addr).append(" <-- DATA(").append(sender).append(": #").append(seqno); if (conn_id != 0) sb.append(", conn_id=").append(conn_id); if (first) sb.append(", first"); sb.append(')'); log.trace(sb); } ReceiverEntry entry = getReceiverEntry(sender, seqno, first, conn_id); if (entry == null) return; if (conn_expiry_timeout > 0) entry.update(); Table<Message> win = entry.received_msgs; boolean added = win.add(seqno, msg); // win is guaranteed to be non-null if we get here num_msgs_received++; // An OOB message is passed up immediately. Later, when remove() is called, we discard it. This // affects ordering ! // http://jira.jboss.com/jira/browse/JGRP-377 if (msg.isFlagSet(Message.Flag.OOB) && added) { try { up_prot.up(evt); } catch (Throwable t) { log.error("couldn't deliver OOB message " + msg, t); } } final AtomicBoolean processing = win.getProcessing(); if (!processing.compareAndSet(false, true)) { return; } // try to remove (from the AckReceiverWindow) as many messages as possible as pass them up // Prevents concurrent passing up of messages by different threads // (http://jira.jboss.com/jira/browse/JGRP-198); // this is all the more important once we have a concurrent stack // (http://jira.jboss.com/jira/browse/JGRP-181), // where lots of threads can come up to this point concurrently, but only 1 is allowed to pass // at a time // We *can* deliver messages from *different* senders concurrently, e.g. reception of P1, Q1, // P2, Q2 can result in // delivery of P1, Q1, Q2, P2: FIFO (implemented by UNICAST) says messages need to be delivered // only in the // order in which they were sent by their senders removeAndDeliver(processing, win, sender); sendAck(sender, win.getHighestDelivered(), conn_id); } protected void handleBatchReceived(Address sender, Map<Short, List<Message>> map) { for (Map.Entry<Short, List<Message>> element : map.entrySet()) { final List<Message> msg_list = element.getValue(); if (log.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); sb.append(local_addr) .append(" <-- DATA(") .append(sender) .append(": " + printMessageList(msg_list)) .append(')'); log.trace(sb); } short conn_id = element.getKey(); ReceiverEntry entry = null; for (Message msg : msg_list) { UnicastHeader hdr = (UnicastHeader) msg.getHeader(id); entry = getReceiverEntry(sender, hdr.seqno, hdr.first, conn_id); if (entry == null) continue; Table<Message> win = entry.received_msgs; boolean msg_added = win.add(hdr.seqno, msg); // win is guaranteed to be non-null if we get here num_msgs_received++; if (hdr.first && msg_added) sendAck( sender, hdr.seqno, conn_id); // send an ack immediately when we received the first message of a conn // An OOB message is passed up immediately. Later, when remove() is called, we discard it. // This affects ordering ! // http://jira.jboss.com/jira/browse/JGRP-377 if (msg.isFlagSet(Message.Flag.OOB) && msg_added) { try { up_prot.up(new Event(Event.MSG, msg)); } catch (Throwable t) { log.error("couldn't deliver OOB message " + msg, t); } } } if (entry != null && conn_expiry_timeout > 0) entry.update(); } ReceiverEntry entry = recv_table.get(sender); Table<Message> win = entry != null ? entry.received_msgs : null; if (win != null) { final AtomicBoolean processing = win.getProcessing(); if (processing.compareAndSet(false, true)) { removeAndDeliver(processing, win, sender); sendAck(sender, win.getHighestDeliverable(), entry.recv_conn_id); } } } /** * Try to remove as many messages as possible from the table as pass them up. Prevents concurrent * passing up of messages by different threads (http://jira.jboss.com/jira/browse/JGRP-198); lots * of threads can come up to this point concurrently, but only 1 is allowed to pass at a time. We * *can* deliver messages from *different* senders concurrently, e.g. reception of P1, Q1, P2, Q2 * can result in delivery of P1, Q1, Q2, P2: FIFO (implemented by UNICAST) says messages need to * be delivered in the order in which they were sent */ protected int removeAndDeliver( final AtomicBoolean processing, Table<Message> win, Address sender) { int retval = 0; boolean released_processing = false; try { while (true) { List<Message> list = win.removeMany(processing, true, max_msg_batch_size); if (list == null) { released_processing = true; return retval; } MessageBatch batch = new MessageBatch(local_addr, sender, null, false, list); for (Message msg_to_deliver : batch) { // discard OOB msg: it has already been delivered // (http://jira.jboss.com/jira/browse/JGRP-377) if (msg_to_deliver.isFlagSet(Message.Flag.OOB)) batch.remove(msg_to_deliver); } try { if (log.isTraceEnabled()) { Message first = batch.first(), last = batch.last(); StringBuilder sb = new StringBuilder(local_addr + ": delivering"); if (first != null && last != null) { UnicastHeader hdr1 = (UnicastHeader) first.getHeader(id), hdr2 = (UnicastHeader) last.getHeader(id); sb.append(" #").append(hdr1.seqno).append(" - #").append(hdr2.seqno); } sb.append(" (" + batch.size()).append(" messages)"); log.trace(sb); } up_prot.up(batch); } catch (Throwable t) { log.error("failed to deliver batch " + batch, t); } } } finally { // processing is always set in win.remove(processing) above and never here ! This code is just // a // 2nd line of defense should there be an exception before win.remove(processing) sets // processing if (!released_processing) processing.set(false); } } protected ReceiverEntry getReceiverEntry( Address sender, long seqno, boolean first, short conn_id) { ReceiverEntry entry = recv_table.get(sender); if (entry != null && entry.recv_conn_id == conn_id) return entry; recv_table_lock.lock(); try { entry = recv_table.get(sender); if (first) { if (entry == null) { entry = getOrCreateReceiverEntry(sender, seqno, conn_id); } else { // entry != null && win != null if (conn_id != entry.recv_conn_id) { if (log.isTraceEnabled()) log.trace( local_addr + ": conn_id=" + conn_id + " != " + entry.recv_conn_id + "; resetting receiver window"); recv_table.remove(sender); entry = getOrCreateReceiverEntry(sender, seqno, conn_id); } else {; } } } else { // entry == null && win == null OR entry != null && win == null OR entry != null && // win != null if (entry == null || entry.recv_conn_id != conn_id) { recv_table_lock.unlock(); sendRequestForFirstSeqno(sender, seqno); // drops the message and returns (see below) return null; } } return entry; } finally { if (recv_table_lock.isHeldByCurrentThread()) recv_table_lock.unlock(); } } protected ReceiverEntry getOrCreateReceiverEntry(Address sender, long seqno, short conn_id) { Table<Message> table = new Table<Message>( xmit_table_num_rows, xmit_table_msgs_per_row, seqno - 1, xmit_table_resize_factor, xmit_table_max_compaction_time); ReceiverEntry entry = new ReceiverEntry(table, conn_id); ReceiverEntry entry2 = recv_table.putIfAbsent(sender, entry); if (entry2 != null) return entry2; if (log.isTraceEnabled()) log.trace( local_addr + ": created receiver window for " + sender + " at seqno=#" + seqno + " for conn-id=" + conn_id); return entry; } protected void handleAckReceived(Address sender, long seqno, short conn_id) { if (log.isTraceEnabled()) log.trace( new StringBuilder() .append(local_addr) .append(" <-- ACK(") .append(sender) .append(": #") .append(seqno) .append(", conn-id=") .append(conn_id) .append(')')); SenderEntry entry = send_table.get(sender); if (entry != null && entry.send_conn_id != conn_id) { if (log.isTraceEnabled()) log.trace( local_addr + ": my conn_id (" + entry.send_conn_id + ") != received conn_id (" + conn_id + "); discarding ACK"); return; } Table<Message> win = entry != null ? entry.sent_msgs : null; if (win != null) { win.purge(seqno, true); // removes all messages <= seqno (forced purge) num_acks_received++; } } /** * We need to resend our first message with our conn_id * * @param sender * @param seqno Resend the non null messages in the range [lowest .. seqno] */ protected void handleResendingOfFirstMessage(Address sender, long seqno) { if (log.isTraceEnabled()) log.trace(local_addr + " <-- SEND_FIRST_SEQNO(" + sender + "," + seqno + ")"); SenderEntry entry = send_table.get(sender); Table<Message> win = entry != null ? entry.sent_msgs : null; if (win == null) { if (log.isWarnEnabled()) log.warn(local_addr + ": sender window for " + sender + " not found"); return; } boolean first_sent = false; for (long i = win.getLow() + 1; i <= seqno; i++) { Message rsp = win.get(i); if (rsp == null) continue; if (first_sent) { down_prot.down(new Event(Event.MSG, rsp)); } else { first_sent = true; // We need to copy the UnicastHeader and put it back into the message because Message.copy() // doesn't copy // the headers and therefore we'd modify the original message in the sender retransmission // window // (https://jira.jboss.org/jira/browse/JGRP-965) Message copy = rsp.copy(); UnicastHeader hdr = (UnicastHeader) copy.getHeader(this.id); UnicastHeader newhdr = hdr.copy(); newhdr.first = true; copy.putHeader(this.id, newhdr); down_prot.down(new Event(Event.MSG, copy)); } } } protected void startRetransmitTask() { if (xmit_task == null || xmit_task.isDone()) xmit_task = timer.scheduleWithFixedDelay( new RetransmitTask(), 0, xmit_interval, TimeUnit.MILLISECONDS); } protected void stopRetransmitTask() { if (xmit_task != null) { xmit_task.cancel(true); xmit_task = null; } } protected void sendAck(Address dst, long seqno, short conn_id) { if (!running) // if we are disconnected, then don't send any acks which throw exceptions on // shutdown return; Message ack = new Message(dst) .setFlag(Message.Flag.INTERNAL) .putHeader(this.id, UnicastHeader.createAckHeader(seqno, conn_id)); if (log.isTraceEnabled()) log.trace( new StringBuilder() .append(local_addr) .append(" --> ACK(") .append(dst) .append(": #") .append(seqno) .append(')')); try { down_prot.down(new Event(Event.MSG, ack)); num_acks_sent++; } catch (Throwable t) { log.error("failed sending ACK(" + seqno + ") to " + dst, t); } } protected synchronized void startConnectionReaper() { if (connection_reaper == null || connection_reaper.isDone()) connection_reaper = timer.scheduleWithFixedDelay( new ConnectionReaper(), conn_expiry_timeout, conn_expiry_timeout, TimeUnit.MILLISECONDS); } protected synchronized void stopConnectionReaper() { if (connection_reaper != null) connection_reaper.cancel(false); } protected synchronized short getNewConnectionId() { short retval = last_conn_id; if (last_conn_id >= Short.MAX_VALUE || last_conn_id < 0) last_conn_id = 0; else last_conn_id++; return retval; } protected void sendRequestForFirstSeqno(Address dest, long seqno_received) { Message msg = new Message(dest).setFlag(Message.Flag.OOB, Message.Flag.INTERNAL); UnicastHeader hdr = UnicastHeader.createSendFirstSeqnoHeader(seqno_received); msg.putHeader(this.id, hdr); if (log.isTraceEnabled()) log.trace(local_addr + " --> SEND_FIRST_SEQNO(" + dest + "," + seqno_received + ")"); down_prot.down(new Event(Event.MSG, msg)); } @ManagedOperation( description = "Closes connections that have been idle for more than conn_expiry_timeout ms") public void reapIdleConnections() { // remove expired connections from send_table for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) { SenderEntry val = entry.getValue(); long age = val.age(); if (age >= conn_expiry_timeout) { removeSendConnection(entry.getKey()); if (log.isDebugEnabled()) log.debug( local_addr + ": removed expired connection for " + entry.getKey() + " (" + age + " ms old) from send_table"); } } // remove expired connections from recv_table for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) { ReceiverEntry val = entry.getValue(); long age = val.age(); if (age >= conn_expiry_timeout) { removeReceiveConnection(entry.getKey()); if (log.isDebugEnabled()) log.debug( local_addr + ": removed expired connection for " + entry.getKey() + " (" + age + " ms old) from recv_table"); } } } protected String printMessageList(List<Message> list) { StringBuilder sb = new StringBuilder(); int size = list.size(); Message first = size > 0 ? list.get(0) : null, second = size > 1 ? list.get(size - 1) : first; UnicastHeader hdr; if (first != null) { hdr = (UnicastHeader) first.getHeader(id); if (hdr != null) sb.append("#" + hdr.seqno); } if (second != null) { hdr = (UnicastHeader) second.getHeader(id); if (hdr != null) sb.append(" - #" + hdr.seqno); } return sb.toString(); } /** * The following types and fields are serialized: * * <pre> * | DATA | seqno | conn_id | first | * | ACK | seqno | * | SEND_FIRST_SEQNO | * </pre> */ public static class UnicastHeader extends Header { public static final byte DATA = 0; public static final byte ACK = 1; public static final byte SEND_FIRST_SEQNO = 2; byte type; long seqno; // DATA and ACK short conn_id; // DATA boolean first; // DATA public UnicastHeader() {} // used for externalization public static UnicastHeader createDataHeader(long seqno, short conn_id, boolean first) { return new UnicastHeader(DATA, seqno, conn_id, first); } public static UnicastHeader createAckHeader(long seqno, short conn_id) { return new UnicastHeader(ACK, seqno, conn_id, false); } public static UnicastHeader createSendFirstSeqnoHeader(long seqno_received) { return new UnicastHeader(SEND_FIRST_SEQNO, seqno_received); } protected UnicastHeader(byte type, long seqno) { this.type = type; this.seqno = seqno; } protected UnicastHeader(byte type, long seqno, short conn_id, boolean first) { this.type = type; this.seqno = seqno; this.conn_id = conn_id; this.first = first; } public long getSeqno() { return seqno; } public String toString() { StringBuilder sb = new StringBuilder(); sb.append(type2Str(type)).append(", seqno=").append(seqno); if (conn_id != 0) sb.append(", conn_id=").append(conn_id); if (first) sb.append(", first"); return sb.toString(); } public static String type2Str(byte t) { switch (t) { case DATA: return "DATA"; case ACK: return "ACK"; case SEND_FIRST_SEQNO: return "SEND_FIRST_SEQNO"; default: return "<unknown>"; } } public final int size() { int retval = Global.BYTE_SIZE; // type switch (type) { case DATA: retval += Bits.size(seqno) // seqno + Global.SHORT_SIZE // conn_id + Global.BYTE_SIZE; // first break; case ACK: retval += Bits.size(seqno) + Global.SHORT_SIZE; // conn_id break; case SEND_FIRST_SEQNO: retval += Bits.size(seqno); break; } return retval; } public UnicastHeader copy() { return new UnicastHeader(type, seqno, conn_id, first); } public void writeTo(DataOutput out) throws Exception { out.writeByte(type); switch (type) { case DATA: Bits.writeLong(seqno, out); out.writeShort(conn_id); out.writeBoolean(first); break; case ACK: Bits.writeLong(seqno, out); out.writeShort(conn_id); break; case SEND_FIRST_SEQNO: Bits.writeLong(seqno, out); break; } } public void readFrom(DataInput in) throws Exception { type = in.readByte(); switch (type) { case DATA: seqno = Bits.readLong(in); conn_id = in.readShort(); first = in.readBoolean(); break; case ACK: seqno = Bits.readLong(in); conn_id = in.readShort(); break; case SEND_FIRST_SEQNO: seqno = Bits.readLong(in); break; } } } protected final class SenderEntry { // stores (and retransmits) msgs sent by us to a certain peer final Table<Message> sent_msgs; final AtomicLong sent_msgs_seqno = new AtomicLong(DEFAULT_FIRST_SEQNO); // seqno for msgs sent by us final short send_conn_id; protected final AtomicLong timestamp = new AtomicLong(0); final Lock lock = new ReentrantLock(); public SenderEntry(short send_conn_id) { this.send_conn_id = send_conn_id; this.sent_msgs = new Table<Message>( xmit_table_num_rows, xmit_table_msgs_per_row, 0, xmit_table_resize_factor, xmit_table_max_compaction_time); update(); } void update() { timestamp.set(System.currentTimeMillis()); } long age() { return System.currentTimeMillis() - timestamp.longValue(); } public String toString() { StringBuilder sb = new StringBuilder(); if (sent_msgs != null) sb.append(sent_msgs).append(", "); sb.append("send_conn_id=" + send_conn_id).append(" (" + age() + " ms old)"); return sb.toString(); } } protected static final class ReceiverEntry { protected final Table<Message> received_msgs; // stores all msgs rcvd by a certain peer in seqno-order protected final short recv_conn_id; protected final AtomicLong timestamp = new AtomicLong(0); public ReceiverEntry(Table<Message> received_msgs, short recv_conn_id) { this.received_msgs = received_msgs; this.recv_conn_id = recv_conn_id; update(); } void update() { timestamp.set(System.currentTimeMillis()); } long age() { return System.currentTimeMillis() - timestamp.longValue(); } public String toString() { StringBuilder sb = new StringBuilder(); if (received_msgs != null) sb.append(received_msgs).append(", "); sb.append("recv_conn_id=" + recv_conn_id); sb.append(" (" + age() + " ms old)"); return sb.toString(); } } protected class ConnectionReaper implements Runnable { public void run() { reapIdleConnections(); } public String toString() { return UNICAST.class.getSimpleName() + ": ConnectionReaper (interval=" + conn_expiry_timeout + " ms)"; } } /** * Retransmitter task which periodically (every xmit_interval ms) looks at all the retransmit * (send) tables and re-sends messages for which we haven't received an ack yet */ protected class RetransmitTask implements Runnable { public void run() { for (SenderEntry val : send_table.values()) { Table<Message> buf = val != null ? val.sent_msgs : null; if (buf != null && !buf.isEmpty()) { long from = buf.getHighestDelivered() + 1, to = buf.getHighestReceived(); List<Message> list = buf.get(from, to); if (list != null) { for (Message msg : list) retransmit(msg); } } } } public String toString() { return UNICAST.class.getSimpleName() + ": RetransmitTask (interval=" + xmit_interval + " ms)"; } } }
/** * Reliable unicast layer. Implemented with negative acks. Every sender keeps its messages in an * AckSenderWindow. A receiver stores incoming messages in a NakReceiverWindow, and asks the sender * for retransmission if a gap is detected. Every now and then (stable_interval), a timer task sends * a STABLE message to all senders, including the highest received and delivered seqnos. A sender * purges messages lower than highest delivered and asks the STABLE sender for messages it might * have missed (smaller than highest received). A STABLE message can also be sent when a receiver * has received more than max_bytes from a given sender. * * <p>The advantage of this protocol over {@link org.jgroups.protocols.UNICAST} is that it doesn't * send acks for every message. Instead, it sends 'acks' after receiving max_bytes and/ or * periodically (stable_interval). * * @author Bela Ban */ @MBean(description = "Reliable unicast layer") public class UNICAST2 extends Protocol implements AgeOutCache.Handler<Address> { public static final long DEFAULT_FIRST_SEQNO = Global.DEFAULT_FIRST_UNICAST_SEQNO; /* ------------------------------------------ Properties ------------------------------------------ */ @Deprecated protected int[] timeout = { 400, 800, 1600, 3200 }; // for NakSenderWindow: max time to wait for missing acks /** * The first value (in milliseconds) to use in the exponential backoff retransmission mechanism. * Only enabled if the value is > 0 */ @Deprecated @Property( description = "The first value (in milliseconds) to use in the exponential backoff. Enabled if greater than 0", deprecatedMessage = "Not used anymore") protected int exponential_backoff = 300; @Property( description = "Max number of messages to be removed from a NakReceiverWindow. This property might " + "get removed anytime, so don't use it !") protected int max_msg_batch_size = 500; @Property(description = "Max number of bytes before a stability message is sent to the sender") protected long max_bytes = 10000000; @Property( description = "Max number of milliseconds before a stability message is sent to the sender(s)") protected long stable_interval = 60000L; @Property( description = "Max number of STABLE messages sent for the same highest_received seqno. A value < 1 is invalid") protected int max_stable_msgs = 5; @Property( description = "Number of rows of the matrix in the retransmission table (only for experts)", writable = false) protected int xmit_table_num_rows = 100; @Property( description = "Number of elements of a row of the matrix in the retransmission table (only for experts). " + "The capacity of the matrix is xmit_table_num_rows * xmit_table_msgs_per_row", writable = false) protected int xmit_table_msgs_per_row = 2000; @Property( description = "Resize factor of the matrix in the retransmission table (only for experts)", writable = false) protected double xmit_table_resize_factor = 1.2; @Property( description = "Number of milliseconds after which the matrix in the retransmission table " + "is compacted (only for experts)", writable = false) protected long xmit_table_max_compaction_time = 10 * 60 * 1000; @Deprecated @Property( description = "If enabled, the removal of a message from the retransmission table causes an " + "automatic purge (only for experts)", writable = false, deprecatedMessage = "not used anymore") protected boolean xmit_table_automatic_purging = true; @Property( description = "Whether to use the old retransmitter which retransmits individual messages or the new one " + "which uses ranges of retransmitted messages. Default is true. Note that this property will be removed in 3.0; " + "it is only used to switch back to the old (and proven) retransmitter mechanism if issues occur") protected boolean use_range_based_retransmitter = true; @Property( description = "Time (in milliseconds) after which an idle incoming or outgoing connection is closed. The " + "connection will get re-established when used again. 0 disables connection reaping") protected long conn_expiry_timeout = 60000; @Property( description = "Interval (in milliseconds) at which missing messages (from all retransmit buffers) " + "are retransmitted") protected long xmit_interval = 1000; /* --------------------------------------------- JMX ---------------------------------------------- */ protected long num_msgs_sent = 0, num_msgs_received = 0; /* --------------------------------------------- Fields ------------------------------------------------ */ protected final ConcurrentMap<Address, SenderEntry> send_table = Util.createConcurrentMap(); protected final ConcurrentMap<Address, ReceiverEntry> recv_table = Util.createConcurrentMap(); /** RetransmitTask running every xmit_interval ms */ protected Future<?> xmit_task; protected final ReentrantLock recv_table_lock = new ReentrantLock(); protected volatile List<Address> members = new ArrayList<Address>(11); protected Address local_addr = null; protected TimeScheduler timer = null; // used for retransmissions (passed to AckSenderWindow) protected volatile boolean running = false; protected short last_conn_id = 0; protected long max_retransmit_time = 60 * 1000L; protected AgeOutCache<Address> cache = null; protected Future<?> stable_task_future = null; // bcasts periodic STABLE message (added to timer below) protected Future<?> connection_reaper; // closes idle connections public int[] getTimeout() { return timeout; } @Deprecated @Property( name = "timeout", converter = PropertyConverters.IntegerArray.class, description = "list of timeouts", deprecatedMessage = "not used anymore") public void setTimeout(int[] val) { if (val != null) timeout = val; } public void setMaxMessageBatchSize(int size) { if (size >= 1) max_msg_batch_size = size; } @ManagedAttribute public String getLocalAddress() { return local_addr != null ? local_addr.toString() : "null"; } @ManagedAttribute public String getMembers() { return members.toString(); } @ManagedAttribute(description = "Returns the number of outgoing (send) connections") public int getNumSendConnections() { return send_table.size(); } @ManagedAttribute(description = "Returns the number of incoming (receive) connections") public int getNumReceiveConnections() { return recv_table.size(); } @ManagedAttribute( description = "Returns the total number of outgoing (send) and incoming (receive) connections") public int getNumConnections() { return getNumReceiveConnections() + getNumSendConnections(); } @ManagedOperation public String printConnections() { StringBuilder sb = new StringBuilder(); if (!send_table.isEmpty()) { sb.append("send connections:\n"); for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) { sb.append(entry.getKey()).append(": ").append(entry.getValue()).append("\n"); } } if (!recv_table.isEmpty()) { sb.append("\nreceive connections:\n"); for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) { sb.append(entry.getKey()).append(": ").append(entry.getValue()).append("\n"); } } return sb.toString(); } @ManagedAttribute(description = "Whether the ConnectionReaper task is running") public boolean isConnectionReaperRunning() { return connection_reaper != null && !connection_reaper.isDone(); } @ManagedAttribute public long getNumMessagesSent() { return num_msgs_sent; } @ManagedAttribute public long getNumMessagesReceived() { return num_msgs_received; } @ManagedAttribute(description = "Total number of undelivered messages in all receive windows") public long getXmitTableUndeliveredMessages() { long retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.size(); } return retval; } @ManagedAttribute(description = "Total number of missing messages in all receive windows") public long getXmitTableMissingMessages() { long retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.getNumMissing(); } return retval; } @ManagedAttribute(description = "Number of compactions in all (receive and send) windows") public int getXmitTableNumCompactions() { int retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.getNumCompactions(); } for (SenderEntry entry : send_table.values()) { if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumCompactions(); } return retval; } @ManagedAttribute(description = "Number of moves in all (receive and send) windows") public int getXmitTableNumMoves() { int retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.getNumMoves(); } for (SenderEntry entry : send_table.values()) { if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumMoves(); } return retval; } @ManagedAttribute(description = "Number of resizes in all (receive and send) windows") public int getXmitTableNumResizes() { int retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.getNumResizes(); } for (SenderEntry entry : send_table.values()) { if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumResizes(); } return retval; } @ManagedAttribute(description = "Number of purges in all (receive and send) windows") public int getXmitTableNumPurges() { int retval = 0; for (ReceiverEntry entry : recv_table.values()) { if (entry.received_msgs != null) retval += entry.received_msgs.getNumPurges(); } for (SenderEntry entry : send_table.values()) { if (entry.sent_msgs != null) retval += entry.sent_msgs.getNumPurges(); } return retval; } @ManagedOperation(description = "Prints the contents of the receive windows for all members") public String printReceiveWindowMessages() { StringBuilder ret = new StringBuilder(local_addr + ":\n"); for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) { Address addr = entry.getKey(); Table<Message> buf = entry.getValue().received_msgs; ret.append(addr).append(": ").append(buf.toString()).append('\n'); } return ret.toString(); } @ManagedOperation(description = "Prints the contents of the send windows for all members") public String printSendWindowMessages() { StringBuilder ret = new StringBuilder(local_addr + ":\n"); for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) { Address addr = entry.getKey(); Table<Message> buf = entry.getValue().sent_msgs; ret.append(addr).append(": ").append(buf.toString()).append('\n'); } return ret.toString(); } @ManagedAttribute(description = "Number of retransmit requests received") protected final AtomicLong xmit_reqs_received = new AtomicLong(0); @ManagedAttribute(description = "Number of retransmit requests sent") protected final AtomicLong xmit_reqs_sent = new AtomicLong(0); @ManagedAttribute(description = "Number of retransmit responses sent") protected final AtomicLong xmit_rsps_sent = new AtomicLong(0); @ManagedAttribute(description = "Is the retransmit task running") public boolean isXmitTaskRunning() { return xmit_task != null && !xmit_task.isDone(); } public long getMaxRetransmitTime() { return max_retransmit_time; } @Property( description = "Max number of milliseconds we try to retransmit a message to any given member. After that, " + "the connection is removed. Any new connection to that member will start with seqno #1 again. 0 disables this") public void setMaxRetransmitTime(long max_retransmit_time) { this.max_retransmit_time = max_retransmit_time; if (cache != null && max_retransmit_time > 0) cache.setTimeout(max_retransmit_time); } @ManagedAttribute public int getAgeOutCacheSize() { return cache != null ? cache.size() : 0; } @ManagedOperation public String printAgeOutCache() { return cache != null ? cache.toString() : "n/a"; } public AgeOutCache<Address> getAgeOutCache() { return cache; } public void resetStats() { num_msgs_sent = num_msgs_received = 0; xmit_reqs_received.set(0); xmit_reqs_sent.set(0); xmit_rsps_sent.set(0); } public TimeScheduler getTimer() { return timer; } /** * Only used for unit tests, don't use ! * * @param timer */ public void setTimer(TimeScheduler timer) { this.timer = timer; } public void init() throws Exception { super.init(); if (max_stable_msgs < 1) throw new IllegalArgumentException("max_stable_msgs ( " + max_stable_msgs + ") must be > 0"); if (max_bytes <= 0) throw new IllegalArgumentException("max_bytes has to be > 0"); } public void start() throws Exception { timer = getTransport().getTimer(); if (timer == null) throw new Exception("timer is null"); if (max_retransmit_time > 0) cache = new AgeOutCache<Address>(timer, max_retransmit_time, this); running = true; if (stable_interval > 0) startStableTask(); if (conn_expiry_timeout > 0) startConnectionReaper(); startRetransmitTask(); } public void stop() { running = false; stopStableTask(); stopConnectionReaper(); stopRetransmitTask(); removeAllConnections(); } public Object up(Event evt) { Message msg; Address dst, src; Unicast2Header hdr; switch (evt.getType()) { case Event.MSG: msg = (Message) evt.getArg(); dst = msg.getDest(); if (dst == null || msg.isFlagSet(Message.NO_RELIABILITY)) // only handle unicast messages break; // pass up // changed from removeHeader(): we cannot remove the header because if we do loopback=true // at the // transport level, we will not have the header on retransmit ! (bela Aug 22 2006) hdr = (Unicast2Header) msg.getHeader(this.id); if (hdr == null) break; src = msg.getSrc(); switch (hdr.type) { case Unicast2Header.DATA: // received regular message handleDataReceived(src, hdr.seqno, hdr.conn_id, hdr.first, msg, evt); return null; // we pass the deliverable message up in handleDataReceived() case Unicast2Header.XMIT_REQ: // received ACK for previously sent message handleXmitRequest(src, (SeqnoList) msg.getObject()); break; case Unicast2Header.SEND_FIRST_SEQNO: handleResendingOfFirstMessage(src, hdr.seqno); break; case Unicast2Header.STABLE: stable(msg.getSrc(), hdr.conn_id, hdr.seqno, hdr.high_seqno); break; default: log.error("UnicastHeader type " + hdr.type + " not known !"); break; } return null; } return up_prot.up(evt); // Pass up to the layer above us } public Object down(Event evt) { switch (evt.getType()) { case Event.MSG: // Add UnicastHeader, add to AckSenderWindow and pass down Message msg = (Message) evt.getArg(); Address dst = msg.getDest(); /* only handle unicast messages */ if (dst == null || msg.isFlagSet(Message.NO_RELIABILITY)) break; if (!running) { if (log.isTraceEnabled()) log.trace("discarded message as start() has not yet been called, message: " + msg); return null; } SenderEntry entry = send_table.get(dst); if (entry == null) { entry = new SenderEntry(getNewConnectionId()); SenderEntry existing = send_table.putIfAbsent(dst, entry); if (existing != null) entry = existing; else { if (log.isTraceEnabled()) log.trace( local_addr + ": created connection to " + dst + " (conn_id=" + entry.send_conn_id + ")"); if (cache != null && !members.contains(dst)) cache.add(dst); } } short send_conn_id = entry.send_conn_id; long seqno = entry.sent_msgs_seqno.getAndIncrement(); long sleep = 10; while (running) { try { msg.putHeader( this.id, Unicast2Header.createDataHeader(seqno, send_conn_id, seqno == DEFAULT_FIRST_SEQNO)); entry.sent_msgs.add(seqno, msg); // add *including* UnicastHeader, adds to retransmitter if (conn_expiry_timeout > 0) entry.update(); break; } catch (Throwable t) { if (!running) break; if (log.isWarnEnabled()) log.warn("failed sending message", t); Util.sleep(sleep); sleep = Math.min(5000, sleep * 2); } } if (log.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); sb.append(local_addr) .append(" --> DATA(") .append(dst) .append(": #") .append(seqno) .append(", conn_id=") .append(send_conn_id); if (seqno == DEFAULT_FIRST_SEQNO) sb.append(", first"); sb.append(')'); log.trace(sb); } try { down_prot.down(evt); num_msgs_sent++; } catch (Throwable t) { log.warn("failed sending the message", t); } return null; // we already passed the msg down case Event.VIEW_CHANGE: // remove connections to peers that are not members anymore ! View view = (View) evt.getArg(); List<Address> new_members = view.getMembers(); Set<Address> non_members = new HashSet<Address>(send_table.keySet()); non_members.addAll(recv_table.keySet()); members = new_members; non_members.removeAll(new_members); if (cache != null) cache.removeAll(new_members); if (!non_members.isEmpty()) { if (log.isTraceEnabled()) log.trace("removing non members " + non_members); for (Address non_mbr : non_members) removeConnection(non_mbr); } break; case Event.SET_LOCAL_ADDRESS: local_addr = (Address) evt.getArg(); break; } return down_prot.down(evt); // Pass on to the layer below us } /** * Purge all messages in window for local_addr, which are <= low. Check if the window's highest * received message is > high: if true, retransmit all messages from high - win.high to sender * * @param sender * @param hd Highest delivered seqno * @param hr Highest received seqno */ protected void stable(Address sender, short conn_id, long hd, long hr) { SenderEntry entry = send_table.get(sender); Table<Message> win = entry != null ? entry.sent_msgs : null; if (win == null) return; if (log.isTraceEnabled()) log.trace( new StringBuilder() .append(local_addr) .append(" <-- STABLE(") .append(sender) .append(": ") .append(hd) .append("-") .append(hr) .append(", conn_id=" + conn_id) + ")"); if (entry.send_conn_id != conn_id) { log.warn( local_addr + ": my conn_id (" + entry.send_conn_id + ") != received conn_id (" + conn_id + "); discarding STABLE message !"); return; } win.purge(hd, true); long win_hr = win.getHighestReceived(); if (win_hr > hr) { for (long seqno = hr; seqno <= win_hr; seqno++) { Message msg = win.get( seqno); // destination is still the same (the member which sent the STABLE message) if (msg != null) down_prot.down(new Event(Event.MSG, msg)); } } } @ManagedOperation( description = "Sends a STABLE message to all senders. This causes message purging and potential" + " retransmissions from senders") public void sendStableMessages() { for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) { Address dest = entry.getKey(); ReceiverEntry val = entry.getValue(); Table<Message> win = val != null ? val.received_msgs : null; if (win != null) { long[] tmp = win.getDigest(); long low = tmp[0], high = tmp[1]; if (val.last_highest == high) { if (val.num_stable_msgs >= max_stable_msgs) { continue; } else val.num_stable_msgs++; } else { val.last_highest = high; val.num_stable_msgs = 1; } sendStableMessage(dest, val.recv_conn_id, low, high); } } } protected void sendStableMessage(Address dest, short conn_id, long hd, long hr) { Message stable_msg = new Message(dest, null, null); Unicast2Header hdr = Unicast2Header.createStableHeader(conn_id, hd, hr); stable_msg.putHeader(this.id, hdr); stable_msg.setFlag(Message.OOB); if (log.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); sb.append(local_addr) .append(" --> STABLE(") .append(dest) .append(": ") .append(hd) .append("-") .append(hr) .append(", conn_id=") .append(conn_id) .append(")"); log.trace(sb.toString()); } down_prot.down(new Event(Event.MSG, stable_msg)); } protected void startStableTask() { if (stable_task_future == null || stable_task_future.isDone()) { final Runnable stable_task = new Runnable() { public void run() { try { sendStableMessages(); } catch (Throwable t) { log.error("sending of STABLE messages failed", t); } } }; stable_task_future = timer.scheduleWithFixedDelay( stable_task, stable_interval, stable_interval, TimeUnit.MILLISECONDS); if (log.isTraceEnabled()) log.trace("stable task started"); } } protected void stopStableTask() { if (stable_task_future != null) { stable_task_future.cancel(false); stable_task_future = null; } } protected synchronized void startConnectionReaper() { if (connection_reaper == null || connection_reaper.isDone()) connection_reaper = timer.scheduleWithFixedDelay( new ConnectionReaper(), conn_expiry_timeout, conn_expiry_timeout, TimeUnit.MILLISECONDS); } protected synchronized void stopConnectionReaper() { if (connection_reaper != null) connection_reaper.cancel(false); } /** * Removes and resets from connection table (which is already locked). Returns true if member was * found, otherwise false. This method is public only so it can be invoked by unit testing, but * should not otherwise be used ! */ public void removeConnection(Address mbr) { removeSendConnection(mbr); removeReceiveConnection(mbr); } public void removeSendConnection(Address mbr) { send_table.remove(mbr); } public void removeReceiveConnection(Address mbr) { ReceiverEntry entry2 = recv_table.remove(mbr); if (entry2 != null) { Table<Message> win = entry2.received_msgs; if (win != null) sendStableMessage( mbr, entry2.recv_conn_id, win.getHighestDelivered(), win.getHighestReceived()); entry2.reset(); } } /** * This method is public only so it can be invoked by unit testing, but should not otherwise be * used ! */ @ManagedOperation( description = "Trashes all connections to other nodes. This is only used for testing") public void removeAllConnections() { send_table.clear(); sendStableMessages(); for (ReceiverEntry entry2 : recv_table.values()) entry2.reset(); recv_table.clear(); } public void retransmit(SeqnoList missing, Address sender) { Unicast2Header hdr = Unicast2Header.createXmitReqHeader(); Message retransmit_msg = new Message(sender, null, missing); retransmit_msg.setFlag(Message.OOB); if (log.isTraceEnabled()) log.trace(local_addr + ": sending XMIT_REQ (" + missing + ") to " + sender); retransmit_msg.putHeader(this.id, hdr); down_prot.down(new Event(Event.MSG, retransmit_msg)); xmit_reqs_sent.addAndGet(missing.size()); } /** * Called by AgeOutCache, to removed expired connections * * @param key */ public void expired(Address key) { if (key != null) { if (log.isDebugEnabled()) log.debug("removing connection to " + key + " because it expired"); removeConnection(key); } } /** * Check whether the hashmap contains an entry e for <code>sender</code> (create if not). If * e.received_msgs is null and <code>first</code> is true: create a new AckReceiverWindow(seqno) * and add message. Set e.received_msgs to the new window. Else just add the message. */ protected void handleDataReceived( Address sender, long seqno, short conn_id, boolean first, Message msg, Event evt) { if (log.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); sb.append(local_addr).append(" <-- DATA(").append(sender).append(": #").append(seqno); if (conn_id != 0) sb.append(", conn_id=").append(conn_id); if (first) sb.append(", first"); sb.append(')'); log.trace(sb); } ReceiverEntry entry = getReceiverEntry(sender, seqno, first, conn_id); if (entry == null) return; if (conn_expiry_timeout > 0) entry.update(); Table<Message> win = entry.received_msgs; boolean added = win.add(seqno, msg); // win is guaranteed to be non-null if we get here num_msgs_received++; if (added) { int len = msg.getLength(); if (len > 0 && entry.incrementStable(len)) sendStableMessage( sender, entry.recv_conn_id, win.getHighestDelivered(), win.getHighestReceived()); } // An OOB message is passed up immediately. Later, when remove() is called, we discard it. This // affects ordering ! // http://jira.jboss.com/jira/browse/JGRP-377 if (msg.isFlagSet(Message.OOB) && added) { try { up_prot.up(evt); } catch (Throwable t) { log.error("couldn't deliver OOB message " + msg, t); } } final AtomicBoolean processing = win.getProcessing(); if (!processing.compareAndSet(false, true)) { return; } // Try to remove as many messages as possible and pass them up. // Prevents concurrent passing up of messages by different threads // (http://jira.jboss.com/jira/browse/JGRP-198); // this is all the more important once we have a concurrent stack // (http://jira.jboss.com/jira/browse/JGRP-181), // where lots of threads can come up to this point concurrently, but only 1 is allowed to pass // at a time // We *can* deliver messages from *different* senders concurrently, e.g. reception of P1, Q1, // P2, Q2 can result in // delivery of P1, Q1, Q2, P2: FIFO (implemented by UNICAST) says messages need to be delivered // only in the // order in which they were sent by their senders boolean released_processing = false; try { while (true) { List<Message> msgs = win.removeMany(processing, true, max_msg_batch_size); // remove my own messages if (msgs == null || msgs.isEmpty()) { released_processing = true; return; } for (Message m : msgs) { // discard OOB msg: it has already been delivered // (http://jira.jboss.com/jira/browse/JGRP-377) if (m.isFlagSet(Message.OOB)) continue; try { up_prot.up(new Event(Event.MSG, m)); } catch (Throwable t) { log.error("couldn't deliver message " + m, t); } } } } finally { // processing is always set in win.remove(processing) above and never here ! This code is just // a // 2nd line of defense should there be an exception before win.remove(processing) sets // processing if (!released_processing) processing.set(false); } } protected ReceiverEntry getReceiverEntry( Address sender, long seqno, boolean first, short conn_id) { ReceiverEntry entry = recv_table.get(sender); if (entry != null && entry.recv_conn_id == conn_id) return entry; recv_table_lock.lock(); try { entry = recv_table.get(sender); if (first) { if (entry == null) { entry = getOrCreateReceiverEntry(sender, seqno, conn_id); } else { // entry != null && win != null if (conn_id != entry.recv_conn_id) { if (log.isTraceEnabled()) log.trace( local_addr + ": conn_id=" + conn_id + " != " + entry.recv_conn_id + "; resetting receiver window"); recv_table.remove(sender); entry = getOrCreateReceiverEntry(sender, seqno, conn_id); } else {; } } } else { // entry == null && win == null OR entry != null && win == null OR entry != null && // win != null if (entry == null || entry.recv_conn_id != conn_id) { recv_table_lock.unlock(); sendRequestForFirstSeqno(sender, seqno); // drops the message and returns (see below) return null; } } return entry; } finally { if (recv_table_lock.isHeldByCurrentThread()) recv_table_lock.unlock(); } } protected ReceiverEntry getOrCreateReceiverEntry(Address sender, long seqno, short conn_id) { Table<Message> table = new Table<Message>( xmit_table_num_rows, xmit_table_msgs_per_row, seqno - 1, xmit_table_resize_factor, xmit_table_max_compaction_time); ReceiverEntry entry = new ReceiverEntry(table, conn_id); ReceiverEntry entry2 = recv_table.putIfAbsent(sender, entry); if (entry2 != null) return entry2; if (log.isTraceEnabled()) log.trace( local_addr + ": created receiver window for " + sender + " at seqno=#" + seqno + " for conn-id=" + conn_id); return entry; } protected void handleXmitRequest(Address sender, SeqnoList missing) { if (log.isTraceEnabled()) log.trace( new StringBuilder() .append(local_addr) .append(" <-- XMIT(") .append(sender) .append(": #") .append(missing) .append(')')); SenderEntry entry = send_table.get(sender); xmit_reqs_received.addAndGet(missing.size()); Table<Message> win = entry != null ? entry.sent_msgs : null; if (win != null) { for (long seqno : missing) { Message msg = win.get(seqno); if (msg == null) { if (log.isWarnEnabled() && !local_addr.equals(sender)) { StringBuilder sb = new StringBuilder(); sb.append("(requester=").append(sender).append(", local_addr=").append(this.local_addr); sb.append(") message ").append(sender).append("::").append(seqno); sb.append(" not found in retransmission table of ") .append(sender) .append(":\n") .append(win); log.warn(sb.toString()); } continue; } down_prot.down(new Event(Event.MSG, msg)); xmit_rsps_sent.incrementAndGet(); } } } /** * We need to resend our first message with our conn_id * * @param sender * @param seqno Resend the non null messages in the range [lowest .. seqno] */ protected void handleResendingOfFirstMessage(Address sender, long seqno) { if (log.isTraceEnabled()) log.trace(local_addr + " <-- SEND_FIRST_SEQNO(" + sender + "," + seqno + ")"); SenderEntry entry = send_table.get(sender); Table<Message> win = entry != null ? entry.sent_msgs : null; if (win == null) { if (log.isErrorEnabled()) log.error(local_addr + ": sender window for " + sender + " not found"); return; } boolean first_sent = false; for (long i = win.getLow() + 1; i <= seqno; i++) { Message rsp = win.get(i); if (rsp == null) continue; if (first_sent) { down_prot.down(new Event(Event.MSG, rsp)); } else { first_sent = true; // We need to copy the UnicastHeader and put it back into the message because Message.copy() // doesn't copy // the headers and therefore we'd modify the original message in the sender retransmission // window // (https://jira.jboss.org/jira/browse/JGRP-965) Message copy = rsp.copy(); Unicast2Header hdr = (Unicast2Header) copy.getHeader(this.id); Unicast2Header newhdr = hdr.copy(); newhdr.first = true; copy.putHeader(this.id, newhdr); down_prot.down(new Event(Event.MSG, copy)); } } } protected void startRetransmitTask() { if (xmit_task == null || xmit_task.isDone()) xmit_task = timer.scheduleWithFixedDelay( new RetransmitTask(), 0, xmit_interval, TimeUnit.MILLISECONDS); } protected void stopRetransmitTask() { if (xmit_task != null) { xmit_task.cancel(true); xmit_task = null; } } protected synchronized short getNewConnectionId() { short retval = last_conn_id; if (last_conn_id >= Short.MAX_VALUE || last_conn_id < 0) last_conn_id = 0; else last_conn_id++; return retval; } protected void sendRequestForFirstSeqno(Address dest, long seqno_received) { Message msg = new Message(dest); msg.setFlag(Message.OOB); Unicast2Header hdr = Unicast2Header.createSendFirstSeqnoHeader(seqno_received); msg.putHeader(this.id, hdr); if (log.isTraceEnabled()) log.trace(local_addr + " --> SEND_FIRST_SEQNO(" + dest + "," + seqno_received + ")"); down_prot.down(new Event(Event.MSG, msg)); } @ManagedOperation( description = "Closes connections that have been idle for more than conn_expiry_timeout ms") public void reapIdleConnections() { if (conn_expiry_timeout <= 0) return; // remove expired connections from send_table for (Map.Entry<Address, SenderEntry> entry : send_table.entrySet()) { SenderEntry val = entry.getValue(); long age = val.age(); if (age >= conn_expiry_timeout) { removeSendConnection(entry.getKey()); if (log.isDebugEnabled()) log.debug( local_addr + ": removed expired connection for " + entry.getKey() + " (" + age + " ms old) from send_table"); } } // remove expired connections from recv_table for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) { ReceiverEntry val = entry.getValue(); long age = val.age(); if (age >= conn_expiry_timeout) { removeReceiveConnection(entry.getKey()); if (log.isDebugEnabled()) log.debug( local_addr + ": removed expired connection for " + entry.getKey() + " (" + age + " ms old) from recv_table"); } } } /** * The following types and fields are serialized: * * <pre> * | DATA | seqno | conn_id | first | * | ACK | seqno | * | SEND_FIRST_SEQNO | seqno | * </pre> */ public static class Unicast2Header extends Header { public static final byte DATA = 0; public static final byte XMIT_REQ = 1; public static final byte SEND_FIRST_SEQNO = 2; public static final byte STABLE = 3; byte type; long seqno; // DATA and STABLE long high_seqno; // STABLE short conn_id; // DATA, STABLE boolean first; // DATA public Unicast2Header() {} // used for externalization public static Unicast2Header createDataHeader(long seqno, short conn_id, boolean first) { return new Unicast2Header(DATA, seqno, 0L, conn_id, first); } public static Unicast2Header createXmitReqHeader() { return new Unicast2Header(XMIT_REQ); } public static Unicast2Header createStableHeader(short conn_id, long low, long high) { if (low > high) throw new IllegalArgumentException("low (" + low + ") needs to be <= high (" + high + ")"); Unicast2Header retval = new Unicast2Header(STABLE, low); retval.high_seqno = high; retval.conn_id = conn_id; return retval; } public static Unicast2Header createSendFirstSeqnoHeader(long seqno_received) { return new Unicast2Header(SEND_FIRST_SEQNO, seqno_received); } protected Unicast2Header(byte type) { this.type = type; } protected Unicast2Header(byte type, long seqno) { this.type = type; this.seqno = seqno; } protected Unicast2Header(byte type, long seqno, long high, short conn_id, boolean first) { this.type = type; this.seqno = seqno; this.high_seqno = high; this.conn_id = conn_id; this.first = first; } public byte getType() { return type; } public long getSeqno() { return seqno; } public long getHighSeqno() { return high_seqno; } public short getConnId() { return conn_id; } public boolean isFirst() { return first; } public String toString() { StringBuilder sb = new StringBuilder(); sb.append(type2Str(type)).append(", seqno=").append(seqno); if (conn_id != 0) sb.append(", conn_id=").append(conn_id); if (first) sb.append(", first"); return sb.toString(); } public static String type2Str(byte t) { switch (t) { case DATA: return "DATA"; case XMIT_REQ: return "XMIT_REQ"; case SEND_FIRST_SEQNO: return "SEND_FIRST_SEQNO"; case STABLE: return "STABLE"; default: return "<unknown>"; } } public final int size() { int retval = Global.BYTE_SIZE; // type switch (type) { case DATA: retval += Util.size(seqno) // seqno + Global.SHORT_SIZE // conn_id + Global.BYTE_SIZE; // first break; case XMIT_REQ: break; case STABLE: retval += Util.size(seqno, high_seqno) + Global.SHORT_SIZE; // conn_id break; case SEND_FIRST_SEQNO: retval += Util.size(seqno); break; } return retval; } public Unicast2Header copy() { return new Unicast2Header(type, seqno, high_seqno, conn_id, first); } public void writeTo(DataOutput out) throws Exception { out.writeByte(type); switch (type) { case DATA: Util.writeLong(seqno, out); out.writeShort(conn_id); out.writeBoolean(first); break; case XMIT_REQ: break; case STABLE: Util.writeLongSequence(seqno, high_seqno, out); out.writeShort(conn_id); break; case SEND_FIRST_SEQNO: Util.writeLong(seqno, out); break; } } public void readFrom(DataInput in) throws Exception { type = in.readByte(); switch (type) { case DATA: seqno = Util.readLong(in); conn_id = in.readShort(); first = in.readBoolean(); break; case XMIT_REQ: break; case STABLE: long[] seqnos = Util.readLongSequence(in); seqno = seqnos[0]; high_seqno = seqnos[1]; conn_id = in.readShort(); break; case SEND_FIRST_SEQNO: seqno = Util.readLong(in); break; } } } protected final class SenderEntry { // stores (and retransmits) msgs sent by us to a given peer final Table<Message> sent_msgs; final AtomicLong sent_msgs_seqno = new AtomicLong(DEFAULT_FIRST_SEQNO); // seqno for msgs sent by us final short send_conn_id; protected final AtomicLong timestamp = new AtomicLong(0); public SenderEntry(short send_conn_id) { this.send_conn_id = send_conn_id; this.sent_msgs = new Table<Message>( xmit_table_num_rows, xmit_table_msgs_per_row, 0, xmit_table_resize_factor, xmit_table_max_compaction_time); update(); } void update() { timestamp.set(System.currentTimeMillis()); } long age() { return System.currentTimeMillis() - timestamp.longValue(); } public String toString() { StringBuilder sb = new StringBuilder(); if (sent_msgs != null) sb.append(sent_msgs).append(", "); sb.append("send_conn_id=" + send_conn_id).append(" (" + age() + " ms old)"); return sb.toString(); } } protected final class ReceiverEntry { protected final Table<Message> received_msgs; // stores all msgs rcvd by a certain peer in seqno-order protected final short recv_conn_id; protected int received_bytes = 0; protected final AtomicLong timestamp = new AtomicLong(0); protected final Lock lock = new ReentrantLock(); protected long last_highest = -1; protected int num_stable_msgs = 0; public ReceiverEntry(Table<Message> received_msgs, short recv_conn_id) { this.received_msgs = received_msgs; this.recv_conn_id = recv_conn_id; update(); } /** * Adds len bytes, if max_bytes is exceeded, the value is reset and true returned, else false */ boolean incrementStable(int len) { lock.lock(); try { if (received_bytes + len >= max_bytes) { received_bytes = 0; return true; } received_bytes += len; return false; } finally { lock.unlock(); } } void reset() { received_bytes = 0; last_highest = -1; num_stable_msgs = 0; } void update() { timestamp.set(System.currentTimeMillis()); } long age() { return System.currentTimeMillis() - timestamp.longValue(); } public String toString() { StringBuilder sb = new StringBuilder(); if (received_msgs != null) sb.append(received_msgs).append(", "); sb.append("recv_conn_id=" + recv_conn_id); sb.append(" (" + age() + " ms old)"); return sb.toString(); } } protected class ConnectionReaper implements Runnable { public void run() { reapIdleConnections(); } } /** * Retransmitter task which periodically (every xmit_interval ms) looks at all the retransmit * tables and sends retransmit request to all members from which we have missing messages */ protected class RetransmitTask implements Runnable { public void run() { for (Map.Entry<Address, ReceiverEntry> entry : recv_table.entrySet()) { Address target = entry.getKey(); // target to send retransmit requests to ReceiverEntry val = entry.getValue(); Table<Message> buf = val != null ? val.received_msgs : null; if (buf != null && buf.getNumMissing() > 0) { SeqnoList missing = buf.getMissing(); if (missing != null) retransmit(missing, target); } } } } }
/** * Starts the merge protocol (only run by the merge leader). Essentially sends a MERGE_REQ to all * coordinators of all subgroups found. Each coord receives its digest and view and returns it. * The leader then computes the digest and view for the new group from the return values. Finally, * it sends this merged view/digest to all subgroup coordinators; each coordinator will install it * in their subgroup. */ class MergeTask implements Runnable { private Thread thread = null; /** List of all subpartition coordinators and their members */ private final ConcurrentMap<Address, Collection<Address>> coords = Util.createConcurrentMap(8, 0.75f, 8); /** * @param views Guaranteed to be non-null and to have >= 2 members, or else this thread would * not be started */ public synchronized void start(Map<Address, View> views) { if (thread != null && thread.isAlive()) // the merge thread is already running return; this.coords.clear(); // now remove all members which don't have us in their view, so RPCs won't block (e.g. FLUSH) // https://jira.jboss.org/browse/JGRP-1061 sanitizeViews(views); // Add all different coordinators of the views into the hashmap and sets their members: Collection<Address> coordinators = Util.determineMergeCoords(views); for (Address coord : coordinators) { View view = views.get(coord); if (view != null) this.coords.put(coord, new ArrayList<Address>(view.getMembers())); } // For the merge participants which are not coordinator, we simply add them, and the // associated // membership list consists only of themselves Collection<Address> merge_participants = Util.determineMergeParticipants(views); merge_participants.removeAll(coordinators); for (Address merge_participant : merge_participants) { Collection<Address> tmp = new ArrayList<Address>(); tmp.add(merge_participant); coords.putIfAbsent(merge_participant, tmp); } thread = gms.getThreadFactory().newThread(this, "MergeTask"); thread.setDaemon(true); thread.start(); } public synchronized void stop() { Thread tmp = thread; if (thread != null && thread.isAlive()) tmp.interrupt(); thread = null; } public synchronized boolean isRunning() { return thread != null && thread.isAlive(); } public void run() { // 1. Generate merge_id final MergeId new_merge_id = MergeId.create(gms.local_addr); final Collection<Address> coordsCopy = new ArrayList<Address>(coords.keySet()); long start = System.currentTimeMillis(); try { _run(new_merge_id, coordsCopy); // might remove members from coordsCopy } catch (Throwable ex) { if (log.isWarnEnabled()) log.warn(gms.local_addr + ": " + ex + ", merge is cancelled"); sendMergeCancelledMessage(coordsCopy, new_merge_id); cancelMerge( new_merge_id); // the message above cancels the merge, too, but this is a 2nd line of // defense } finally { /* 5. if flush is in stack stop the flush for entire cluster [JGRP-700] - FLUSH: flushing should span merge */ if (gms.flushProtocolInStack) gms.stopFlush(); thread = null; } long diff = System.currentTimeMillis() - start; if (log.isDebugEnabled()) log.debug(gms.local_addr + ": merge " + new_merge_id + " took " + diff + " ms"); } /** Runs the merge protocol as a leader */ protected void _run(MergeId new_merge_id, final Collection<Address> coordsCopy) throws Exception { boolean success = setMergeId(null, new_merge_id); if (!success) { log.warn("failed to set my own merge_id (" + merge_id + ") to " + new_merge_id); return; } if (log.isDebugEnabled()) log.debug( gms.local_addr + ": merge task " + merge_id + " started with " + coords.keySet().size() + " coords"); /* 2. Fetch the current Views/Digests from all subgroup coordinators */ success = getMergeDataFromSubgroupCoordinators(coords, new_merge_id, gms.merge_timeout); List<Address> missing = null; if (!success) { missing = merge_rsps.getMissing(); if (log.isDebugEnabled()) log.debug( "merge leader " + gms.local_addr + " did not get responses from all " + coords.keySet().size() + " partition coordinators; missing responses from " + missing.size() + " members, removing them from the merge"); merge_rsps.remove(missing); } /* 3. Remove null or rejected merge responses from merge_rsp and coords (so we'll send the new view * only to members who accepted the merge request) */ if (missing != null && !missing.isEmpty()) { coords.keySet().removeAll(missing); coordsCopy.removeAll(missing); } removeRejectedMergeRequests(coords.keySet()); if (merge_rsps.size() == 0) throw new Exception("did not get any merge responses from partition coordinators"); if (!coords .keySet() .contains( gms.local_addr)) // another member might have invoked a merge req on us before we got // there... throw new Exception("merge leader rejected merge request"); /* 4. Combine all views and digests into 1 View/1 Digest */ List<MergeData> merge_data = new ArrayList<MergeData>(merge_rsps.getResults().values()); MergeData combined_merge_data = consolidateMergeData(merge_data); if (combined_merge_data == null) throw new Exception("could not consolidate merge"); /* 4. Send the new View/Digest to all coordinators (including myself). On reception, they will install the digest and view in all of their subgroup members */ if (log.isDebugEnabled()) log.debug( gms.local_addr + ": installing merge view " + combined_merge_data.view.getViewId() + " (" + combined_merge_data.view.size() + " members) in " + coords.keySet().size() + " coords"); sendMergeView(coords.keySet(), combined_merge_data, new_merge_id); } /** * Sends a MERGE_REQ to all coords and populates a list of MergeData (in merge_rsps). Returns * after coords.size() response have been received, or timeout msecs have elapsed (whichever is * first). * * <p>If a subgroup coordinator rejects the MERGE_REQ (e.g. because of participation in a * different merge), <em>that member will be removed from coords !</em> * * @param coords A map of coordinatgor addresses and associated membership lists * @param new_merge_id The new merge id * @param timeout Max number of msecs to wait for the merge responses from the subgroup coords */ protected boolean getMergeDataFromSubgroupCoordinators( Map<Address, Collection<Address>> coords, MergeId new_merge_id, long timeout) { boolean gotAllResponses; long start = System.currentTimeMillis(); merge_rsps.reset(coords.keySet()); if (log.isTraceEnabled()) log.trace(gms.local_addr + ": sending MERGE_REQ to " + coords.keySet()); for (Map.Entry<Address, Collection<Address>> entry : coords.entrySet()) { Address coord = entry.getKey(); Collection<Address> mbrs = entry.getValue(); Message msg = new Message(coord).setFlag(Message.Flag.OOB, Message.Flag.INTERNAL); GMS.GmsHeader hdr = new GMS.GmsHeader(GMS.GmsHeader.MERGE_REQ, mbrs); hdr.mbr = gms.local_addr; hdr.merge_id = new_merge_id; msg.putHeader(gms.getId(), hdr); gms.getDownProtocol().down(new Event(Event.MSG, msg)); } // wait until num_rsps_expected >= num_rsps or timeout elapsed merge_rsps.waitForAllResponses(timeout); gotAllResponses = merge_rsps.hasAllResponses(); long stop = System.currentTimeMillis(); if (log.isTraceEnabled()) log.trace( gms.local_addr + ": collected " + merge_rsps.numberOfValidResponses() + " merge response(s) in " + (stop - start) + " ms"); return gotAllResponses; } /** * Removed rejected merge requests from merge_rsps and coords. This method has a lock on * merge_rsps */ private void removeRejectedMergeRequests(Collection<Address> coords) { int num_removed = 0; for (Iterator<Map.Entry<Address, MergeData>> it = merge_rsps.getResults().entrySet().iterator(); it.hasNext(); ) { Map.Entry<Address, MergeData> entry = it.next(); MergeData data = entry.getValue(); if (data.merge_rejected) { if (data.getSender() != null) coords.remove(data.getSender()); it.remove(); num_removed++; } } if (num_removed > 0) { if (log.isTraceEnabled()) log.trace(gms.local_addr + ": removed " + num_removed + " rejected merge responses"); } } /** * Merge all MergeData. All MergeData elements should be disjunct (both views and digests). * However, this method is prepared to resolve duplicate entries (for the same member). * Resolution strategy for views is to merge only 1 of the duplicate members. Resolution * strategy for digests is to take the higher seqnos for duplicate digests. * * <p>After merging all members into a Membership and subsequent sorting, the first member of * the sorted membership will be the new coordinator. This method has a lock on merge_rsps. * * @param merge_rsps A list of MergeData items. Elements with merge_rejected=true were removed * before. Is guaranteed not to be null and to contain at least 1 member. */ private MergeData consolidateMergeData(List<MergeData> merge_rsps) { long logical_time = 0; // for new_vid List<View> subgroups = new ArrayList<View>(11); // contains a list of Views, each View is a subgroup Collection<Collection<Address>> sub_mbrships = new ArrayList<Collection<Address>>(); for (MergeData tmp_data : merge_rsps) { View tmp_view = tmp_data.getView(); if (tmp_view != null) { ViewId tmp_vid = tmp_view.getVid(); if (tmp_vid != null) { // compute the new view id (max of all vids +1) logical_time = Math.max(logical_time, tmp_vid.getId()); } // merge all membership lists into one (prevent duplicates) sub_mbrships.add(new ArrayList<Address>(tmp_view.getMembers())); subgroups.add(tmp_view.copy()); } } // determine the new digest Digest new_digest = consolidateDigests(merge_rsps, merge_rsps.size()); if (new_digest == null) return null; // remove all members from the new member list that are not in the digest Collection<Address> digest_mbrs = new_digest.getMembers(); for (Collection<Address> coll : sub_mbrships) coll.retainAll(digest_mbrs); List<Address> merged_mbrs = gms.computeNewMembership(sub_mbrships); // the new coordinator is the first member of the consolidated & sorted membership list Address new_coord = merged_mbrs.isEmpty() ? null : merged_mbrs.get(0); if (new_coord == null) return null; // should be the highest view ID seen up to now plus 1 ViewId new_vid = new ViewId(new_coord, logical_time + 1); // determine the new view MergeView new_view = new MergeView(new_vid, merged_mbrs, subgroups); if (log.isTraceEnabled()) log.trace( gms.local_addr + ": consolidated view=" + new_view + "\nconsolidated digest=" + new_digest); return new MergeData(gms.local_addr, new_view, new_digest); } /** * Merge all digests into one. For each sender, the new value is max(highest_delivered), * max(highest_received). This method has a lock on merge_rsps */ private Digest consolidateDigests(List<MergeData> merge_rsps, int num_mbrs) { MutableDigest retval = new MutableDigest(num_mbrs); for (MergeData data : merge_rsps) { Digest tmp_digest = data.getDigest(); if (tmp_digest == null) continue; retval.merge(tmp_digest); } return retval.copy(); } }