// TCP large RECEIVE of results. Note that 'this' is NOT the RPC object // that is hoping to get the received object, nor is the current thread the // RPC thread blocking for the object. The current thread is the TCP // reader thread. static void tcp_ack(final AutoBuffer ab) throws IOException { // Get the RPC we're waiting on int task = ab.getTask(); RPC rpc = ab._h2o.taskGet(task); // Race with canceling a large RPC fetch: Task is already dead. Do not // bother reading from the TCP socket, just bail out & close socket. if (rpc == null || rpc._done) { ab.drainClose(); } else { assert rpc._tasknum == task; assert !rpc._done; // Here we have the result, and we're on the correct Node but wrong // Thread. If we just return, the TCP reader thread will close the // remote, the remote will UDP ACK the RPC back, and back on the current // Node but in the correct Thread, we'd wake up and realize we received a // large result. try { rpc.response(ab); } catch (AutoBuffer.AutoBufferException e) { // If TCP fails, we will have done a short-read crushing the original // _dt object, and be unable to resend. This is fatal right now. // Really: an unimplemented feature; fix is to notice that a partial // TCP read means that the server (1) got our remote_exec request, (2) // has computed an answer and was trying to send it to us, (3) failed // sending via TCP hence the server knows it failed and will send again // without any further work from us. We need to disable all the resend // & retry logic, and wait for the server to re-send our result. // Meanwhile the _dt object is crushed with half-read crap, and cannot // be trusted except in the base fields. throw Log.throwErr(e._ioe); } } // ACKACK the remote, telling him "we got the answer" new AutoBuffer(ab._h2o, H2O.ACK_ACK_PRIORITY).putTask(UDP.udp.ackack.ordinal(), task).close(); }
// Do the remote execution in a F/J thread & send a reply packet. // Caller must call 'tryComplete'. private static AutoBuffer remexec(DTask dt, H2ONode client, int task, AutoBuffer abold) { abold.close(); // Closing the old guy, returning a new guy // Now compute on it! dt.invoke(client); // Send results back AutoBuffer ab = new AutoBuffer(client).putTask(UDP.udp.ack, task).put1(SERVER_UDP_SEND); dt.write(ab); // Write the DTask dt._repliedTcp = ab.hasTCP(); // Resends do not need to repeat TCP result // Install answer so retries get this very answer client.record_task_answer(task, dt); return ab; }
// Assertion check that size is not changing between resends, // i.e., resends sent identical data. private boolean sz_check(AutoBuffer ab) { final int absize = ab.size(); if (_size == 0) { _size = absize; return true; } return _size == absize; }
// TCP large RECEIVE of results. Note that 'this' is NOT the RPC object // that is hoping to get the received object, nor is the current thread the // RPC thread blocking for the object. The current thread is the TCP // reader thread. static void tcp_ack(final AutoBuffer ab) { // Get the RPC we're waiting on int task = ab.getTask(); RPC rpc = TASKS.get(task); // Race with canceling a large RPC fetch: Task is already dead. Do not // bother reading from the TCP socket, just bail out & close socket. if (rpc == null) { ab.drainClose(); } else { assert rpc._tasknum == task; assert !rpc._done; // Here we have the result, and we're on the correct Node but wrong // Thread. If we just return, the TCP reader thread will close the // remote, the remote will UDP ACK the RPC back, and back on the current // Node but in the correct Thread, we'd wake up and realize we received a // large result. rpc.response(ab); } // ACKACK the remote, telling him "we got the answer" new AutoBuffer(ab._h2o).putTask(UDP.udp.ackack.ordinal(), task).close(true); }
// Got a response UDP packet, or completed a large TCP answer-receive. // Install it as The Answer packet and wake up anybody waiting on an answer. protected void response(AutoBuffer ab) { assert _tasknum == ab.getTask(); if (_done) { ab.close(); return; } // Ignore duplicate response packet int flag = ab.getFlag(); // Must read flag also, to advance ab if (flag == SERVER_TCP_SEND) { ab.close(); return; } // Ignore UDP packet for a TCP reply assert flag == SERVER_UDP_SEND; synchronized (this) { // Install the answer under lock if (_done) { ab.close(); return; } // Ignore duplicate response packet _dt.read(ab); // Read the answer (under lock?) ab.close(); // Also finish the read (under lock?) _dt.onAck(); // One time only execute (before sending ACKACK) _done = true; UDPTimeOutThread.PENDING.remove(this); TASKS.remove(_tasknum); // Flag as task-completed, even if the result is null notifyAll(); // And notify in any case } }
// Handle TCP traffic, from a client to this server asking for work to be // done. This is called on the TCP reader thread, not a Fork/Join worker // thread. We want to do the bulk TCP read in the TCP reader thread. static void tcp_exec(final AutoBuffer ab) { final int ctrl = ab.getCtrl(); final int task = ab.getTask(); final int flag = ab.getFlag(); assert flag == CLIENT_UDP_SEND; // Client sent a request to be executed? // Act "as if" called from the UDP packet code, by recording the task just // like the packet we will be receiving (eventually). The presence of this // packet is used to stop dup-actions on dup-sends. Racily inserted, keep // only the last one. DTask dt1 = ab._h2o.record_task(task); assert dt1 == null || dt1 instanceof NOPTask : "#" + task + " " + dt1.getClass(); // For TCP, no repeats, so 1st send is only send (except for UDP // timeout retries) // Make a remote instance of this dude from the stream, but only if the // racing UDP packet did not already make one. Start the bulk TCP read. final DTask dt = ab.get(DTask.class); // Here I want to execute on this, but not block for completion in the // TCP reader thread. Jam the task on some F/J thread. UDP.udp .UDPS[ctrl] .pool() .execute( new CountedCompleter() { public void compute() { remexec(dt, ab._h2o, task, ab).close(); tryComplete(); } public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) { ex.printStackTrace(); return true; } }); // All done for the TCP thread! Work continues in the FJ thread... }
// Pretty-print bytes 1-15; byte 0 is the udp_type enum public String print16(AutoBuffer ab) { int flag = ab.getFlag(); String clazz = ""; if (flag == CLIENT_UDP_SEND) clazz = new String(ab.getA1(Math.min(ab.get2(), ab.remaining()))); String fs = ""; switch (flag) { case SERVER_UDP_SEND: fs = "SERVER_UDP_SEND"; break; case SERVER_TCP_SEND: fs = "SERVER_TCP_SEND"; break; case CLIENT_UDP_SEND: fs = "CLIENT_UDP_SEND"; break; case CLIENT_TCP_SEND: fs = "CLIENT_TCP_SEND"; break; } return "task# " + ab.getTask() + " " + fs + " " + clazz; }
// Make an initial RPC, or re-send a packet. Always called on 1st send; also // called on a timeout. public synchronized RPC<V> call() { // Keep a global record, for awhile TASKS.put(_tasknum, this); // We could be racing timeouts-vs-replies. Blow off timeout if we have an answer. if (isDone()) { TASKS.remove(_tasknum); return this; } // Default strategy: (re)fire the packet and (re)start the timeout. We // "count" exactly 1 failure: just whether or not we shipped via TCP ever // once. After that we fearlessly (re)send UDP-sized packets until the // server replies. // Pack classloader/class & the instance data into the outgoing // AutoBuffer. If it fits in a single UDP packet, ship it. If not, // finish off the current AutoBuffer (which is now going TCP style), and // make a new UDP-sized packet. On a re-send of a TCP-sized hunk, just // send the basic UDP control packet. if (!_sentTcp) { // Ship the UDP packet with clazz name to execute // totally replace me with Michal's enums!!! UDP.udp fjq = _dt.isHighPriority() ? UDP.udp.exechi : UDP.udp.execlo; AutoBuffer ab = new AutoBuffer(_target).putTask(fjq, _tasknum); ab.put1(CLIENT_UDP_SEND).put(_dt).close(); if (ab.hasTCP()) _sentTcp = true; } // Double retry until we exceed existing age. This is the time to delay // until we try again. Note that we come here immediately on creation, // so the first doubling happens before anybody does any waiting. Also // note the generous 5sec cap: ping at least every 5 sec. _retry += (_retry < 5000) ? _retry : 5000; // Put self on the "TBD" list of tasks awaiting Timeout. // So: dont really 'forget' but remember me in a little bit. assert !UDPTimeOutThread.PENDING.contains(this); UDPTimeOutThread.PENDING.add(this); return this; }
private void sendAck() { // Send results back DTask dt, origDt = _dt; // _dt can go null the instant it is send over wire assert origDt != null; // Freed after completion while ((dt = _dt) != null) { // Retry loop for broken TCP sends AutoBuffer ab = null; try { // Start the ACK with results back to client. If the client is // asking for a class/id mapping (or any job running at FETCH_ACK // priority) then return a udp.fetchack byte instead of a udp.ack. // The receiver thread then knows to handle the mapping at the higher // priority. UDP.udp udp = dt.priority() == H2O.FETCH_ACK_PRIORITY ? UDP.udp.fetchack : UDP.udp.ack; ab = new AutoBuffer(_client, udp._prior).putTask(udp, _tsknum).put1(SERVER_UDP_SEND); assert ab.position() == 1 + 2 + 4 + 1; dt.write(ab); // Write the DTask - could be very large write dt._repliedTcp = ab.hasTCP(); // Resends do not need to repeat TCP result ab.close(); // Then close; send final byte _computedAndReplied = true; // After the final handshake, set computed+replied bit break; // Break out of retry loop } catch (AutoBuffer.AutoBufferException e) { if (!_client._heartbeat._client) // Report on servers only; clients allowed to be flaky Log.info( "IOException during ACK, " + e._ioe.getMessage() + ", t#" + _tsknum + " AB=" + ab + ", waiting and retrying..."); ab.drainClose(); if (_client._heartbeat._client) // Dead client will not accept a TCP ACK response? this.CAS_DT(dt, null); // cancel the ACK try { Thread.sleep(100); } catch (InterruptedException ignore) { } } catch (Exception e) { // Custom serializer just barfed? Log.err(e); // Log custom serializer exception ab.drainClose(); } } // end of while(true) if (dt == null) Log.info( "Cancelled remote task#" + _tsknum + " " + origDt.getClass() + " to " + _client + " has been cancelled by remote"); else { if (dt instanceof MRTask && dt.logVerbose()) Log.debug("Done remote task#" + _tsknum + " " + dt.getClass() + " to " + _client); _client.record_task_answer(this); // Setup for retrying Ack & AckAck, if not canceled } }
protected AutoBuffer response(AutoBuffer ab) { assert _tasknum == ab.getTask(); if (_done) { if (!ab.hasTCP()) return ackack(ab, _tasknum); // Ignore duplicate response packet ab.drainClose(); } else { int flag = ab.getFlag(); // Must read flag also, to advance ab if (flag == SERVER_TCP_SEND) return ackack(ab, _tasknum); // Ignore UDP packet for a TCP reply assert flag == SERVER_UDP_SEND : "flag = " + flag; synchronized (this) { // Install the answer under lock if (_done) { if (!ab.hasTCP()) return ackack(ab, _tasknum); // Ignore duplicate response packet ab.drainClose(); } else { // UDPTimeOutThread.PENDING.remove(_tasknum); _dt.read(ab); // Read the answer (under lock?) _size_rez = ab.size(); // Record received size ab .close(); // Also finish the read (under lock? even if canceled, since need to drain // TCP) if (!isCancelled()) // Can be canceled already (locally by MRTask while recieving remote // answer) _dt.onAck(); // One time only execute (before sending ACKACK) _done = true; // Only read one (of many) response packets ab._h2o.taskRemove(_tasknum); // Flag as task-completed, even if the result is null notifyAll(); // And notify in any case } if (!isCancelled()) // Can be canceled already doAllCompletions(); // Send all tasks needing completion to the work queues } } // AckAck back on a fresh AutoBuffer, since actually closed() the incoming one return new AutoBuffer(ab._h2o, H2O.ACK_ACK_PRIORITY) .putTask(UDP.udp.ackack.ordinal(), _tasknum); }
// Re-send strictly the ack, because we're missing an AckAck final void resend_ack() { assert _computedAndReplied : "Found RPCCall not computed " + _tsknum; DTask dt = _dt; if (dt == null) return; // Received ACKACK already UDP.udp udp = dt.priority() == H2O.FETCH_ACK_PRIORITY ? UDP.udp.fetchack : UDP.udp.ack; AutoBuffer rab = new AutoBuffer(_client, dt.priority()).putTask(udp, _tsknum); boolean wasTCP = dt._repliedTcp; if (wasTCP) rab.put1(RPC.SERVER_TCP_SEND); // Original reply sent via TCP else { rab.put1(RPC.SERVER_UDP_SEND); // Original reply sent via UDP assert rab.position() == 1 + 2 + 4 + 1; dt.write(rab); } assert sz_check(rab) : "Resend of " + _dt.getClass() + " changes size from " + _size + " to " + rab.size(); assert dt._repliedTcp == wasTCP; rab.close(); dt._repliedTcp = wasTCP; // Double retry until we exceed existing age. This is the time to delay // until we try again. Note that we come here immediately on creation, // so the first doubling happens before anybody does any waiting. Also // note the generous 5sec cap: ping at least every 5 sec. _retry += (_retry < MAX_TIMEOUT) ? _retry : MAX_TIMEOUT; }
AutoBuffer call(AutoBuffer ab) { return ab.getFlag() == CLIENT_UDP_SEND // UDP vs TCP send? ? remexec(ab.get(DTask.class), ab._h2o, ab.getTask(), ab) : ab; // Else all the work is being done in the TCP thread. }
// Got a response UDP packet, or completed a large TCP answer-receive. // Install it as The Answer packet and wake up anybody waiting on an answer. // On all paths, send an ACKACK back static AutoBuffer ackack(AutoBuffer ab, int tnum) { return ab.clearForWriting(H2O.ACK_ACK_PRIORITY).putTask(UDP.udp.ackack.ordinal(), tnum); }
// Handle traffic, from a client to this server asking for work to be done. // Called from either a F/J thread (generally with a UDP packet) or from the // TCPReceiver thread. static void remote_exec(AutoBuffer ab) { long lo = ab.get8(0), hi = ab._size >= 16 ? ab.get8(8) : 0; final int task = ab.getTask(); final int flag = ab.getFlag(); assert flag == CLIENT_UDP_SEND || flag == CLIENT_TCP_SEND; // Client-side send // Atomically record an instance of this task, one-time-only replacing a // null with an RPCCall, a placeholder while we work on a proper response - // and it serves to let us discard dup UDP requests. RPCCall old = ab._h2o.has_task(task); // This is a UDP packet requesting an answer back for a request sent via // TCP but the UDP packet has arrived ahead of the TCP. Just drop the UDP // and wait for the TCP to appear. if (old == null && flag == CLIENT_TCP_SEND) { Log.warn( "got tcp with existing task #, FROM " + ab._h2o.toString() + " AB: " /* + UDP.printx16(lo,hi)*/); assert !ab.hasTCP() : "ERROR: got tcp with existing task #, FROM " + ab._h2o.toString() + " AB: " /* + UDP.printx16(lo,hi)*/; // All the resends should be UDP only // DROP PACKET } else if (old == null) { // New task? RPCCall rpc; try { // Read the DTask Right Now. If we are the TCPReceiver thread, then we // are reading in that thread... and thus TCP reads are single-threaded. rpc = new RPCCall(ab.get(water.DTask.class), ab._h2o, task); } catch (AutoBuffer.AutoBufferException e) { // Here we assume it's a TCP fail on read - and ignore the remote_exec // request. The caller will send it again. NOTE: this case is // indistinguishable from a broken short-writer/long-reader bug, except // that we'll re-send endlessly and fail endlessly. Log.info( "Network congestion OR short-writer/long-reader: TCP " + e._ioe.getMessage() + ", AB=" + ab + ", ignoring partial send"); ab.drainClose(); return; } RPCCall rpc2 = ab._h2o.record_task(rpc); if (rpc2 == null) { // Atomically insert (to avoid double-work) if (rpc._dt instanceof MRTask && rpc._dt.logVerbose()) Log.debug("Start remote task#" + task + " " + rpc._dt.getClass() + " from " + ab._h2o); H2O.submitTask(rpc); // And execute! } else { // Else lost the task-insertion race if (ab.hasTCP()) ab.drainClose(); // DROP PACKET } } else if (!old._computedAndReplied) { // This packet has not been fully computed. Hence it's still a work-in- // progress locally. We have no answer to reply but we do not want to // re-offer the packet for repeated work. Send back a NACK, letting the // client know we're Working On It assert !ab.hasTCP() : "got tcp with existing task #, FROM " + ab._h2o.toString() + " AB: " + UDP.printx16(lo, hi) + ", position = " + ab._bb.position(); ab.clearForWriting(udp.nack._prior).putTask(UDP.udp.nack.ordinal(), task); // DROP PACKET } else { // This is an old re-send of the same thing we've answered to before. // Send back the same old answer ACK. If we sent via TCP before, then // we know the answer got there so just send a control-ACK back. If we // sent via UDP, resend the whole answer. if (ab.hasTCP()) { Log.warn( "got tcp with existing task #, FROM " + ab._h2o.toString() + " AB: " + UDP.printx16(lo, hi)); // All the resends should be UDP only ab.drainClose(); } if (old._dt != null) { // already ackacked ++old._ackResendCnt; if (old._ackResendCnt % 10 == 0) Log.err( "Possibly broken network, can not send ack through, got " + old._ackResendCnt + " for task # " + old._tsknum + ", dt == null?" + (old._dt == null)); old.resend_ack(); } } ab.close(); }
// Pretty-print bytes 1-15; byte 0 is the udp_type enum @Override String print16(AutoBuffer ab) { int flag = ab.getFlag(); String clazz = (flag == CLIENT_UDP_SEND) ? TypeMap.className(ab.getInt()) : ""; return "task# " + ab.getTask() + " " + clazz + " " + COOKIES[flag - SERVER_UDP_SEND]; }
public synchronized RPC<V> call() { // Any Completer will not be carried over to remote; add it to the RPC call // so completion is signaled after the remote comes back. CountedCompleter cc = _dt.getCompleter(); if (cc != null) handleCompleter(cc); // If running on self, just submit to queues & do locally if (_target == H2O.SELF) return handleLocal(); // Keep a global record, for awhile if (_target != null) _target.taskPut(_tasknum, this); try { if (_nack) return this; // Racing Nack rechecked under lock; no need to send retry // We could be racing timeouts-vs-replies. Blow off timeout if we have an answer. if (isDone()) { if (_target != null) _target.taskRemove(_tasknum); return this; } // Default strategy: (re)fire the packet and (re)start the timeout. We // "count" exactly 1 failure: just whether or not we shipped via TCP ever // once. After that we fearlessly (re)send UDP-sized packets until the // server replies. // Pack classloader/class & the instance data into the outgoing // AutoBuffer. If it fits in a single UDP packet, ship it. If not, // finish off the current AutoBuffer (which is now going TCP style), and // make a new UDP-sized packet. On a re-send of a TCP-sized hunk, just // send the basic UDP control packet. if (!_sentTcp) { while (true) { // Retry loop for broken TCP sends AutoBuffer ab = new AutoBuffer(_target, _dt.priority()); try { final boolean t; int offset = ab.position(); ab.putTask(UDP.udp.exec, _tasknum).put1(CLIENT_UDP_SEND); ab.put(_dt); t = ab.hasTCP(); assert sz_check(ab) : "Resend of " + _dt.getClass() + " changes size from " + _size + " to " + ab.size() + " for task#" + _tasknum; ab.close(); // Then close; send final byte _sentTcp = t; // Set after close (and any other possible fail) break; // Break out of retry loop } catch (AutoBuffer.AutoBufferException e) { Log.info( "IOException during RPC call: " + e._ioe.getMessage() + ", AB=" + ab + ", for task#" + _tasknum + ", waiting and retrying..."); ab.drainClose(); try { Thread.sleep(500); } catch (InterruptedException ignore) { } } } // end of while(true) } else { // Else it was sent via TCP in a prior attempt, and we've timed out. // This means the caller's ACK/answer probably got dropped and we need // him to resend it (or else the caller is still processing our // request). Send a UDP reminder - but with the CLIENT_TCP_SEND flag // instead of the UDP send, and no DTask (since it previously went via // TCP, no need to resend it). AutoBuffer ab = new AutoBuffer(_target, _dt.priority()).putTask(UDP.udp.exec, _tasknum); ab.put1(CLIENT_TCP_SEND).close(); } // Double retry until we exceed existing age. This is the time to delay // until we try again. Note that we come here immediately on creation, // so the first doubling happens before anybody does any waiting. Also // note the generous 5sec cap: ping at least every 5 sec. _retry += (_retry < MAX_TIMEOUT) ? _retry : MAX_TIMEOUT; // Put self on the "TBD" list of tasks awaiting Timeout. // So: dont really 'forget' but remember me in a little bit. // UDPTimeOutThread.PENDING.put(_tasknum, this); return this; } catch (Throwable t) { t.printStackTrace(); throw Log.throwErr(t); } }