/** * Kick off a new sub-procedure on the listener with the data stored in the passed znode. * * <p>Will attempt to create the same procedure multiple times if an procedure znode with the same * name is created. It is left up the coordinator to ensure this doesn't occur. * * @param path full path to the znode for the procedure to start */ private synchronized void startNewSubprocedure(String path) { LOG.debug("Found procedure znode: " + path); String opName = ZKUtil.getNodeName(path); // start watching for an abort notification for the procedure String abortZNode = zkController.getAbortZNode(opName); try { if (ZKUtil.watchAndCheckExists(zkController.getWatcher(), abortZNode)) { LOG.debug("Not starting:" + opName + " because we already have an abort notification."); return; } } catch (KeeperException e) { member.controllerConnectionFailure( "Failed to get the abort znode (" + abortZNode + ") for procedure :" + opName, e, opName); return; } // get the data for the procedure Subprocedure subproc = null; try { byte[] data = ZKUtil.getData(zkController.getWatcher(), path); if (!ProtobufUtil.isPBMagicPrefix(data)) { String msg = "Data in for starting procuedure " + opName + " is illegally formatted (no pb magic). " + "Killing the procedure: " + Bytes.toString(data); LOG.error(msg); throw new IllegalArgumentException(msg); } LOG.debug("start proc data length is " + data.length); data = Arrays.copyOfRange(data, ProtobufUtil.lengthOfPBMagic(), data.length); LOG.debug("Found data for znode:" + path); subproc = member.createSubprocedure(opName, data); member.submitSubprocedure(subproc); } catch (IllegalArgumentException iae) { LOG.error("Illegal argument exception", iae); sendMemberAborted(subproc, new ForeignException(getMemberName(), iae)); } catch (IllegalStateException ise) { LOG.error("Illegal state exception ", ise); sendMemberAborted(subproc, new ForeignException(getMemberName(), ise)); } catch (KeeperException e) { member.controllerConnectionFailure( "Failed to get data for new procedure:" + opName, e, opName); } catch (InterruptedException e) { member.controllerConnectionFailure( "Failed to get data for new procedure:" + opName, e, opName); Thread.currentThread().interrupt(); } }
/** * This attempts to create an acquired state znode for the procedure (snapshot name). * * <p>It then looks for the reached znode to trigger in-barrier execution. If not present we have * a watcher, if present then trigger the in-barrier action. */ @Override public void sendMemberAcquired(Subprocedure sub) throws IOException { String procName = sub.getName(); try { LOG.debug( "Member: '" + memberName + "' joining acquired barrier for procedure (" + procName + ") in zk"); String acquiredZNode = ZKUtil.joinZNode( ZKProcedureUtil.getAcquireBarrierNode(zkController, procName), memberName); ZKUtil.createAndFailSilent(zkController.getWatcher(), acquiredZNode); // watch for the complete node for this snapshot String reachedBarrier = zkController.getReachedBarrierNode(procName); LOG.debug("Watch for global barrier reached:" + reachedBarrier); if (ZKUtil.watchAndCheckExists(zkController.getWatcher(), reachedBarrier)) { receivedReachedGlobalBarrier(reachedBarrier); } } catch (KeeperException e) { member.controllerConnectionFailure( "Failed to acquire barrier for procedure: " + procName + " and member: " + memberName, e, procName); } }
private void waitForNewProcedures() { // watch for new procedues that we need to start subprocedures for LOG.debug("Looking for new procedures under znode:'" + zkController.getAcquiredBarrier() + "'"); List<String> runningProcedures = null; try { runningProcedures = ZKUtil.listChildrenAndWatchForNewChildren( zkController.getWatcher(), zkController.getAcquiredBarrier()); if (runningProcedures == null) { LOG.debug("No running procedures."); return; } } catch (KeeperException e) { member.controllerConnectionFailure( "General failure when watching for new procedures", e, null); } if (runningProcedures == null) { LOG.debug("No running procedures."); return; } for (String procName : runningProcedures) { // then read in the procedure information String path = ZKUtil.joinZNode(zkController.getAcquiredBarrier(), procName); startNewSubprocedure(path); } }
private void watchForAbortedProcedures() { LOG.debug("Checking for aborted procedures on node: '" + zkController.getAbortZnode() + "'"); try { // this is the list of the currently aborted procedues for (String node : ZKUtil.listChildrenAndWatchForNewChildren( zkController.getWatcher(), zkController.getAbortZnode())) { String abortNode = ZKUtil.joinZNode(zkController.getAbortZnode(), node); abort(abortNode); } } catch (KeeperException e) { member.controllerConnectionFailure( "Failed to list children for abort node:" + zkController.getAbortZnode(), e, null); } }
/** This acts as the ack for a completed procedure */ @Override public void sendMemberCompleted(Subprocedure sub, byte[] data) throws IOException { String procName = sub.getName(); LOG.debug( "Marking procedure '" + procName + "' completed for member '" + memberName + "' in zk"); String joinPath = ZKUtil.joinZNode(zkController.getReachedBarrierNode(procName), memberName); // ProtobufUtil.prependPBMagic does not take care of null if (data == null) { data = new byte[0]; } try { ZKUtil.createAndFailSilent( zkController.getWatcher(), joinPath, ProtobufUtil.prependPBMagic(data)); } catch (KeeperException e) { member.controllerConnectionFailure( "Failed to post zk node:" + joinPath + " to join procedure barrier.", e, procName); } }
/** * Pass along the found abort notification to the listener * * @param abortZNode full znode path to the failed procedure information */ protected void abort(String abortZNode) { LOG.debug("Aborting procedure member for znode " + abortZNode); String opName = ZKUtil.getNodeName(abortZNode); try { byte[] data = ZKUtil.getData(zkController.getWatcher(), abortZNode); // figure out the data we need to pass ForeignException ee; try { if (data == null || data.length == 0) { // ignore return; } else if (!ProtobufUtil.isPBMagicPrefix(data)) { String msg = "Illegally formatted data in abort node for proc " + opName + ". Killing the procedure."; LOG.error(msg); // we got a remote exception, but we can't describe it so just return exn from here ee = new ForeignException(getMemberName(), new IllegalArgumentException(msg)); } else { data = Arrays.copyOfRange(data, ProtobufUtil.lengthOfPBMagic(), data.length); ee = ForeignException.deserialize(data); } } catch (InvalidProtocolBufferException e) { LOG.warn( "Got an error notification for op:" + opName + " but we can't read the information. Killing the procedure."); // we got a remote exception, but we can't describe it so just return exn from here ee = new ForeignException(getMemberName(), e); } this.member.receiveAbortProcedure(opName, ee); } catch (KeeperException e) { member.controllerConnectionFailure( "Failed to get data for abort znode:" + abortZNode + zkController.getAbortZnode(), e, opName); } catch (InterruptedException e) { LOG.warn("abort already in progress", e); Thread.currentThread().interrupt(); } }
/** * This should be called by the member and should write a serialized root cause exception as to * the abort znode. */ @Override public void sendMemberAborted(Subprocedure sub, ForeignException ee) { if (sub == null) { LOG.error("Failed due to null subprocedure", ee); return; } String procName = sub.getName(); LOG.debug("Aborting procedure (" + procName + ") in zk"); String procAbortZNode = zkController.getAbortZNode(procName); try { String source = (ee.getSource() == null) ? memberName : ee.getSource(); byte[] errorInfo = ProtobufUtil.prependPBMagic(ForeignException.serialize(source, ee)); ZKUtil.createAndFailSilent(zkController.getWatcher(), procAbortZNode, errorInfo); LOG.debug("Finished creating abort znode:" + procAbortZNode); } catch (KeeperException e) { // possible that we get this error for the procedure if we already reset the zk state, but in // that case we should still get an error for that procedure anyways zkController.logZKTree(zkController.getBaseZnode()); member.controllerConnectionFailure( "Failed to post zk node:" + procAbortZNode + " to abort procedure", e, procName); } }