/** Process a (possibly converted) record. */ @SuppressWarnings("unchecked") private void processRecord( Object convertedRecord, ForkOperator forkOperator, RowLevelPolicyChecker rowChecker, RowLevelPolicyCheckResults rowResults, int branches) throws Exception { // Skip the record if quality checking fails if (!rowChecker.executePolicies(convertedRecord, rowResults)) { return; } List<Boolean> forkedRecords = forkOperator.forkDataRecord(this.taskState, convertedRecord); if (forkedRecords.size() != branches) { throw new ForkBranchMismatchException( String.format( "Number of forked data records [%d] is not equal to number of branches [%d]", forkedRecords.size(), branches)); } if (inMultipleBranches(forkedRecords) && !(convertedRecord instanceof Copyable)) { throw new CopyNotSupportedException(convertedRecord + " is not copyable"); } // If the record has been successfully put into the queues of every forks boolean allPutsSucceeded = false; // Use an array of primitive boolean type to avoid unnecessary boxing/unboxing boolean[] succeededPuts = new boolean[branches]; // Put the record into the record queue of each fork. A put may timeout and return a false, in // which // case the put needs to be retried in the next iteration along with other failed puts. This // goes on // until all puts succeed, at which point the task moves to the next record. while (!allPutsSucceeded) { allPutsSucceeded = true; for (int i = 0; i < branches; i++) { if (succeededPuts[i]) { continue; } if (this.forks.get(i).isPresent() && forkedRecords.get(i)) { boolean succeeded = this.forks .get(i) .get() .putRecord( convertedRecord instanceof Copyable ? ((Copyable) convertedRecord).copy() : convertedRecord); succeededPuts[i] = succeeded; if (!succeeded) { allPutsSucceeded = false; } } else { succeededPuts[i] = true; } } } }
@Override @SuppressWarnings("unchecked") public void run() { long startTime = System.currentTimeMillis(); this.taskState.setStartTime(startTime); this.taskState.setWorkingState(WorkUnitState.WorkingState.RUNNING); // Clear the list so it starts with a fresh list of forks for each run/retry this.forks.clear(); Closer closer = Closer.create(); Converter converter = null; InstrumentedExtractorBase extractor = null; RowLevelPolicyChecker rowChecker = null; try { extractor = closer.register( new InstrumentedExtractorDecorator(this.taskState, this.taskContext.getExtractor())); converter = closer.register(new MultiConverter(this.taskContext.getConverters())); // Get the fork operator. By default IdentityForkOperator is used with a single branch. ForkOperator forkOperator = closer.register(this.taskContext.getForkOperator()); forkOperator.init(this.taskState); int branches = forkOperator.getBranches(this.taskState); // Set fork.branches explicitly here so the rest task flow can pick it up this.taskState.setProp(ConfigurationKeys.FORK_BRANCHES_KEY, branches); // Extract, convert, and fork the source schema. Object schema = converter.convertSchema(extractor.getSchema(), this.taskState); List<Boolean> forkedSchemas = forkOperator.forkSchema(this.taskState, schema); if (forkedSchemas.size() != branches) { throw new ForkBranchMismatchException( String.format( "Number of forked schemas [%d] is not equal to number of branches [%d]", forkedSchemas.size(), branches)); } if (inMultipleBranches(forkedSchemas) && !(schema instanceof Copyable)) { throw new CopyNotSupportedException(schema + " is not copyable"); } // Create one fork for each forked branch for (int i = 0; i < branches; i++) { if (forkedSchemas.get(i)) { Fork fork = closer.register( new Fork( this.taskContext, schema instanceof Copyable ? ((Copyable) schema).copy() : schema, branches, i)); // Run the Fork this.forkCompletionService.submit(fork, fork); this.forks.add(Optional.of(fork)); } else { this.forks.add(Optional.<Fork>absent()); } } // Build the row-level quality checker rowChecker = closer.register(this.taskContext.getRowLevelPolicyChecker()); RowLevelPolicyCheckResults rowResults = new RowLevelPolicyCheckResults(); long recordsPulled = 0; Object record; // Extract, convert, and fork one source record at a time. while ((record = extractor.readRecord(null)) != null) { recordsPulled++; for (Object convertedRecord : converter.convertRecord(schema, record, this.taskState)) { processRecord(convertedRecord, forkOperator, rowChecker, rowResults, branches); } } LOG.info("Extracted " + recordsPulled + " data records"); LOG.info("Row quality checker finished with results: " + rowResults.getResults()); this.taskState.setProp(ConfigurationKeys.EXTRACTOR_ROWS_EXTRACTED, recordsPulled); this.taskState.setProp( ConfigurationKeys.EXTRACTOR_ROWS_EXPECTED, extractor.getExpectedRecordCount()); for (Optional<Fork> fork : this.forks) { if (fork.isPresent()) { // Tell the fork that the main branch is completed and no new incoming data records should // be expected fork.get().markParentTaskDone(); } } for (Optional<Fork> fork : this.forks) { if (fork.isPresent()) { try { this.forkCompletionService.take(); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); } } } // Check if all forks succeeded boolean allForksSucceeded = true; for (Optional<Fork> fork : this.forks) { if (fork.isPresent()) { if (fork.get().isSucceeded()) { if (!fork.get().commit()) { allForksSucceeded = false; } } else { allForksSucceeded = false; } } } if (allForksSucceeded) { // Set the task state to SUCCESSFUL. The state is not set to COMMITTED // as the data publisher will do that upon successful data publishing. this.taskState.setWorkingState(WorkUnitState.WorkingState.SUCCESSFUL); } else { LOG.error(String.format("Not all forks of task %s succeeded", this.taskId)); this.taskState.setWorkingState(WorkUnitState.WorkingState.FAILED); } } catch (Throwable t) { failTask(t); } finally { addConstructsFinalStateToTaskState(extractor, converter, rowChecker); this.taskState.setProp(ConfigurationKeys.WRITER_RECORDS_WRITTEN, getRecordsWritten()); this.taskState.setProp(ConfigurationKeys.WRITER_BYTES_WRITTEN, getBytesWritten()); try { closer.close(); } catch (Throwable t) { LOG.error("Failed to close all open resources", t); } try { if (shouldPublishDataInTask()) { // If data should be published by the task, publish the data and set the task state to // COMMITTED. // Task data can only be published after all forks have been closed by closer.close(). publishTaskData(); this.taskState.setWorkingState(WorkUnitState.WorkingState.COMMITTED); } } catch (IOException ioe) { failTask(ioe); } finally { long endTime = System.currentTimeMillis(); this.taskState.setEndTime(endTime); this.taskState.setTaskDuration(endTime - startTime); this.taskStateTracker.onTaskCompletion(this); } } }