/** * Get latest water mark from previous work unit states. * * @param state Source state * @return latest water mark (high water mark) */ private long getLatestWatermarkFromMetadata(SourceState state) { LOG.debug("Get latest watermark from the previous run"); long latestWaterMark = ConfigurationKeys.DEFAULT_WATERMARK_VALUE; List<WorkUnitState> previousWorkUnitStates = Lists.newArrayList(state.getPreviousWorkUnitStates()); List<Long> previousWorkUnitStateHighWatermarks = Lists.newArrayList(); List<Long> previousWorkUnitLowWatermarks = Lists.newArrayList(); if (previousWorkUnitStates.isEmpty()) { LOG.info( "No previous work unit states found; Latest watermark - Default watermark: " + latestWaterMark); return latestWaterMark; } boolean hasFailedRun = false; boolean isCommitOnFullSuccess = false; boolean isDataProcessedInPreviousRun = false; JobCommitPolicy commitPolicy = JobCommitPolicy.forName( state.getProp( ConfigurationKeys.JOB_COMMIT_POLICY_KEY, ConfigurationKeys.DEFAULT_JOB_COMMIT_POLICY)); if (commitPolicy == JobCommitPolicy.COMMIT_ON_FULL_SUCCESS) { isCommitOnFullSuccess = true; } for (WorkUnitState workUnitState : previousWorkUnitStates) { long processedRecordCount = 0; LOG.info( "State of the previous task: " + workUnitState.getId() + ":" + workUnitState.getWorkingState()); if (workUnitState.getWorkingState() == WorkingState.FAILED || workUnitState.getWorkingState() == WorkingState.CANCELLED || workUnitState.getWorkingState() == WorkingState.RUNNING || workUnitState.getWorkingState() == WorkingState.PENDING) { hasFailedRun = true; } else { processedRecordCount = workUnitState.getPropAsLong(ConfigurationKeys.EXTRACTOR_ROWS_EXPECTED); if (processedRecordCount != 0) { isDataProcessedInPreviousRun = true; } } LOG.info( "Low watermark of the previous task: " + workUnitState.getId() + ":" + workUnitState.getWorkunit().getLowWaterMark()); LOG.info( "High watermark of the previous task: " + workUnitState.getId() + ":" + workUnitState.getHighWaterMark()); LOG.info("Record count of the previous task: " + processedRecordCount + "\n"); // Consider high water mark of the previous work unit, if it is // extracted any data if (processedRecordCount != 0) { previousWorkUnitStateHighWatermarks.add(workUnitState.getHighWaterMark()); } previousWorkUnitLowWatermarks.add(this.getLowWatermarkFromWorkUnit(workUnitState)); } // If commit policy is full and it has failed run, get latest water mark // as // minimum of low water marks from previous states. if (isCommitOnFullSuccess && hasFailedRun) { long previousLowWatermark = Collections.min(previousWorkUnitLowWatermarks); WorkUnitState previousState = previousWorkUnitStates.get(0); ExtractType extractType = ExtractType.valueOf( previousState .getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE) .toUpperCase()); // add backup seconds only for snapshot extracts but not for appends if (extractType == ExtractType.SNAPSHOT) { int backupSecs = previousState.getPropAsInt( ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, 0); String watermarkType = previousState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE); latestWaterMark = this.addBackedUpSeconds(previousLowWatermark, backupSecs, watermarkType); } else { latestWaterMark = previousLowWatermark; } LOG.info( "Previous job was COMMIT_ON_FULL_SUCCESS but it was failed; Latest watermark - " + "Min watermark from WorkUnits: " + latestWaterMark); } // If commit policy is full and there are no failed tasks or commit // policy is partial, // get latest water mark as maximum of high water marks from previous // tasks. else { if (isDataProcessedInPreviousRun) { latestWaterMark = Collections.max(previousWorkUnitStateHighWatermarks); LOG.info( "Previous run was successful. Latest watermark - Max watermark from WorkUnitStates: " + latestWaterMark); } else { latestWaterMark = Collections.min(previousWorkUnitLowWatermarks); LOG.info( "Previous run was successful but no data found. Latest watermark - Min watermark from WorkUnitStates: " + latestWaterMark); } } return latestWaterMark; }
protected void publishData( WorkUnitState workUnitState, int branchId, Set<Path> writerOutputPathsMoved) throws IOException { // Get a ParallelRunner instance for moving files in parallel ParallelRunner parallelRunner = this.getParallelRunner(this.fileSystemByBranches.get(branchId)); // The directory where the workUnitState wrote its output data. Path writerOutputDir = WriterUtils.getWriterOutputDir(workUnitState, this.numBranches, branchId); if (writerOutputPathsMoved.contains(writerOutputDir)) { // This writer output path has already been moved for another task of the same extract return; } if (!this.fileSystemByBranches.get(branchId).exists(writerOutputDir)) { LOG.warn( String.format( "Branch %d of WorkUnit %s produced no data", branchId, workUnitState.getId())); return; } // The directory where the final output directory for this job will be placed. // It is a combination of DATA_PUBLISHER_FINAL_DIR and WRITER_FILE_PATH. Path publisherOutputDir = getPublisherOutputDir(workUnitState, branchId); if (this.fileSystemByBranches.get(branchId).exists(publisherOutputDir)) { // The final output directory already exists, check if the job is configured to replace it. boolean replaceFinalOutputDir = this.getState() .getPropAsBoolean( ForkOperatorUtils.getPropertyNameForBranch( ConfigurationKeys.DATA_PUBLISHER_REPLACE_FINAL_DIR, this.numBranches, branchId)); // If the final output directory is not configured to be replaced, put new data to the // existing directory. if (!replaceFinalOutputDir) { addWriterOutputToExistingDir( writerOutputDir, publisherOutputDir, workUnitState, branchId, parallelRunner); writerOutputPathsMoved.add(writerOutputDir); return; } // Delete the final output directory if it is configured to be replaced this.fileSystemByBranches.get(branchId).delete(publisherOutputDir, true); } else { // Create the parent directory of the final output directory if it does not exist WriterUtils.mkdirsWithRecursivePermission( this.fileSystemByBranches.get(branchId), publisherOutputDir.getParent(), this.permissions.get(branchId)); } LOG.info(String.format("Moving %s to %s", writerOutputDir, publisherOutputDir)); parallelRunner.renamePath( writerOutputDir, publisherOutputDir, this.publisherFinalDirOwnerGroupsByBranches.get(branchId)); writerOutputPathsMoved.add(writerOutputDir); }