public TableLevelWatermarker(State state) {
    this.tableWatermarks = Maps.newHashMap();

    // Load previous watermarks in case of sourceState
    if (state instanceof SourceState) {
      SourceState sourceState = (SourceState) state;
      for (Map.Entry<String, Iterable<WorkUnitState>> datasetWorkUnitStates :
          sourceState.getPreviousWorkUnitStatesByDatasetUrns().entrySet()) {

        // Use the minimum of all previous watermarks for this dataset
        List<LongWatermark> previousWatermarks =
            FluentIterable.from(datasetWorkUnitStates.getValue())
                .filter(Predicates.not(PartitionLevelWatermarker.WATERMARK_WORKUNIT_PREDICATE))
                .transform(
                    new Function<WorkUnitState, LongWatermark>() {
                      @Override
                      public LongWatermark apply(WorkUnitState w) {
                        return w.getActualHighWatermark(LongWatermark.class);
                      }
                    })
                .toList();

        if (!previousWatermarks.isEmpty()) {
          this.tableWatermarks.put(
              datasetWorkUnitStates.getKey(), Collections.min(previousWatermarks));
        }
      }
      log.info("Loaded table watermarks from previous state " + this.tableWatermarks);
    }
  }
示例#2
0
 /**
  * Initialize the logger.
  *
  * @param state Source state
  */
 private void initLogger(SourceState state) {
   StringBuilder sb = new StringBuilder();
   sb.append("[");
   sb.append(StringUtils.stripToEmpty(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA)));
   sb.append("_");
   sb.append(StringUtils.stripToEmpty(state.getProp(ConfigurationKeys.SOURCE_ENTITY)));
   sb.append("]");
   MDC.put("sourceInfo", sb.toString());
 }
示例#3
0
  @Override
  public List<WorkUnit> getWorkunits(SourceState state) {
    initLogger(state);

    List<WorkUnit> workUnits = Lists.newArrayList();
    String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
    String entityName = state.getProp(ConfigurationKeys.SOURCE_ENTITY);

    String extractTableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
    // If extract table name is not found then use the entity name
    if (StringUtils.isBlank(extractTableName)) {
      extractTableName =
          Utils.escapeSpecialCharacters(
              entityName, ConfigurationKeys.ESCAPE_CHARS_IN_TABLE_NAME, "_");
    }

    TableType tableType =
        TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase());
    long previousWatermark = this.getLatestWatermarkFromMetadata(state);

    Map<Long, Long> sortedPartitions = Maps.newTreeMap();
    sortedPartitions.putAll(new Partitioner(state).getPartitions(previousWatermark));

    // Use extract table name to create extract
    SourceState partitionState = new SourceState();
    partitionState.addAll(state);
    Extract extract = partitionState.createExtract(tableType, nameSpaceName, extractTableName);

    // Setting current time for the full extract
    if (Boolean.valueOf(state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY))) {
      extract.setFullTrue(System.currentTimeMillis());
    }

    for (Entry<Long, Long> entry : sortedPartitions.entrySet()) {
      partitionState.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, entry.getKey());
      partitionState.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, entry.getValue());
      workUnits.add(partitionState.createWorkUnit(extract));
    }

    LOG.info("Total number of work units for the current run: " + workUnits.size());

    List<WorkUnit> previousWorkUnits = this.getPreviousWorkUnitsForRetry(state);
    LOG.info("Total number of incomplete tasks from the previous run: " + previousWorkUnits.size());
    workUnits.addAll(previousWorkUnits);

    return workUnits;
  }
示例#4
0
  /**
   * Get latest water mark from previous work unit states.
   *
   * @param state Source state
   * @return latest water mark (high water mark)
   */
  private long getLatestWatermarkFromMetadata(SourceState state) {
    LOG.debug("Get latest watermark from the previous run");
    long latestWaterMark = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;

    List<WorkUnitState> previousWorkUnitStates =
        Lists.newArrayList(state.getPreviousWorkUnitStates());
    List<Long> previousWorkUnitStateHighWatermarks = Lists.newArrayList();
    List<Long> previousWorkUnitLowWatermarks = Lists.newArrayList();

    if (previousWorkUnitStates.isEmpty()) {
      LOG.info(
          "No previous work unit states found; Latest watermark - Default watermark: "
              + latestWaterMark);
      return latestWaterMark;
    }

    boolean hasFailedRun = false;
    boolean isCommitOnFullSuccess = false;
    boolean isDataProcessedInPreviousRun = false;

    JobCommitPolicy commitPolicy =
        JobCommitPolicy.forName(
            state.getProp(
                ConfigurationKeys.JOB_COMMIT_POLICY_KEY,
                ConfigurationKeys.DEFAULT_JOB_COMMIT_POLICY));
    if (commitPolicy == JobCommitPolicy.COMMIT_ON_FULL_SUCCESS) {
      isCommitOnFullSuccess = true;
    }

    for (WorkUnitState workUnitState : previousWorkUnitStates) {
      long processedRecordCount = 0;
      LOG.info(
          "State of the previous task: "
              + workUnitState.getId()
              + ":"
              + workUnitState.getWorkingState());
      if (workUnitState.getWorkingState() == WorkingState.FAILED
          || workUnitState.getWorkingState() == WorkingState.CANCELLED
          || workUnitState.getWorkingState() == WorkingState.RUNNING
          || workUnitState.getWorkingState() == WorkingState.PENDING) {
        hasFailedRun = true;
      } else {
        processedRecordCount =
            workUnitState.getPropAsLong(ConfigurationKeys.EXTRACTOR_ROWS_EXPECTED);
        if (processedRecordCount != 0) {
          isDataProcessedInPreviousRun = true;
        }
      }

      LOG.info(
          "Low watermark of the previous task: "
              + workUnitState.getId()
              + ":"
              + workUnitState.getWorkunit().getLowWaterMark());
      LOG.info(
          "High watermark of the previous task: "
              + workUnitState.getId()
              + ":"
              + workUnitState.getHighWaterMark());
      LOG.info("Record count of the previous task: " + processedRecordCount + "\n");

      // Consider high water mark of the previous work unit, if it is
      // extracted any data
      if (processedRecordCount != 0) {
        previousWorkUnitStateHighWatermarks.add(workUnitState.getHighWaterMark());
      }

      previousWorkUnitLowWatermarks.add(this.getLowWatermarkFromWorkUnit(workUnitState));
    }

    // If commit policy is full and it has failed run, get latest water mark
    // as
    // minimum of low water marks from previous states.
    if (isCommitOnFullSuccess && hasFailedRun) {
      long previousLowWatermark = Collections.min(previousWorkUnitLowWatermarks);

      WorkUnitState previousState = previousWorkUnitStates.get(0);
      ExtractType extractType =
          ExtractType.valueOf(
              previousState
                  .getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE)
                  .toUpperCase());

      // add backup seconds only for snapshot extracts but not for appends
      if (extractType == ExtractType.SNAPSHOT) {
        int backupSecs =
            previousState.getPropAsInt(
                ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, 0);
        String watermarkType =
            previousState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE);
        latestWaterMark = this.addBackedUpSeconds(previousLowWatermark, backupSecs, watermarkType);
      } else {
        latestWaterMark = previousLowWatermark;
      }

      LOG.info(
          "Previous job was COMMIT_ON_FULL_SUCCESS but it was failed; Latest watermark - "
              + "Min watermark from WorkUnits: "
              + latestWaterMark);
    }

    // If commit policy is full and there are no failed tasks or commit
    // policy is partial,
    // get latest water mark as maximum of high water marks from previous
    // tasks.
    else {
      if (isDataProcessedInPreviousRun) {
        latestWaterMark = Collections.max(previousWorkUnitStateHighWatermarks);
        LOG.info(
            "Previous run was successful. Latest watermark - Max watermark from WorkUnitStates: "
                + latestWaterMark);
      } else {
        latestWaterMark = Collections.min(previousWorkUnitLowWatermarks);
        LOG.info(
            "Previous run was successful but no data found. Latest watermark - Min watermark from WorkUnitStates: "
                + latestWaterMark);
      }
    }

    return latestWaterMark;
  }