@BeforeClass public void setUp() { WorkUnitState workUnitState = new WorkUnitState(); workUnitState.setProp(ConfigurationKeys.JOB_ID_KEY, "Job-1"); workUnitState.setProp(ConfigurationKeys.TASK_ID_KEY, "Task-1"); this.taskState = new TaskState(workUnitState); }
/** * Use resource key(Optional) and rest json entry as a template and fill in template using Avro as * a reference. e.g: Rest JSON entry HOCON template: * AccountId=${sf_account_id},Member_Id__c=${member_id} Avro: * {"sf_account_id":{"string":"0016000000UiCYHAA3"},"member_id":{"long":296458833}} * * <p>Converted Json: {"AccountId":"0016000000UiCYHAA3","Member_Id__c":296458833} * * <p>As it's template based approach, it can produce nested JSON structure even Avro is flat (or * vice versa). * * <p>e.g: Rest resource template: /sobject/account/memberId/${member_id} Avro: * {"sf_account_id":{"string":"0016000000UiCYHAA3"},"member_id":{"long":296458833}} Converted * resource: /sobject/account/memberId/296458833 * * <p>Converted resource will be used to form end point. * http://www.server.com:9090/sobject/account/memberId/296458833 * * <p>{@inheritDoc} * * @see gobblin.converter.Converter#convertRecord(java.lang.Object, java.lang.Object, * gobblin.configuration.WorkUnitState) */ @Override public Iterable<RestEntry<JsonObject>> convertRecord( Void outputSchema, GenericRecord inputRecord, WorkUnitState workUnit) throws DataConversionException { Config srcConfig = ConfigFactory.parseString( inputRecord.toString(), ConfigParseOptions.defaults().setSyntax(ConfigSyntax.JSON)); String resourceKey = workUnit.getProp(CONVERTER_AVRO_REST_ENTRY_RESOURCE_KEY, ""); if (!StringUtils.isEmpty(resourceKey)) { final String dummyKey = "DUMMY"; Config tmpConfig = ConfigFactory.parseString(dummyKey + "=" + resourceKey).resolveWith(srcConfig); resourceKey = tmpConfig.getString(dummyKey); } String hoconInput = workUnit.getProp(CONVERTER_AVRO_REST_JSON_ENTRY_TEMPLATE); if (StringUtils.isEmpty(hoconInput)) { return new SingleRecordIterable<>( new RestEntry<>(resourceKey, parser.parse(inputRecord.toString()).getAsJsonObject())); } Config destConfig = ConfigFactory.parseString(hoconInput).resolveWith(srcConfig); JsonObject json = parser.parse(destConfig.root().render(ConfigRenderOptions.concise())).getAsJsonObject(); return new SingleRecordIterable<>(new RestEntry<>(resourceKey, json)); }
protected void addWriterOutputToExistingDir( Path writerOutputDir, Path publisherOutputDir, WorkUnitState workUnitState, int branchId, ParallelRunner parallelRunner) throws IOException { boolean preserveFileName = workUnitState.getPropAsBoolean( ForkOperatorUtils.getPropertyNameForBranch( ConfigurationKeys.SOURCE_FILEBASED_PRESERVE_FILE_NAME, this.numBranches, branchId), false); // Go through each file in writerOutputDir and move it into publisherOutputDir for (FileStatus status : this.fileSystemByBranches.get(branchId).listStatus(writerOutputDir)) { // Preserve the file name if configured, use specified name otherwise Path finalOutputPath = preserveFileName ? new Path( publisherOutputDir, workUnitState.getProp( ForkOperatorUtils.getPropertyNameForBranch( ConfigurationKeys.DATA_PUBLISHER_FINAL_NAME, this.numBranches, branchId))) : new Path(publisherOutputDir, status.getPath().getName()); LOG.info(String.format("Moving %s to %s", status.getPath(), finalOutputPath)); parallelRunner.renamePath(status.getPath(), finalOutputPath, Optional.<String>absent()); } }
public TaskState(WorkUnitState workUnitState) { // Since getWorkunit() returns an immutable WorkUnit object, // the WorkUnit object in this object is also immutable. super(workUnitState.getWorkunit()); addAll(workUnitState); this.jobId = workUnitState.getProp(ConfigurationKeys.JOB_ID_KEY); this.taskId = workUnitState.getProp(ConfigurationKeys.TASK_ID_KEY); this.setId(this.taskId); }
public KafkaAvroExtractor(WorkUnitState state) { super(state); this.schemaRegistry = state.contains(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_CLASS) ? Optional.of(KafkaSchemaRegistry.<K, Schema>get(state.getProperties())) : Optional.<KafkaSchemaRegistry<K, Schema>>absent(); this.schema = getExtractorSchema(); if (this.schema.isPresent()) { this.reader = Optional.of(new GenericDatumReader<Record>(this.schema.get())); } else { log.error( String.format( "Cannot find latest schema for topic %s. This topic will be skipped", this.topicName)); this.reader = Optional.absent(); } }
@Override public void publishData(Collection<? extends WorkUnitState> states) throws IOException { // We need a Set to collect unique writer output paths as multiple tasks may belong to the same // extract. Tasks that // belong to the same Extract will by default have the same output directory Set<Path> writerOutputPathsMoved = Sets.newHashSet(); for (WorkUnitState workUnitState : states) { for (int branchId = 0; branchId < this.numBranches; branchId++) { publishData(workUnitState, branchId, writerOutputPathsMoved); } // Upon successfully committing the data to the final output directory, set states // of successful tasks to COMMITTED. leaving states of unsuccessful ones unchanged. // This makes sense to the COMMIT_ON_PARTIAL_SUCCESS policy. workUnitState.setWorkingState(WorkUnitState.WorkingState.COMMITTED); } }
/** Test for {@link ForkOperatorUtils#getPropertyNameForBranch(WorkUnitState, String)}. */ @Test public void testGetPropertyNameForBranchWithWorkUnitState() { WorkUnitState workUnitState = new WorkUnitState(); workUnitState.setProp(PROPERTY_FOO, PATH_FOO); // Test that if the fork id key is not specified that the original property is preserved Assert.assertEquals( ForkOperatorUtils.getPropertyNameForBranch(workUnitState, PROPERTY_FOO), PROPERTY_FOO); // Test that if the fork id key is set to -1 that the original property is preserved workUnitState.setProp(ConfigurationKeys.FORK_BRANCH_ID_KEY, -1); Assert.assertEquals( ForkOperatorUtils.getPropertyNameForBranch(workUnitState, PROPERTY_FOO), PROPERTY_FOO); // Test that if the fork id key is set to 0 that the new property is properly created workUnitState.setProp(ConfigurationKeys.FORK_BRANCH_ID_KEY, 0); Assert.assertEquals( ForkOperatorUtils.getPropertyNameForBranch(workUnitState, PROPERTY_FOO), PROPERTY_FOO + ".0"); }
@Override public void write(DataOutput out) throws IOException { Text text = new Text(); text.set(this.jobId); text.write(out); text.set(this.taskId); text.write(out); out.writeLong(this.startTime); out.writeLong(this.endTime); out.writeLong(this.duration); super.write(out); }
@Override public void readFields(DataInput in) throws IOException { Text text = new Text(); text.readFields(in); this.jobId = text.toString(); text.readFields(in); this.taskId = text.toString(); this.setId(this.taskId); this.startTime = in.readLong(); this.endTime = in.readLong(); this.duration = in.readLong(); super.readFields(in); }
@Test public void testWrite() throws Exception { String streamString = "testContents"; FileStatus status = fs.getFileStatus(testTempPath); OwnerAndPermission ownerAndPermission = new OwnerAndPermission( status.getOwner(), status.getGroup(), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); CopyableFile cf = CopyableFileUtils.getTestCopyableFile(ownerAndPermission); CopyableDatasetMetadata metadata = new CopyableDatasetMetadata(new TestCopyableDataset(new Path("/source"))); WorkUnitState state = new WorkUnitState(); state.setProp( ConfigurationKeys.WRITER_STAGING_DIR, new Path(testTempPath, "staging").toString()); state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, new Path(testTempPath, "output").toString()); state.setProp(ConfigurationKeys.WRITER_FILE_PATH, RandomStringUtils.randomAlphabetic(5)); CopySource.serializeCopyEntity(state, cf); CopySource.serializeCopyableDataset(state, metadata); FileAwareInputStreamDataWriter dataWriter = new FileAwareInputStreamDataWriter(state, 1, 0); FileAwareInputStream fileAwareInputStream = new FileAwareInputStream( cf, StreamUtils.convertStream(IOUtils.toInputStream(streamString))); dataWriter.write(fileAwareInputStream); dataWriter.commit(); Path writtenFilePath = new Path( new Path( state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR), cf.getDatasetAndPartition(metadata).identifier()), cf.getDestination()); Assert.assertEquals( IOUtils.toString(new FileInputStream(writtenFilePath.toString())), streamString); }
/** * Get low water mark from the given work unit state. * * @param workUnitState Work unit state * @return latest low water mark */ private long getLowWatermarkFromWorkUnit(WorkUnitState workUnitState) { String watermarkType = workUnitState.getProp( ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE); long lowWaterMark = workUnitState.getWorkunit().getLowWaterMark(); if (lowWaterMark == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) { return lowWaterMark; } WatermarkType wmType = WatermarkType.valueOf(watermarkType.toUpperCase()); int deltaNum = new WatermarkPredicate(wmType).getDeltaNumForNextWatermark(); switch (wmType) { case SIMPLE: return lowWaterMark - deltaNum; default: Date lowWaterMarkDate = Utils.toDate(lowWaterMark, "yyyyMMddHHmmss"); return Long.parseLong( Utils.dateToString( Utils.addSecondsToDate(lowWaterMarkDate, deltaNum * -1), "yyyyMMddHHmmss")); } }
/** * Get latest water mark from previous work unit states. * * @param state Source state * @return latest water mark (high water mark) */ private long getLatestWatermarkFromMetadata(SourceState state) { LOG.debug("Get latest watermark from the previous run"); long latestWaterMark = ConfigurationKeys.DEFAULT_WATERMARK_VALUE; List<WorkUnitState> previousWorkUnitStates = Lists.newArrayList(state.getPreviousWorkUnitStates()); List<Long> previousWorkUnitStateHighWatermarks = Lists.newArrayList(); List<Long> previousWorkUnitLowWatermarks = Lists.newArrayList(); if (previousWorkUnitStates.isEmpty()) { LOG.info( "No previous work unit states found; Latest watermark - Default watermark: " + latestWaterMark); return latestWaterMark; } boolean hasFailedRun = false; boolean isCommitOnFullSuccess = false; boolean isDataProcessedInPreviousRun = false; JobCommitPolicy commitPolicy = JobCommitPolicy.forName( state.getProp( ConfigurationKeys.JOB_COMMIT_POLICY_KEY, ConfigurationKeys.DEFAULT_JOB_COMMIT_POLICY)); if (commitPolicy == JobCommitPolicy.COMMIT_ON_FULL_SUCCESS) { isCommitOnFullSuccess = true; } for (WorkUnitState workUnitState : previousWorkUnitStates) { long processedRecordCount = 0; LOG.info( "State of the previous task: " + workUnitState.getId() + ":" + workUnitState.getWorkingState()); if (workUnitState.getWorkingState() == WorkingState.FAILED || workUnitState.getWorkingState() == WorkingState.CANCELLED || workUnitState.getWorkingState() == WorkingState.RUNNING || workUnitState.getWorkingState() == WorkingState.PENDING) { hasFailedRun = true; } else { processedRecordCount = workUnitState.getPropAsLong(ConfigurationKeys.EXTRACTOR_ROWS_EXPECTED); if (processedRecordCount != 0) { isDataProcessedInPreviousRun = true; } } LOG.info( "Low watermark of the previous task: " + workUnitState.getId() + ":" + workUnitState.getWorkunit().getLowWaterMark()); LOG.info( "High watermark of the previous task: " + workUnitState.getId() + ":" + workUnitState.getHighWaterMark()); LOG.info("Record count of the previous task: " + processedRecordCount + "\n"); // Consider high water mark of the previous work unit, if it is // extracted any data if (processedRecordCount != 0) { previousWorkUnitStateHighWatermarks.add(workUnitState.getHighWaterMark()); } previousWorkUnitLowWatermarks.add(this.getLowWatermarkFromWorkUnit(workUnitState)); } // If commit policy is full and it has failed run, get latest water mark // as // minimum of low water marks from previous states. if (isCommitOnFullSuccess && hasFailedRun) { long previousLowWatermark = Collections.min(previousWorkUnitLowWatermarks); WorkUnitState previousState = previousWorkUnitStates.get(0); ExtractType extractType = ExtractType.valueOf( previousState .getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE) .toUpperCase()); // add backup seconds only for snapshot extracts but not for appends if (extractType == ExtractType.SNAPSHOT) { int backupSecs = previousState.getPropAsInt( ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, 0); String watermarkType = previousState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE); latestWaterMark = this.addBackedUpSeconds(previousLowWatermark, backupSecs, watermarkType); } else { latestWaterMark = previousLowWatermark; } LOG.info( "Previous job was COMMIT_ON_FULL_SUCCESS but it was failed; Latest watermark - " + "Min watermark from WorkUnits: " + latestWaterMark); } // If commit policy is full and there are no failed tasks or commit // policy is partial, // get latest water mark as maximum of high water marks from previous // tasks. else { if (isDataProcessedInPreviousRun) { latestWaterMark = Collections.max(previousWorkUnitStateHighWatermarks); LOG.info( "Previous run was successful. Latest watermark - Max watermark from WorkUnitStates: " + latestWaterMark); } else { latestWaterMark = Collections.min(previousWorkUnitLowWatermarks); LOG.info( "Previous run was successful but no data found. Latest watermark - Min watermark from WorkUnitStates: " + latestWaterMark); } } return latestWaterMark; }
protected void publishData( WorkUnitState workUnitState, int branchId, Set<Path> writerOutputPathsMoved) throws IOException { // Get a ParallelRunner instance for moving files in parallel ParallelRunner parallelRunner = this.getParallelRunner(this.fileSystemByBranches.get(branchId)); // The directory where the workUnitState wrote its output data. Path writerOutputDir = WriterUtils.getWriterOutputDir(workUnitState, this.numBranches, branchId); if (writerOutputPathsMoved.contains(writerOutputDir)) { // This writer output path has already been moved for another task of the same extract return; } if (!this.fileSystemByBranches.get(branchId).exists(writerOutputDir)) { LOG.warn( String.format( "Branch %d of WorkUnit %s produced no data", branchId, workUnitState.getId())); return; } // The directory where the final output directory for this job will be placed. // It is a combination of DATA_PUBLISHER_FINAL_DIR and WRITER_FILE_PATH. Path publisherOutputDir = getPublisherOutputDir(workUnitState, branchId); if (this.fileSystemByBranches.get(branchId).exists(publisherOutputDir)) { // The final output directory already exists, check if the job is configured to replace it. boolean replaceFinalOutputDir = this.getState() .getPropAsBoolean( ForkOperatorUtils.getPropertyNameForBranch( ConfigurationKeys.DATA_PUBLISHER_REPLACE_FINAL_DIR, this.numBranches, branchId)); // If the final output directory is not configured to be replaced, put new data to the // existing directory. if (!replaceFinalOutputDir) { addWriterOutputToExistingDir( writerOutputDir, publisherOutputDir, workUnitState, branchId, parallelRunner); writerOutputPathsMoved.add(writerOutputDir); return; } // Delete the final output directory if it is configured to be replaced this.fileSystemByBranches.get(branchId).delete(publisherOutputDir, true); } else { // Create the parent directory of the final output directory if it does not exist WriterUtils.mkdirsWithRecursivePermission( this.fileSystemByBranches.get(branchId), publisherOutputDir.getParent(), this.permissions.get(branchId)); } LOG.info(String.format("Moving %s to %s", writerOutputDir, publisherOutputDir)); parallelRunner.renamePath( writerOutputDir, publisherOutputDir, this.publisherFinalDirOwnerGroupsByBranches.get(branchId)); writerOutputPathsMoved.add(writerOutputDir); }
@Test public void testCommit() throws IOException { String destinationExistingToken = "destination"; String destinationAdditionalTokens = "path"; String fileName = "file"; // Asemble destination paths Path destination = new Path( new Path(new Path("/", destinationExistingToken), destinationAdditionalTokens), fileName); Path destinationWithoutLeadingSeparator = new Path(new Path(destinationExistingToken, destinationAdditionalTokens), fileName); // Create temp directory File tmpFile = Files.createTempDir(); tmpFile.deleteOnExit(); Path tmpPath = new Path(tmpFile.getAbsolutePath()); // create origin file Path originFile = new Path(tmpPath, fileName); this.fs.createNewFile(originFile); // create stating dir Path stagingDir = new Path(tmpPath, "staging"); this.fs.mkdirs(stagingDir); // create output dir Path outputDir = new Path(tmpPath, "output"); this.fs.mkdirs(outputDir); // create copyable file FileStatus status = this.fs.getFileStatus(originFile); FsPermission readWrite = new FsPermission(FsAction.READ_WRITE, FsAction.READ_WRITE, FsAction.READ_WRITE); FsPermission dirReadWrite = new FsPermission(FsAction.ALL, FsAction.READ_WRITE, FsAction.READ_WRITE); OwnerAndPermission ownerAndPermission = new OwnerAndPermission(status.getOwner(), status.getGroup(), readWrite); List<OwnerAndPermission> ancestorOwnerAndPermissions = Lists.newArrayList(); ancestorOwnerAndPermissions.add(ownerAndPermission); ancestorOwnerAndPermissions.add(ownerAndPermission); ancestorOwnerAndPermissions.add(ownerAndPermission); ancestorOwnerAndPermissions.add(ownerAndPermission); Properties properties = new Properties(); properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/publisher"); CopyableFile cf = CopyableFile.fromOriginAndDestination( this.fs, status, destination, CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties) .publishDir(new Path("/target")) .preserve(PreserveAttributes.fromMnemonicString("")) .build()) .destinationOwnerAndPermission(ownerAndPermission) .ancestorsOwnerAndPermission(ancestorOwnerAndPermissions) .build(); // create work unit state WorkUnitState state = new WorkUnitState(); state.setProp(ConfigurationKeys.WRITER_STAGING_DIR, stagingDir.toUri().getPath()); state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, outputDir.toUri().getPath()); state.setProp(ConfigurationKeys.WRITER_FILE_PATH, RandomStringUtils.randomAlphabetic(5)); CopyableDatasetMetadata metadata = new CopyableDatasetMetadata(new TestCopyableDataset(new Path("/source"))); CopySource.serializeCopyEntity(state, cf); CopySource.serializeCopyableDataset(state, metadata); // create writer FileAwareInputStreamDataWriter writer = new FileAwareInputStreamDataWriter(state, 1, 0); // create output of writer.write Path writtenFile = writer.getStagingFilePath(cf); this.fs.mkdirs(writtenFile.getParent()); this.fs.createNewFile(writtenFile); // create existing directories in writer output Path outputRoot = FileAwareInputStreamDataWriter.getPartitionOutputRoot( outputDir, cf.getDatasetAndPartition(metadata)); Path existingOutputPath = new Path(outputRoot, destinationExistingToken); this.fs.mkdirs(existingOutputPath); FileStatus fileStatus = this.fs.getFileStatus(existingOutputPath); FsPermission existingPathPermission = fileStatus.getPermission(); // check initial state of the relevant directories Assert.assertTrue(this.fs.exists(existingOutputPath)); Assert.assertEquals(this.fs.listStatus(existingOutputPath).length, 0); writer.actualProcessedCopyableFile = Optional.of(cf); // commit writer.commit(); // check state of relevant paths after commit Path expectedOutputPath = new Path(outputRoot, destinationWithoutLeadingSeparator); Assert.assertTrue(this.fs.exists(expectedOutputPath)); fileStatus = this.fs.getFileStatus(expectedOutputPath); Assert.assertEquals(fileStatus.getOwner(), ownerAndPermission.getOwner()); Assert.assertEquals(fileStatus.getGroup(), ownerAndPermission.getGroup()); Assert.assertEquals(fileStatus.getPermission(), readWrite); // parent should have permissions set correctly fileStatus = this.fs.getFileStatus(expectedOutputPath.getParent()); Assert.assertEquals(fileStatus.getPermission(), dirReadWrite); // previously existing paths should not have permissions changed fileStatus = this.fs.getFileStatus(existingOutputPath); Assert.assertEquals(fileStatus.getPermission(), existingPathPermission); Assert.assertFalse(this.fs.exists(writer.stagingDir)); }
@Override public void setActualHighWatermark(WorkUnitState wus) { wus.setActualHighWatermark(wus.getWorkunit().getExpectedHighWatermark(LongWatermark.class)); }