@Override public void failedBulkLoad(final byte[] family, final String srcPath) throws IOException { if (!FSHDFSUtils.isSameHdfs(conf, srcFs, fs)) { // files are copied so no need to move them back return; } Path p = new Path(srcPath); Path stageP = new Path(stagingDir, new Path(Bytes.toString(family), p.getName())); // In case of Replication for bulk load files, hfiles are not renamed by end point during // prepare stage, so no need of rename here again if (p.equals(stageP)) { LOG.debug(p.getName() + " is already available in source directory. Skipping rename."); return; } LOG.debug("Moving " + stageP + " back to " + p); if (!fs.rename(stageP, p)) throw new IOException("Failed to move HFile: " + stageP + " to " + p); // restore original permission if (origPermissions.containsKey(srcPath)) { fs.setPermission(p, origPermissions.get(srcPath)); } else { LOG.warn("Can't find previous permission for path=" + srcPath); } }
/** * This is the typical flow for using the DistributedCache classes. * * @throws IOException * @throws LoginException */ public void testManagerFlow() throws IOException, LoginException { if (!canRun()) { return; } // ****** Imitate JobClient code // Configures a task/job with both a regular file and a "classpath" file. Configuration subConf = new Configuration(conf); String userName = getJobOwnerName(); subConf.set("user.name", userName); JobID jobid = new JobID("jt", 1); DistributedCache.addCacheFile(firstCacheFile.toUri(), subConf); DistributedCache.addFileToClassPath(secondCacheFile, subConf, FileSystem.get(subConf)); TrackerDistributedCacheManager.determineTimestamps(subConf); TrackerDistributedCacheManager.determineCacheVisibilities(subConf); // ****** End of imitating JobClient code Path jobFile = new Path(TEST_ROOT_DIR, "job.xml"); FileOutputStream os = new FileOutputStream(new File(jobFile.toString())); subConf.writeXml(os); os.close(); // ****** Imitate TaskRunner code. TrackerDistributedCacheManager manager = new TrackerDistributedCacheManager(conf, taskController); TaskDistributedCacheManager handle = manager.newTaskDistributedCacheManager(jobid, subConf); assertNull(null, DistributedCache.getLocalCacheFiles(subConf)); File workDir = new File(new Path(TEST_ROOT_DIR, "workdir").toString()); handle.setupCache( subConf, TaskTracker.getPublicDistributedCacheDir(), TaskTracker.getPrivateDistributedCacheDir(userName)); JobLocalizer.downloadPrivateCache(subConf); // DOESN'T ACTUALLY HAPPEN IN THE TaskRunner (THIS IS A TODO) // handle.setupPrivateCache(localDirAllocator, TaskTracker // .getPrivateDistributedCacheDir(userName)); // // ****** End of imitating TaskRunner code Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(subConf); assertNotNull(null, localCacheFiles); assertEquals(2, localCacheFiles.length); Path cachedFirstFile = localCacheFiles[0]; Path cachedSecondFile = localCacheFiles[1]; assertFileLengthEquals(firstCacheFile, cachedFirstFile); assertFalse("Paths should be different.", firstCacheFile.equals(cachedFirstFile)); assertEquals(1, handle.getClassPaths().size()); assertEquals(cachedSecondFile.toString(), handle.getClassPaths().get(0)); checkFilePermissions(localCacheFiles); // Cleanup handle.release(); manager.purgeCache(); assertFalse(pathToFile(cachedFirstFile).exists()); }
@Test(groups = {"fast-unit"}) public void path_where_localFileIsPut_should_differForDifferentFiles() { File file1 = createFile(); File file2 = createFile(); assertTrue(!file1.getAbsolutePath().equals(file2.getAbsolutePath())); Path path1 = putter.getPathForFile(file1); Path path2 = putter.getPathForFile(file2); assertTrue(!path1.equals(path2)); }
@Test(groups = {"fast-unit"}) public void pathWhereAClassesFilesAreStored_should_differForDifferentClasses() { ClassA classA = new ClassA(); ClassB classB = new ClassB(); boolean isDifferentClassses = !classA.getClass().getName().equals(classB.getClass().getName()); assertTrue(isDifferentClassses); Path classAStoragePath = classA.getPathWhereFilesAreStored(); Path classBStoragePath = classB.getPathWhereFilesAreStored(); assertTrue(!classAStoragePath.equals(classBStoragePath)); }
public boolean equals(Object object) { if (object instanceof TableDesc) { TableDesc other = (TableDesc) object; boolean eq = tableName.equals(other.tableName); eq = eq && schema.equals(other.schema); eq = eq && meta.equals(other.meta); eq = eq && uri.equals(other.uri); eq = eq && TUtil.checkEquals(partitionMethodDesc, other.partitionMethodDesc); return eq && TUtil.checkEquals(stats, other.stats); } return false; }
/** * Only read the columns that were requested in the constructor.<br> * * @param struct ColumnarStruct * @param path Path * @return Tuple * @throws IOException */ private Tuple readColumnarTuple(ColumnarStruct struct, Path path) throws IOException { int[] columnIndexes = getRequiredColumns(); // the partition keys if any will already be in the UDFContext here. String[] partitionKeys = getPartitionKeys(null, null); // only if the path has changed should be run the if (currentPath == null || !currentPath.equals(path)) { currentPathPartitionKeyMap = (partitionKeys == null) ? null : pathPartitionerHelper.getPathPartitionKeyValues(path.toString()); currentPath = path; } // if the partitionColumns is null this value will stop the for loop // below from trynig to add any partition columns // that do not exist int partitionColumnStartIndex = Integer.MAX_VALUE; if (!(partitionColumns == null || partitionColumns.size() == 0)) { // partition columns are always appended to the schema fields. partitionColumnStartIndex = pigSchema.getFields().length; } // create tuple with determined previous size Tuple t = tupleFactory.newTuple(columnIndexes.length); // read in all columns for (int i = 0; i < columnIndexes.length; i++) { int columnIndex = columnIndexes[i]; if (columnIndex < partitionColumnStartIndex) { Object obj = struct.getField(columnIndex); Object pigType = HiveRCSchemaUtil.extractPigTypeFromHiveType(obj); t.set(i, pigType); } else { // read the partition columns // will only be executed if partitionColumns is not null String key = partitionKeys[columnIndex - partitionColumnStartIndex]; Object value = currentPathPartitionKeyMap.get(key); t.set(i, value); } } return t; }
public void lsr(Path p, List<String> results) throws IOException { if (!this.fs.getFileStatus(p).isDirectory()) { results.add(p.toString()); } Path qualifiedPath = this.fs.makeQualified(p); for (FileStatus status : this.fs.listStatus(p)) { if (status.isDirectory()) { // Fix for hadoop issue: https://issues.apache.org/jira/browse/HADOOP-12169 if (!qualifiedPath.equals(status.getPath())) { lsr(status.getPath(), results); } } else { results.add(status.getPath().toString()); } } }
/** * Try to open the file from one of the available locations. * * @return FSDataInputStream stream of the opened file link * @throws IOException on unexpected error, or file not found. */ private FSDataInputStream tryOpen() throws IOException { for (Path path : fileLink.getLocations()) { if (path.equals(currentPath)) continue; try { in = fs.open(path, bufferSize); in.seek(pos); assert (in.getPos() == pos) : "Link unable to seek to the right position=" + pos; if (LOG.isTraceEnabled()) { if (currentPath != null) { LOG.debug("link open path=" + path); } else { LOG.trace("link switch from path=" + currentPath + " to path=" + path); } } currentPath = path; return (in); } catch (FileNotFoundException e) { // Try another file location } } throw new FileNotFoundException("Unable to open link: " + fileLink); }
@Override public String prepareBulkLoad(final byte[] family, final String srcPath) throws IOException { Path p = new Path(srcPath); Path stageP = new Path(stagingDir, new Path(Bytes.toString(family), p.getName())); // In case of Replication for bulk load files, hfiles are already copied in staging directory if (p.equals(stageP)) { LOG.debug( p.getName() + " is already available in staging directory. Skipping copy or rename."); return stageP.toString(); } if (srcFs == null) { srcFs = FileSystem.get(p.toUri(), conf); } if (!isFile(p)) { throw new IOException("Path does not reference a file: " + p); } // Check to see if the source and target filesystems are the same if (!FSHDFSUtils.isSameHdfs(conf, srcFs, fs)) { LOG.debug( "Bulk-load file " + srcPath + " is on different filesystem than " + "the destination filesystem. Copying file over to destination staging dir."); FileUtil.copy(srcFs, p, fs, stageP, false, conf); } else { LOG.debug("Moving " + p + " to " + stageP); FileStatus origFileStatus = fs.getFileStatus(p); origPermissions.put(srcPath, origFileStatus.getPermission()); if (!fs.rename(p, stageP)) { throw new IOException("Failed to move HFile: " + p + " to " + stageP); } } fs.setPermission(stageP, PERM_ALL_ACCESS); return stageP.toString(); }
private synchronized void moveToDone() throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("moveToDone: " + historyFile); } if (!isMovePending()) { // It was either deleted or is already in done. Either way do nothing if (LOG.isDebugEnabled()) { LOG.debug("Move no longer pending"); } return; } try { long completeTime = jobIndexInfo.getFinishTime(); if (completeTime == 0) { completeTime = System.currentTimeMillis(); } JobId jobId = jobIndexInfo.getJobId(); List<Path> paths = new ArrayList<Path>(2); if (historyFile == null) { LOG.info("No file for job-history with " + jobId + " found in cache!"); } else { paths.add(historyFile); } if (confFile == null) { LOG.info("No file for jobConf with " + jobId + " found in cache!"); } else { paths.add(confFile); } if (summaryFile == null) { LOG.info("No summary file for job: " + jobId); } else { String jobSummaryString = getJobSummary(intermediateDoneDirFc, summaryFile); SUMMARY_LOG.info(jobSummaryString); LOG.info("Deleting JobSummary file: [" + summaryFile + "]"); intermediateDoneDirFc.delete(summaryFile, false); summaryFile = null; } Path targetDir = canonicalHistoryLogPath(jobId, completeTime); addDirectoryToSerialNumberIndex(targetDir); makeDoneSubdir(targetDir); if (historyFile != null) { Path toPath = doneDirFc.makeQualified(new Path(targetDir, historyFile.getName())); if (!toPath.equals(historyFile)) { moveToDoneNow(historyFile, toPath); historyFile = toPath; } } if (confFile != null) { Path toPath = doneDirFc.makeQualified(new Path(targetDir, confFile.getName())); if (!toPath.equals(confFile)) { moveToDoneNow(confFile, toPath); confFile = toPath; } } state = HistoryInfoState.IN_DONE; } catch (Throwable t) { LOG.error("Error while trying to move a job to done", t); this.state = HistoryInfoState.MOVE_FAILED; } }
public void testFreshness() throws Exception { if (!canRun()) { return; } Configuration myConf = new Configuration(conf); myConf.set("fs.default.name", "refresh:///"); myConf.setClass("fs.refresh.impl", FakeFileSystem.class, FileSystem.class); String userName = getJobOwnerName(); TrackerDistributedCacheManager manager = new TrackerDistributedCacheManager(myConf, taskController); // ****** Imitate JobClient code // Configures a task/job with both a regular file and a "classpath" file. Configuration subConf = new Configuration(myConf); subConf.set("user.name", userName); DistributedCache.addCacheFile(firstCacheFile.toUri(), subConf); TrackerDistributedCacheManager.determineTimestamps(subConf); TrackerDistributedCacheManager.determineCacheVisibilities(subConf); // ****** End of imitating JobClient code // ****** Imitate TaskRunner code. TaskDistributedCacheManager handle = manager.newTaskDistributedCacheManager(new JobID("jt", 1), subConf); assertNull(null, DistributedCache.getLocalCacheFiles(subConf)); File workDir = new File(new Path(TEST_ROOT_DIR, "workdir").toString()); handle.setupCache( subConf, TaskTracker.getPublicDistributedCacheDir(), TaskTracker.getPrivateDistributedCacheDir(userName)); // TODO this doesn't really happen in the TaskRunner // handle.setupPrivateCache(localDirAllocator, TaskTracker // .getPrivateDistributedCacheDir(userName)); // ****** End of imitating TaskRunner code Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(subConf); assertNotNull(null, localCacheFiles); assertEquals(1, localCacheFiles.length); Path cachedFirstFile = localCacheFiles[0]; assertFileLengthEquals(firstCacheFile, cachedFirstFile); assertFalse("Paths should be different.", firstCacheFile.equals(cachedFirstFile)); // release handle.release(); // change the file timestamp FileSystem fs = FileSystem.get(myConf); ((FakeFileSystem) fs).advanceClock(1); // running a task of the same job Throwable th = null; try { handle.setupCache( subConf, TaskTracker.getPublicDistributedCacheDir(), TaskTracker.getPrivateDistributedCacheDir(userName)); // handle.setupPrivateCache(localDirAllocator, TaskTracker // .getPrivateDistributedCacheDir(userName)); } catch (IOException ie) { th = ie; } assertNotNull("Throwable is null", th); assertTrue( "Exception message does not match", th.getMessage().contains("has changed on HDFS since job started")); // release handle.release(); // submit another job Configuration subConf2 = new Configuration(myConf); subConf2.set("user.name", userName); DistributedCache.addCacheFile(firstCacheFile.toUri(), subConf2); TrackerDistributedCacheManager.determineTimestamps(subConf2); TrackerDistributedCacheManager.determineCacheVisibilities(subConf2); handle = manager.newTaskDistributedCacheManager(new JobID("jt", 2), subConf2); handle.setupCache( subConf2, TaskTracker.getPublicDistributedCacheDir(), TaskTracker.getPrivateDistributedCacheDir(userName)); Path[] localCacheFiles2 = DistributedCache.getLocalCacheFiles(subConf2); assertNotNull(null, localCacheFiles2); assertEquals(1, localCacheFiles2.length); Path cachedFirstFile2 = localCacheFiles2[0]; assertFileLengthEquals(firstCacheFile, cachedFirstFile2); assertFalse("Paths should be different.", firstCacheFile.equals(cachedFirstFile2)); // assert that two localizations point to different paths assertFalse( "two jobs with different timestamps did not localize" + " in different paths", cachedFirstFile.equals(cachedFirstFile2)); // release handle.release(); }
/** test delete cache */ public void testDeleteCache() throws Exception { if (!canRun()) { return; } // This test needs mapred.local.dir to be single directory // instead of four, because it assumes that both // firstcachefile and secondcachefile will be localized on same directory // so that second localization triggers deleteCache. // If mapred.local.dir is four directories, second localization might not // trigger deleteCache, if it is localized in different directory. Configuration conf2 = new Configuration(conf); conf2.set("mapred.local.dir", ROOT_MAPRED_LOCAL_DIR.toString()); conf2.setLong("local.cache.size", LOCAL_CACHE_LIMIT); refreshConf(conf2); TrackerDistributedCacheManager manager = new TrackerDistributedCacheManager(conf2, taskController); FileSystem localfs = FileSystem.getLocal(conf2); long now = System.currentTimeMillis(); String userName = getJobOwnerName(); conf2.set("user.name", userName); // We first test the size limit FileStatus stat = fs.getFileStatus(firstCacheFilePublic); CacheFile cfile1 = new CacheFile( firstCacheFilePublic.toUri(), CacheFile.FileType.REGULAR, true, stat.getModificationTime(), true); Path firstLocalCache = manager.getLocalCache( firstCacheFilePublic.toUri(), conf2, TaskTracker.getPrivateDistributedCacheDir(userName), fs.getFileStatus(firstCacheFilePublic), false, fs.getFileStatus(firstCacheFilePublic).getModificationTime(), true, cfile1); manager.releaseCache(cfile1.getStatus()); // in above code,localized a file of size 4K and then release the cache // which will cause the cache be deleted when the limit goes out. // The below code localize another cache which's designed to // sweep away the first cache. stat = fs.getFileStatus(secondCacheFilePublic); CacheFile cfile2 = new CacheFile( secondCacheFilePublic.toUri(), CacheFile.FileType.REGULAR, true, stat.getModificationTime(), true); assertTrue( "DistributedCache currently doesn't have cached file", localfs.exists(firstLocalCache)); Path secondLocalCache = manager.getLocalCache( secondCacheFilePublic.toUri(), conf2, TaskTracker.getPrivateDistributedCacheDir(userName), fs.getFileStatus(secondCacheFilePublic), false, fs.getFileStatus(secondCacheFilePublic).getModificationTime(), true, cfile2); assertFalse( "DistributedCache failed deleting old" + " cache when the cache store is full.", localfs.exists(firstLocalCache)); // find the root directory of distributed caches Path firstCursor = firstLocalCache; Path secondCursor = secondLocalCache; while (!firstCursor.equals(secondCursor)) { // Debug code, to see what these things look like System.err.println("cursors: " + firstCursor); System.err.println(" and " + secondCursor); firstCursor = firstCursor.getParent(); secondCursor = secondCursor.getParent(); } System.err.println("The final cursor is " + firstCursor); System.err.println( "That directory ends up with " + localfs.listStatus(firstCursor).length + " subdirectories"); Path cachesBase = firstCursor; assertFalse( "DistributedCache did not delete the gensym'ed distcache " + "directory names when it deleted the files they contained " + "because they collectively exceeded the size limit.", localfs.listStatus(cachesBase).length > 1); conf2.setLong("local.cache.size", LOCAL_CACHE_LIMIT * 10); conf2.setLong("mapreduce.tasktracker.local.cache.numberdirectories", LOCAL_CACHE_SUBDIR_LIMIT); manager = new TrackerDistributedCacheManager(conf2, taskController); // Now we test the number of sub directories limit // Create the temporary cache files to be used in the tests. Path thirdCacheFile = new Path(TEST_ROOT_DIR, "thirdcachefile"); Path fourthCacheFile = new Path(TEST_ROOT_DIR, "fourthcachefile"); // Adding two more small files, so it triggers the number of sub directory // limit but does not trigger the file size limit. createPrivateTempFile(thirdCacheFile); createPrivateTempFile(fourthCacheFile); DistributedCache.setCacheFiles(new URI[] {thirdCacheFile.toUri()}, conf2); TrackerDistributedCacheManager.determineCacheVisibilities(conf2); TrackerDistributedCacheManager.determineTimestamps(conf2); stat = fs.getFileStatus(thirdCacheFile); CacheFile cfile3 = new CacheFile( thirdCacheFile.toUri(), CacheFile.FileType.REGULAR, false, stat.getModificationTime(), true); Path thirdLocalCache = manager.getLocalCache( thirdCacheFile.toUri(), conf2, TaskTracker.getPrivateDistributedCacheDir(userName), fs.getFileStatus(thirdCacheFile), false, fs.getFileStatus(thirdCacheFile).getModificationTime(), false, cfile3); DistributedCache.setLocalFiles(conf2, thirdLocalCache.toString()); JobLocalizer.downloadPrivateCache(conf2); // Release the third cache so that it can be deleted while sweeping manager.releaseCache(cfile3.getStatus()); // Getting the fourth cache will make the number of sub directories becomes // 3 which is greater than 2. So the released cache will be deleted. stat = fs.getFileStatus(fourthCacheFile); CacheFile cfile4 = new CacheFile( fourthCacheFile.toUri(), CacheFile.FileType.REGULAR, false, stat.getModificationTime(), true); assertTrue( "DistributedCache currently doesn't have cached file", localfs.exists(thirdLocalCache)); DistributedCache.setCacheFiles(new URI[] {fourthCacheFile.toUri()}, conf2); DistributedCache.setLocalFiles(conf2, thirdCacheFile.toUri().toString()); TrackerDistributedCacheManager.determineCacheVisibilities(conf2); TrackerDistributedCacheManager.determineTimestamps(conf2); Path fourthLocalCache = manager.getLocalCache( fourthCacheFile.toUri(), conf2, TaskTracker.getPrivateDistributedCacheDir(userName), fs.getFileStatus(fourthCacheFile), false, fs.getFileStatus(fourthCacheFile).getModificationTime(), false, cfile4); assertFalse( "DistributedCache failed deleting old" + " cache when the cache exceeds the number of sub directories limit.", localfs.exists(thirdLocalCache)); assertFalse( "DistributedCache did not delete the gensym'ed distcache " + "directory names when it deleted the files they contained " + "because there were too many.", localfs.listStatus(cachesBase).length > LOCAL_CACHE_SUBDIR_LIMIT); // Clean up the files created in this test new File(thirdCacheFile.toString()).delete(); new File(fourthCacheFile.toString()).delete(); }
public static FileStatus containsPath(Path path, FileStatus[] dirList) throws IOException { for (int i = 0; i < dirList.length; i++) { if (path.equals(dirList[i].getPath())) return dirList[i]; } return null; }
/** raids test file */ private void raidTestFiles(Path raidPath, Path[] filePaths, boolean doHar) throws IOException, ClassNotFoundException { // create RaidNode raidConf = new Configuration(conf); raidConf.set(RaidNode.RAID_LOCATION_KEY, RAID_DIR); raidConf.setInt("raid.blockfix.interval", 1000); // the RaidNode does the raiding inline (instead of submitting to MR node) conf.set("raid.classname", "org.apache.hadoop.raid.LocalRaidNode"); rnode = RaidNode.createRaidNode(null, raidConf); for (Path filePath : filePaths) { long waitStart = System.currentTimeMillis(); boolean raided = false; Path parityFilePath = new Path(RAID_DIR, filePath.toString().substring(1)); while (!raided) { try { FileStatus[] listPaths = dfs.listStatus(raidPath); if (listPaths != null) { if (doHar) { // case with HAR for (FileStatus f : listPaths) { if (f.getPath().toString().endsWith(".har")) { // check if the parity file is in the index final Path indexPath = new Path(f.getPath(), "_index"); final FileStatus indexFileStatus = dfs.getFileStatus(indexPath); final HarIndex harIndex = new HarIndex(dfs.open(indexPath), indexFileStatus.getLen()); final HarIndex.IndexEntry indexEntry = harIndex.findEntryByFileName(parityFilePath.toString()); if (indexEntry != null) { LOG.info( "raid file " + parityFilePath.toString() + " found in Har archive: " + f.getPath().toString() + " ts=" + indexEntry.mtime); raided = true; break; } } } } else { // case without HAR for (FileStatus f : listPaths) { Path found = new Path(f.getPath().toUri().getPath()); if (parityFilePath.equals(found)) { LOG.info("raid file found: " + f.getPath().toString()); raided = true; break; } } } } } catch (FileNotFoundException ignore) { } if (!raided) { if (System.currentTimeMillis() > waitStart + 40000L) { LOG.error("parity file not created after 40s"); throw new IOException("parity file not HARed after 40s"); } else { try { Thread.sleep(1000); } catch (InterruptedException ignore) { } } } } } rnode.stop(); rnode.join(); rnode = null; LOG.info("test file raided"); }