private static void addFolder2( FileSystem fs, Path p, ArrayList<String> keys, ArrayList<String> failed) { try { if (fs == null) return; Futures futures = new Futures(); for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder2(fs, pfs, keys, failed); } else { long size = file.getLen(); Key res; if (pfs.getName().endsWith(Extensions.JSON)) { throw H2O.unimpl(); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? throw H2O.unimpl(); } else { Key k = null; keys.add((k = HdfsFileVec.make(file, futures)).toString()); Log.info("PersistHdfs: DKV.put(" + k + ")"); } } } } catch (Exception e) { Log.err(e); failed.add(p.toString()); } }
/** * Check whether the contents of src and dst are the same. * * <p>Return false if dstpath does not exist * * <p>If the files have different sizes, return false. * * <p>If the files have the same sizes, the file checksums will be compared. * * <p>When file checksum is not supported in any of file systems, two files are considered as the * same if they have the same size. */ private static boolean sameFile( FileSystem srcfs, FileStatus srcstatus, FileSystem dstfs, Path dstpath) throws IOException { FileStatus dststatus; try { dststatus = dstfs.getFileStatus(dstpath); } catch (FileNotFoundException fnfe) { return false; } // same length? if (srcstatus.getLen() != dststatus.getLen()) { return false; } // compare checksums try { final FileChecksum srccs = srcfs.getFileChecksum(srcstatus.getPath()); final FileChecksum dstcs = dstfs.getFileChecksum(dststatus.getPath()); // return true if checksum is not supported // (i.e. some of the checksums is null) return srccs == null || dstcs == null || srccs.equals(dstcs); } catch (FileNotFoundException fnfe) { return false; } }
/* * Fetch a file that is in a Hadoop file system. Return a local File. * Interruptible. */ private File hdfsFetch(Path fromPath, Reporter reporter) throws IOException, InterruptedException { UUID uniqueId = UUID.randomUUID(); File toFile = new File(tempDir, uniqueId.toString() + "/" + fromPath.getName()); File toDir = new File(toFile.getParent()); if (toDir.exists()) { FileUtils.deleteDirectory(toDir); } toDir.mkdirs(); Path toPath = new Path(toFile.getCanonicalPath()); FileSystem fS = fromPath.getFileSystem(hadoopConf); FileSystem tofS = FileSystem.getLocal(hadoopConf); Throttler throttler = new Throttler((double) bytesPerSecThrottle); try { for (FileStatus fStatus : fS.globStatus(fromPath)) { log.info("Copying " + fStatus.getPath() + " to " + toPath); long bytesSoFar = 0; FSDataInputStream iS = fS.open(fStatus.getPath()); FSDataOutputStream oS = tofS.create(toPath); byte[] buffer = new byte[downloadBufferSize]; int nRead; while ((nRead = iS.read(buffer, 0, buffer.length)) != -1) { // Needed to being able to be interrupted at any moment. if (Thread.interrupted()) { iS.close(); oS.close(); cleanDirNoExceptions(toDir); throw new InterruptedException(); } bytesSoFar += nRead; oS.write(buffer, 0, nRead); throttler.incrementAndThrottle(nRead); if (bytesSoFar >= bytesToReportProgress) { reporter.progress(bytesSoFar); bytesSoFar = 0l; } } if (reporter != null) { reporter.progress(bytesSoFar); } oS.close(); iS.close(); } return toDir; } catch (ClosedByInterruptException e) { // This can be thrown by the method read. cleanDirNoExceptions(toDir); throw new InterruptedIOException(); } }
private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) { try { if (fs == null) return; for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder(fs, pfs, succeeded, failed); } else { Key k = Key.make(pfs.toString()); long size = file.getLen(); Value val = null; if (pfs.getName().endsWith(Extensions.JSON)) { JsonParser parser = new JsonParser(); JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject(); JsonElement v = json.get(Constants.VERSION); if (v == null) throw new RuntimeException("Missing version"); JsonElement type = json.get(Constants.TYPE); if (type == null) throw new RuntimeException("Missing type"); Class c = Class.forName(type.getAsString()); OldModel model = (OldModel) c.newInstance(); model.fromJson(json); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? FSDataInputStream s = fs.open(pfs); int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg byte[] mem = MemoryManager.malloc1(sz); s.readFully(mem); // Convert to a ValueArray (hope it fits in 1Meg!) ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem)); val = new Value(k, ary, Value.HDFS); } else if (size >= 2 * ValueArray.CHUNK_SZ) { val = new Value( k, new ValueArray(k, size), Value.HDFS); // ValueArray byte wrapper over a large file } else { val = new Value(k, (int) size, Value.HDFS); // Plain Value val.setdsk(); } DKV.put(k, val); Log.info("PersistHdfs: DKV.put(" + k + ")"); JsonObject o = new JsonObject(); o.addProperty(Constants.KEY, k.toString()); o.addProperty(Constants.FILE, pfs.toString()); o.addProperty(Constants.VALUE_SIZE, file.getLen()); succeeded.add(o); } } } catch (Exception e) { Log.err(e); JsonObject o = new JsonObject(); o.addProperty(Constants.FILE, p.toString()); o.addProperty(Constants.ERROR, e.getMessage()); failed.add(o); } }
/** * Generate the list of files and make them into FileSplits. This needs to be copied to insert a * filter on acceptable data */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); long desiredMappers = job.getConfiguration().getLong("org.systemsbiology.jxtandem.DesiredXMLInputMappers", 0); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> fileStatuses = listStatus(job); boolean forceNumberMappers = fileStatuses.size() == 1; for (FileStatus file : fileStatuses) { Path path = file.getPath(); if (!isPathAcceptable(path)) // filter acceptable data continue; FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); // use desired mappers to force more splits if (forceNumberMappers && desiredMappers > 0) maxSize = Math.min(maxSize, (length / desiredMappers)); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (withinSlop(splitSize, bytesRemaining)) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add( new FileSplit( path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add( new FileSplit( path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } System.out.println("Total # of splits: " + splits.size()); // LOG.debug("Total # of splits: " + splits.size()); return splits; }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int cnfiles = job.getInt(SRC_COUNT_LABEL, -1); long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1); String srcfilelist = job.get(SRC_LIST_LABEL, ""); if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) { throw new RuntimeException( "Invalid metadata: #files(" + cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist + ")"); } Path src = new Path(srcfilelist); FileSystem fs = src.getFileSystem(job); FileStatus srcst = fs.getFileStatus(src); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); LongWritable key = new LongWritable(); FilePair value = new FilePair(); final long targetsize = cbsize / numSplits; long pos = 0L; long last = 0L; long acc = 0L; long cbrem = srcst.getLen(); SequenceFile.Reader sl = null; try { sl = new SequenceFile.Reader(fs, src, job); for (; sl.next(key, value); last = sl.getPosition()) { // if adding this split would put this split past the target size, // cut the last split and put this next file in the next split. if (acc + key.get() > targetsize && acc != 0) { long splitsize = last - pos; splits.add(new FileSplit(src, pos, splitsize, (String[]) null)); cbrem -= splitsize; pos = last; acc = 0L; } acc += key.get(); } } finally { checkAndClose(sl); } if (cbrem != 0) { splits.add(new FileSplit(src, pos, cbrem, (String[]) null)); } return splits.toArray(new FileSplit[splits.size()]); }
private static Path checkDest(String srcName, FileSystem dstFS, Path dst, boolean overwrite) throws IOException { if (dstFS.exists(dst)) { FileStatus sdst = dstFS.getFileStatus(dst); if (sdst.isDir()) { if (null == srcName) { throw new IOException("Target " + dst + " is a directory"); } return checkDest(null, dstFS, new Path(dst, srcName), overwrite); } else if (!overwrite) { throw new IOException("Target " + dst + " already exists"); } } return dst; }
// Checks if the cache has already been localized and is fresh private static boolean ifExistsAndFresh( Configuration conf, FileSystem fs, URI cache, long confFileStamp, CacheStatus lcacheStatus, FileStatus fileStatus) throws IOException { // check for existence of the cache long dfsFileStamp; if (fileStatus != null) { dfsFileStamp = fileStatus.getModificationTime(); } else { dfsFileStamp = getTimestamp(conf, cache); } // ensure that the file on hdfs hasn't been modified since the job started if (dfsFileStamp != confFileStamp) { LOG.fatal("File: " + cache + " has changed on HDFS since job started"); throw new IOException("File: " + cache + " has changed on HDFS since job started"); } if (dfsFileStamp != lcacheStatus.mtime) { // needs refreshing return false; } return true; }
/** Added for back compatibility. */ public static Path getLocalCache( URI cache, Configuration conf, Path subdir, FileStatus fileStatus, boolean isArchive, long confFileStamp, Path currentWorkDir, boolean honorSymLinkConf, MRAsyncDiskService asyncDiskService, LocalDirAllocator lDirAllocator) throws IOException { return getLocalCache( cache, conf, subdir, fileStatus, isArchive, confFileStamp, fileStatus.getLen(), currentWorkDir, honorSymLinkConf, asyncDiskService, lDirAllocator); }
/** Implement the delete(Path, boolean) in checksum file system. */ public boolean delete(Path f, boolean recursive) throws IOException { FileStatus fstatus = null; try { fstatus = fs.getFileStatus(f); } catch (FileNotFoundException e) { return false; } if (fstatus.isDir()) { // this works since the crcs are in the same // directories and the files. so we just delete // everything in the underlying filesystem return fs.delete(f, recursive); } else { Path checkFile = getChecksumFile(f); if (fs.exists(checkFile)) { fs.delete(checkFile, true); } return fs.delete(f, true); } }
private FSDataOutputStream create(Path f, Reporter reporter, FileStatus srcstat) throws IOException { if (destFileSys.exists(f)) { destFileSys.delete(f, false); } if (!preserve_status) { return destFileSys.create(f, true, sizeBuf, reporter); } FsPermission permission = preseved.contains(FileAttribute.PERMISSION) ? srcstat.getPermission() : null; short replication = preseved.contains(FileAttribute.REPLICATION) ? srcstat.getReplication() : destFileSys.getDefaultReplication(); long blockSize = preseved.contains(FileAttribute.BLOCK_SIZE) ? srcstat.getBlockSize() : destFileSys.getDefaultBlockSize(); return destFileSys.create(f, permission, true, sizeBuf, replication, blockSize, reporter); }
/** * list subfiles * * @param hdfsPath !null path probably exists * @return !null list of enclused file names - emptu if file of not exists */ @Override public String[] ls(final String hdfsPath) { if (isRunningAsUser()) { return super.ls(hdfsPath); } final FileSystem fs = getDFS(); try { FileStatus[] statuses = fs.listStatus(new Path(hdfsPath)); if (statuses == null) return new String[0]; List<String> holder = new ArrayList<String>(); for (int i = 0; i < statuses.length; i++) { FileStatus statuse = statuses[i]; String s = statuse.getPath().getName(); holder.add(s); } String[] ret = new String[holder.size()]; holder.toArray(ret); return ret; } catch (IOException e) { throw new RuntimeException(e); } }
public static boolean copy( FileSystem srcFS, Path[] srcs, FileSystem dstFS, Path dst, boolean deleteSource, boolean overwrite, Configuration conf) throws IOException { boolean gotException = false; boolean returnVal = true; StringBuffer exceptions = new StringBuffer(); if (srcs.length == 1) return copy(srcFS, srcs[0], dstFS, dst, deleteSource, overwrite, conf); // Check if dest is directory if (!dstFS.exists(dst)) { throw new IOException("`" + dst + "': specified destination directory " + "doest not exist"); } else { FileStatus sdst = dstFS.getFileStatus(dst); if (!sdst.isDir()) throw new IOException( "copying multiple files, but last argument `" + dst + "' is not a directory"); } for (Path src : srcs) { try { if (!copy(srcFS, src, dstFS, dst, deleteSource, overwrite, conf)) returnVal = false; } catch (IOException e) { gotException = true; exceptions.append(e.getMessage()); exceptions.append("\n"); } } if (gotException) { throw new IOException(exceptions.toString()); } return returnVal; }
/** * The src file is under FS, and the dst is on the local disk. Copy it from FS control to the * local dst name. If src and dst are directories, the copyCrc parameter determines whether to * copy CRC files. */ public void copyToLocalFile(Path src, Path dst, boolean copyCrc) throws IOException { if (!fs.isDirectory(src)) { // source is a file fs.copyToLocalFile(src, dst); FileSystem localFs = getLocal(getConf()).getRawFileSystem(); if (localFs.isDirectory(dst)) { dst = new Path(dst, src.getName()); } dst = getChecksumFile(dst); if (localFs.exists(dst)) { // remove old local checksum file localFs.delete(dst, true); } Path checksumFile = getChecksumFile(src); if (copyCrc && fs.exists(checksumFile)) { // copy checksum file fs.copyToLocalFile(checksumFile, dst); } } else { FileStatus[] srcs = listStatus(src); for (FileStatus srcFile : srcs) { copyToLocalFile(srcFile.getPath(), new Path(dst, srcFile.getPath().getName()), copyCrc); } } }
public static void recursePath(Configuration conf, Path path, Job job) { try { FileSystem fs = path.getFileSystem(conf); FileStatus[] fstats = fs.listStatus(path); if (fstats != null) { for (FileStatus f : fstats) { Path p = f.getPath(); ; if (fs.isFile(p)) { // connection times out otherwise System.err.println("file:" + p.toString()); FileInputFormat.addInputPath(job, p); } else { System.err.println("dir:" + p.toString()); recursePath(conf, p, job); } } } } catch (IOException e) { // shouldn't be here throw new RuntimeException(e); } }
/** * Get the locally cached file or archive; it could either be previously cached (and valid) or * copy it from the {@link FileSystem} now. * * @param cache the cache to be localized, this should be specified as new * URI(hdfs://hostname:port/absolute_path_to_file#LINKNAME). If no schema or hostname:port is * provided the file is assumed to be in the filesystem being used in the Configuration * @param conf The Confguration file which contains the filesystem * @param baseDir The base cache Dir where you wnat to localize the files/archives * @param fileStatus The file status on the dfs. * @param isArchive if the cache is an archive or a file. In case it is an archive with a .zip or * .jar or .tar or .tgz or .tar.gz extension it will be unzipped/unjarred/untarred * automatically and the directory where the archive is unzipped/unjarred/untarred is returned * as the Path. In case of a file, the path to the file is returned * @param confFileStamp this is the hdfs file modification timestamp to verify that the file to be * cached hasn't changed since the job started * @param currentWorkDir this is the directory where you would want to create symlinks for the * locally cached files/archives * @return the path to directory where the archives are unjarred in case of archives, the path to * the file where the file is copied locally * @throws IOException */ public static Path getLocalCache( URI cache, Configuration conf, Path baseDir, FileStatus fileStatus, boolean isArchive, long confFileStamp, Path currentWorkDir, MRAsyncDiskService asyncDiskService) throws IOException { return getLocalCache( cache, conf, baseDir, fileStatus, isArchive, confFileStamp, fileStatus.getLen(), currentWorkDir, true, asyncDiskService, new LocalDirAllocator("mapred.local.dir")); }
/** * Initialize DFSCopyFileMapper specific job-configuration. * * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments */ private static void setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); // set boolean values final boolean update = args.flags.contains(Options.UPDATE); final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE); jobConf.setBoolean(Options.UPDATE.propertyname, update); jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite); jobConf.setBoolean( Options.IGNORE_READ_FAILURES.propertyname, args.flags.contains(Options.IGNORE_READ_FAILURES)); jobConf.setBoolean( Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); FileSystem dstfs = args.dst.getFileSystem(conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_distcp_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_distcp_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter( jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter( jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter( jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory OR we're updating/overwriting // the contents of the destination directory. final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite; int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext(); ) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<FileStatus>(); for (pathstack.push(srcfilestat); !pathstack.empty(); ) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { // skip file if the src and the dst files are the same. skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst)); // skip file if it exceed file limit or size limit skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit; if (!skipfile) { ++fileCount; byteCount += child.getLen(); if (LOG.isTraceEnabled()) { LOG.trace("adding file " + child.getPath()); } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append( new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { LOG.info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_distcp_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); if (dststatus != null && args.flags.contains(Options.DELETE)) { deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf); } Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_distcp_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); LOG.info("srcCount=" + srcCount); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(byteCount, jobConf); }
private static void updatePermissions( FileStatus src, FileStatus dst, EnumSet<FileAttribute> preseved, FileSystem destFileSys) throws IOException { String owner = null; String group = null; if (preseved.contains(FileAttribute.USER) && !src.getOwner().equals(dst.getOwner())) { owner = src.getOwner(); } if (preseved.contains(FileAttribute.GROUP) && !src.getGroup().equals(dst.getGroup())) { group = src.getGroup(); } if (owner != null || group != null) { destFileSys.setOwner(dst.getPath(), owner, group); } if (preseved.contains(FileAttribute.PERMISSION) && !src.getPermission().equals(dst.getPermission())) { destFileSys.setPermission(dst.getPath(), src.getPermission()); } }
/** * Copy a file to a destination. * * @param srcstat src path and metadata * @param dstpath dst path * @param reporter */ private void copy( FileStatus srcstat, Path relativedst, OutputCollector<WritableComparable<?>, Text> outc, Reporter reporter) throws IOException { Path absdst = new Path(destPath, relativedst); int totfiles = job.getInt(SRC_COUNT_LABEL, -1); assert totfiles >= 0 : "Invalid file count " + totfiles; // if a directory, ensure created even if empty if (srcstat.isDir()) { if (destFileSys.exists(absdst)) { if (!destFileSys.getFileStatus(absdst).isDir()) { throw new IOException("Failed to mkdirs: " + absdst + " is a file."); } } else if (!destFileSys.mkdirs(absdst)) { throw new IOException("Failed to mkdirs " + absdst); } // TODO: when modification times can be set, directories should be // emitted to reducers so they might be preserved. Also, mkdirs does // not currently return an error when the directory already exists; // if this changes, all directory work might as well be done in reduce return; } if (destFileSys.exists(absdst) && !overwrite && !needsUpdate(srcstat, destFileSys, absdst)) { outc.collect(null, new Text("SKIP: " + srcstat.getPath())); ++skipcount; reporter.incrCounter(Counter.SKIP, 1); updateStatus(reporter); return; } Path tmpfile = new Path(job.get(TMP_DIR_LABEL), relativedst); long cbcopied = 0L; FSDataInputStream in = null; FSDataOutputStream out = null; try { // open src file try { in = srcstat.getPath().getFileSystem(job).open(srcstat.getPath()); } catch (IOException e) { LOG.error("Failed to open src file " + srcstat.getPath() + ", ignore and return"); in = null; return; } reporter.incrCounter(Counter.BYTESEXPECTED, srcstat.getLen()); // open tmp file out = create(tmpfile, reporter, srcstat); // copy file for (int cbread; (cbread = in.read(buffer)) >= 0; ) { out.write(buffer, 0, cbread); cbcopied += cbread; reporter.setStatus( String.format("%.2f ", cbcopied * 100.0 / srcstat.getLen()) + absdst + " [ " + StringUtils.humanReadableInt(cbcopied) + " / " + StringUtils.humanReadableInt(srcstat.getLen()) + " ]"); } } finally { checkAndClose(in); checkAndClose(out); } if (cbcopied != srcstat.getLen()) { if (srcstat.getLen() == 0 && cbcopied > 0) { LOG.info("most likely see a WAL file corruption: " + srcstat.getPath()); } else { throw new IOException( "File size not matched: copied " + bytesString(cbcopied) + " to tmpfile (=" + tmpfile + ") but expected " + bytesString(srcstat.getLen()) + " from " + srcstat.getPath()); } } else { if (totfiles == 1) { // Copying a single file; use dst path provided by user as destination // rather than destination directory, if a file Path dstparent = absdst.getParent(); if (!(destFileSys.exists(dstparent) && destFileSys.getFileStatus(dstparent).isDir())) { absdst = dstparent; } } if (destFileSys.exists(absdst) && destFileSys.getFileStatus(absdst).isDir()) { throw new IOException(absdst + " is a directory"); } if (!destFileSys.mkdirs(absdst.getParent())) { throw new IOException("Failed to craete parent dir: " + absdst.getParent()); } rename(tmpfile, absdst); FileStatus dststat = destFileSys.getFileStatus(absdst); if (dststat.getLen() != srcstat.getLen()) { destFileSys.delete(absdst, false); throw new IOException( "File size not matched: copied " + bytesString(dststat.getLen()) + " to dst (=" + absdst + ") but expected " + bytesString(srcstat.getLen()) + " from " + srcstat.getPath()); } updatePermissions(srcstat, dststat); } // report at least once for each file ++copycount; reporter.incrCounter(Counter.BYTESCOPIED, cbcopied); reporter.incrCounter(Counter.COPY, 1); updateStatus(reporter); }
/** * Return true if dst should be replaced by src and the update flag is set. Right now, this * merely checks that the src and dst len are not equal. This should be improved on once * modification times, CRCs, etc. can be meaningful in this context. * * @throws IOException */ private boolean needsUpdate(FileStatus srcstatus, FileSystem dstfs, Path dstpath) throws IOException { return update && !sameFile(srcstatus.getPath().getFileSystem(job), srcstatus, dstfs, dstpath); }
/** * Method to move files from HDFS to local filesystem * * <p>localPath: Path on the machines filesystem fs:FileSystem object from HDFS pathList:List of * paths for files that might need to be backed up size:max size in bytes to be backed up * * <p>ReturnsDate of the last files backed up if reached size limit, else, zero */ public long backupFiles( String localPath, String preservePath, FileSystem fs, ArrayList<Path> pathList, long size) { Path fsPath; long tmpSize = 0; long tmpDate = 0; // Start iterating over all paths for (Path hdfsPath : pathList) { try { long nFileSize = fs.getContentSummary(hdfsPath).getLength(); tmpSize = tmpSize + nFileSize; if ((tmpSize <= size) || (size == 0)) { FileStatus stat = fs.getFileStatus(hdfsPath); System.err.println( "File " + hdfsPath.toUri().getPath() + " " + nFileSize + " bytes, " + "perms: " + stat.getOwner() + "/" + stat.getGroup() + ", " + stat.getPermission().toString()); tmpDate = stat.getModificationTime() / 1000; String sFsPath = localPath + hdfsPath.toUri().getPath(); fsPath = new Path(sFsPath); File f = new File(sFsPath); // COMMENTED OUT: until a few backup cycles run // and the mtime gets in fact set on all copied // files. // // ignore it if the file exists and has the same mtime // if (f.exists() && f.isFile() && f.lastModified() == stat.getModificationTime()) // { // System.out.println("no need to backup " + f.toString() + ", mtime matches hdfs"); // continue; // } if (false == m_bDryRun) { // check if we need to back up the local file // (not directory), if it already exists. if (f.exists() && f.isFile()) { // ignore files with substrings in the // no-preserve file if (true == doPreserveFile(sFsPath)) { // move it to the backup path String sNewPath = preservePath + hdfsPath.toUri().getPath(); File newFile = new File(sNewPath); // create directory structure for new file? if (false == newFile.getParentFile().exists()) { if (false == newFile.getParentFile().mkdirs()) { System.err.println("Failed to mkdirs " + newFile.getParentFile().toString()); System.exit(1); } } // rename existing file to new location if (false == f.renameTo(newFile)) { System.err.println( "Failed to renameTo " + f.toString() + " to " + newFile.toString()); System.exit(1); } System.out.println("preserved " + f.toString() + " into " + newFile.toString()); } else { System.out.println("skipped preservation of " + f.toString()); } } // copy from hdfs to local filesystem fs.copyToLocalFile(hdfsPath, fsPath); // set the mtime to match hdfs file f.setLastModified(stat.getModificationTime()); // compare checksums on both files compareChecksums(fs, hdfsPath, sFsPath); } // don't print the progress after every file -- go // by at least 1% increments long nPercentDone = (long) (100 * tmpSize / m_nTotalBytes); if (nPercentDone > m_nLastPercentBytesDone) { System.out.println( "progress: copied " + prettyPrintBytes(tmpSize) + ", " + nPercentDone + "% done" + ", tstamp=" + tmpDate); m_nLastPercentBytesDone = nPercentDone; } if (m_nSleepSeconds > 0) { try { Thread.sleep(1000 * m_nSleepSeconds); } catch (Exception e2) { // ignore } } } else { return tmpDate; } } catch (IOException e) { System.err.println("FATAL ERROR: Something wrong with the file"); System.err.println(e); System.out.println(tmpDate); System.exit(1); return 0; } } return 0; }
public void write(DataOutput out) throws IOException { input.write(out); Text.writeString(out, output); }
/** * Compare the checksums of the hdfs file as well as the local copied file. * * @author [email protected] * @date Fri Jan 27 06:06:00 2012 */ boolean compareChecksums(FileSystem fs, Path p, String sFsPath) { try { // get hdfs file info FileStatus stat = fs.getFileStatus(p); // get HDFS checksum FileChecksum ck = fs.getFileChecksum(p); String sCk, sCkShort; if (ck == null) { sCk = sCkShort = "<null>"; } else { sCk = ck.toString(); sCkShort = sCk.replaceAll("^.*:", ""); } // System.out.println(p.toUri().getPath() + " len=" + stat.getLen() // + " " + stat.getOwner() + "/" + stat.getGroup() // + " checksum=" + sCk); // find the local file File fLocal = new File(sFsPath); if (!fLocal.exists()) { System.out.println("CHECKSUM-ERROR: file does not exist: " + sFsPath); return false; } if (!fLocal.isFile()) { System.out.println("CHECKSUM-ERROR: path is not a file: " + sFsPath); return false; } if (stat.getLen() != fLocal.length()) { System.out.println( "CHECKSUM-ERROR: length mismatch: " + sFsPath + " hdfslen=" + stat.getLen() + " fslen=" + fLocal.length()); return false; } // get local fs checksum FileChecksum ckLocal = getLocalFileChecksum(sFsPath); if (ckLocal == null) { System.out.println("ERROR Failed to get checksum for local file " + sFsPath); return false; } // compare checksums as a string, after stripping the // algorithm name from the beginning String sCkLocal = ckLocal.toString(); String sCkLocalShort = sCkLocal.replaceAll("^.*:", ""); if (false == sCkShort.equals(sCkLocalShort)) { System.out.println( "CHECKSUM-ERROR: checksum mismatch: " + sFsPath + "\nhdfs = " + sCk + "\nlocal= " + sCkLocal); return false; } return true; } catch (IOException e) { System.out.println("CHECKSUM-ERROR: " + sFsPath + " exception " + e.toString()); } return false; }
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { if (args.length != 2) { System.out.println("Usage: FeatureMatching ID <inputName.jpeg/inputName.jpg>"); System.exit(1); } SimpleDateFormat sdf = new SimpleDateFormat("", Locale.US); sdf.applyPattern("yyyy-MM-dd_HH-mm-ss"); String time = sdf.format(new Date()); Job job = Job.getInstance(); ID = "/" + args[0]; String filename = args[1]; filename = filename.toLowerCase(); System.out.println("current filename:" + filename); // Detect illegal username (if the username dir doesn't exist) File userPath = new File(LOCAL_USER_DIR + ID); if (!userPath.exists()) { System.out.println("Unauthorized username!!!\nExiting......"); System.exit(1); } // Preprocess the input image.jpg from local dir: /local.../user/ID/input/image.jpg // Save the features to local dir: hdfs://.../user/ID/input/image.jpg extractQueryFeatures2HDFS(filename, job); // Add the feature file to the hdfs cache String featureFileName = filename.substring(0, filename.lastIndexOf(".")) + ".json"; // job.addCacheFile(new Path(HDFS_HOME + USER + ID + INPUT + "/" + // featureFileName).toUri()); job.getConfiguration() .set("featureFilePath", HDFS_HOME + USER + ID + INPUT + "/" + featureFileName); // Check the file type. Only support jpeg/jpg type images String type = filename.substring(args[1].lastIndexOf(".")); if (!(type.equals(".jpg") || type.equals(".jpeg"))) { System.out.println("Image type not supported!!!\nExiting"); System.exit(1); } // Input: hdfs://.../features/ // The feature dir is a location of all features extracted from the database String inputPathStr = HDFS_HOME + FEATURES; // Output: hdfs://.../user/ID/output/ String outputPathStr = HDFS_HOME + USER + ID + OUTPUT + "/" + time; job.setInputFormatClass(KeyValueTextInputFormat.class); // job.setOutputFormatClass(TextOutputFormat.class); // Get the lists of all feature files: /.../features/data/part-* FileSystem fs = FileSystem.get(job.getConfiguration()); FileStatus[] statuses = fs.listStatus(new Path(inputPathStr)); StringBuffer sb = new StringBuffer(); for (FileStatus fileStatus : statuses) { sb.append(fileStatus.getPath() + ","); } sb.deleteCharAt(sb.lastIndexOf(",")); job.setJarByClass(FeatureMatching.class); job.setMapperClass(FeatureMatchMapper.class); job.setReducerClass(FeatureMatchReducer.class); // only need one reducer to collect the result job.setNumReduceTasks(1); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); // Input a directory, so need the recursive input FileInputFormat.setInputDirRecursive(job, true); // Set the PathFilter, to omit _SUCCESS files // (This is not working correctly, as the PathFilter class is an interface rather than a class. // But the 2nd arg asks me to extend the PathFilter) // FileInputFormat.setInputPathFilter(job, MyPathFilter.class); // // FileInputFormat.setInputPaths(job, new Path(inputPathStr)); FileInputFormat.setInputPaths(job, sb.toString()); FileOutputFormat.setOutputPath(job, new Path(outputPathStr)); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
@Override public final void run() { try { // before preparing the job localize // all the archives TaskAttemptID taskid = t.getTaskID(); LocalDirAllocator lDirAlloc = new LocalDirAllocator("mapred.local.dir"); File jobCacheDir = null; if (conf.getJar() != null) { jobCacheDir = new File(new Path(conf.getJar()).getParent().toString()); } File workDir = new File( lDirAlloc .getLocalPathToRead( TaskTracker.getJobCacheSubdir() + Path.SEPARATOR + t.getJobID() + Path.SEPARATOR + t.getTaskID() + Path.SEPARATOR + MRConstants.WORKDIR, conf) .toString()); URI[] archives = DistributedCache.getCacheArchives(conf); URI[] files = DistributedCache.getCacheFiles(conf); FileStatus fileStatus; FileSystem fileSystem; Path localPath; String baseDir; if ((archives != null) || (files != null)) { if (archives != null) { String[] archivesTimestamps = DistributedCache.getArchiveTimestamps(conf); Path[] p = new Path[archives.length]; for (int i = 0; i < archives.length; i++) { fileSystem = FileSystem.get(archives[i], conf); fileStatus = fileSystem.getFileStatus(new Path(archives[i].getPath())); String cacheId = DistributedCache.makeRelative(archives[i], conf); String cachePath = TaskTracker.getCacheSubdir() + Path.SEPARATOR + cacheId; if (lDirAlloc.ifExists(cachePath, conf)) { localPath = lDirAlloc.getLocalPathToRead(cachePath, conf); } else { localPath = lDirAlloc.getLocalPathForWrite(cachePath, fileStatus.getLen(), conf); } baseDir = localPath.toString().replace(cacheId, ""); p[i] = DistributedCache.getLocalCache( archives[i], conf, new Path(baseDir), fileStatus, true, Long.parseLong(archivesTimestamps[i]), new Path(workDir.getAbsolutePath()), false); } DistributedCache.setLocalArchives(conf, stringifyPathArray(p)); } if ((files != null)) { String[] fileTimestamps = DistributedCache.getFileTimestamps(conf); Path[] p = new Path[files.length]; for (int i = 0; i < files.length; i++) { fileSystem = FileSystem.get(files[i], conf); fileStatus = fileSystem.getFileStatus(new Path(files[i].getPath())); String cacheId = DistributedCache.makeRelative(files[i], conf); String cachePath = TaskTracker.getCacheSubdir() + Path.SEPARATOR + cacheId; if (lDirAlloc.ifExists(cachePath, conf)) { localPath = lDirAlloc.getLocalPathToRead(cachePath, conf); } else { localPath = lDirAlloc.getLocalPathForWrite(cachePath, fileStatus.getLen(), conf); } baseDir = localPath.toString().replace(cacheId, ""); p[i] = DistributedCache.getLocalCache( files[i], conf, new Path(baseDir), fileStatus, false, Long.parseLong(fileTimestamps[i]), new Path(workDir.getAbsolutePath()), false); } DistributedCache.setLocalFiles(conf, stringifyPathArray(p)); } Path localTaskFile = new Path(t.getJobFile()); FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(localTaskFile, true); OutputStream out = localFs.create(localTaskFile); try { conf.writeXml(out); } finally { out.close(); } } if (!prepare()) { return; } String sep = System.getProperty("path.separator"); StringBuffer classPath = new StringBuffer(); // start with same classpath as parent process classPath.append(System.getProperty("java.class.path")); classPath.append(sep); if (!workDir.mkdirs()) { if (!workDir.isDirectory()) { LOG.fatal("Mkdirs failed to create " + workDir.toString()); } } String jar = conf.getJar(); if (jar != null) { // if jar exists, it into workDir File[] libs = new File(jobCacheDir, "lib").listFiles(); if (libs != null) { for (int i = 0; i < libs.length; i++) { classPath.append(sep); // add libs from jar to classpath classPath.append(libs[i]); } } classPath.append(sep); classPath.append(new File(jobCacheDir, "classes")); classPath.append(sep); classPath.append(jobCacheDir); } // include the user specified classpath // archive paths Path[] archiveClasspaths = DistributedCache.getArchiveClassPaths(conf); if (archiveClasspaths != null && archives != null) { Path[] localArchives = DistributedCache.getLocalCacheArchives(conf); if (localArchives != null) { for (int i = 0; i < archives.length; i++) { for (int j = 0; j < archiveClasspaths.length; j++) { if (archives[i].getPath().equals(archiveClasspaths[j].toString())) { classPath.append(sep); classPath.append(localArchives[i].toString()); } } } } } // file paths Path[] fileClasspaths = DistributedCache.getFileClassPaths(conf); if (fileClasspaths != null && files != null) { Path[] localFiles = DistributedCache.getLocalCacheFiles(conf); if (localFiles != null) { for (int i = 0; i < files.length; i++) { for (int j = 0; j < fileClasspaths.length; j++) { if (files[i].getPath().equals(fileClasspaths[j].toString())) { classPath.append(sep); classPath.append(localFiles[i].toString()); } } } } } classPath.append(sep); classPath.append(workDir); // Build exec child jmv args. Vector<String> vargs = new Vector<String>(8); File jvm = // use same jvm as parent new File(new File(System.getProperty("java.home"), "bin"), "java"); vargs.add(jvm.toString()); // Add child (task) java-vm options. // // The following symbols if present in mapred.child.java.opts value are // replaced: // + @taskid@ is interpolated with value of TaskID. // Other occurrences of @ will not be altered. // // Example with multiple arguments and substitutions, showing // jvm GC logging, and start of a passwordless JVM JMX agent so can // connect with jconsole and the likes to watch child memory, threads // and get thread dumps. // // <property> // <name>mapred.child.java.opts</name> // <value>-verbose:gc -Xloggc:/tmp/@[email protected] \ // -Dcom.sun.management.jmxremote.authenticate=false \ // -Dcom.sun.management.jmxremote.ssl=false \ // </value> // </property> // String javaOpts = conf.get("mapred.child.java.opts", "-Xmx200m"); javaOpts = javaOpts.replace("@taskid@", taskid.toString()); String[] javaOptsSplit = javaOpts.split(" "); // Add java.library.path; necessary for loading native libraries. // // 1. To support native-hadoop library i.e. libhadoop.so, we add the // parent processes' java.library.path to the child. // 2. We also add the 'cwd' of the task to it's java.library.path to help // users distribute native libraries via the DistributedCache. // 3. The user can also specify extra paths to be added to the // java.library.path via mapred.child.java.opts. // String libraryPath = System.getProperty("java.library.path"); if (libraryPath == null) { libraryPath = workDir.getAbsolutePath(); } else { libraryPath += sep + workDir; } boolean hasUserLDPath = false; for (int i = 0; i < javaOptsSplit.length; i++) { if (javaOptsSplit[i].startsWith("-Djava.library.path=")) { javaOptsSplit[i] += sep + libraryPath; hasUserLDPath = true; break; } } if (!hasUserLDPath) { vargs.add("-Djava.library.path=" + libraryPath); } for (int i = 0; i < javaOptsSplit.length; i++) { vargs.add(javaOptsSplit[i]); } // add java.io.tmpdir given by mapred.child.tmp String tmp = conf.get("mapred.child.tmp", "./tmp"); Path tmpDir = new Path(tmp); // if temp directory path is not absolute // prepend it with workDir. if (!tmpDir.isAbsolute()) { tmpDir = new Path(workDir.toString(), tmp); } FileSystem localFs = FileSystem.getLocal(conf); if (!localFs.mkdirs(tmpDir) && !localFs.getFileStatus(tmpDir).isDir()) { throw new IOException("Mkdirs failed to create " + tmpDir.toString()); } vargs.add("-Djava.io.tmpdir=" + tmpDir.toString()); // Add classpath. vargs.add("-classpath"); vargs.add(classPath.toString()); // Setup the log4j prop long logSize = TaskLog.getTaskLogLength(conf); vargs.add( "-Dhadoop.log.dir=" + new File(System.getProperty("hadoop.log.dir")).getAbsolutePath()); vargs.add("-Dhadoop.root.logger=INFO,TLA"); vargs.add("-Dhadoop.tasklog.taskid=" + taskid); vargs.add("-Dhadoop.tasklog.totalLogFileSize=" + logSize); if (conf.getProfileEnabled()) { if (conf.getProfileTaskRange(t.isMapTask()).isIncluded(t.getPartition())) { File prof = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.PROFILE); vargs.add(String.format(conf.getProfileParams(), prof.toString())); } } // Add main class and its arguments vargs.add(Child.class.getName()); // main of Child // pass umbilical address InetSocketAddress address = tracker.getTaskTrackerReportAddress(); vargs.add(address.getAddress().getHostAddress()); vargs.add(Integer.toString(address.getPort())); vargs.add(taskid.toString()); // pass task identifier String pidFile = null; if (tracker.isTaskMemoryManagerEnabled()) { pidFile = lDirAlloc .getLocalPathForWrite( (TaskTracker.getPidFilesSubdir() + Path.SEPARATOR + taskid), this.conf) .toString(); } // set memory limit using ulimit if feasible and necessary ... String[] ulimitCmd = Shell.getUlimitMemoryCommand(conf); List<String> setup = null; if (ulimitCmd != null) { setup = new ArrayList<String>(); for (String arg : ulimitCmd) { setup.add(arg); } } // Set up the redirection of the task's stdout and stderr streams File stdout = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.STDOUT); File stderr = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.STDERR); stdout.getParentFile().mkdirs(); tracker.getTaskTrackerInstrumentation().reportTaskLaunch(taskid, stdout, stderr); Map<String, String> env = new HashMap<String, String>(); StringBuffer ldLibraryPath = new StringBuffer(); ldLibraryPath.append(workDir.toString()); String oldLdLibraryPath = null; oldLdLibraryPath = System.getenv("LD_LIBRARY_PATH"); if (oldLdLibraryPath != null) { ldLibraryPath.append(sep); ldLibraryPath.append(oldLdLibraryPath); } env.put("LD_LIBRARY_PATH", ldLibraryPath.toString()); jvmManager.launchJvm( this, jvmManager.constructJvmEnv( setup, vargs, stdout, stderr, logSize, workDir, env, pidFile, conf)); synchronized (lock) { while (!done) { lock.wait(); } } tracker.getTaskTrackerInstrumentation().reportTaskEnd(t.getTaskID()); if (exitCodeSet) { if (!killed && exitCode != 0) { if (exitCode == 65) { tracker.getTaskTrackerInstrumentation().taskFailedPing(t.getTaskID()); } throw new IOException("Task process exit with nonzero status of " + exitCode + "."); } } } catch (FSError e) { LOG.fatal("FSError", e); try { tracker.fsError(t.getTaskID(), e.getMessage()); } catch (IOException ie) { LOG.fatal(t.getTaskID() + " reporting FSError", ie); } } catch (Throwable throwable) { LOG.warn(t.getTaskID() + " Child Error", throwable); ByteArrayOutputStream baos = new ByteArrayOutputStream(); throwable.printStackTrace(new PrintStream(baos)); try { tracker.reportDiagnosticInfo(t.getTaskID(), baos.toString()); } catch (IOException e) { LOG.warn(t.getTaskID() + " Reporting Diagnostics", e); } } finally { try { URI[] archives = DistributedCache.getCacheArchives(conf); URI[] files = DistributedCache.getCacheFiles(conf); if (archives != null) { for (int i = 0; i < archives.length; i++) { DistributedCache.releaseCache(archives[i], conf); } } if (files != null) { for (int i = 0; i < files.length; i++) { DistributedCache.releaseCache(files[i], conf); } } } catch (IOException ie) { LOG.warn("Error releasing caches : Cache files might not have been cleaned up"); } tracker.reportTaskFinished(t.getTaskID(), false); if (t.isMapTask()) { tracker.addFreeMapSlot(); } else { tracker.addFreeReduceSlot(); } } }
/** Delete the dst files/dirs which do not exist in src */ private static void deleteNonexisting( FileSystem dstfs, FileStatus dstroot, Path dstsorted, FileSystem jobfs, Path jobdir, JobConf jobconf, Configuration conf) throws IOException { if (!dstroot.isDir()) { throw new IOException( "dst must be a directory when option " + Options.DELETE.cmd + " is set, but dst (= " + dstroot.getPath() + ") is not a directory."); } // write dst lsr results final Path dstlsr = new Path(jobdir, "_distcp_dst_lsr"); final SequenceFile.Writer writer = SequenceFile.createWriter( jobfs, jobconf, dstlsr, Text.class, FileStatus.class, SequenceFile.CompressionType.NONE); try { // do lsr to get all file statuses in dstroot final Stack<FileStatus> lsrstack = new Stack<FileStatus>(); for (lsrstack.push(dstroot); !lsrstack.isEmpty(); ) { final FileStatus status = lsrstack.pop(); if (status.isDir()) { for (FileStatus child : dstfs.listStatus(status.getPath())) { String relative = makeRelative(dstroot.getPath(), child.getPath()); writer.append(new Text(relative), child); lsrstack.push(child); } } } } finally { checkAndClose(writer); } // sort lsr results final Path sortedlsr = new Path(jobdir, "_distcp_dst_lsr_sorted"); SequenceFile.Sorter sorter = new SequenceFile.Sorter( jobfs, new Text.Comparator(), Text.class, FileStatus.class, jobconf); sorter.sort(dstlsr, sortedlsr); // compare lsr list and dst list SequenceFile.Reader lsrin = null; SequenceFile.Reader dstin = null; try { lsrin = new SequenceFile.Reader(jobfs, sortedlsr, jobconf); dstin = new SequenceFile.Reader(jobfs, dstsorted, jobconf); // compare sorted lsr list and sorted dst list final Text lsrpath = new Text(); final FileStatus lsrstatus = new FileStatus(); final Text dstpath = new Text(); final Text dstfrom = new Text(); final FsShell shell = new FsShell(conf); final String[] shellargs = {"-rmr", null}; boolean hasnext = dstin.next(dstpath, dstfrom); for (; lsrin.next(lsrpath, lsrstatus); ) { int dst_cmp_lsr = dstpath.compareTo(lsrpath); for (; hasnext && dst_cmp_lsr < 0; ) { hasnext = dstin.next(dstpath, dstfrom); dst_cmp_lsr = dstpath.compareTo(lsrpath); } if (dst_cmp_lsr == 0) { // lsrpath exists in dst, skip it hasnext = dstin.next(dstpath, dstfrom); } else { // lsrpath does not exist, delete it String s = new Path(dstroot.getPath(), lsrpath.toString()).toString(); if (shellargs[1] == null || !isAncestorPath(shellargs[1], s)) { shellargs[1] = s; int r = 0; try { r = shell.run(shellargs); } catch (Exception e) { throw new IOException("Exception from shell.", e); } if (r != 0) { throw new IOException( "\"" + shellargs[0] + " " + shellargs[1] + "\" returns non-zero value " + r); } } } } } finally { checkAndClose(lsrin); checkAndClose(dstin); } }
public void readFields(DataInput in) throws IOException { input.readFields(in); output = Text.readString(in); }
@Test public void testListStatus() throws Exception { final String hPrefix = "test/hadoop"; final String[] dirs = { hPrefix + "/a", hPrefix + "/b", hPrefix + "/c", hPrefix + "/1", hPrefix + "/#@#@", hPrefix + "/&*#$#$@234" }; ArrayList<Path> testDirs = new ArrayList<Path>(); for (String d : dirs) { testDirs.add(qualifiedPath(d, fc2)); } Assert.assertFalse(exists(fc1, testDirs.get(0))); for (Path path : testDirs) { fc1.mkdir(path, FsPermission.getDefault(), true); } // test listStatus that returns an array of FileStatus FileStatus[] paths = fc1.util().listStatus(qualifiedPath("test", fc1)); Assert.assertEquals(1, paths.length); Assert.assertEquals(qualifiedPath(hPrefix, fc1), paths[0].getPath()); paths = fc1.util().listStatus(qualifiedPath(hPrefix, fc1)); Assert.assertEquals(6, paths.length); for (int i = 0; i < dirs.length; i++) { boolean found = false; for (int j = 0; j < paths.length; j++) { if (qualifiedPath(dirs[i], fc1).equals(paths[j].getPath())) { found = true; } } Assert.assertTrue(dirs[i] + " not found", found); } paths = fc1.util().listStatus(qualifiedPath(dirs[0], fc1)); Assert.assertEquals(0, paths.length); // test listStatus that returns an iterator of FileStatus RemoteIterator<FileStatus> pathsItor = fc1.listStatus(qualifiedPath("test", fc1)); Assert.assertEquals(qualifiedPath(hPrefix, fc1), pathsItor.next().getPath()); Assert.assertFalse(pathsItor.hasNext()); pathsItor = fc1.listStatus(qualifiedPath(hPrefix, fc1)); int dirLen = 0; for (; pathsItor.hasNext(); dirLen++) { boolean found = false; FileStatus stat = pathsItor.next(); for (int j = 0; j < dirs.length; j++) { if (qualifiedPath(dirs[j], fc1).equals(stat.getPath())) { found = true; break; } } Assert.assertTrue(stat.getPath() + " not found", found); } Assert.assertEquals(6, dirLen); pathsItor = fc1.listStatus(qualifiedPath(dirs[0], fc1)); Assert.assertFalse(pathsItor.hasNext()); }
/** * Method to go though the HDFS filesystem in a DFS to find all files * * <p>fs:FileSystem object from HDFS minDate: Oldest date for files to be backed up maxDate:Newest * date for files to be backed up p:Path in HDFS to look for files pathList:Will be filled with * all files in p hmTimestamps: hashmap of timestamps for later sorting */ public void checkDir( FileSystem fs, long minDate, long maxDate, Path p, ArrayList<Path> pathList, HashMap<Path, Long> hmTimestamps) { long tmpDate; FileStatus[] fStat; try { String sPath = p.toUri().getPath(); // If this is a directory if (fs.getFileStatus(p).isDir()) { // ignore certain directories if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName()) || sPath.startsWith("/mapred") || "ops".equals(p.getName()) || p.getName().startsWith("_distcp_logs")) { return; } // dump the mkdir and chmod commands for this // directory -- skip root directory only { FileStatus stat = fs.getFileStatus(p); if (!sPath.equals("/")) { m_wrMkdirs.println("hadoop fs -mkdir " + sPath); } m_wrChmods.println( "hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); Short sh = new Short(stat.getPermission().toShort()); m_wrChmods.println( "hadoop fs -chmod " + Long.toOctalString(sh.longValue()) + " " + sPath); } fStat = fs.listStatus(p); // Do a recursive call to all elements for (int i = 0; i < fStat.length; i++) { checkDir(fs, minDate, maxDate, fStat[i].getPath(), pathList, hmTimestamps); } } else { // If not a directory then we've found a file // ignore crc files if (p.getName().endsWith(".crc")) { return; } // ignore other files if (sPath.startsWith("/user/oozie/etl/workflows/")) { return; } // try to get the table name from the path. There are // various types of tables, from those replicated from // another database to regular hive tables to // partitioned hive tables. We use table names to // both exclude some from the backup, and for the rest // to dump out the schema and partition name. if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) { m_nIgnoredTables++; if (m_nIgnoredTables < 5) { System.out.println("Skipping ignore-table file: " + sPath); } else if (m_nIgnoredTables == 5) { System.out.println("(...not showing other skipped tables...)"); } return; } FileStatus stat = fs.getFileStatus(p); tmpDate = stat.getModificationTime() / 1000; // store the chmods/chowns for all files m_wrChmods.println( "hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); m_wrChmods.println("hadoop fs -chmod " + stat.getPermission().toShort() + " " + sPath); // check dates. is it too young? if (tmpDate < minDate) { return; } // is the file too recent? if (tmpDate > maxDate) { // System.out.println("file too recent: " + sPath); return; } // file timestamp is ok pathList.add(p); hmTimestamps.put(p, new Long(tmpDate)); // store info about total bytes neeed to backup m_nTotalBytes += fs.getContentSummary(p).getLength(); } } catch (IOException e) { System.err.println("ERROR: could not open " + p + ": " + e); // System.exit(1) ; } }