private static IntWritable deduceInputFile(JobConf job) { Path[] inputPaths = FileInputFormat.getInputPaths(job); Path inputFile = new Path(job.get("map.input.file")); // value == one for sort-input; value == two for sort-output return (inputFile.getParent().equals(inputPaths[0])) ? sortInput : sortOutput; }
private static void finalize( Configuration conf, JobConf jobconf, final Path destPath, String presevedAttributes) throws IOException { if (presevedAttributes == null) { return; } EnumSet<FileAttribute> preseved = FileAttribute.parse(presevedAttributes); if (!preseved.contains(FileAttribute.USER) && !preseved.contains(FileAttribute.GROUP) && !preseved.contains(FileAttribute.PERMISSION)) { return; } FileSystem dstfs = destPath.getFileSystem(conf); Path dstdirlist = new Path(jobconf.get(DST_DIR_LIST_LABEL)); SequenceFile.Reader in = null; try { in = new SequenceFile.Reader(dstdirlist.getFileSystem(jobconf), dstdirlist, jobconf); Text dsttext = new Text(); FilePair pair = new FilePair(); for (; in.next(dsttext, pair); ) { Path absdst = new Path(destPath, pair.output); updatePermissions(pair.input, dstfs.getFileStatus(absdst), preseved, dstfs); } } finally { checkAndClose(in); } }
/** Sanity check for srcPath */ private static void checkSrcPath(Configuration conf, List<Path> srcPaths) throws IOException { List<IOException> rslt = new ArrayList<IOException>(); for (Path p : srcPaths) { FileSystem fs = p.getFileSystem(conf); if (!fs.exists(p)) { rslt.add(new IOException("Input source " + p + " does not exist.")); } } if (!rslt.isEmpty()) { throw new InvalidInputException(rslt); } }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int cnfiles = job.getInt(SRC_COUNT_LABEL, -1); long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1); String srcfilelist = job.get(SRC_LIST_LABEL, ""); if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) { throw new RuntimeException( "Invalid metadata: #files(" + cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist + ")"); } Path src = new Path(srcfilelist); FileSystem fs = src.getFileSystem(job); FileStatus srcst = fs.getFileStatus(src); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); LongWritable key = new LongWritable(); FilePair value = new FilePair(); final long targetsize = cbsize / numSplits; long pos = 0L; long last = 0L; long acc = 0L; long cbrem = srcst.getLen(); SequenceFile.Reader sl = null; try { sl = new SequenceFile.Reader(fs, src, job); for (; sl.next(key, value); last = sl.getPosition()) { // if adding this split would put this split past the target size, // cut the last split and put this next file in the next split. if (acc + key.get() > targetsize && acc != 0) { long splitsize = last - pos; splits.add(new FileSplit(src, pos, splitsize, (String[]) null)); cbrem -= splitsize; pos = last; acc = 0L; } acc += key.get(); } } finally { checkAndClose(sl); } if (cbrem != 0) { splits.add(new FileSplit(src, pos, cbrem, (String[]) null)); } return splits.toArray(new FileSplit[splits.size()]); }
/** * Make a path relative with respect to a root path. absPath is always assumed to descend from * root. Otherwise returned path is null. */ static String makeRelative(Path root, Path absPath) { if (!absPath.isAbsolute()) { throw new IllegalArgumentException("!absPath.isAbsolute(), absPath=" + absPath); } String p = absPath.toUri().getPath(); StringTokenizer pathTokens = new StringTokenizer(p, "/"); for (StringTokenizer rootTokens = new StringTokenizer(root.toUri().getPath(), "/"); rootTokens.hasMoreTokens(); ) { if (!rootTokens.nextToken().equals(pathTokens.nextToken())) { return null; } } StringBuilder sb = new StringBuilder(); for (; pathTokens.hasMoreTokens(); ) { sb.append(pathTokens.nextToken()); if (pathTokens.hasMoreTokens()) { sb.append(Path.SEPARATOR); } } return sb.length() == 0 ? "." : sb.toString(); }
private static List<Path> fetchFileList(Configuration conf, Path srcList) throws IOException { List<Path> result = new ArrayList<Path>(); FileSystem fs = srcList.getFileSystem(conf); BufferedReader input = null; try { input = new BufferedReader(new InputStreamReader(fs.open(srcList))); String line = input.readLine(); while (line != null) { result.add(new Path(line)); line = input.readLine(); } } finally { checkAndClose(input); } return result; }
/** * Mapper configuration. Extracts source and destination file system, as well as top-level paths * on source and destination directories. Gets the named file systems, to be used later in map. */ public void configure(JobConf job) { destPath = new Path(job.get(DST_DIR_LABEL, "/")); try { destFileSys = destPath.getFileSystem(job); } catch (IOException ex) { throw new RuntimeException("Unable to get the named file system.", ex); } sizeBuf = job.getInt("copy.buf.size", 128 * 1024); buffer = new byte[sizeBuf]; ignoreReadFailures = job.getBoolean(Options.IGNORE_READ_FAILURES.propertyname, false); preserve_status = job.getBoolean(Options.PRESERVE_STATUS.propertyname, false); if (preserve_status) { preseved = FileAttribute.parse(job.get(PRESERVE_STATUS_LABEL)); } update = job.getBoolean(Options.UPDATE.propertyname, false); overwrite = !update && job.getBoolean(Options.OVERWRITE.propertyname, false); this.job = job; }
public static void main(String[] args) { int testType = TEST_TYPE_READ; int bufferSize = DEFAULT_BUFFER_SIZE; int fileSize = 1; int nrFiles = 1; String resFileName = DEFAULT_RES_FILE_NAME; boolean isSequential = false; String version = "DFSCIOTest.0.0.1"; String usage = "Usage: DFSCIOTest -read | -write | -clean [-nrFiles N] [-fileSize MB] [-resFile resultFileName] [-bufferSize Bytes] "; System.out.println(version); if (args.length == 0) { System.err.println(usage); System.exit(-1); } for (int i = 0; i < args.length; i++) { // parse command line if (args[i].startsWith("-r")) { testType = TEST_TYPE_READ; } else if (args[i].startsWith("-w")) { testType = TEST_TYPE_WRITE; } else if (args[i].startsWith("-clean")) { testType = TEST_TYPE_CLEANUP; } else if (args[i].startsWith("-seq")) { isSequential = true; } else if (args[i].equals("-nrFiles")) { nrFiles = Integer.parseInt(args[++i]); } else if (args[i].equals("-fileSize")) { fileSize = Integer.parseInt(args[++i]); } else if (args[i].equals("-bufferSize")) { bufferSize = Integer.parseInt(args[++i]); } else if (args[i].equals("-resFile")) { resFileName = args[++i]; } } LOG.info("nrFiles = " + nrFiles); LOG.info("fileSize (MB) = " + fileSize); LOG.info("bufferSize = " + bufferSize); try { fsConfig.setInt("test.io.file.buffer.size", bufferSize); FileSystem fs = FileSystem.get(fsConfig); if (testType != TEST_TYPE_CLEANUP) { fs.delete(HDFS_TEST_DIR, true); if (!fs.mkdirs(HDFS_TEST_DIR)) { throw new IOException("Mkdirs failed to create " + HDFS_TEST_DIR.toString()); } // Copy the executables over to the remote filesystem String hadoopHome = System.getenv("HADOOP_PREFIX"); fs.copyFromLocalFile( new Path(hadoopHome + "/libhdfs/libhdfs.so." + HDFS_LIB_VERSION), HDFS_SHLIB); fs.copyFromLocalFile(new Path(hadoopHome + "/libhdfs/hdfs_read"), HDFS_READ); fs.copyFromLocalFile(new Path(hadoopHome + "/libhdfs/hdfs_write"), HDFS_WRITE); } if (isSequential) { long tStart = System.currentTimeMillis(); sequentialTest(fs, testType, fileSize, nrFiles); long execTime = System.currentTimeMillis() - tStart; String resultLine = "Seq Test exec time sec: " + (float) execTime / 1000; LOG.info(resultLine); return; } if (testType == TEST_TYPE_CLEANUP) { cleanup(fs); return; } createControlFile(fs, fileSize, nrFiles); long tStart = System.currentTimeMillis(); if (testType == TEST_TYPE_WRITE) writeTest(fs); if (testType == TEST_TYPE_READ) readTest(fs); long execTime = System.currentTimeMillis() - tStart; analyzeResult(fs, testType, execTime, resFileName); } catch (Exception e) { System.err.print(e.getLocalizedMessage()); System.exit(-1); } }
public Long doIO(Reporter reporter, String name, long totalSize) throws IOException { totalSize *= MEGA; // create instance of local filesystem FileSystem localFS = FileSystem.getLocal(fsConfig); try { // native runtime Runtime runTime = Runtime.getRuntime(); // copy the dso and executable from dfs synchronized (this) { localFS.delete(HDFS_TEST_DIR, true); if (!(localFS.mkdirs(HDFS_TEST_DIR))) { throw new IOException("Failed to create " + HDFS_TEST_DIR + " on local filesystem"); } } synchronized (this) { if (!localFS.exists(HDFS_SHLIB)) { if (!FileUtil.copy(fs, HDFS_SHLIB, localFS, HDFS_SHLIB, false, fsConfig)) { throw new IOException("Failed to copy " + HDFS_SHLIB + " to local filesystem"); } String chmodCmd = new String(CHMOD + " a+x " + HDFS_SHLIB); Process process = runTime.exec(chmodCmd); int exitStatus = process.waitFor(); if (exitStatus != 0) { throw new IOException(chmodCmd + ": Failed with exitStatus: " + exitStatus); } } } synchronized (this) { if (!localFS.exists(HDFS_READ)) { if (!FileUtil.copy(fs, HDFS_READ, localFS, HDFS_READ, false, fsConfig)) { throw new IOException("Failed to copy " + HDFS_READ + " to local filesystem"); } String chmodCmd = new String(CHMOD + " a+x " + HDFS_READ); Process process = runTime.exec(chmodCmd); int exitStatus = process.waitFor(); if (exitStatus != 0) { throw new IOException(chmodCmd + ": Failed with exitStatus: " + exitStatus); } } } // exec the C program Path inFile = new Path(DATA_DIR, name); String readCmd = new String(HDFS_READ + " " + inFile + " " + totalSize + " " + bufferSize); Process process = runTime.exec(readCmd, null, new File(HDFS_TEST_DIR.toString())); int exitStatus = process.waitFor(); if (exitStatus != 0) { throw new IOException(HDFS_READ + ": Failed with exitStatus: " + exitStatus); } } catch (InterruptedException interruptedException) { reporter.setStatus(interruptedException.toString()); } finally { localFS.close(); } return new Long(totalSize); }
/** * Initialize DFSCopyFileMapper specific job-configuration. * * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments */ private static void setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); // set boolean values final boolean update = args.flags.contains(Options.UPDATE); final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE); jobConf.setBoolean(Options.UPDATE.propertyname, update); jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite); jobConf.setBoolean( Options.IGNORE_READ_FAILURES.propertyname, args.flags.contains(Options.IGNORE_READ_FAILURES)); jobConf.setBoolean( Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); FileSystem dstfs = args.dst.getFileSystem(conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_distcp_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_distcp_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter( jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter( jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter( jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory OR we're updating/overwriting // the contents of the destination directory. final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite; int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext(); ) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<FileStatus>(); for (pathstack.push(srcfilestat); !pathstack.empty(); ) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { // skip file if the src and the dst files are the same. skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst)); // skip file if it exceed file limit or size limit skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit; if (!skipfile) { ++fileCount; byteCount += child.getLen(); if (LOG.isTraceEnabled()) { LOG.trace("adding file " + child.getPath()); } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append( new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { LOG.info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_distcp_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); if (dststatus != null && args.flags.contains(Options.DELETE)) { deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf); } Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_distcp_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); LOG.info("srcCount=" + srcCount); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(byteCount, jobConf); }
/** Fully delete dir */ static void fullyDelete(String dir, Configuration conf) throws IOException { if (dir != null) { Path tmp = new Path(dir); tmp.getFileSystem(conf).delete(tmp, true); } }
/** * Copy a file to a destination. * * @param srcstat src path and metadata * @param dstpath dst path * @param reporter */ private void copy( FileStatus srcstat, Path relativedst, OutputCollector<WritableComparable<?>, Text> outc, Reporter reporter) throws IOException { Path absdst = new Path(destPath, relativedst); int totfiles = job.getInt(SRC_COUNT_LABEL, -1); assert totfiles >= 0 : "Invalid file count " + totfiles; // if a directory, ensure created even if empty if (srcstat.isDir()) { if (destFileSys.exists(absdst)) { if (!destFileSys.getFileStatus(absdst).isDir()) { throw new IOException("Failed to mkdirs: " + absdst + " is a file."); } } else if (!destFileSys.mkdirs(absdst)) { throw new IOException("Failed to mkdirs " + absdst); } // TODO: when modification times can be set, directories should be // emitted to reducers so they might be preserved. Also, mkdirs does // not currently return an error when the directory already exists; // if this changes, all directory work might as well be done in reduce return; } if (destFileSys.exists(absdst) && !overwrite && !needsUpdate(srcstat, destFileSys, absdst)) { outc.collect(null, new Text("SKIP: " + srcstat.getPath())); ++skipcount; reporter.incrCounter(Counter.SKIP, 1); updateStatus(reporter); return; } Path tmpfile = new Path(job.get(TMP_DIR_LABEL), relativedst); long cbcopied = 0L; FSDataInputStream in = null; FSDataOutputStream out = null; try { // open src file try { in = srcstat.getPath().getFileSystem(job).open(srcstat.getPath()); } catch (IOException e) { LOG.error("Failed to open src file " + srcstat.getPath() + ", ignore and return"); in = null; return; } reporter.incrCounter(Counter.BYTESEXPECTED, srcstat.getLen()); // open tmp file out = create(tmpfile, reporter, srcstat); // copy file for (int cbread; (cbread = in.read(buffer)) >= 0; ) { out.write(buffer, 0, cbread); cbcopied += cbread; reporter.setStatus( String.format("%.2f ", cbcopied * 100.0 / srcstat.getLen()) + absdst + " [ " + StringUtils.humanReadableInt(cbcopied) + " / " + StringUtils.humanReadableInt(srcstat.getLen()) + " ]"); } } finally { checkAndClose(in); checkAndClose(out); } if (cbcopied != srcstat.getLen()) { if (srcstat.getLen() == 0 && cbcopied > 0) { LOG.info("most likely see a WAL file corruption: " + srcstat.getPath()); } else { throw new IOException( "File size not matched: copied " + bytesString(cbcopied) + " to tmpfile (=" + tmpfile + ") but expected " + bytesString(srcstat.getLen()) + " from " + srcstat.getPath()); } } else { if (totfiles == 1) { // Copying a single file; use dst path provided by user as destination // rather than destination directory, if a file Path dstparent = absdst.getParent(); if (!(destFileSys.exists(dstparent) && destFileSys.getFileStatus(dstparent).isDir())) { absdst = dstparent; } } if (destFileSys.exists(absdst) && destFileSys.getFileStatus(absdst).isDir()) { throw new IOException(absdst + " is a directory"); } if (!destFileSys.mkdirs(absdst.getParent())) { throw new IOException("Failed to craete parent dir: " + absdst.getParent()); } rename(tmpfile, absdst); FileStatus dststat = destFileSys.getFileStatus(absdst); if (dststat.getLen() != srcstat.getLen()) { destFileSys.delete(absdst, false); throw new IOException( "File size not matched: copied " + bytesString(dststat.getLen()) + " to dst (=" + absdst + ") but expected " + bytesString(srcstat.getLen()) + " from " + srcstat.getPath()); } updatePermissions(srcstat, dststat); } // report at least once for each file ++copycount; reporter.incrCounter(Counter.BYTESCOPIED, cbcopied); reporter.incrCounter(Counter.COPY, 1); updateStatus(reporter); }
public boolean accept(Path path) { return (path.getName().startsWith("part-")); }
static void checkRecords(Configuration defaults, Path sortInput, Path sortOutput) throws IOException { FileSystem inputfs = sortInput.getFileSystem(defaults); FileSystem outputfs = sortOutput.getFileSystem(defaults); FileSystem defaultfs = FileSystem.get(defaults); JobConf jobConf = new JobConf(defaults, RecordStatsChecker.class); jobConf.setJobName("sortvalidate-recordstats-checker"); int noSortReduceTasks = outputfs.listStatus(sortOutput, sortPathsFilter).length; jobConf.setInt("sortvalidate.sort.reduce.tasks", noSortReduceTasks); int noSortInputpaths = inputfs.listStatus(sortInput).length; jobConf.setInputFormat(NonSplitableSequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(IntWritable.class); jobConf.setOutputValueClass(RecordStatsChecker.RecordStatsWritable.class); jobConf.setMapperClass(Map.class); jobConf.setCombinerClass(Reduce.class); jobConf.setReducerClass(Reduce.class); jobConf.setNumMapTasks(noSortReduceTasks); jobConf.setNumReduceTasks(1); FileInputFormat.setInputPaths(jobConf, sortInput); FileInputFormat.addInputPath(jobConf, sortOutput); Path outputPath = new Path("/tmp/sortvalidate/recordstatschecker"); if (defaultfs.exists(outputPath)) { defaultfs.delete(outputPath, true); } FileOutputFormat.setOutputPath(jobConf, outputPath); // Uncomment to run locally in a single process // job_conf.set("mapred.job.tracker", "local"); Path[] inputPaths = FileInputFormat.getInputPaths(jobConf); System.out.println( "\nSortValidator.RecordStatsChecker: Validate sort " + "from " + inputPaths[0] + " (" + noSortInputpaths + " files), " + inputPaths[1] + " (" + noSortReduceTasks + " files) into " + FileOutputFormat.getOutputPath(jobConf) + " with 1 reducer."); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); // Check to ensure that the statistics of the // framework's sort-input and sort-output match SequenceFile.Reader stats = new SequenceFile.Reader(defaultfs, new Path(outputPath, "part-00000"), defaults); IntWritable k1 = new IntWritable(); IntWritable k2 = new IntWritable(); RecordStatsWritable v1 = new RecordStatsWritable(); RecordStatsWritable v2 = new RecordStatsWritable(); if (!stats.next(k1, v1)) { throw new IOException("Failed to read record #1 from reduce's output"); } if (!stats.next(k2, v2)) { throw new IOException("Failed to read record #2 from reduce's output"); } if ((v1.getBytes() != v2.getBytes()) || (v1.getRecords() != v2.getRecords()) || v1.getChecksum() != v2.getChecksum()) { throw new IOException( "(" + v1.getBytes() + ", " + v1.getRecords() + ", " + v1.getChecksum() + ") v/s (" + v2.getBytes() + ", " + v2.getRecords() + ", " + v2.getChecksum() + ")"); } }