/** * Method to go though the HDFS filesystem in a DFS to find all files * * <p>fs:FileSystem object from HDFS minDate: Oldest date for files to be backed up maxDate:Newest * date for files to be backed up p:Path in HDFS to look for files pathList:Will be filled with * all files in p hmTimestamps: hashmap of timestamps for later sorting */ public void checkDir( FileSystem fs, long minDate, long maxDate, Path p, ArrayList<Path> pathList, HashMap<Path, Long> hmTimestamps) { long tmpDate; FileStatus[] fStat; try { String sPath = p.toUri().getPath(); // If this is a directory if (fs.getFileStatus(p).isDir()) { // ignore certain directories if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName()) || sPath.startsWith("/mapred") || "ops".equals(p.getName()) || p.getName().startsWith("_distcp_logs")) { return; } // dump the mkdir and chmod commands for this // directory -- skip root directory only { FileStatus stat = fs.getFileStatus(p); if (!sPath.equals("/")) { m_wrMkdirs.println("hadoop fs -mkdir " + sPath); } m_wrChmods.println( "hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); Short sh = new Short(stat.getPermission().toShort()); m_wrChmods.println( "hadoop fs -chmod " + Long.toOctalString(sh.longValue()) + " " + sPath); } fStat = fs.listStatus(p); // Do a recursive call to all elements for (int i = 0; i < fStat.length; i++) { checkDir(fs, minDate, maxDate, fStat[i].getPath(), pathList, hmTimestamps); } } else { // If not a directory then we've found a file // ignore crc files if (p.getName().endsWith(".crc")) { return; } // ignore other files if (sPath.startsWith("/user/oozie/etl/workflows/")) { return; } // try to get the table name from the path. There are // various types of tables, from those replicated from // another database to regular hive tables to // partitioned hive tables. We use table names to // both exclude some from the backup, and for the rest // to dump out the schema and partition name. if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) { m_nIgnoredTables++; if (m_nIgnoredTables < 5) { System.out.println("Skipping ignore-table file: " + sPath); } else if (m_nIgnoredTables == 5) { System.out.println("(...not showing other skipped tables...)"); } return; } FileStatus stat = fs.getFileStatus(p); tmpDate = stat.getModificationTime() / 1000; // store the chmods/chowns for all files m_wrChmods.println( "hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); m_wrChmods.println("hadoop fs -chmod " + stat.getPermission().toShort() + " " + sPath); // check dates. is it too young? if (tmpDate < minDate) { return; } // is the file too recent? if (tmpDate > maxDate) { // System.out.println("file too recent: " + sPath); return; } // file timestamp is ok pathList.add(p); hmTimestamps.put(p, new Long(tmpDate)); // store info about total bytes neeed to backup m_nTotalBytes += fs.getContentSummary(p).getLength(); } } catch (IOException e) { System.err.println("ERROR: could not open " + p + ": " + e); // System.exit(1) ; } }
/** * Compare the checksums of the hdfs file as well as the local copied file. * * @author [email protected] * @date Fri Jan 27 06:06:00 2012 */ boolean compareChecksums(FileSystem fs, Path p, String sFsPath) { try { // get hdfs file info FileStatus stat = fs.getFileStatus(p); // get HDFS checksum FileChecksum ck = fs.getFileChecksum(p); String sCk, sCkShort; if (ck == null) { sCk = sCkShort = "<null>"; } else { sCk = ck.toString(); sCkShort = sCk.replaceAll("^.*:", ""); } // System.out.println(p.toUri().getPath() + " len=" + stat.getLen() // + " " + stat.getOwner() + "/" + stat.getGroup() // + " checksum=" + sCk); // find the local file File fLocal = new File(sFsPath); if (!fLocal.exists()) { System.out.println("CHECKSUM-ERROR: file does not exist: " + sFsPath); return false; } if (!fLocal.isFile()) { System.out.println("CHECKSUM-ERROR: path is not a file: " + sFsPath); return false; } if (stat.getLen() != fLocal.length()) { System.out.println( "CHECKSUM-ERROR: length mismatch: " + sFsPath + " hdfslen=" + stat.getLen() + " fslen=" + fLocal.length()); return false; } // get local fs checksum FileChecksum ckLocal = getLocalFileChecksum(sFsPath); if (ckLocal == null) { System.out.println("ERROR Failed to get checksum for local file " + sFsPath); return false; } // compare checksums as a string, after stripping the // algorithm name from the beginning String sCkLocal = ckLocal.toString(); String sCkLocalShort = sCkLocal.replaceAll("^.*:", ""); if (false == sCkShort.equals(sCkLocalShort)) { System.out.println( "CHECKSUM-ERROR: checksum mismatch: " + sFsPath + "\nhdfs = " + sCk + "\nlocal= " + sCkLocal); return false; } return true; } catch (IOException e) { System.out.println("CHECKSUM-ERROR: " + sFsPath + " exception " + e.toString()); } return false; }
/** * Method to move files from HDFS to local filesystem * * <p>localPath: Path on the machines filesystem fs:FileSystem object from HDFS pathList:List of * paths for files that might need to be backed up size:max size in bytes to be backed up * * <p>ReturnsDate of the last files backed up if reached size limit, else, zero */ public long backupFiles( String localPath, String preservePath, FileSystem fs, ArrayList<Path> pathList, long size) { Path fsPath; long tmpSize = 0; long tmpDate = 0; // Start iterating over all paths for (Path hdfsPath : pathList) { try { long nFileSize = fs.getContentSummary(hdfsPath).getLength(); tmpSize = tmpSize + nFileSize; if ((tmpSize <= size) || (size == 0)) { FileStatus stat = fs.getFileStatus(hdfsPath); System.err.println( "File " + hdfsPath.toUri().getPath() + " " + nFileSize + " bytes, " + "perms: " + stat.getOwner() + "/" + stat.getGroup() + ", " + stat.getPermission().toString()); tmpDate = stat.getModificationTime() / 1000; String sFsPath = localPath + hdfsPath.toUri().getPath(); fsPath = new Path(sFsPath); File f = new File(sFsPath); // COMMENTED OUT: until a few backup cycles run // and the mtime gets in fact set on all copied // files. // // ignore it if the file exists and has the same mtime // if (f.exists() && f.isFile() && f.lastModified() == stat.getModificationTime()) // { // System.out.println("no need to backup " + f.toString() + ", mtime matches hdfs"); // continue; // } if (false == m_bDryRun) { // check if we need to back up the local file // (not directory), if it already exists. if (f.exists() && f.isFile()) { // ignore files with substrings in the // no-preserve file if (true == doPreserveFile(sFsPath)) { // move it to the backup path String sNewPath = preservePath + hdfsPath.toUri().getPath(); File newFile = new File(sNewPath); // create directory structure for new file? if (false == newFile.getParentFile().exists()) { if (false == newFile.getParentFile().mkdirs()) { System.err.println("Failed to mkdirs " + newFile.getParentFile().toString()); System.exit(1); } } // rename existing file to new location if (false == f.renameTo(newFile)) { System.err.println( "Failed to renameTo " + f.toString() + " to " + newFile.toString()); System.exit(1); } System.out.println("preserved " + f.toString() + " into " + newFile.toString()); } else { System.out.println("skipped preservation of " + f.toString()); } } // copy from hdfs to local filesystem fs.copyToLocalFile(hdfsPath, fsPath); // set the mtime to match hdfs file f.setLastModified(stat.getModificationTime()); // compare checksums on both files compareChecksums(fs, hdfsPath, sFsPath); } // don't print the progress after every file -- go // by at least 1% increments long nPercentDone = (long) (100 * tmpSize / m_nTotalBytes); if (nPercentDone > m_nLastPercentBytesDone) { System.out.println( "progress: copied " + prettyPrintBytes(tmpSize) + ", " + nPercentDone + "% done" + ", tstamp=" + tmpDate); m_nLastPercentBytesDone = nPercentDone; } if (m_nSleepSeconds > 0) { try { Thread.sleep(1000 * m_nSleepSeconds); } catch (Exception e2) { // ignore } } } else { return tmpDate; } } catch (IOException e) { System.err.println("FATAL ERROR: Something wrong with the file"); System.err.println(e); System.out.println(tmpDate); System.exit(1); return 0; } } return 0; }