public static void main(String[] args) throws IOException { Path baseDir = null; String localPath = null; String preservePath = null; String sIgnoreTablesFilename = null; String sNoPreserveFilename = null; String sDateString = null; long size = 0; // UNIX dates for right now long now = new java.util.Date().getTime() / 1000; long maxDate = now; for (int i = 0; i < args.length; i++) { if (args[i].equals("--hdfs-path")) { baseDir = new Path(args[++i]); continue; } if (args[i].equals("--local-path")) { localPath = args[++i]; continue; } if (args[i].equals("--preserve-path")) { preservePath = args[++i]; continue; } if (args[i].equals("--no-preserve")) { sNoPreserveFilename = args[++i]; continue; } if (args[i].equals("--ignore-tables")) { sIgnoreTablesFilename = args[++i]; continue; } if (args[i].equals("--sleep")) { try { m_nSleepSeconds = Integer.parseInt(args[++i]); } catch (Exception e) { System.err.println("ERROR: " + e.toString() + "\n"); usage(); } continue; } if (args[i].equals("--dry-run")) { m_bDryRun = true; continue; } if (args[i].equals("--date")) { sDateString = args[++i]; continue; } if (args[i].equals("--max-date")) { maxDate = Long.parseLong(args[++i]); continue; } if (args[i].equals("--max-bytes")) { size = Long.parseLong(args[++i]); continue; } System.err.println("ERROR: unknown arg " + args[i]); usage(); } if (baseDir == null || localPath == null || preservePath == null || sDateString == null) { usage(); } long minDate; if ("yesterday".equals(sDateString)) { // figure out yesterday's dates Calendar cal = Calendar.getInstance(); cal.roll(Calendar.DAY_OF_YEAR, -1); // yesterday midnight cal.set(Calendar.HOUR_OF_DAY, 0); cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); minDate = cal.getTimeInMillis() / 1000; // yesterday end of day cal.set(Calendar.HOUR_OF_DAY, 23); cal.set(Calendar.MINUTE, 59); cal.set(Calendar.SECOND, 59); cal.set(Calendar.MILLISECOND, 999); maxDate = cal.getTimeInMillis() / 1000; } else if ("last-week".equals(sDateString)) { minDate = maxDate - (7 * 24 * 60 * 60); } else if ("last-day".equals(sDateString)) { minDate = maxDate - (24 * 60 * 60); } else { // UNIX date since epoch of last backup minDate = Long.parseLong(sDateString); } long tmpDate = 0; BackupHdfs bak = new BackupHdfs(); // initialize the list of tables to ignore if (sIgnoreTablesFilename != null) { bak.initializeTablesToIgnore(sIgnoreTablesFilename); } // initialize list of files to not preserve if (sNoPreserveFilename != null) { bak.initializeNoPreserve(sNoPreserveFilename); } ArrayList<Path> pathList = new ArrayList<Path>(2000); HashMap<Path, Long> hmTimestamps = new HashMap<Path, Long>(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); // If the HDFS path is a dir continue if (fs.getFileStatus(baseDir).isDir()) { Calendar cal = Calendar.getInstance(); System.err.println(""); cal.setTimeInMillis(minDate * 1000); System.err.println("min date = " + cal.getTime().toString()); cal.setTimeInMillis(maxDate * 1000); System.err.println("max date = " + cal.getTime().toString()); System.err.println(""); System.err.println("Searching filesystem: " + baseDir.toUri().getPath()); bak.checkDir(fs, minDate, maxDate, baseDir, pathList, hmTimestamps); System.err.println(""); System.err.println("Skipped " + m_nIgnoredTables + " files due to ignored tables"); System.err.println(""); System.err.println("Number of files to backup = " + pathList.size()); System.err.println("Total bytes to backup = " + prettyPrintBytes(m_nTotalBytes)); System.err.println(""); System.err.println("sorting list of files..."); Collections.sort(pathList, new DateComparator(hmTimestamps)); System.err.println("done"); System.err.println(""); System.err.println("starting backup..."); tmpDate = bak.backupFiles(localPath, preservePath, fs, pathList, size); bak.closeFiles(); System.err.println(""); System.err.println("backup completed..."); } if (tmpDate == 0) { // If not size limit reached print out date for right now System.out.println(maxDate); } else { // Print out date for last file backed up System.err.println("Size limit reached."); System.out.println(tmpDate); } System.exit(0); }
/** * Combine the status stored in the index and the underlying status. * * @param h status stored in the index * @param cache caching the underlying file statuses * @return the combined file status * @throws IOException */ private FileStatus toFileStatus(HarStatus h, Map<String, FileStatus> cache) throws IOException { FileStatus underlying = null; if (cache != null) { underlying = cache.get(h.partName); } if (underlying == null) { final Path p = h.isDir ? archivePath : new Path(archivePath, h.partName); underlying = fs.getFileStatus(p); if (cache != null) { cache.put(h.partName, underlying); } } long modTime = 0; int version = metadata.getVersion(); if (version < 3) { modTime = underlying.getModificationTime(); } else if (version == 3) { modTime = h.getModificationTime(); } return new FileStatus( h.isDir() ? 0L : h.getLength(), h.isDir(), underlying.getReplication(), underlying.getBlockSize(), modTime, underlying.getAccessTime(), underlying.getPermission(), underlying.getOwner(), underlying.getGroup(), makeRelative(this.uri.getPath(), new Path(h.name))); }
/** * Initialize a Har filesystem per har archive. The archive home directory is the top level * directory in the filesystem that contains the HAR archive. Be careful with this method, you do * not want to go on creating new Filesystem instances per call to path.getFileSystem(). the uri * of Har is har://underlyingfsscheme-host:port/archivepath. or har:///archivepath. This assumes * the underlying filesystem to be used in case not specified. */ @Override public void initialize(URI name, Configuration conf) throws IOException { // initialize the metadata cache, if needed initializeMetadataCache(conf); // decode the name URI underLyingURI = decodeHarURI(name, conf); // we got the right har Path- now check if this is // truly a har filesystem Path harPath = archivePath(new Path(name.getScheme(), name.getAuthority(), name.getPath())); if (harPath == null) { throw new IOException("Invalid path for the Har Filesystem. " + name.toString()); } if (fs == null) { fs = FileSystem.get(underLyingURI, conf); } uri = harPath.toUri(); archivePath = new Path(uri.getPath()); harAuth = getHarAuth(underLyingURI); // check for the underlying fs containing // the index file Path masterIndexPath = new Path(archivePath, "_masterindex"); Path archiveIndexPath = new Path(archivePath, "_index"); if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { throw new IOException( "Invalid path for the Har Filesystem. " + "No index file in " + harPath); } metadata = harMetaCache.get(uri); if (metadata != null) { FileStatus mStat = fs.getFileStatus(masterIndexPath); FileStatus aStat = fs.getFileStatus(archiveIndexPath); if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { // the archive has been overwritten since we last read it // remove the entry from the meta data cache metadata = null; harMetaCache.remove(uri); } } if (metadata == null) { metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); metadata.parseMetaData(); harMetaCache.put(uri, metadata); } }
public FileStatus getPartFileStatus(Path partPath) throws IOException { FileStatus status; status = partFileStatuses.get(partPath); if (status == null) { status = fs.getFileStatus(partPath); partFileStatuses.put(partPath, status); } return status; }
/** * Launch application for the dag represented by this client. * * @throws YarnException * @throws IOException */ public void startApplication() throws YarnException, IOException { Class<?>[] defaultClasses; if (applicationType.equals(YARN_APPLICATION_TYPE)) { // TODO restrict the security check to only check if security is enabled for webservices. if (UserGroupInformation.isSecurityEnabled()) { defaultClasses = DATATORRENT_SECURITY_CLASSES; } else { defaultClasses = DATATORRENT_CLASSES; } } else { throw new IllegalStateException(applicationType + " is not a valid application type."); } LinkedHashSet<String> localJarFiles = findJars(dag, defaultClasses); if (resources != null) { localJarFiles.addAll(resources); } YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics(); LOG.info( "Got Cluster metric info from ASM" + ", numNodeManagers=" + clusterMetrics.getNumNodeManagers()); // GetClusterNodesRequest clusterNodesReq = Records.newRecord(GetClusterNodesRequest.class); // GetClusterNodesResponse clusterNodesResp = // rmClient.clientRM.getClusterNodes(clusterNodesReq); // LOG.info("Got Cluster node info from ASM"); // for (NodeReport node : clusterNodesResp.getNodeReports()) { // LOG.info("Got node report from ASM for" // + ", nodeId=" + node.getNodeId() // + ", nodeAddress" + node.getHttpAddress() // + ", nodeRackName" + node.getRackName() // + ", nodeNumContainers" + node.getNumContainers() // + ", nodeHealthStatus" + node.getHealthReport()); // } List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo(); for (QueueUserACLInfo aclInfo : listAclInfo) { for (QueueACL userAcl : aclInfo.getUserAcls()) { LOG.info( "User ACL Info for Queue" + ", queueName=" + aclInfo.getQueueName() + ", userAcl=" + userAcl.name()); } } // Get a new application id YarnClientApplication newApp = yarnClient.createApplication(); appId = newApp.getNewApplicationResponse().getApplicationId(); // Dump out information about cluster capability as seen by the resource manager int maxMem = newApp.getNewApplicationResponse().getMaximumResourceCapability().getMemory(); LOG.info("Max mem capabililty of resources in this cluster " + maxMem); int amMemory = dag.getMasterMemoryMB(); if (amMemory > maxMem) { LOG.info( "AM memory specified above max threshold of cluster. Using max value." + ", specified=" + amMemory + ", max=" + maxMem); amMemory = maxMem; } if (dag.getAttributes().get(LogicalPlan.APPLICATION_ID) == null) { dag.setAttribute(LogicalPlan.APPLICATION_ID, appId.toString()); } // Create launch context for app master LOG.info("Setting up application submission context for ASM"); ApplicationSubmissionContext appContext = Records.newRecord(ApplicationSubmissionContext.class); // set the application id appContext.setApplicationId(appId); // set the application name appContext.setApplicationName(dag.getValue(LogicalPlan.APPLICATION_NAME)); appContext.setApplicationType(this.applicationType); if (YARN_APPLICATION_TYPE.equals(this.applicationType)) { // appContext.setMaxAppAttempts(1); // no retries until Stram is HA } // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); // Setup security tokens // If security is enabled get ResourceManager and NameNode delegation tokens. // Set these tokens on the container so that they are sent as part of application submission. // This also sets them up for renewal by ResourceManager. The NameNode delegation rmToken // is also used by ResourceManager to fetch the jars from HDFS and set them up for the // application master launch. if (UserGroupInformation.isSecurityEnabled()) { Credentials credentials = new Credentials(); String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL); if (tokenRenewer == null || tokenRenewer.length() == 0) { throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer"); } // For now, only getting tokens for the default file-system. FileSystem fs = StramClientUtils.newFileSystemInstance(conf); try { final Token<?> tokens[] = fs.addDelegationTokens(tokenRenewer, credentials); if (tokens != null) { for (Token<?> token : tokens) { LOG.info("Got dt for " + fs.getUri() + "; " + token); } } } finally { fs.close(); } addRMDelegationToken(tokenRenewer, credentials); DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); amContainer.setTokens(fsTokens); } // set local resources for the application master // local files or archives as needed // In this scenario, the jar file for the application master is part of the local resources Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); // copy required jar files to dfs, to be localized for containers FileSystem fs = StramClientUtils.newFileSystemInstance(conf); try { Path appsBasePath = new Path(StramClientUtils.getDTDFSRootDir(fs, conf), StramClientUtils.SUBDIR_APPS); Path appPath = new Path(appsBasePath, appId.toString()); String libJarsCsv = copyFromLocal(fs, appPath, localJarFiles.toArray(new String[] {})); LOG.info("libjars: {}", libJarsCsv); dag.getAttributes().put(LogicalPlan.LIBRARY_JARS, libJarsCsv); LaunchContainerRunnable.addFilesToLocalResources( LocalResourceType.FILE, libJarsCsv, localResources, fs); if (archives != null) { String[] localFiles = archives.split(","); String archivesCsv = copyFromLocal(fs, appPath, localFiles); LOG.info("archives: {}", archivesCsv); dag.getAttributes().put(LogicalPlan.ARCHIVES, archivesCsv); LaunchContainerRunnable.addFilesToLocalResources( LocalResourceType.ARCHIVE, archivesCsv, localResources, fs); } if (files != null) { String[] localFiles = files.split(","); String filesCsv = copyFromLocal(fs, appPath, localFiles); LOG.info("files: {}", filesCsv); dag.getAttributes().put(LogicalPlan.FILES, filesCsv); LaunchContainerRunnable.addFilesToLocalResources( LocalResourceType.FILE, filesCsv, localResources, fs); } dag.getAttributes().put(LogicalPlan.APPLICATION_PATH, appPath.toString()); if (dag.getAttributes().get(OperatorContext.STORAGE_AGENT) == null) { /* which would be the most likely case */ Path checkpointPath = new Path(appPath, LogicalPlan.SUBDIR_CHECKPOINTS); // use conf client side to pickup any proxy settings from dt-site.xml dag.setAttribute( OperatorContext.STORAGE_AGENT, new FSStorageAgent(checkpointPath.toString(), conf)); } if (dag.getAttributes().get(LogicalPlan.CONTAINER_OPTS_CONFIGURATOR) == null) { dag.setAttribute( LogicalPlan.CONTAINER_OPTS_CONFIGURATOR, new BasicContainerOptConfigurator()); } // Set the log4j properties if needed if (!log4jPropFile.isEmpty()) { Path log4jSrc = new Path(log4jPropFile); Path log4jDst = new Path(appPath, "log4j.props"); fs.copyFromLocalFile(false, true, log4jSrc, log4jDst); FileStatus log4jFileStatus = fs.getFileStatus(log4jDst); LocalResource log4jRsrc = Records.newRecord(LocalResource.class); log4jRsrc.setType(LocalResourceType.FILE); log4jRsrc.setVisibility(LocalResourceVisibility.APPLICATION); log4jRsrc.setResource(ConverterUtils.getYarnUrlFromURI(log4jDst.toUri())); log4jRsrc.setTimestamp(log4jFileStatus.getModificationTime()); log4jRsrc.setSize(log4jFileStatus.getLen()); localResources.put("log4j.properties", log4jRsrc); } if (originalAppId != null) { Path origAppPath = new Path(appsBasePath, this.originalAppId); LOG.info("Restart from {}", origAppPath); copyInitialState(origAppPath); } // push logical plan to DFS location Path cfgDst = new Path(appPath, LogicalPlan.SER_FILE_NAME); FSDataOutputStream outStream = fs.create(cfgDst, true); LogicalPlan.write(this.dag, outStream); outStream.close(); Path launchConfigDst = new Path(appPath, LogicalPlan.LAUNCH_CONFIG_FILE_NAME); outStream = fs.create(launchConfigDst, true); conf.writeXml(outStream); outStream.close(); FileStatus topologyFileStatus = fs.getFileStatus(cfgDst); LocalResource topologyRsrc = Records.newRecord(LocalResource.class); topologyRsrc.setType(LocalResourceType.FILE); topologyRsrc.setVisibility(LocalResourceVisibility.APPLICATION); topologyRsrc.setResource(ConverterUtils.getYarnUrlFromURI(cfgDst.toUri())); topologyRsrc.setTimestamp(topologyFileStatus.getModificationTime()); topologyRsrc.setSize(topologyFileStatus.getLen()); localResources.put(LogicalPlan.SER_FILE_NAME, topologyRsrc); // Set local resource info into app master container launch context amContainer.setLocalResources(localResources); // Set the necessary security tokens as needed // amContainer.setContainerTokens(containerToken); // Set the env variables to be setup in the env where the application master will be run LOG.info("Set the environment for the application master"); Map<String, String> env = new HashMap<String, String>(); // Add application jar(s) location to classpath // At some point we should not be required to add // the hadoop specific classpaths to the env. // It should be provided out of the box. // For now setting all required classpaths including // the classpath to "." for the application jar(s) // including ${CLASSPATH} will duplicate the class path in app master, removing it for now // StringBuilder classPathEnv = new StringBuilder("${CLASSPATH}:./*"); StringBuilder classPathEnv = new StringBuilder("./*"); String classpath = conf.get(YarnConfiguration.YARN_APPLICATION_CLASSPATH); for (String c : StringUtils.isBlank(classpath) ? YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH : classpath.split(",")) { if (c.equals("$HADOOP_CLIENT_CONF_DIR")) { // SPOI-2501 continue; } classPathEnv.append(':'); classPathEnv.append(c.trim()); } env.put("CLASSPATH", classPathEnv.toString()); // propagate to replace node managers user name (effective in non-secure mode) env.put("HADOOP_USER_NAME", UserGroupInformation.getLoginUser().getUserName()); amContainer.setEnvironment(env); // Set the necessary command to execute the application master ArrayList<CharSequence> vargs = new ArrayList<CharSequence>(30); // Set java executable command LOG.info("Setting up app master command"); vargs.add(javaCmd); if (dag.isDebug()) { vargs.add("-agentlib:jdwp=transport=dt_socket,server=y,suspend=n"); } // Set Xmx based on am memory size // default heap size 75% of total memory vargs.add("-Xmx" + (amMemory * 3 / 4) + "m"); vargs.add("-XX:+HeapDumpOnOutOfMemoryError"); vargs.add("-XX:HeapDumpPath=/tmp/dt-heap-" + appId.getId() + ".bin"); vargs.add("-Dhadoop.root.logger=" + (dag.isDebug() ? "DEBUG" : "INFO") + ",RFA"); vargs.add("-Dhadoop.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR); vargs.add(String.format("-D%s=%s", StreamingContainer.PROP_APP_PATH, dag.assertAppPath())); if (dag.isDebug()) { vargs.add("-Dlog4j.debug=true"); } String loggersLevel = conf.get(DTLoggerFactory.DT_LOGGERS_LEVEL); if (loggersLevel != null) { vargs.add(String.format("-D%s=%s", DTLoggerFactory.DT_LOGGERS_LEVEL, loggersLevel)); } vargs.add(StreamingAppMaster.class.getName()); vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout"); vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr"); // Get final command StringBuilder command = new StringBuilder(9 * vargs.size()); for (CharSequence str : vargs) { command.append(str).append(" "); } LOG.info("Completed setting up app master command " + command.toString()); List<String> commands = new ArrayList<String>(); commands.add(command.toString()); amContainer.setCommands(commands); // Set up resource type requirements // For now, only memory is supported so we set memory requirements Resource capability = Records.newRecord(Resource.class); capability.setMemory(amMemory); appContext.setResource(capability); // Service data is a binary blob that can be passed to the application // Not needed in this scenario // amContainer.setServiceData(serviceData); appContext.setAMContainerSpec(amContainer); // Set the priority for the application master Priority pri = Records.newRecord(Priority.class); pri.setPriority(amPriority); appContext.setPriority(pri); // Set the queue to which this application is to be submitted in the RM appContext.setQueue(queueName); // Submit the application to the applications manager // SubmitApplicationResponse submitResp = rmClient.submitApplication(appRequest); // Ignore the response as either a valid response object is returned on success // or an exception thrown to denote some form of a failure String specStr = Objects.toStringHelper("Submitting application: ") .add("name", appContext.getApplicationName()) .add("queue", appContext.getQueue()) .add("user", UserGroupInformation.getLoginUser()) .add("resource", appContext.getResource()) .toString(); LOG.info(specStr); if (dag.isDebug()) { // LOG.info("Full submission context: " + appContext); } yarnClient.submitApplication(appContext); } finally { fs.close(); } }
private void parseMetaData() throws IOException { Text line = new Text(); long read; FSDataInputStream in = null; LineReader lin = null; try { in = fs.open(masterIndexPath); FileStatus masterStat = fs.getFileStatus(masterIndexPath); masterIndexTimestamp = masterStat.getModificationTime(); lin = new LineReader(in, getConf()); read = lin.readLine(line); // the first line contains the version of the index file String versionLine = line.toString(); String[] arr = versionLine.split(" "); version = Integer.parseInt(arr[0]); // make it always backwards-compatible if (this.version > HarFileSystem.VERSION) { throw new IOException( "Invalid version " + this.version + " expected " + HarFileSystem.VERSION); } // each line contains a hashcode range and the index file name String[] readStr; while (read < masterStat.getLen()) { int b = lin.readLine(line); read += b; readStr = line.toString().split(" "); int startHash = Integer.parseInt(readStr[0]); int endHash = Integer.parseInt(readStr[1]); stores.add( new Store( Long.parseLong(readStr[2]), Long.parseLong(readStr[3]), startHash, endHash)); line.clear(); } } catch (IOException ioe) { LOG.warn("Encountered exception ", ioe); throw ioe; } finally { IOUtils.cleanup(LOG, lin, in); } FSDataInputStream aIn = fs.open(archiveIndexPath); try { FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); archiveIndexTimestamp = archiveStat.getModificationTime(); LineReader aLin; // now start reading the real index file for (Store s : stores) { read = 0; aIn.seek(s.begin); aLin = new LineReader(aIn, getConf()); while (read + s.begin < s.end) { int tmp = aLin.readLine(line); read += tmp; String lineFeed = line.toString(); String[] parsed = lineFeed.split(" "); parsed[0] = decodeFileName(parsed[0]); archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); line.clear(); } } } finally { IOUtils.cleanup(LOG, aIn); } }
/** * Returns the status of a given cache file on hdfs. * * @param conf configuration * @param cache cache file * @return FileStatus object of the file * @throws IOException */ public static FileStatus getFileStatus(Configuration conf, URI cache) throws IOException { FileSystem fileSystem = FileSystem.get(cache, conf); Path filePath = new Path(cache.getPath()); return fileSystem.getFileStatus(filePath); }
/** * Returns mtime of a given cache file on hdfs. * * @param conf configuration * @param cache cache file * @return mtime of a given cache file on hdfs * @throws IOException */ public static long getTimestamp(Configuration conf, URI cache) throws IOException { FileSystem fileSystem = FileSystem.get(cache, conf); Path filePath = new Path(cache.getPath()); return fileSystem.getFileStatus(filePath).getModificationTime(); }
/** * Compare the checksums of the hdfs file as well as the local copied file. * * @author [email protected] * @date Fri Jan 27 06:06:00 2012 */ boolean compareChecksums(FileSystem fs, Path p, String sFsPath) { try { // get hdfs file info FileStatus stat = fs.getFileStatus(p); // get HDFS checksum FileChecksum ck = fs.getFileChecksum(p); String sCk, sCkShort; if (ck == null) { sCk = sCkShort = "<null>"; } else { sCk = ck.toString(); sCkShort = sCk.replaceAll("^.*:", ""); } // System.out.println(p.toUri().getPath() + " len=" + stat.getLen() // + " " + stat.getOwner() + "/" + stat.getGroup() // + " checksum=" + sCk); // find the local file File fLocal = new File(sFsPath); if (!fLocal.exists()) { System.out.println("CHECKSUM-ERROR: file does not exist: " + sFsPath); return false; } if (!fLocal.isFile()) { System.out.println("CHECKSUM-ERROR: path is not a file: " + sFsPath); return false; } if (stat.getLen() != fLocal.length()) { System.out.println( "CHECKSUM-ERROR: length mismatch: " + sFsPath + " hdfslen=" + stat.getLen() + " fslen=" + fLocal.length()); return false; } // get local fs checksum FileChecksum ckLocal = getLocalFileChecksum(sFsPath); if (ckLocal == null) { System.out.println("ERROR Failed to get checksum for local file " + sFsPath); return false; } // compare checksums as a string, after stripping the // algorithm name from the beginning String sCkLocal = ckLocal.toString(); String sCkLocalShort = sCkLocal.replaceAll("^.*:", ""); if (false == sCkShort.equals(sCkLocalShort)) { System.out.println( "CHECKSUM-ERROR: checksum mismatch: " + sFsPath + "\nhdfs = " + sCk + "\nlocal= " + sCkLocal); return false; } return true; } catch (IOException e) { System.out.println("CHECKSUM-ERROR: " + sFsPath + " exception " + e.toString()); } return false; }
/** * Method to go though the HDFS filesystem in a DFS to find all files * * <p>fs:FileSystem object from HDFS minDate: Oldest date for files to be backed up maxDate:Newest * date for files to be backed up p:Path in HDFS to look for files pathList:Will be filled with * all files in p hmTimestamps: hashmap of timestamps for later sorting */ public void checkDir( FileSystem fs, long minDate, long maxDate, Path p, ArrayList<Path> pathList, HashMap<Path, Long> hmTimestamps) { long tmpDate; FileStatus[] fStat; try { String sPath = p.toUri().getPath(); // If this is a directory if (fs.getFileStatus(p).isDir()) { // ignore certain directories if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName()) || sPath.startsWith("/mapred") || "ops".equals(p.getName()) || p.getName().startsWith("_distcp_logs")) { return; } // dump the mkdir and chmod commands for this // directory -- skip root directory only { FileStatus stat = fs.getFileStatus(p); if (!sPath.equals("/")) { m_wrMkdirs.println("hadoop fs -mkdir " + sPath); } m_wrChmods.println( "hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); Short sh = new Short(stat.getPermission().toShort()); m_wrChmods.println( "hadoop fs -chmod " + Long.toOctalString(sh.longValue()) + " " + sPath); } fStat = fs.listStatus(p); // Do a recursive call to all elements for (int i = 0; i < fStat.length; i++) { checkDir(fs, minDate, maxDate, fStat[i].getPath(), pathList, hmTimestamps); } } else { // If not a directory then we've found a file // ignore crc files if (p.getName().endsWith(".crc")) { return; } // ignore other files if (sPath.startsWith("/user/oozie/etl/workflows/")) { return; } // try to get the table name from the path. There are // various types of tables, from those replicated from // another database to regular hive tables to // partitioned hive tables. We use table names to // both exclude some from the backup, and for the rest // to dump out the schema and partition name. if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) { m_nIgnoredTables++; if (m_nIgnoredTables < 5) { System.out.println("Skipping ignore-table file: " + sPath); } else if (m_nIgnoredTables == 5) { System.out.println("(...not showing other skipped tables...)"); } return; } FileStatus stat = fs.getFileStatus(p); tmpDate = stat.getModificationTime() / 1000; // store the chmods/chowns for all files m_wrChmods.println( "hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); m_wrChmods.println("hadoop fs -chmod " + stat.getPermission().toShort() + " " + sPath); // check dates. is it too young? if (tmpDate < minDate) { return; } // is the file too recent? if (tmpDate > maxDate) { // System.out.println("file too recent: " + sPath); return; } // file timestamp is ok pathList.add(p); hmTimestamps.put(p, new Long(tmpDate)); // store info about total bytes neeed to backup m_nTotalBytes += fs.getContentSummary(p).getLength(); } } catch (IOException e) { System.err.println("ERROR: could not open " + p + ": " + e); // System.exit(1) ; } }
/** * Method to move files from HDFS to local filesystem * * <p>localPath: Path on the machines filesystem fs:FileSystem object from HDFS pathList:List of * paths for files that might need to be backed up size:max size in bytes to be backed up * * <p>ReturnsDate of the last files backed up if reached size limit, else, zero */ public long backupFiles( String localPath, String preservePath, FileSystem fs, ArrayList<Path> pathList, long size) { Path fsPath; long tmpSize = 0; long tmpDate = 0; // Start iterating over all paths for (Path hdfsPath : pathList) { try { long nFileSize = fs.getContentSummary(hdfsPath).getLength(); tmpSize = tmpSize + nFileSize; if ((tmpSize <= size) || (size == 0)) { FileStatus stat = fs.getFileStatus(hdfsPath); System.err.println( "File " + hdfsPath.toUri().getPath() + " " + nFileSize + " bytes, " + "perms: " + stat.getOwner() + "/" + stat.getGroup() + ", " + stat.getPermission().toString()); tmpDate = stat.getModificationTime() / 1000; String sFsPath = localPath + hdfsPath.toUri().getPath(); fsPath = new Path(sFsPath); File f = new File(sFsPath); // COMMENTED OUT: until a few backup cycles run // and the mtime gets in fact set on all copied // files. // // ignore it if the file exists and has the same mtime // if (f.exists() && f.isFile() && f.lastModified() == stat.getModificationTime()) // { // System.out.println("no need to backup " + f.toString() + ", mtime matches hdfs"); // continue; // } if (false == m_bDryRun) { // check if we need to back up the local file // (not directory), if it already exists. if (f.exists() && f.isFile()) { // ignore files with substrings in the // no-preserve file if (true == doPreserveFile(sFsPath)) { // move it to the backup path String sNewPath = preservePath + hdfsPath.toUri().getPath(); File newFile = new File(sNewPath); // create directory structure for new file? if (false == newFile.getParentFile().exists()) { if (false == newFile.getParentFile().mkdirs()) { System.err.println("Failed to mkdirs " + newFile.getParentFile().toString()); System.exit(1); } } // rename existing file to new location if (false == f.renameTo(newFile)) { System.err.println( "Failed to renameTo " + f.toString() + " to " + newFile.toString()); System.exit(1); } System.out.println("preserved " + f.toString() + " into " + newFile.toString()); } else { System.out.println("skipped preservation of " + f.toString()); } } // copy from hdfs to local filesystem fs.copyToLocalFile(hdfsPath, fsPath); // set the mtime to match hdfs file f.setLastModified(stat.getModificationTime()); // compare checksums on both files compareChecksums(fs, hdfsPath, sFsPath); } // don't print the progress after every file -- go // by at least 1% increments long nPercentDone = (long) (100 * tmpSize / m_nTotalBytes); if (nPercentDone > m_nLastPercentBytesDone) { System.out.println( "progress: copied " + prettyPrintBytes(tmpSize) + ", " + nPercentDone + "% done" + ", tstamp=" + tmpDate); m_nLastPercentBytesDone = nPercentDone; } if (m_nSleepSeconds > 0) { try { Thread.sleep(1000 * m_nSleepSeconds); } catch (Exception e2) { // ignore } } } else { return tmpDate; } } catch (IOException e) { System.err.println("FATAL ERROR: Something wrong with the file"); System.err.println(e); System.out.println(tmpDate); System.exit(1); return 0; } } return 0; }