public static void main(String[] args) throws IOException { Path baseDir = null; String localPath = null; String preservePath = null; String sIgnoreTablesFilename = null; String sNoPreserveFilename = null; String sDateString = null; long size = 0; // UNIX dates for right now long now = new java.util.Date().getTime() / 1000; long maxDate = now; for (int i = 0; i < args.length; i++) { if (args[i].equals("--hdfs-path")) { baseDir = new Path(args[++i]); continue; } if (args[i].equals("--local-path")) { localPath = args[++i]; continue; } if (args[i].equals("--preserve-path")) { preservePath = args[++i]; continue; } if (args[i].equals("--no-preserve")) { sNoPreserveFilename = args[++i]; continue; } if (args[i].equals("--ignore-tables")) { sIgnoreTablesFilename = args[++i]; continue; } if (args[i].equals("--sleep")) { try { m_nSleepSeconds = Integer.parseInt(args[++i]); } catch (Exception e) { System.err.println("ERROR: " + e.toString() + "\n"); usage(); } continue; } if (args[i].equals("--dry-run")) { m_bDryRun = true; continue; } if (args[i].equals("--date")) { sDateString = args[++i]; continue; } if (args[i].equals("--max-date")) { maxDate = Long.parseLong(args[++i]); continue; } if (args[i].equals("--max-bytes")) { size = Long.parseLong(args[++i]); continue; } System.err.println("ERROR: unknown arg " + args[i]); usage(); } if (baseDir == null || localPath == null || preservePath == null || sDateString == null) { usage(); } long minDate; if ("yesterday".equals(sDateString)) { // figure out yesterday's dates Calendar cal = Calendar.getInstance(); cal.roll(Calendar.DAY_OF_YEAR, -1); // yesterday midnight cal.set(Calendar.HOUR_OF_DAY, 0); cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); minDate = cal.getTimeInMillis() / 1000; // yesterday end of day cal.set(Calendar.HOUR_OF_DAY, 23); cal.set(Calendar.MINUTE, 59); cal.set(Calendar.SECOND, 59); cal.set(Calendar.MILLISECOND, 999); maxDate = cal.getTimeInMillis() / 1000; } else if ("last-week".equals(sDateString)) { minDate = maxDate - (7 * 24 * 60 * 60); } else if ("last-day".equals(sDateString)) { minDate = maxDate - (24 * 60 * 60); } else { // UNIX date since epoch of last backup minDate = Long.parseLong(sDateString); } long tmpDate = 0; BackupHdfs bak = new BackupHdfs(); // initialize the list of tables to ignore if (sIgnoreTablesFilename != null) { bak.initializeTablesToIgnore(sIgnoreTablesFilename); } // initialize list of files to not preserve if (sNoPreserveFilename != null) { bak.initializeNoPreserve(sNoPreserveFilename); } ArrayList<Path> pathList = new ArrayList<Path>(2000); HashMap<Path, Long> hmTimestamps = new HashMap<Path, Long>(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); // If the HDFS path is a dir continue if (fs.getFileStatus(baseDir).isDir()) { Calendar cal = Calendar.getInstance(); System.err.println(""); cal.setTimeInMillis(minDate * 1000); System.err.println("min date = " + cal.getTime().toString()); cal.setTimeInMillis(maxDate * 1000); System.err.println("max date = " + cal.getTime().toString()); System.err.println(""); System.err.println("Searching filesystem: " + baseDir.toUri().getPath()); bak.checkDir(fs, minDate, maxDate, baseDir, pathList, hmTimestamps); System.err.println(""); System.err.println("Skipped " + m_nIgnoredTables + " files due to ignored tables"); System.err.println(""); System.err.println("Number of files to backup = " + pathList.size()); System.err.println("Total bytes to backup = " + prettyPrintBytes(m_nTotalBytes)); System.err.println(""); System.err.println("sorting list of files..."); Collections.sort(pathList, new DateComparator(hmTimestamps)); System.err.println("done"); System.err.println(""); System.err.println("starting backup..."); tmpDate = bak.backupFiles(localPath, preservePath, fs, pathList, size); bak.closeFiles(); System.err.println(""); System.err.println("backup completed..."); } if (tmpDate == 0) { // If not size limit reached print out date for right now System.out.println(maxDate); } else { // Print out date for last file backed up System.err.println("Size limit reached."); System.out.println(tmpDate); } System.exit(0); }
/** Convert a Json map to a HdfsFileStatus object. */ public static HdfsFileStatus toFileStatus(final Map<?, ?> json, boolean includesType) { if (json == null) { return null; } final Map<?, ?> m = includesType ? (Map<?, ?>) json.get(FileStatus.class.getSimpleName()) : json; final String localName = (String) m.get("pathSuffix"); final PathType type = PathType.valueOf((String) m.get("type")); final byte[] symlink = type != PathType.SYMLINK ? null : DFSUtil.string2Bytes((String) m.get("symlink")); final long len = (Long) m.get("length"); final String owner = (String) m.get("owner"); final String group = (String) m.get("group"); final FsPermission permission = toFsPermission( (String) m.get("permission"), (Boolean) m.get("aclBit"), (Boolean) m.get("encBit")); final long aTime = (Long) m.get("accessTime"); final long mTime = (Long) m.get("modificationTime"); final long blockSize = (Long) m.get("blockSize"); final short replication = (short) (long) (Long) m.get("replication"); final long fileId = m.containsKey("fileId") ? (Long) m.get("fileId") : INodeId.GRANDFATHER_INODE_ID; Long childrenNumLong = (Long) m.get("childrenNum"); final int childrenNum = (childrenNumLong == null) ? -1 : childrenNumLong.intValue(); final byte storagePolicy = m.containsKey("storagePolicy") ? (byte) (long) (Long) m.get("storagePolicy") : BlockStoragePolicySuite.ID_UNSPECIFIED; return new HdfsFileStatus( len, type == PathType.DIRECTORY, replication, blockSize, mTime, aTime, permission, owner, group, symlink, DFSUtil.string2Bytes(localName), fileId, childrenNum, null, storagePolicy); }
/** * Parse a list of strings into longs. * * @param strs the list of strings to parse * @return a list of longs that were parsed. same length as strs. */ private static long[] parseTimestamps(String[] strs) { if (strs == null) { return null; } long[] result = new long[strs.length]; for (int i = 0; i < strs.length; ++i) { result[i] = Long.parseLong(strs[i]); } return result; }
/** Sets up configuration based on params */ private static boolean setup(Hashtable<String, String> curConf, Configuration argConf) { if (argConf.get("file") == null) { logger.fatal("Missing file parameter"); System.exit(1); } if (argConf.get("hdfs_base_path") == null) { logger.fatal("Missing HDFS base path, check gestore-conf.xml"); System.exit(1); } if (argConf.get("hdfs_temp_path") == null) { logger.fatal("Missing HDFS temp path, check gestore-conf.xml"); System.exit(1); } if (argConf.get("local_temp_path") == null) { logger.fatal("Missing local temp path, check gestore-conf.xml"); System.exit(1); } // Input paramaters curConf.put("run_id", argConf.get("run", "")); curConf.put("task_id", argConf.get("task", "")); curConf.put("file_id", argConf.get("file")); curConf.put("local_path", argConf.get("path", "")); curConf.put("type", argConf.get("type", "l2r")); curConf.put("timestamp_start", argConf.get("timestamp_start", "1")); curConf.put( "timestamp_stop", argConf.get("timestamp_stop", Integer.toString(Integer.MAX_VALUE))); curConf.put("delimiter", argConf.get("regex", "ID=.*")); curConf.put("taxon", argConf.get("taxon", "all")); curConf.put("intermediate", argConf.get("full_run", "false")); curConf.put("quick_add", argConf.get("quick_add", "false")); Boolean full_run = curConf.get("intermediate").matches("(?i).*true.*"); curConf.put("format", argConf.get("format", "unknown")); curConf.put("split", argConf.get("split", "1")); curConf.put("copy", argConf.get("copy", "true")); // Constants curConf.put("base_path", argConf.get("hdfs_base_path")); curConf.put("temp_path", argConf.get("hdfs_temp_path")); curConf.put("local_temp_path", argConf.get("local_temp_path")); curConf.put("db_name_files", argConf.get("hbase_file_table")); curConf.put("db_name_runs", argConf.get("hbase_run_table")); curConf.put("db_name_updates", argConf.get("hbase_db_update_table")); // Timestamps Date currentTime = new Date(); Date endDate = new Date(new Long(curConf.get("timestamp_stop"))); curConf.put("timestamp_real", Long.toString(currentTime.getTime())); return true; }
/** Find the corresponding meta data file from a given block file */ private static long parseGenerationStamp(File blockFile, File metaFile) throws IOException { String metaname = metaFile.getName(); String gs = metaname.substring( blockFile.getName().length() + 1, metaname.length() - METADATA_EXTENSION.length()); try { return Long.parseLong(gs); } catch (NumberFormatException nfe) { throw (IOException) new IOException("blockFile=" + blockFile + ", metaFile=" + metaFile).initCause(nfe); } }
/** * Find the metadata file for the specified block file. Return the generation stamp from the * name of the metafile. */ long getGenerationStampFromFile(File[] listdir, File blockFile) { String blockName = blockFile.getName(); for (int j = 0; j < listdir.length; j++) { String path = listdir[j].getName(); if (!path.startsWith(blockName)) { continue; } String[] vals = path.split("_"); if (vals.length != 3) { // blk, blkid, genstamp.meta continue; } String[] str = vals[2].split("\\."); if (str.length != 2) { continue; } return Long.parseLong(str[0]); } DataNode.LOG.warn("Block " + blockFile + " does not have a metafile!"); return Block.GRANDFATHER_GENERATION_STAMP; }
/** * Update the maps baseDirSize and baseDirNumberSubDir when adding cache. * * @param cacheStatus cache status of the cache is added */ private static void addCacheInfoUpdate(CacheStatus cacheStatus) { long cacheSize = cacheStatus.size; synchronized (baseDirSize) { Long dirSize = baseDirSize.get(cacheStatus.getBaseDir()); if (dirSize == null) { dirSize = Long.valueOf(cacheSize); } else { dirSize += cacheSize; } baseDirSize.put(cacheStatus.getBaseDir(), dirSize); } synchronized (baseDirNumberSubDir) { Integer dirSubDir = baseDirNumberSubDir.get(cacheStatus.getBaseDir()); if (dirSubDir == null) { dirSubDir = 1; } else { dirSubDir += 1; } baseDirNumberSubDir.put(cacheStatus.getBaseDir(), dirSubDir); } }
/** * Get the locally cached file or archive; it could either be previously cached (and valid) or * copy it from the {@link FileSystem} now. * * @param cache the cache to be localized, this should be specified as new * URI(hdfs://hostname:port/absolute_path_to_file#LINKNAME). If no schema or hostname:port is * provided the file is assumed to be in the filesystem being used in the Configuration * @param conf The Confguration file which contains the filesystem * @param subDir The sub cache Dir where you want to localize the files/archives * @param fileStatus The file status on the dfs. * @param isArchive if the cache is an archive or a file. In case it is an archive with a .zip or * .jar or .tar or .tgz or .tar.gz extension it will be unzipped/unjarred/untarred * automatically and the directory where the archive is unzipped/unjarred/untarred is returned * as the Path. In case of a file, the path to the file is returned * @param confFileStamp this is the hdfs file modification timestamp to verify that the file to be * cached hasn't changed since the job started * @param fileLength this is the length of the cache file * @param currentWorkDir this is the directory where you would want to create symlinks for the * locally cached files/archives * @param honorSymLinkConf if this is false, then the symlinks are not created even if conf says * so (this is required for an optimization in task launches * @param lDirAllocator LocalDirAllocator of the tracker * @return the path to directory where the archives are unjarred in case of archives, the path to * the file where the file is copied locally * @throws IOException */ private static Path getLocalCache( URI cache, Configuration conf, Path subDir, FileStatus fileStatus, boolean isArchive, long confFileStamp, long fileLength, Path currentWorkDir, boolean honorSymLinkConf, MRAsyncDiskService asyncDiskService, LocalDirAllocator lDirAllocator) throws IOException { String key = getKey(cache, conf, confFileStamp); CacheStatus lcacheStatus; Path localizedPath; synchronized (cachedArchives) { lcacheStatus = cachedArchives.get(key); if (lcacheStatus == null) { // was never localized Path uniqueParentDir = new Path(subDir, String.valueOf(random.nextLong())); String cachePath = new Path(uniqueParentDir, makeRelative(cache, conf)).toString(); Path localPath = lDirAllocator.getLocalPathForWrite(cachePath, fileLength, conf); lcacheStatus = new CacheStatus( new Path(localPath.toString().replace(cachePath, "")), localPath, uniqueParentDir); cachedArchives.put(key, lcacheStatus); } lcacheStatus.refcount++; } boolean initSuccessful = false; try { synchronized (lcacheStatus) { if (!lcacheStatus.isInited()) { localizedPath = localizeCache(conf, cache, confFileStamp, lcacheStatus, isArchive); lcacheStatus.initComplete(); } else { if (fileStatus != null) { localizedPath = checkCacheStatusValidity( conf, cache, confFileStamp, lcacheStatus, fileStatus, isArchive); } else { // if fileStatus is null, then the md5 must be correct // so there is no need to check for cache validity localizedPath = lcacheStatus.localizedLoadPath; } } createSymlink(conf, cache, lcacheStatus, isArchive, currentWorkDir, honorSymLinkConf); } // try deleting stuff if you can long size = 0; int numberSubDir = 0; synchronized (lcacheStatus) { synchronized (baseDirSize) { Long get = baseDirSize.get(lcacheStatus.getBaseDir()); if (get != null) { size = get.longValue(); } else { LOG.warn("Cannot find size of baseDir: " + lcacheStatus.getBaseDir()); } } synchronized (baseDirNumberSubDir) { Integer get = baseDirNumberSubDir.get(lcacheStatus.getBaseDir()); if (get != null) { numberSubDir = get.intValue(); } else { LOG.warn("Cannot find subdirectories limit of baseDir: " + lcacheStatus.getBaseDir()); } } } // setting the cache size to a default of 10GB long allowedSize = conf.getLong("local.cache.size", DEFAULT_CACHE_SIZE); long allowedNumberSubDir = conf.getLong("local.cache.numbersubdir", DEFAULT_CACHE_SUBDIR_LIMIT); if (allowedSize < size || allowedNumberSubDir < numberSubDir) { // try some cache deletions LOG.debug( "Start deleting released cache because" + " [size, allowedSize, numberSubDir, allowedNumberSubDir] =" + " [" + size + ", " + allowedSize + ", " + numberSubDir + ", " + allowedNumberSubDir + "]"); deleteCache(conf, asyncDiskService); } initSuccessful = true; return localizedPath; } finally { if (!initSuccessful) { synchronized (cachedArchives) { lcacheStatus.refcount--; } } } }
/** * Method to go though the HDFS filesystem in a DFS to find all files * * <p>fs:FileSystem object from HDFS minDate: Oldest date for files to be backed up maxDate:Newest * date for files to be backed up p:Path in HDFS to look for files pathList:Will be filled with * all files in p hmTimestamps: hashmap of timestamps for later sorting */ public void checkDir( FileSystem fs, long minDate, long maxDate, Path p, ArrayList<Path> pathList, HashMap<Path, Long> hmTimestamps) { long tmpDate; FileStatus[] fStat; try { String sPath = p.toUri().getPath(); // If this is a directory if (fs.getFileStatus(p).isDir()) { // ignore certain directories if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName()) || sPath.startsWith("/mapred") || "ops".equals(p.getName()) || p.getName().startsWith("_distcp_logs")) { return; } // dump the mkdir and chmod commands for this // directory -- skip root directory only { FileStatus stat = fs.getFileStatus(p); if (!sPath.equals("/")) { m_wrMkdirs.println("hadoop fs -mkdir " + sPath); } m_wrChmods.println( "hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); Short sh = new Short(stat.getPermission().toShort()); m_wrChmods.println( "hadoop fs -chmod " + Long.toOctalString(sh.longValue()) + " " + sPath); } fStat = fs.listStatus(p); // Do a recursive call to all elements for (int i = 0; i < fStat.length; i++) { checkDir(fs, minDate, maxDate, fStat[i].getPath(), pathList, hmTimestamps); } } else { // If not a directory then we've found a file // ignore crc files if (p.getName().endsWith(".crc")) { return; } // ignore other files if (sPath.startsWith("/user/oozie/etl/workflows/")) { return; } // try to get the table name from the path. There are // various types of tables, from those replicated from // another database to regular hive tables to // partitioned hive tables. We use table names to // both exclude some from the backup, and for the rest // to dump out the schema and partition name. if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) { m_nIgnoredTables++; if (m_nIgnoredTables < 5) { System.out.println("Skipping ignore-table file: " + sPath); } else if (m_nIgnoredTables == 5) { System.out.println("(...not showing other skipped tables...)"); } return; } FileStatus stat = fs.getFileStatus(p); tmpDate = stat.getModificationTime() / 1000; // store the chmods/chowns for all files m_wrChmods.println( "hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); m_wrChmods.println("hadoop fs -chmod " + stat.getPermission().toShort() + " " + sPath); // check dates. is it too young? if (tmpDate < minDate) { return; } // is the file too recent? if (tmpDate > maxDate) { // System.out.println("file too recent: " + sPath); return; } // file timestamp is ok pathList.add(p); hmTimestamps.put(p, new Long(tmpDate)); // store info about total bytes neeed to backup m_nTotalBytes += fs.getContentSummary(p).getLength(); } } catch (IOException e) { System.err.println("ERROR: could not open " + p + ": " + e); // System.exit(1) ; } }
public int run(String[] args) throws Exception { // printUsage(); /* * SETUP */ Configuration argConf = getConf(); Hashtable<String, String> confArg = new Hashtable<String, String>(); setup(confArg, argConf); Date currentTime = new Date(); Date endDate = new Date(new Long(confArg.get("timestamp_stop"))); Boolean full_run = confArg.get("intermediate").matches("(?i).*true.*"); Boolean quick_add = confArg.get("quick_add").matches("(?i).*true.*"); logger.info("Running GeStore"); // ZooKeeper setup Configuration config = HBaseConfiguration.create(); zkWatcher = new ZooKeeperWatcher(config, "Testing", new HBaseAdmin(config)); zkInstance = new ZooKeeper( ZKConfig.getZKQuorumServersString(config), config.getInt("zookeeper.session.timeout", -1), zkWatcher); if (!confArg.get("task_id").isEmpty()) { confArg.put("temp_path", confArg.get("temp_path") + confArg.get("task_id")); } String lockRequest = confArg.get("file_id"); if (!confArg.get("run_id").isEmpty()) lockRequest = lockRequest + "_" + confArg.get("run_id") + "_"; if (!confArg.get("task_id").isEmpty()) lockRequest = lockRequest + "_" + confArg.get("task_id") + "_"; // Get type of movement toFrom type_move = checkArgs(confArg); if (type_move == toFrom.LOCAL2REMOTE && !confArg.get("format").equals("unknown")) { List<String> arguments = new ArrayList<String>(); arguments.add("-Dinput=" + confArg.get("local_path")); arguments.add("-Dtable=" + confArg.get("file_id")); arguments.add("-Dtimestamp=" + confArg.get("timestamp_stop")); arguments.add("-Dtype=" + confArg.get("format")); arguments.add("-Dtarget_dir=" + confArg.get("base_path") + "_" + confArg.get("file_id")); arguments.add("-Dtemp_hdfs_path=" + confArg.get("temp_path")); arguments.add("-Drun_id=" + confArg.get("run_id")); if (!confArg.get("run_id").isEmpty()) arguments.add("-Drun_id=" + confArg.get("run_id")); if (!confArg.get("task_id").isEmpty()) arguments.add("-Dtask_id=" + confArg.get("task_id")); if (quick_add) arguments.add("-Dquick_add=" + confArg.get("quick_add")); String lockName = lock(lockRequest); String[] argumentString = arguments.toArray(new String[arguments.size()]); adddb.main(argumentString); unlock(lockName); System.exit(0); } // Database registration dbutil db_util = new dbutil(config); db_util.register_database(confArg.get("db_name_files"), true); db_util.register_database(confArg.get("db_name_runs"), true); db_util.register_database(confArg.get("db_name_updates"), true); FileSystem hdfs = FileSystem.get(config); FileSystem localFS = FileSystem.getLocal(config); // Get source type confArg.put("source", getSource(db_util, confArg.get("db_name_files"), confArg.get("file_id"))); confArg.put( "database", isDatabase(db_util, confArg.get("db_name_files"), confArg.get("file_id"))); if (!confArg.get("source").equals("local") && type_move == toFrom.REMOTE2LOCAL && !confArg.get("timestamp_stop").equals(Integer.toString(Integer.MAX_VALUE))) { confArg.put("timestamp_stop", Long.toString(latestVersion(confArg, db_util))); } /* * Get previous timestamp */ Get run_id_get = new Get(confArg.get("run_id").getBytes()); Result run_get = db_util.doGet(confArg.get("db_name_runs"), run_id_get); KeyValue run_file_prev = run_get.getColumnLatest( "d".getBytes(), (confArg.get("file_id") + "_db_timestamp").getBytes()); String last_timestamp = new String("0"); if (null != run_file_prev && !confArg.get("source").equals("local")) { long last_timestamp_real = run_file_prev.getTimestamp(); Long current_timestamp = new Long(confArg.get("timestamp_real")); if ((current_timestamp - last_timestamp_real) > 36000) { last_timestamp = new String(run_file_prev.getValue()); Integer lastTimestamp = new Integer(last_timestamp); lastTimestamp += 1; last_timestamp = lastTimestamp.toString(); logger.info("Last timestamp: " + last_timestamp + " End data: " + endDate); Date last_run = new Date(run_file_prev.getTimestamp()); if (last_run.before(endDate) && !full_run) { confArg.put("timestamp_start", last_timestamp); } } } Integer tse = new Integer(confArg.get("timestamp_stop")); Integer tss = new Integer(confArg.get("timestamp_start")); if (tss > tse) { logger.info("No new version of requested file."); return 0; } /* * Generate file */ String lockName = lock(lockRequest); Get file_id_get = new Get(confArg.get("file_id").getBytes()); Result file_get = db_util.doGet(confArg.get("db_name_files"), file_id_get); if (!file_get.isEmpty()) { boolean found = hasFile( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg)); if (confArg.get("source").equals("fullfile")) { found = false; } String filenames_put = getFileNames( db_util, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg)); // Filename not found in file database if (!found && type_move == toFrom.REMOTE2LOCAL) { if (!confArg.get("source").equals("local")) { // Generate intermediate file if (getFile(hdfs, confArg, db_util) == null) { unlock(lockName); return 1; } // Put generated file into file database if (!confArg.get("format").equals("fullfile")) { putFileEntry( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), confArg.get("full_file_name"), confArg.get("source")); } } else { logger.warn("Remote file not found, and cannot be generated! File: " + confArg); unlock(lockName); return 1; } } } else { if (type_move == toFrom.REMOTE2LOCAL) { logger.warn("Remote file not found, and cannot be generated."); unlock(lockName); return 1; } } /* * Copy file * Update tables */ if (type_move == toFrom.LOCAL2REMOTE) { if (!confArg.get("format").equals("fullfile")) { putFileEntry( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg), confArg.get("source")); } putRunEntry( db_util, confArg.get("db_name_runs"), confArg.get("run_id"), confArg.get("file_id"), confArg.get("type"), confArg.get("timestamp_real"), confArg.get("timestamp_stop"), getFullPath(confArg), confArg.get("delimiter")); hdfs.copyFromLocalFile(new Path(confArg.get("local_path")), new Path(getFullPath(confArg))); } else if (type_move == toFrom.REMOTE2LOCAL) { FileStatus[] files = hdfs.globStatus(new Path(getFullPath(confArg) + "*")); putRunEntry( db_util, confArg.get("db_name_runs"), confArg.get("run_id"), confArg.get("file_id"), confArg.get("type"), confArg.get("timestamp_real"), confArg.get("timestamp_stop"), getFullPath(confArg), confArg.get("delimiter")); unlock(lockName); for (FileStatus file : files) { Path cur_file = file.getPath(); Path cur_local_path = new Path(new String(confArg.get("local_path") + confArg.get("file_id"))); String suffix = getSuffix(getFileName(confArg), cur_file.getName()); if (suffix.length() > 0) { cur_local_path = cur_local_path.suffix(new String("." + suffix)); } if (confArg.get("copy").equals("true")) { String crc = hdfs.getFileChecksum(cur_file).toString(); if (checksumLocalTest(cur_local_path, crc)) { continue; } else { hdfs.copyToLocalFile(cur_file, cur_local_path); writeChecksum(cur_local_path, crc); } } else { System.out.println(cur_local_path + "\t" + cur_file); } } } unlock(lockName); return 0; }
// Information needed to get a single file: // BASE_PATH, FILE_ID, TIMESTAMP_START, TIMESTAMP_STOP, SOURCE, FILESYSTEM private static Vector<Path> getFile( FileSystem fs, Hashtable<String, String> config, dbutil db_util) throws Exception { Long latestVersion = latestVersion(config, db_util); try { config.put("timestamp_start", config.get("timestamp_start")); config.put("timestamp_real", latestVersion.toString()); config.put("timestamp_stop", latestVersion.toString()); } catch (Exception E) { logger.error("Tryign to get file that is impossible to generate: " + getFullPath(config)); return null; } if (Integer.parseInt(config.get("timestamp_start")) > Integer.parseInt(config.get("timestamp_stop"))) { return null; } logger.debug( "Getting DB for timestamp " + config.get("timestamp_start") + " to " + config.get("timestamp_stop")); String final_result = getFullPath(config); String temp_path_base = config.get("local_temp_path") + "_" + config.get("task_id") + "_" + config.get("run_id") + "/"; Path newPath = new Path(final_result + "*"); Vector<Path> ret_path = new Vector<Path>(); String lockName = lock(final_result.replaceAll("/", "_")); if (fs.globStatus(newPath).length != 0) { ret_path.add(newPath); unlock(lockName); config.put("full_file_name", final_result); return ret_path; } else { if (!config.get("source").equals("local")) { config.put("temp_path_base", temp_path_base); config.put("timestamp_start", config.get("timestamp_start")); config.put("timestamp_real", latestVersion.toString()); config.put("timestamp_stop", latestVersion.toString()); Class<?> sourceClass = Class.forName("org.gestore.plugin.source." + config.get("source") + "Source"); Method process_data = sourceClass.getMethod("process", Hashtable.class, FileSystem.class); Object processor = sourceClass.newInstance(); Object retVal; try { retVal = process_data.invoke(processor, config, fs); } catch (InvocationTargetException E) { Throwable exception = E.getTargetException(); logger.error("Unable to call method in child class: " + exception.toString()); exception.printStackTrace(System.out); unlock(lockName); return null; } FileStatus[] files = (FileStatus[]) retVal; if (files == null) { logger.error("Error getting files, no files returned"); return null; } for (FileStatus file : files) { Path cur_file = file.getPath(); Path cur_local_path = new Path(temp_path_base + config.get("file_id")); String suffix = getSuffix(config.get("file_id"), cur_file.getName()); cur_local_path = cur_local_path.suffix(suffix); Path res_path = new Path(new String(final_result + suffix)); logger.debug("Moving file" + cur_file.toString() + " to " + res_path.toString()); if (config.get("copy").equals("true")) { fs.moveFromLocalFile(cur_file, res_path); } else { fs.rename(cur_file, res_path); } } config.put("full_file_name", final_result); } } unlock(lockName); return ret_path; }
@Override public final void run() { try { // before preparing the job localize // all the archives TaskAttemptID taskid = t.getTaskID(); LocalDirAllocator lDirAlloc = new LocalDirAllocator("mapred.local.dir"); File jobCacheDir = null; if (conf.getJar() != null) { jobCacheDir = new File(new Path(conf.getJar()).getParent().toString()); } File workDir = new File( lDirAlloc .getLocalPathToRead( TaskTracker.getJobCacheSubdir() + Path.SEPARATOR + t.getJobID() + Path.SEPARATOR + t.getTaskID() + Path.SEPARATOR + MRConstants.WORKDIR, conf) .toString()); URI[] archives = DistributedCache.getCacheArchives(conf); URI[] files = DistributedCache.getCacheFiles(conf); FileStatus fileStatus; FileSystem fileSystem; Path localPath; String baseDir; if ((archives != null) || (files != null)) { if (archives != null) { String[] archivesTimestamps = DistributedCache.getArchiveTimestamps(conf); Path[] p = new Path[archives.length]; for (int i = 0; i < archives.length; i++) { fileSystem = FileSystem.get(archives[i], conf); fileStatus = fileSystem.getFileStatus(new Path(archives[i].getPath())); String cacheId = DistributedCache.makeRelative(archives[i], conf); String cachePath = TaskTracker.getCacheSubdir() + Path.SEPARATOR + cacheId; if (lDirAlloc.ifExists(cachePath, conf)) { localPath = lDirAlloc.getLocalPathToRead(cachePath, conf); } else { localPath = lDirAlloc.getLocalPathForWrite(cachePath, fileStatus.getLen(), conf); } baseDir = localPath.toString().replace(cacheId, ""); p[i] = DistributedCache.getLocalCache( archives[i], conf, new Path(baseDir), fileStatus, true, Long.parseLong(archivesTimestamps[i]), new Path(workDir.getAbsolutePath()), false); } DistributedCache.setLocalArchives(conf, stringifyPathArray(p)); } if ((files != null)) { String[] fileTimestamps = DistributedCache.getFileTimestamps(conf); Path[] p = new Path[files.length]; for (int i = 0; i < files.length; i++) { fileSystem = FileSystem.get(files[i], conf); fileStatus = fileSystem.getFileStatus(new Path(files[i].getPath())); String cacheId = DistributedCache.makeRelative(files[i], conf); String cachePath = TaskTracker.getCacheSubdir() + Path.SEPARATOR + cacheId; if (lDirAlloc.ifExists(cachePath, conf)) { localPath = lDirAlloc.getLocalPathToRead(cachePath, conf); } else { localPath = lDirAlloc.getLocalPathForWrite(cachePath, fileStatus.getLen(), conf); } baseDir = localPath.toString().replace(cacheId, ""); p[i] = DistributedCache.getLocalCache( files[i], conf, new Path(baseDir), fileStatus, false, Long.parseLong(fileTimestamps[i]), new Path(workDir.getAbsolutePath()), false); } DistributedCache.setLocalFiles(conf, stringifyPathArray(p)); } Path localTaskFile = new Path(t.getJobFile()); FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(localTaskFile, true); OutputStream out = localFs.create(localTaskFile); try { conf.writeXml(out); } finally { out.close(); } } if (!prepare()) { return; } String sep = System.getProperty("path.separator"); StringBuffer classPath = new StringBuffer(); // start with same classpath as parent process classPath.append(System.getProperty("java.class.path")); classPath.append(sep); if (!workDir.mkdirs()) { if (!workDir.isDirectory()) { LOG.fatal("Mkdirs failed to create " + workDir.toString()); } } String jar = conf.getJar(); if (jar != null) { // if jar exists, it into workDir File[] libs = new File(jobCacheDir, "lib").listFiles(); if (libs != null) { for (int i = 0; i < libs.length; i++) { classPath.append(sep); // add libs from jar to classpath classPath.append(libs[i]); } } classPath.append(sep); classPath.append(new File(jobCacheDir, "classes")); classPath.append(sep); classPath.append(jobCacheDir); } // include the user specified classpath // archive paths Path[] archiveClasspaths = DistributedCache.getArchiveClassPaths(conf); if (archiveClasspaths != null && archives != null) { Path[] localArchives = DistributedCache.getLocalCacheArchives(conf); if (localArchives != null) { for (int i = 0; i < archives.length; i++) { for (int j = 0; j < archiveClasspaths.length; j++) { if (archives[i].getPath().equals(archiveClasspaths[j].toString())) { classPath.append(sep); classPath.append(localArchives[i].toString()); } } } } } // file paths Path[] fileClasspaths = DistributedCache.getFileClassPaths(conf); if (fileClasspaths != null && files != null) { Path[] localFiles = DistributedCache.getLocalCacheFiles(conf); if (localFiles != null) { for (int i = 0; i < files.length; i++) { for (int j = 0; j < fileClasspaths.length; j++) { if (files[i].getPath().equals(fileClasspaths[j].toString())) { classPath.append(sep); classPath.append(localFiles[i].toString()); } } } } } classPath.append(sep); classPath.append(workDir); // Build exec child jmv args. Vector<String> vargs = new Vector<String>(8); File jvm = // use same jvm as parent new File(new File(System.getProperty("java.home"), "bin"), "java"); vargs.add(jvm.toString()); // Add child (task) java-vm options. // // The following symbols if present in mapred.child.java.opts value are // replaced: // + @taskid@ is interpolated with value of TaskID. // Other occurrences of @ will not be altered. // // Example with multiple arguments and substitutions, showing // jvm GC logging, and start of a passwordless JVM JMX agent so can // connect with jconsole and the likes to watch child memory, threads // and get thread dumps. // // <property> // <name>mapred.child.java.opts</name> // <value>-verbose:gc -Xloggc:/tmp/@[email protected] \ // -Dcom.sun.management.jmxremote.authenticate=false \ // -Dcom.sun.management.jmxremote.ssl=false \ // </value> // </property> // String javaOpts = conf.get("mapred.child.java.opts", "-Xmx200m"); javaOpts = javaOpts.replace("@taskid@", taskid.toString()); String[] javaOptsSplit = javaOpts.split(" "); // Add java.library.path; necessary for loading native libraries. // // 1. To support native-hadoop library i.e. libhadoop.so, we add the // parent processes' java.library.path to the child. // 2. We also add the 'cwd' of the task to it's java.library.path to help // users distribute native libraries via the DistributedCache. // 3. The user can also specify extra paths to be added to the // java.library.path via mapred.child.java.opts. // String libraryPath = System.getProperty("java.library.path"); if (libraryPath == null) { libraryPath = workDir.getAbsolutePath(); } else { libraryPath += sep + workDir; } boolean hasUserLDPath = false; for (int i = 0; i < javaOptsSplit.length; i++) { if (javaOptsSplit[i].startsWith("-Djava.library.path=")) { javaOptsSplit[i] += sep + libraryPath; hasUserLDPath = true; break; } } if (!hasUserLDPath) { vargs.add("-Djava.library.path=" + libraryPath); } for (int i = 0; i < javaOptsSplit.length; i++) { vargs.add(javaOptsSplit[i]); } // add java.io.tmpdir given by mapred.child.tmp String tmp = conf.get("mapred.child.tmp", "./tmp"); Path tmpDir = new Path(tmp); // if temp directory path is not absolute // prepend it with workDir. if (!tmpDir.isAbsolute()) { tmpDir = new Path(workDir.toString(), tmp); } FileSystem localFs = FileSystem.getLocal(conf); if (!localFs.mkdirs(tmpDir) && !localFs.getFileStatus(tmpDir).isDir()) { throw new IOException("Mkdirs failed to create " + tmpDir.toString()); } vargs.add("-Djava.io.tmpdir=" + tmpDir.toString()); // Add classpath. vargs.add("-classpath"); vargs.add(classPath.toString()); // Setup the log4j prop long logSize = TaskLog.getTaskLogLength(conf); vargs.add( "-Dhadoop.log.dir=" + new File(System.getProperty("hadoop.log.dir")).getAbsolutePath()); vargs.add("-Dhadoop.root.logger=INFO,TLA"); vargs.add("-Dhadoop.tasklog.taskid=" + taskid); vargs.add("-Dhadoop.tasklog.totalLogFileSize=" + logSize); if (conf.getProfileEnabled()) { if (conf.getProfileTaskRange(t.isMapTask()).isIncluded(t.getPartition())) { File prof = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.PROFILE); vargs.add(String.format(conf.getProfileParams(), prof.toString())); } } // Add main class and its arguments vargs.add(Child.class.getName()); // main of Child // pass umbilical address InetSocketAddress address = tracker.getTaskTrackerReportAddress(); vargs.add(address.getAddress().getHostAddress()); vargs.add(Integer.toString(address.getPort())); vargs.add(taskid.toString()); // pass task identifier String pidFile = null; if (tracker.isTaskMemoryManagerEnabled()) { pidFile = lDirAlloc .getLocalPathForWrite( (TaskTracker.getPidFilesSubdir() + Path.SEPARATOR + taskid), this.conf) .toString(); } // set memory limit using ulimit if feasible and necessary ... String[] ulimitCmd = Shell.getUlimitMemoryCommand(conf); List<String> setup = null; if (ulimitCmd != null) { setup = new ArrayList<String>(); for (String arg : ulimitCmd) { setup.add(arg); } } // Set up the redirection of the task's stdout and stderr streams File stdout = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.STDOUT); File stderr = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.STDERR); stdout.getParentFile().mkdirs(); tracker.getTaskTrackerInstrumentation().reportTaskLaunch(taskid, stdout, stderr); Map<String, String> env = new HashMap<String, String>(); StringBuffer ldLibraryPath = new StringBuffer(); ldLibraryPath.append(workDir.toString()); String oldLdLibraryPath = null; oldLdLibraryPath = System.getenv("LD_LIBRARY_PATH"); if (oldLdLibraryPath != null) { ldLibraryPath.append(sep); ldLibraryPath.append(oldLdLibraryPath); } env.put("LD_LIBRARY_PATH", ldLibraryPath.toString()); jvmManager.launchJvm( this, jvmManager.constructJvmEnv( setup, vargs, stdout, stderr, logSize, workDir, env, pidFile, conf)); synchronized (lock) { while (!done) { lock.wait(); } } tracker.getTaskTrackerInstrumentation().reportTaskEnd(t.getTaskID()); if (exitCodeSet) { if (!killed && exitCode != 0) { if (exitCode == 65) { tracker.getTaskTrackerInstrumentation().taskFailedPing(t.getTaskID()); } throw new IOException("Task process exit with nonzero status of " + exitCode + "."); } } } catch (FSError e) { LOG.fatal("FSError", e); try { tracker.fsError(t.getTaskID(), e.getMessage()); } catch (IOException ie) { LOG.fatal(t.getTaskID() + " reporting FSError", ie); } } catch (Throwable throwable) { LOG.warn(t.getTaskID() + " Child Error", throwable); ByteArrayOutputStream baos = new ByteArrayOutputStream(); throwable.printStackTrace(new PrintStream(baos)); try { tracker.reportDiagnosticInfo(t.getTaskID(), baos.toString()); } catch (IOException e) { LOG.warn(t.getTaskID() + " Reporting Diagnostics", e); } } finally { try { URI[] archives = DistributedCache.getCacheArchives(conf); URI[] files = DistributedCache.getCacheFiles(conf); if (archives != null) { for (int i = 0; i < archives.length; i++) { DistributedCache.releaseCache(archives[i], conf); } } if (files != null) { for (int i = 0; i < files.length; i++) { DistributedCache.releaseCache(files[i], conf); } } } catch (IOException ie) { LOG.warn("Error releasing caches : Cache files might not have been cleaned up"); } tracker.reportTaskFinished(t.getTaskID(), false); if (t.isMapTask()) { tracker.addFreeMapSlot(); } else { tracker.addFreeReduceSlot(); } } }
public String toString() { return Long.toString(value); }