public void testFormat() throws Exception { localFs = FileSystem.getLocal(defaultConf); localFs.delete(workDir, true); Job job = new Job(new Configuration(defaultConf)); Path file = new Path(workDir, "test.txt"); int seed = new Random().nextInt(); Random random = new Random(seed); // for a variety of lengths for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) { // create a file with length entries Writer writer = new OutputStreamWriter(localFs.create(file)); try { MyClass mc = new MyClass(); for (int i = 0; i < length; i++) { mc.s = Integer.toString(i); mc.v = i; byte[] raw = MessagePack.pack(mc); byte[] b64e = base64_.encodeBase64(raw); byte[] b64d = base64_.decode(b64e); MyClass mc2 = MessagePack.unpack(b64d, mc.getClass()); assertEquals(mc.s, mc2.s); assertEquals(mc.v, mc2.v); writer.write(base64_.encodeToString(raw)); } } finally { writer.close(); } checkFormat(job); } }
public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = JobClient.runJob(sortJob); long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "Injector: total number of urls injected after normalization and filtering: " + urlsInjected); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
// the method which actually copies the caches locally and unjars/unzips them // and does chmod for the files private static Path localizeCache( Configuration conf, URI cache, long confFileStamp, CacheStatus cacheStatus, boolean isArchive) throws IOException { FileSystem fs = getFileSystem(cache, conf); FileSystem localFs = FileSystem.getLocal(conf); Path parchive = null; if (isArchive) { parchive = new Path( cacheStatus.localizedLoadPath, new Path(cacheStatus.localizedLoadPath.getName())); } else { parchive = cacheStatus.localizedLoadPath; } if (!localFs.mkdirs(parchive.getParent())) { throw new IOException( "Mkdirs failed to create directory " + cacheStatus.localizedLoadPath.toString()); } String cacheId = cache.getPath(); fs.copyToLocalFile(new Path(cacheId), parchive); if (isArchive) { String tmpArchive = parchive.toString().toLowerCase(); File srcFile = new File(parchive.toString()); File destDir = new File(parchive.getParent().toString()); if (tmpArchive.endsWith(".jar")) { RunJar.unJar(srcFile, destDir); } else if (tmpArchive.endsWith(".zip")) { FileUtil.unZip(srcFile, destDir); } else if (isTarFile(tmpArchive)) { FileUtil.unTar(srcFile, destDir); } // else will not do anyhting // and copy the file into the dir as it is } long cacheSize = FileUtil.getDU(new File(parchive.getParent().toString())); cacheStatus.size = cacheSize; addCacheInfoUpdate(cacheStatus); // do chmod here try { // Setting recursive permission to grant everyone read and execute Path localDir = new Path(cacheStatus.localizedBaseDir, cacheStatus.uniqueParentDir); LOG.info("Doing chmod on localdir :" + localDir); FileUtil.chmod(localDir.toString(), "ugo+rx", true); } catch (InterruptedException e) { LOG.warn("Exception in chmod" + e.toString()); } // update cacheStatus to reflect the newly cached file cacheStatus.mtime = getTimestamp(conf, cache); return cacheStatus.localizedLoadPath; }
@Override public void close() throws IOException { super.close(); if (fs != null) { try { fs.close(); } catch (IOException ie) { // this might already be closed // ignore } } }
private void dirCleanup() { Configuration conf = new Configuration(); try { FileSystem fs = FileSystem.get(conf); Path dirPath = new Path(testPath); if (fs.exists(dirPath)) { fs.delete(dirPath, true); } } catch (IOException ex) { LOG.warn("IO Error in test cleanup", ex); } }
protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Path pt = new Path("/user/yao/query/query"); FileSystem fs = FileSystem.get(new Configuration()); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(pt))); String line = br.readLine(); String[] keywords = line.split(","); k0 = keywords[0]; k1 = keywords[1]; k2 = keywords[2]; br.close(); }
/** * Combine the status stored in the index and the underlying status. * * @param h status stored in the index * @param cache caching the underlying file statuses * @return the combined file status * @throws IOException */ private FileStatus toFileStatus(HarStatus h, Map<String, FileStatus> cache) throws IOException { FileStatus underlying = null; if (cache != null) { underlying = cache.get(h.partName); } if (underlying == null) { final Path p = h.isDir ? archivePath : new Path(archivePath, h.partName); underlying = fs.getFileStatus(p); if (cache != null) { cache.put(h.partName, underlying); } } long modTime = 0; int version = metadata.getVersion(); if (version < 3) { modTime = underlying.getModificationTime(); } else if (version == 3) { modTime = h.getModificationTime(); } return new FileStatus( h.isDir() ? 0L : h.getLength(), h.isDir(), underlying.getReplication(), underlying.getBlockSize(), modTime, underlying.getAccessTime(), underlying.getPermission(), underlying.getOwner(), underlying.getGroup(), makeRelative(this.uri.getPath(), new Path(h.name))); }
private String copyFromLocal(FileSystem fs, Path basePath, String[] files) throws IOException { StringBuilder csv = new StringBuilder(files.length * (basePath.toString().length() + 16)); for (String localFile : files) { Path src = new Path(localFile); String filename = src.getName(); Path dst = new Path(basePath, filename); URI localFileURI = null; try { localFileURI = new URI(localFile); } catch (URISyntaxException e) { throw new IOException(e); } if (localFileURI.getScheme() == null || localFileURI.getScheme().startsWith("file")) { LOG.info("Copy {} from local filesystem to {}", localFile, dst); fs.copyFromLocalFile(false, true, src, dst); } else { LOG.info("Copy {} from DFS to {}", localFile, dst); FileUtil.copy(fs, src, fs, dst, false, true, conf); } if (csv.length() > 0) { csv.append(LIB_JARS_SEP); } csv.append(dst.toString()); } return csv.toString(); }
public void configure(JobConf conf) { numberOfCenters = Integer.valueOf(conf.get("numberOfCenters")); centersDirectory = conf.get("centersReadDirectory"); try { Configuration c = new Configuration(); FileSystem fs = FileSystem.get(c); for (int index = 0; index < numberOfCenters; ++index) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(centersDirectory + "/centers/" + index), c); LongWritable key = new LongWritable(); Point value = new Point(); reader.next(key, value); Point center = (Point) value; centers.add(center); reader.close(); } } catch (IOException e) { // do nothing // I hope this doesn't happen System.out.println("well, damn."); e.printStackTrace(); } }
private static boolean hasFile( dbutil db_util, FileSystem fs, String db_name, String file_id, String file_path) throws Exception { Get file_id_get = new Get(file_id.getBytes()); Result file_result = db_util.doGet(db_name, file_id_get); KeyValue file_names = file_result.getColumnLatest("d".getBytes(), "filenames".getBytes()); if (file_names == null) { return false; } String all_files = new String(file_names.getValue()); String[] files = all_files.split("\n"); for (String line : files) { if (line.equals(file_path)) { if (fs.globStatus(new Path(line + "*")).length == 0) { Put new_put = new Put(file_id.getBytes()); new_put.add( "d".getBytes(), "filenames".getBytes(), all_files.replace(file_path + "\n", "").getBytes()); db_util.doPut(db_name, new_put); return false; } return true; } } return false; }
private void verifyOutputTextFiles( FileSystem fs, Configuration conf, String dir, String prefix, List<String> bodies) throws IOException { int found = 0; int expected = bodies.size(); for (String outputFile : getAllFiles(dir)) { String name = (new File(outputFile)).getName(); if (name.startsWith(prefix)) { FSDataInputStream input = fs.open(new Path(outputFile)); BufferedReader reader = new BufferedReader(new InputStreamReader(input)); String body = null; while ((body = reader.readLine()) != null) { bodies.remove(body); found++; } reader.close(); } } Assert.assertTrue( "Found = " + found + ", Expected = " + expected + ", Left = " + bodies.size() + " " + bodies, bodies.size() == 0); }
/** * Initialize a Har filesystem per har archive. The archive home directory is the top level * directory in the filesystem that contains the HAR archive. Be careful with this method, you do * not want to go on creating new Filesystem instances per call to path.getFileSystem(). the uri * of Har is har://underlyingfsscheme-host:port/archivepath. or har:///archivepath. This assumes * the underlying filesystem to be used in case not specified. */ @Override public void initialize(URI name, Configuration conf) throws IOException { // initialize the metadata cache, if needed initializeMetadataCache(conf); // decode the name URI underLyingURI = decodeHarURI(name, conf); // we got the right har Path- now check if this is // truly a har filesystem Path harPath = archivePath(new Path(name.getScheme(), name.getAuthority(), name.getPath())); if (harPath == null) { throw new IOException("Invalid path for the Har Filesystem. " + name.toString()); } if (fs == null) { fs = FileSystem.get(underLyingURI, conf); } uri = harPath.toUri(); archivePath = new Path(uri.getPath()); harAuth = getHarAuth(underLyingURI); // check for the underlying fs containing // the index file Path masterIndexPath = new Path(archivePath, "_masterindex"); Path archiveIndexPath = new Path(archivePath, "_index"); if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) { throw new IOException( "Invalid path for the Har Filesystem. " + "No index file in " + harPath); } metadata = harMetaCache.get(uri); if (metadata != null) { FileStatus mStat = fs.getFileStatus(masterIndexPath); FileStatus aStat = fs.getFileStatus(archiveIndexPath); if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() || aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) { // the archive has been overwritten since we last read it // remove the entry from the meta data cache metadata = null; harMetaCache.remove(uri); } } if (metadata == null) { metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath); metadata.parseMetaData(); harMetaCache.put(uri, metadata); } }
private void insertFile( Path apkPath, Map<String, String> zip_properties, File insert, String method, Path location) throws AndrolibException, IOException { // ZipFileSystem only writes at .close() // http://mail.openjdk.java.net/pipermail/nio-dev/2012-July/001764.html try (FileSystem fs = FileSystems.newFileSystem(apkPath, null)) { Path root = fs.getPath("/"); // in order to get the path relative to the zip, we strip off the absolute path, minus what we // already have in the zip. thus /var/files/apktool/apk/unknown/folder/file => /folder/file Path dest = fs.getPath(root.toString(), insert.getAbsolutePath().replace(location.toString(), "")); Path newFile = Paths.get(insert.getAbsolutePath()); Files.copy(newFile, dest, StandardCopyOption.REPLACE_EXISTING); fs.close(); } }
/** Start the JobTracker process, listen on the indicated port */ JobTracker(Configuration conf) throws IOException { // // Grab some static constants // maxCurrentTasks = conf.getInt("mapred.tasktracker.tasks.maximum", 2); RETIRE_JOB_INTERVAL = conf.getLong("mapred.jobtracker.retirejob.interval", 24 * 60 * 60 * 1000); RETIRE_JOB_CHECK_INTERVAL = conf.getLong("mapred.jobtracker.retirejob.check", 60 * 1000); TASK_ALLOC_EPSILON = conf.getFloat("mapred.jobtracker.taskalloc.loadbalance.epsilon", 0.2f); PAD_FRACTION = conf.getFloat("mapred.jobtracker.taskalloc.capacitypad", 0.1f); MIN_SLOTS_FOR_PADDING = 3 * maxCurrentTasks; // This is a directory of temporary submission files. We delete it // on startup, and can delete any files that we're done with this.conf = conf; JobConf jobConf = new JobConf(conf); this.systemDir = jobConf.getSystemDir(); this.fs = FileSystem.get(conf); FileUtil.fullyDelete(fs, systemDir); fs.mkdirs(systemDir); // Same with 'localDir' except it's always on the local disk. jobConf.deleteLocalFiles(SUBDIR); // Set ports, start RPC servers, etc. InetSocketAddress addr = getAddress(conf); this.localMachine = addr.getHostName(); this.port = addr.getPort(); this.interTrackerServer = RPC.getServer(this, addr.getPort(), 10, false, conf); this.interTrackerServer.start(); Properties p = System.getProperties(); for (Iterator it = p.keySet().iterator(); it.hasNext(); ) { String key = (String) it.next(); String val = (String) p.getProperty(key); LOG.info("Property '" + key + "' is " + val); } this.infoPort = conf.getInt("mapred.job.tracker.info.port", 50030); this.infoServer = new JobTrackerInfoServer(this, infoPort); this.infoServer.start(); this.startTime = System.currentTimeMillis(); new Thread(this.expireTrackers).start(); new Thread(this.retireJobs).start(); new Thread(this.initJobs).start(); }
public void copyInitialState(Path origAppDir) throws IOException { // locate previous snapshot String newAppDir = this.dag.assertAppPath(); FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(origAppDir.toString(), conf); // read snapshot against new dependencies Object snapshot = recoveryHandler.restore(); if (snapshot == null) { throw new IllegalArgumentException("No previous application state found in " + origAppDir); } InputStream logIs = recoveryHandler.getLog(); // modify snapshot state to switch app id ((StreamingContainerManager.CheckpointState) snapshot).setApplicationId(this.dag, conf); Path checkpointPath = new Path(newAppDir, LogicalPlan.SUBDIR_CHECKPOINTS); FileSystem fs = FileSystem.newInstance(origAppDir.toUri(), conf); // remove the path that was created by the storage agent during deserialization and replacement fs.delete(checkpointPath, true); // write snapshot to new location recoveryHandler = new FSRecoveryHandler(newAppDir, conf); recoveryHandler.save(snapshot); OutputStream logOs = recoveryHandler.rotateLog(); IOUtils.copy(logIs, logOs); logOs.flush(); logOs.close(); logIs.close(); // copy sub directories that are not present in target FileStatus[] lFiles = fs.listStatus(origAppDir); for (FileStatus f : lFiles) { if (f.isDirectory()) { String targetPath = f.getPath().toString().replace(origAppDir.toString(), newAppDir); if (!fs.exists(new Path(targetPath))) { LOG.debug("Copying {} to {}", f.getPath(), targetPath); FileUtil.copy(fs, f.getPath(), fs, new Path(targetPath), false, conf); // FSUtil.copy(fs, f, fs, new Path(targetPath), false, false, conf); } else { LOG.debug("Ignoring {} as it already exists under {}", f.getPath(), targetPath); // FSUtil.setPermission(fs, new Path(targetPath), new FsPermission((short)0777)); } } } }
public FileStatus getPartFileStatus(Path partPath) throws IOException { FileStatus status; status = partFileStatuses.get(partPath); if (status == null) { status = fs.getFileStatus(partPath); partFileStatuses.put(partPath, status); } return status; }
private void insertFolder( Path apkPath, Map<String, String> zip_properties, File insert, String method, Path location) throws AndrolibException, IOException { try (FileSystem fs = FileSystems.newFileSystem(apkPath, null)) { Path root = fs.getPath("/"); Path dest = fs.getPath(root.toString(), insert.getAbsolutePath().replace(location.toString(), "")); Path parent = dest.normalize(); // check for folder existing in apkFileSystem if (parent != null && Files.notExists(parent)) { if (!Files.isDirectory(parent, LinkOption.NOFOLLOW_LINKS)) { Files.createDirectories(parent); } } fs.close(); } }
/** * Returns a List of FsContent objects from TSK based on sql query. * * @param image is a Image object that denotes which image to get the files from * @param query is a sql string query that is to be run * @return FFSqlitedb is a List of FsContent objects */ @SuppressWarnings("deprecation") public List<FsContent> extractFiles(Image image, String query) { Collection<FileSystem> imageFS = tskCase.getFileSystems(image); List<String> fsIds = new LinkedList<String>(); for (FileSystem img : imageFS) { Long tempID = img.getId(); fsIds.add(tempID.toString()); } String allFS = new String(); for (int i = 0; i < fsIds.size(); i++) { if (i == 0) { allFS += " AND (0"; } allFS += " OR fs_obj_id = '" + fsIds.get(i) + "'"; if (i == fsIds.size() - 1) { allFS += ")"; } } List<FsContent> FFSqlitedb = null; ResultSet rs = null; try { rs = tskCase.runQuery(query + allFS); FFSqlitedb = tskCase.resultSetToFsContents(rs); } catch (SQLException ex) { logger.log( Level.SEVERE, "Error while trying to extract files for:" + this.getClass().getName(), ex); this.addErrorMessage(this.getName() + ": Error while trying to extract files to analyze."); } finally { if (rs != null) { try { tskCase.closeRunQuery(rs); } catch (SQLException ex) { logger.log( Level.SEVERE, "Error while trying to close result set after extract files for:" + this.getClass().getName(), ex); } } } return FFSqlitedb; }
/** * Clear the entire contents of the cache and delete the backing files. This should only be used * when the server is reinitializing, because the users are going to lose their files. */ public static void purgeCache(Configuration conf, MRAsyncDiskService service) throws IOException { synchronized (cachedArchives) { LocalFileSystem localFs = FileSystem.getLocal(conf); for (Map.Entry<String, CacheStatus> f : cachedArchives.entrySet()) { try { deleteLocalPath(service, localFs, f.getValue().localizedLoadPath); } catch (IOException ie) { LOG.debug("Error cleaning up cache", ie); } } cachedArchives.clear(); } }
/** * Get block locations from the underlying fs and fix their offsets and lengths. * * @param file the input file status to get block locations * @param start the start of the desired range in the contained file * @param len the length of the desired range * @return block locations for this segment of file * @throws IOException */ @Override public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) throws IOException { HarStatus hstatus = getFileHarStatus(file.getPath()); Path partPath = new Path(archivePath, hstatus.getPartName()); FileStatus partStatus = metadata.getPartFileStatus(partPath); // get all part blocks that overlap with the desired file blocks BlockLocation[] locations = fs.getFileBlockLocations(partStatus, hstatus.getStartIndex() + start, len); return fixBlockLocations(locations, start, len, hstatus.getStartIndex()); }
/** * decode the raw URI to get the underlying URI * * @param rawURI raw Har URI * @return filtered URI of the underlying fileSystem */ private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException { String tmpAuth = rawURI.getAuthority(); // we are using the default file // system in the config // so create a underlying uri and // return it if (tmpAuth == null) { // create a path return FileSystem.getDefaultUri(conf); } String authority = rawURI.getAuthority(); if (authority == null) { throw new IOException( "URI: " + rawURI + " is an invalid Har URI since authority==null." + " Expecting har://<scheme>-<host>/<path>."); } int i = authority.indexOf('-'); if (i < 0) { throw new IOException( "URI: " + rawURI + " is an invalid Har URI since '-' not found." + " Expecting har://<scheme>-<host>/<path>."); } if (rawURI.getQuery() != null) { // query component not allowed throw new IOException("query component in Path not supported " + rawURI); } URI tmp; try { // convert <scheme>-<host> to <scheme>://<host> URI baseUri = new URI(authority.replaceFirst("-", "://")); tmp = new URI( baseUri.getScheme(), baseUri.getAuthority(), rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment()); } catch (URISyntaxException e) { throw new IOException( "URI: " + rawURI + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>."); } return tmp; }
public static void recursePath(Configuration conf, Path path, Job job) { try { FileSystem fs = path.getFileSystem(conf); FileStatus[] fstats = fs.listStatus(path); if (fstats != null) { for (FileStatus f : fstats) { Path p = f.getPath(); ; if (fs.isFile(p)) { // connection times out otherwise System.err.println("file:" + p.toString()); FileInputFormat.addInputPath(job, p); } else { System.err.println("dir:" + p.toString()); recursePath(conf, p, job); } } } } catch (IOException e) { // shouldn't be here throw new RuntimeException(e); } }
HarFsInputStream(FileSystem fs, Path path, long start, long length, int bufferSize) throws IOException { if (length < 0) { throw new IllegalArgumentException("Negative length [" + length + "]"); } underLyingStream = fs.open(path, bufferSize); underLyingStream.seek(start); // the start of this file in the part file this.start = start; // the position pointer in the part file this.position = start; // the end pointer in the part file this.end = start + length; }
public static void extractQueryFeatures2HDFS(String filename, Job job) throws IOException { // Read the local image.jpg as a Mat Mat query_mat_float = Highgui.imread(LOCAL_USER_DIR + ID + INPUT + "/" + filename, CvType.CV_32FC3); // Convert RGB to GRAY Mat query_gray = new Mat(); Imgproc.cvtColor(query_mat_float, query_gray, Imgproc.COLOR_RGB2GRAY); // Convert the float type to unsigned integer(required by SIFT) Mat query_mat_byte = new Mat(); query_gray.convertTo(query_mat_byte, CvType.CV_8UC3); // // Resize the image to 1/FACTOR both width and height // Mat query_mat_byte = FeatureExtraction.resize(query_mat_byte); // Extract the feature from the (Mat)image Mat query_features = FeatureExtraction.extractFeature(query_mat_byte); System.out.println(PREFIX + "Extracting the query image feature..."); System.out.println("query_mat(float,color):" + query_mat_float); System.out.println("query_mat(float,gray):" + query_gray); System.out.println("query_mat(byte,gray):" + query_mat_byte); System.out.println("query_mat_features:" + query_features); System.out.println(); // Store the feature to the hdfs in order to use it later in different map tasks System.out.println(PREFIX + "Generating the feature file for the query image in HDFS..."); FileSystem fs = FileSystem.get(job.getConfiguration()); String featureFileName = filename.substring(0, filename.lastIndexOf(".")) + ".json"; FSDataOutputStream fsDataOutputStream = fs.create(new Path(HDFS_HOME + USER + ID + INPUT + "/" + featureFileName)); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); bw.write(FeatureExtraction.mat2json(query_features)); bw.close(); System.out.println(PREFIX + "Query feature extraction finished..."); System.out.println(); }
/* * Returns the relative path of the dir this cache will be localized in * relative path that this cache will be localized in. For * hdfs://hostname:port/absolute_path -- the relative path is * hostname/absolute path -- if it is just /absolute_path -- then the * relative path is hostname of DFS this mapred cluster is running * on/absolute_path */ public static String makeRelative(URI cache, Configuration conf) throws IOException { String host = cache.getHost(); if (host == null) { host = cache.getScheme(); } if (host == null) { URI defaultUri = FileSystem.get(conf).getUri(); host = defaultUri.getHost(); if (host == null) { host = defaultUri.getScheme(); } } String path = host + cache.getPath(); path = path.replace(":/", "/"); // remove windows device colon return path; }
private static Path checkCacheStatusValidity( Configuration conf, URI cache, long confFileStamp, CacheStatus cacheStatus, FileStatus fileStatus, boolean isArchive) throws IOException { FileSystem fs = FileSystem.get(cache, conf); // Has to be if (!ifExistsAndFresh(conf, fs, cache, confFileStamp, cacheStatus, fileStatus)) { throw new IOException( "Stale cache file: " + cacheStatus.localizedLoadPath + " for cache-file: " + cache); } LOG.info( String.format( "Using existing cache of %s->%s", cache.toString(), cacheStatus.localizedLoadPath)); return cacheStatus.localizedLoadPath; }
private static void deleteCache(Configuration conf, MRAsyncDiskService asyncDiskService) throws IOException { List<CacheStatus> deleteSet = new LinkedList<CacheStatus>(); // try deleting cache Status with refcount of zero synchronized (cachedArchives) { for (Iterator<String> it = cachedArchives.keySet().iterator(); it.hasNext(); ) { String cacheId = (String) it.next(); CacheStatus lcacheStatus = cachedArchives.get(cacheId); if (lcacheStatus.refcount == 0) { // delete this cache entry from the global list // and mark the localized file for deletion deleteSet.add(lcacheStatus); it.remove(); } } } // do the deletion asynchronously, after releasing the global lock Thread cacheFileCleaner = new Thread(new CacheFileCleanTask(asyncDiskService, FileSystem.getLocal(conf), deleteSet)); cacheFileCleaner.start(); }
public void testFormat() throws Exception { JobConf job = new JobConf(conf); FileSystem fs = FileSystem.getLocal(conf); Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred"); Path file = new Path(dir, "test.seq"); Reporter reporter = Reporter.NULL; int seed = new Random().nextInt(); // LOG.info("seed = "+seed); Random random = new Random(seed); fs.delete(dir, true); FileInputFormat.setInputPaths(job, dir); // for a variety of lengths for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) { // LOG.info("creating; entries = " + length); // create a file with length entries SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, BytesWritable.class); try { for (int i = 0; i < length; i++) { IntWritable key = new IntWritable(i); byte[] data = new byte[random.nextInt(10)]; random.nextBytes(data); BytesWritable value = new BytesWritable(data); writer.append(key, value); } } finally { writer.close(); } // try splitting the file in a variety of sizes InputFormat<IntWritable, BytesWritable> format = new SequenceFileInputFormat<IntWritable, BytesWritable>(); IntWritable key = new IntWritable(); BytesWritable value = new BytesWritable(); for (int i = 0; i < 3; i++) { int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1; // LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.getSplits(job, numSplits); // LOG.info("splitting: got = " + splits.length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.length; j++) { RecordReader<IntWritable, BytesWritable> reader = format.getRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.next(key, value)) { // if (bits.get(key.get())) { // LOG.info("splits["+j+"]="+splits[j]+" : " + // key.get()); // LOG.info("@"+reader.getPos()); // } assertFalse("Key in multiple partitions.", bits.get(key.get())); bits.set(key.get()); count++; } // LOG.info("splits["+j+"]="+splits[j]+" count=" + // count); } finally { reader.close(); } } assertEquals("Some keys in no partition.", length, bits.cardinality()); } } }
/** * Launch application for the dag represented by this client. * * @throws YarnException * @throws IOException */ public void startApplication() throws YarnException, IOException { Class<?>[] defaultClasses; if (applicationType.equals(YARN_APPLICATION_TYPE)) { // TODO restrict the security check to only check if security is enabled for webservices. if (UserGroupInformation.isSecurityEnabled()) { defaultClasses = DATATORRENT_SECURITY_CLASSES; } else { defaultClasses = DATATORRENT_CLASSES; } } else { throw new IllegalStateException(applicationType + " is not a valid application type."); } LinkedHashSet<String> localJarFiles = findJars(dag, defaultClasses); if (resources != null) { localJarFiles.addAll(resources); } YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics(); LOG.info( "Got Cluster metric info from ASM" + ", numNodeManagers=" + clusterMetrics.getNumNodeManagers()); // GetClusterNodesRequest clusterNodesReq = Records.newRecord(GetClusterNodesRequest.class); // GetClusterNodesResponse clusterNodesResp = // rmClient.clientRM.getClusterNodes(clusterNodesReq); // LOG.info("Got Cluster node info from ASM"); // for (NodeReport node : clusterNodesResp.getNodeReports()) { // LOG.info("Got node report from ASM for" // + ", nodeId=" + node.getNodeId() // + ", nodeAddress" + node.getHttpAddress() // + ", nodeRackName" + node.getRackName() // + ", nodeNumContainers" + node.getNumContainers() // + ", nodeHealthStatus" + node.getHealthReport()); // } List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo(); for (QueueUserACLInfo aclInfo : listAclInfo) { for (QueueACL userAcl : aclInfo.getUserAcls()) { LOG.info( "User ACL Info for Queue" + ", queueName=" + aclInfo.getQueueName() + ", userAcl=" + userAcl.name()); } } // Get a new application id YarnClientApplication newApp = yarnClient.createApplication(); appId = newApp.getNewApplicationResponse().getApplicationId(); // Dump out information about cluster capability as seen by the resource manager int maxMem = newApp.getNewApplicationResponse().getMaximumResourceCapability().getMemory(); LOG.info("Max mem capabililty of resources in this cluster " + maxMem); int amMemory = dag.getMasterMemoryMB(); if (amMemory > maxMem) { LOG.info( "AM memory specified above max threshold of cluster. Using max value." + ", specified=" + amMemory + ", max=" + maxMem); amMemory = maxMem; } if (dag.getAttributes().get(LogicalPlan.APPLICATION_ID) == null) { dag.setAttribute(LogicalPlan.APPLICATION_ID, appId.toString()); } // Create launch context for app master LOG.info("Setting up application submission context for ASM"); ApplicationSubmissionContext appContext = Records.newRecord(ApplicationSubmissionContext.class); // set the application id appContext.setApplicationId(appId); // set the application name appContext.setApplicationName(dag.getValue(LogicalPlan.APPLICATION_NAME)); appContext.setApplicationType(this.applicationType); if (YARN_APPLICATION_TYPE.equals(this.applicationType)) { // appContext.setMaxAppAttempts(1); // no retries until Stram is HA } // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); // Setup security tokens // If security is enabled get ResourceManager and NameNode delegation tokens. // Set these tokens on the container so that they are sent as part of application submission. // This also sets them up for renewal by ResourceManager. The NameNode delegation rmToken // is also used by ResourceManager to fetch the jars from HDFS and set them up for the // application master launch. if (UserGroupInformation.isSecurityEnabled()) { Credentials credentials = new Credentials(); String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL); if (tokenRenewer == null || tokenRenewer.length() == 0) { throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer"); } // For now, only getting tokens for the default file-system. FileSystem fs = StramClientUtils.newFileSystemInstance(conf); try { final Token<?> tokens[] = fs.addDelegationTokens(tokenRenewer, credentials); if (tokens != null) { for (Token<?> token : tokens) { LOG.info("Got dt for " + fs.getUri() + "; " + token); } } } finally { fs.close(); } addRMDelegationToken(tokenRenewer, credentials); DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); amContainer.setTokens(fsTokens); } // set local resources for the application master // local files or archives as needed // In this scenario, the jar file for the application master is part of the local resources Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); // copy required jar files to dfs, to be localized for containers FileSystem fs = StramClientUtils.newFileSystemInstance(conf); try { Path appsBasePath = new Path(StramClientUtils.getDTDFSRootDir(fs, conf), StramClientUtils.SUBDIR_APPS); Path appPath = new Path(appsBasePath, appId.toString()); String libJarsCsv = copyFromLocal(fs, appPath, localJarFiles.toArray(new String[] {})); LOG.info("libjars: {}", libJarsCsv); dag.getAttributes().put(LogicalPlan.LIBRARY_JARS, libJarsCsv); LaunchContainerRunnable.addFilesToLocalResources( LocalResourceType.FILE, libJarsCsv, localResources, fs); if (archives != null) { String[] localFiles = archives.split(","); String archivesCsv = copyFromLocal(fs, appPath, localFiles); LOG.info("archives: {}", archivesCsv); dag.getAttributes().put(LogicalPlan.ARCHIVES, archivesCsv); LaunchContainerRunnable.addFilesToLocalResources( LocalResourceType.ARCHIVE, archivesCsv, localResources, fs); } if (files != null) { String[] localFiles = files.split(","); String filesCsv = copyFromLocal(fs, appPath, localFiles); LOG.info("files: {}", filesCsv); dag.getAttributes().put(LogicalPlan.FILES, filesCsv); LaunchContainerRunnable.addFilesToLocalResources( LocalResourceType.FILE, filesCsv, localResources, fs); } dag.getAttributes().put(LogicalPlan.APPLICATION_PATH, appPath.toString()); if (dag.getAttributes().get(OperatorContext.STORAGE_AGENT) == null) { /* which would be the most likely case */ Path checkpointPath = new Path(appPath, LogicalPlan.SUBDIR_CHECKPOINTS); // use conf client side to pickup any proxy settings from dt-site.xml dag.setAttribute( OperatorContext.STORAGE_AGENT, new FSStorageAgent(checkpointPath.toString(), conf)); } if (dag.getAttributes().get(LogicalPlan.CONTAINER_OPTS_CONFIGURATOR) == null) { dag.setAttribute( LogicalPlan.CONTAINER_OPTS_CONFIGURATOR, new BasicContainerOptConfigurator()); } // Set the log4j properties if needed if (!log4jPropFile.isEmpty()) { Path log4jSrc = new Path(log4jPropFile); Path log4jDst = new Path(appPath, "log4j.props"); fs.copyFromLocalFile(false, true, log4jSrc, log4jDst); FileStatus log4jFileStatus = fs.getFileStatus(log4jDst); LocalResource log4jRsrc = Records.newRecord(LocalResource.class); log4jRsrc.setType(LocalResourceType.FILE); log4jRsrc.setVisibility(LocalResourceVisibility.APPLICATION); log4jRsrc.setResource(ConverterUtils.getYarnUrlFromURI(log4jDst.toUri())); log4jRsrc.setTimestamp(log4jFileStatus.getModificationTime()); log4jRsrc.setSize(log4jFileStatus.getLen()); localResources.put("log4j.properties", log4jRsrc); } if (originalAppId != null) { Path origAppPath = new Path(appsBasePath, this.originalAppId); LOG.info("Restart from {}", origAppPath); copyInitialState(origAppPath); } // push logical plan to DFS location Path cfgDst = new Path(appPath, LogicalPlan.SER_FILE_NAME); FSDataOutputStream outStream = fs.create(cfgDst, true); LogicalPlan.write(this.dag, outStream); outStream.close(); Path launchConfigDst = new Path(appPath, LogicalPlan.LAUNCH_CONFIG_FILE_NAME); outStream = fs.create(launchConfigDst, true); conf.writeXml(outStream); outStream.close(); FileStatus topologyFileStatus = fs.getFileStatus(cfgDst); LocalResource topologyRsrc = Records.newRecord(LocalResource.class); topologyRsrc.setType(LocalResourceType.FILE); topologyRsrc.setVisibility(LocalResourceVisibility.APPLICATION); topologyRsrc.setResource(ConverterUtils.getYarnUrlFromURI(cfgDst.toUri())); topologyRsrc.setTimestamp(topologyFileStatus.getModificationTime()); topologyRsrc.setSize(topologyFileStatus.getLen()); localResources.put(LogicalPlan.SER_FILE_NAME, topologyRsrc); // Set local resource info into app master container launch context amContainer.setLocalResources(localResources); // Set the necessary security tokens as needed // amContainer.setContainerTokens(containerToken); // Set the env variables to be setup in the env where the application master will be run LOG.info("Set the environment for the application master"); Map<String, String> env = new HashMap<String, String>(); // Add application jar(s) location to classpath // At some point we should not be required to add // the hadoop specific classpaths to the env. // It should be provided out of the box. // For now setting all required classpaths including // the classpath to "." for the application jar(s) // including ${CLASSPATH} will duplicate the class path in app master, removing it for now // StringBuilder classPathEnv = new StringBuilder("${CLASSPATH}:./*"); StringBuilder classPathEnv = new StringBuilder("./*"); String classpath = conf.get(YarnConfiguration.YARN_APPLICATION_CLASSPATH); for (String c : StringUtils.isBlank(classpath) ? YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH : classpath.split(",")) { if (c.equals("$HADOOP_CLIENT_CONF_DIR")) { // SPOI-2501 continue; } classPathEnv.append(':'); classPathEnv.append(c.trim()); } env.put("CLASSPATH", classPathEnv.toString()); // propagate to replace node managers user name (effective in non-secure mode) env.put("HADOOP_USER_NAME", UserGroupInformation.getLoginUser().getUserName()); amContainer.setEnvironment(env); // Set the necessary command to execute the application master ArrayList<CharSequence> vargs = new ArrayList<CharSequence>(30); // Set java executable command LOG.info("Setting up app master command"); vargs.add(javaCmd); if (dag.isDebug()) { vargs.add("-agentlib:jdwp=transport=dt_socket,server=y,suspend=n"); } // Set Xmx based on am memory size // default heap size 75% of total memory vargs.add("-Xmx" + (amMemory * 3 / 4) + "m"); vargs.add("-XX:+HeapDumpOnOutOfMemoryError"); vargs.add("-XX:HeapDumpPath=/tmp/dt-heap-" + appId.getId() + ".bin"); vargs.add("-Dhadoop.root.logger=" + (dag.isDebug() ? "DEBUG" : "INFO") + ",RFA"); vargs.add("-Dhadoop.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR); vargs.add(String.format("-D%s=%s", StreamingContainer.PROP_APP_PATH, dag.assertAppPath())); if (dag.isDebug()) { vargs.add("-Dlog4j.debug=true"); } String loggersLevel = conf.get(DTLoggerFactory.DT_LOGGERS_LEVEL); if (loggersLevel != null) { vargs.add(String.format("-D%s=%s", DTLoggerFactory.DT_LOGGERS_LEVEL, loggersLevel)); } vargs.add(StreamingAppMaster.class.getName()); vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout"); vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr"); // Get final command StringBuilder command = new StringBuilder(9 * vargs.size()); for (CharSequence str : vargs) { command.append(str).append(" "); } LOG.info("Completed setting up app master command " + command.toString()); List<String> commands = new ArrayList<String>(); commands.add(command.toString()); amContainer.setCommands(commands); // Set up resource type requirements // For now, only memory is supported so we set memory requirements Resource capability = Records.newRecord(Resource.class); capability.setMemory(amMemory); appContext.setResource(capability); // Service data is a binary blob that can be passed to the application // Not needed in this scenario // amContainer.setServiceData(serviceData); appContext.setAMContainerSpec(amContainer); // Set the priority for the application master Priority pri = Records.newRecord(Priority.class); pri.setPriority(amPriority); appContext.setPriority(pri); // Set the queue to which this application is to be submitted in the RM appContext.setQueue(queueName); // Submit the application to the applications manager // SubmitApplicationResponse submitResp = rmClient.submitApplication(appRequest); // Ignore the response as either a valid response object is returned on success // or an exception thrown to denote some form of a failure String specStr = Objects.toStringHelper("Submitting application: ") .add("name", appContext.getApplicationName()) .add("queue", appContext.getQueue()) .add("user", UserGroupInformation.getLoginUser()) .add("resource", appContext.getResource()) .toString(); LOG.info(specStr); if (dag.isDebug()) { // LOG.info("Full submission context: " + appContext); } yarnClient.submitApplication(appContext); } finally { fs.close(); } }
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { if (args.length != 2) { System.out.println("Usage: FeatureMatching ID <inputName.jpeg/inputName.jpg>"); System.exit(1); } SimpleDateFormat sdf = new SimpleDateFormat("", Locale.US); sdf.applyPattern("yyyy-MM-dd_HH-mm-ss"); String time = sdf.format(new Date()); Job job = Job.getInstance(); ID = "/" + args[0]; String filename = args[1]; filename = filename.toLowerCase(); System.out.println("current filename:" + filename); // Detect illegal username (if the username dir doesn't exist) File userPath = new File(LOCAL_USER_DIR + ID); if (!userPath.exists()) { System.out.println("Unauthorized username!!!\nExiting......"); System.exit(1); } // Preprocess the input image.jpg from local dir: /local.../user/ID/input/image.jpg // Save the features to local dir: hdfs://.../user/ID/input/image.jpg extractQueryFeatures2HDFS(filename, job); // Add the feature file to the hdfs cache String featureFileName = filename.substring(0, filename.lastIndexOf(".")) + ".json"; // job.addCacheFile(new Path(HDFS_HOME + USER + ID + INPUT + "/" + // featureFileName).toUri()); job.getConfiguration() .set("featureFilePath", HDFS_HOME + USER + ID + INPUT + "/" + featureFileName); // Check the file type. Only support jpeg/jpg type images String type = filename.substring(args[1].lastIndexOf(".")); if (!(type.equals(".jpg") || type.equals(".jpeg"))) { System.out.println("Image type not supported!!!\nExiting"); System.exit(1); } // Input: hdfs://.../features/ // The feature dir is a location of all features extracted from the database String inputPathStr = HDFS_HOME + FEATURES; // Output: hdfs://.../user/ID/output/ String outputPathStr = HDFS_HOME + USER + ID + OUTPUT + "/" + time; job.setInputFormatClass(KeyValueTextInputFormat.class); // job.setOutputFormatClass(TextOutputFormat.class); // Get the lists of all feature files: /.../features/data/part-* FileSystem fs = FileSystem.get(job.getConfiguration()); FileStatus[] statuses = fs.listStatus(new Path(inputPathStr)); StringBuffer sb = new StringBuffer(); for (FileStatus fileStatus : statuses) { sb.append(fileStatus.getPath() + ","); } sb.deleteCharAt(sb.lastIndexOf(",")); job.setJarByClass(FeatureMatching.class); job.setMapperClass(FeatureMatchMapper.class); job.setReducerClass(FeatureMatchReducer.class); // only need one reducer to collect the result job.setNumReduceTasks(1); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); // Input a directory, so need the recursive input FileInputFormat.setInputDirRecursive(job, true); // Set the PathFilter, to omit _SUCCESS files // (This is not working correctly, as the PathFilter class is an interface rather than a class. // But the 2nd arg asks me to extend the PathFilter) // FileInputFormat.setInputPathFilter(job, MyPathFilter.class); // // FileInputFormat.setInputPaths(job, new Path(inputPathStr)); FileInputFormat.setInputPaths(job, sb.toString()); FileOutputFormat.setOutputPath(job, new Path(outputPathStr)); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }