@Override public List<HdfsEntity> list(String path) throws IOException { FileStatus[] fileStatuses = fileSystem.listStatus(new Path(path)); List<HdfsEntity> entities = new ArrayList<HdfsEntity>(); for (FileStatus fileStatus : fileStatuses) { HdfsEntity entity = new HdfsEntity(); entity.setDirectory(fileStatus.isDirectory()); entity.setPath(fileStatus.getPath().toUri().getPath()); entity.setModifiedAt(new Date(fileStatus.getModificationTime())); entity.setSize(fileStatus.getLen()); if (fileStatus.isDirectory()) { Integer length = 0; try { FileStatus[] contents = fileSystem.listStatus(fileStatus.getPath()); length = contents.length; } catch (org.apache.hadoop.security.AccessControlException exception) { } entity.setContentCount(length); } entities.add(entity); } return entities; }
/* * Test that {@link HFileOutputFormat2} creates an HFile with TIMERANGE * metadata used by time-restricted scans. */ @Test public void test_TIMERANGE() throws Exception { Configuration conf = new Configuration(this.util.getConfiguration()); RecordWriter<ImmutableBytesWritable, Cell> writer = null; TaskAttemptContext context = null; Path dir = util.getDataTestDir("test_TIMERANGE_present"); LOG.info("Timerange dir writing to dir: " + dir); try { // build a record writer using HFileOutputFormat2 Job job = new Job(conf); FileOutputFormat.setOutputPath(job, dir); context = createTestTaskAttemptContext(job); HFileOutputFormat2 hof = new HFileOutputFormat2(); writer = hof.getRecordWriter(context); // Pass two key values with explicit times stamps final byte[] b = Bytes.toBytes("b"); // value 1 with timestamp 2000 KeyValue kv = new KeyValue(b, b, b, 2000, b); KeyValue original = kv.clone(); writer.write(new ImmutableBytesWritable(), kv); assertEquals(original, kv); // value 2 with timestamp 1000 kv = new KeyValue(b, b, b, 1000, b); original = kv.clone(); writer.write(new ImmutableBytesWritable(), kv); assertEquals(original, kv); // verify that the file has the proper FileInfo. writer.close(context); // the generated file lives 1 directory down from the attempt directory // and is the only file, e.g. // _attempt__0000_r_000000_0/b/1979617994050536795 FileSystem fs = FileSystem.get(conf); Path attemptDirectory = hof.getDefaultWorkFile(context, "").getParent(); FileStatus[] sub1 = fs.listStatus(attemptDirectory); FileStatus[] file = fs.listStatus(sub1[0].getPath()); // open as HFile Reader and pull out TIMERANGE FileInfo. HFile.Reader rd = HFile.createReader(fs, file[0].getPath(), new CacheConfig(conf), conf); Map<byte[], byte[]> finfo = rd.loadFileInfo(); byte[] range = finfo.get("TIMERANGE".getBytes()); assertNotNull(range); // unmarshall and check values. TimeRangeTracker timeRangeTracker = new TimeRangeTracker(); Writables.copyWritable(range, timeRangeTracker); LOG.info( timeRangeTracker.getMinimumTimestamp() + "...." + timeRangeTracker.getMaximumTimestamp()); assertEquals(1000, timeRangeTracker.getMinimumTimestamp()); assertEquals(2000, timeRangeTracker.getMaximumTimestamp()); rd.close(); } finally { if (writer != null && context != null) writer.close(context); dir.getFileSystem(conf).delete(dir, true); } }
/** * Runs through the hbase rootdir and checks all stores have only one file in them -- that is, * they've been major compacted. Looks at root and meta tables too. * * @param fs * @param hbaseRootDir * @return True if this hbase install is major compacted. * @throws IOException */ public static boolean isMajorCompacted(final FileSystem fs, final Path hbaseRootDir) throws IOException { // Presumes any directory under hbase.rootdir is a table. FileStatus[] tableDirs = fs.listStatus(hbaseRootDir, new DirFilter(fs)); for (int i = 0; i < tableDirs.length; i++) { // Skip the .log directory. All others should be tables. Inside a table, // there are compaction.dir directories to skip. Otherwise, all else // should be regions. Then in each region, should only be family // directories. Under each of these, should be one file only. Path d = tableDirs[i].getPath(); if (d.getName().equals(HConstants.HREGION_LOGDIR_NAME)) { continue; } FileStatus[] regionDirs = fs.listStatus(d, new DirFilter(fs)); for (int j = 0; j < regionDirs.length; j++) { Path dd = regionDirs[j].getPath(); if (dd.getName().equals(HConstants.HREGION_COMPACTIONDIR_NAME)) { continue; } // Else its a region name. Now look in region for families. FileStatus[] familyDirs = fs.listStatus(dd, new DirFilter(fs)); for (int k = 0; k < familyDirs.length; k++) { Path family = familyDirs[k].getPath(); // Now in family make sure only one file. FileStatus[] familyStatus = fs.listStatus(family); if (familyStatus.length > 1) { LOG.debug(family.toString() + " has " + familyStatus.length + " files."); return false; } } } } return true; }
/** * Runs through the hbase rootdir and checks all stores have only one file in them -- that is, * they've been major compacted. Looks at root and meta tables too. This version differs from * {@link #isMajorCompacted(FileSystem, Path)} in that it expects a pre-0.20.0 hbase layout on the * filesystem. Used migrating. * * @param fs * @param hbaseRootDir * @return True if this hbase install is major compacted. * @throws IOException */ public static boolean isMajorCompactedPre020(final FileSystem fs, final Path hbaseRootDir) throws IOException { // Presumes any directory under hbase.rootdir is a table. FileStatus[] tableDirs = fs.listStatus(hbaseRootDir, new DirFilter(fs)); for (int i = 0; i < tableDirs.length; i++) { // Inside a table, there are compaction.dir directories to skip. // Otherwise, all else should be regions. Then in each region, should // only be family directories. Under each of these, should be a mapfile // and info directory and in these only one file. Path d = tableDirs[i].getPath(); if (d.getName().equals(HConstants.HREGION_LOGDIR_NAME)) { continue; } FileStatus[] regionDirs = fs.listStatus(d, new DirFilter(fs)); for (int j = 0; j < regionDirs.length; j++) { Path dd = regionDirs[j].getPath(); if (dd.getName().equals(HConstants.HREGION_COMPACTIONDIR_NAME)) { continue; } // Else its a region name. Now look in region for families. FileStatus[] familyDirs = fs.listStatus(dd, new DirFilter(fs)); for (int k = 0; k < familyDirs.length; k++) { Path family = familyDirs[k].getPath(); FileStatus[] infoAndMapfile = fs.listStatus(family); // Assert that only info and mapfile in family dir. if (infoAndMapfile.length != 0 && infoAndMapfile.length != 2) { LOG.debug( family.toString() + " has more than just info and mapfile: " + infoAndMapfile.length); return false; } // Make sure directory named info or mapfile. for (int ll = 0; ll < 2; ll++) { if (infoAndMapfile[ll].getPath().getName().equals("info") || infoAndMapfile[ll].getPath().getName().equals("mapfiles")) continue; LOG.debug("Unexpected directory name: " + infoAndMapfile[ll].getPath()); return false; } // Now in family, there are 'mapfile' and 'info' subdirs. Just // look in the 'mapfile' subdir. FileStatus[] familyStatus = fs.listStatus(new Path(family, "mapfiles")); if (familyStatus.length > 1) { LOG.debug(family.toString() + " has " + familyStatus.length + " files."); return false; } } } } return true; }
/** * Returns all files belonging to the given region directory. Could return an empty list. * * @param fs The file system reference. * @param regionDir The region directory to scan. * @return The list of files found. * @throws IOException When scanning the files fails. */ static List<Path> getStoreFiles(FileSystem fs, Path regionDir) throws IOException { List<Path> res = new ArrayList<Path>(); PathFilter dirFilter = new FSUtils.DirFilter(fs); FileStatus[] familyDirs = fs.listStatus(regionDir, dirFilter); for (FileStatus dir : familyDirs) { FileStatus[] files = fs.listStatus(dir.getPath()); for (FileStatus file : files) { if (!file.isDir()) { res.add(file.getPath()); } } } return res; }
@Test public void testListEmptyRootDirectory() throws IOException { // extra sanity checks here to avoid support calls about complete loss of data skipIfUnsupported(TEST_ROOT_TESTS_ENABLED); FileSystem fs = getFileSystem(); Path root = new Path("/"); FileStatus[] statuses = fs.listStatus(root); for (FileStatus status : statuses) { ContractTestUtils.assertDeleted(fs, status.getPath(), true); } assertEquals( "listStatus on empty root-directory returned a non-empty list", 0, fs.listStatus(root).length); }
private static void addFolder2( FileSystem fs, Path p, ArrayList<String> keys, ArrayList<String> failed) { try { if (fs == null) return; Futures futures = new Futures(); for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder2(fs, pfs, keys, failed); } else { long size = file.getLen(); Key res; if (pfs.getName().endsWith(Extensions.JSON)) { throw H2O.unimpl(); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? throw H2O.unimpl(); } else { Key k = null; keys.add((k = HdfsFileVec.make(file, futures)).toString()); Log.info("PersistHdfs: DKV.put(" + k + ")"); } } } } catch (Exception e) { Log.err(e); failed.add(p.toString()); } }
private int readFile() throws IllegalArgumentException, IOException { int count = 0; final FileSystem fs = FileSystem.get(MapReduceTestUtils.getConfiguration()); final FileStatus[] fss = fs.listStatus( new Path( TestUtils.TEMP_DIR + File.separator + MapReduceTestEnvironment.HDFS_BASE_DIRECTORY + "/t1/pairs")); for (final FileStatus ifs : fss) { if (ifs.isFile() && ifs.getPath().toString().matches(".*part-r-0000[0-9]")) { try (SequenceFile.Reader reader = new SequenceFile.Reader( MapReduceTestUtils.getConfiguration(), Reader.file(ifs.getPath()))) { final Text key = new Text(); final Text val = new Text(); while (reader.next(key, val)) { count++; System.err.println(key + "\t" + val); } } } } return count; }
/** Returns the greatest partition number available for appending, for data files in targetDir. */ private int getNextPartition(FileSystem fs, Path targetDir) throws IOException { int nextPartition = 0; FileStatus[] existingFiles = fs.listStatus(targetDir); if (existingFiles != null && existingFiles.length > 0) { Pattern patt = Pattern.compile("part.*-([0-9][0-9][0-9][0-9][0-9]).*"); for (FileStatus fileStat : existingFiles) { if (!fileStat.isDir()) { String filename = fileStat.getPath().getName(); Matcher mat = patt.matcher(filename); if (mat.matches()) { int thisPart = Integer.parseInt(mat.group(1)); if (thisPart >= nextPartition) { nextPartition = thisPart; nextPartition++; } } } } } if (nextPartition > 0) { LOG.info("Using found partition " + nextPartition); } return nextPartition; }
public static void run(Configuration conf, Path input, String outputFile) throws IOException, InstantiationException, IllegalAccessException { Writer writer; if (outputFile == null) { writer = new OutputStreamWriter(System.out); } else { writer = new OutputStreamWriter( new FileOutputStream(new File(outputFile)), Charset.forName("UTF-8")); } try { FileSystem fs = input.getFileSystem(conf); for (FileStatus fst : fs.listStatus(input, new DataPathFilter())) { Path dataPath = fst.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, dataPath, conf); try { Text key = reader.getKeyClass().asSubclass(Text.class).newInstance(); DocumentMapping value = new DocumentMapping(); while (reader.next(key, value)) { String docId = value.getDocId(); writer.write(docId + "\t" + key + "\n"); } } finally { reader.close(); } } } finally { writer.close(); } }
/** * Override this so that we don't set the targetTestRoot to any path under the root of the FS, and * so that we don't try to delete the test dir, but rather only its contents. */ @Override void initializeTargetTestRoot() throws IOException { targetTestRoot = fHdfs.makeQualified(new Path("/")); for (FileStatus status : fHdfs.listStatus(targetTestRoot)) { fHdfs.delete(status.getPath(), true); } }
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { // TODO Auto-generated method stub JobConf conf = new JobConf(); conf.setNumMapTasks(1); conf.setNumReduceTasks(5); FileSystem fs = FileSystem.get(conf); Path dir = new Path(args[0]); FileStatus[] stats = fs.listStatus(dir); numFiles = stats.length; Job job = new Job(conf); job.setJarByClass(FileCombiner.class); job.setJobName("File Combiner"); job.setMapperClass(FileCombinerMapper.class); job.setReducerClass(FileCombinerReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
/** * Called after the MapReduce job has completed, to verify that the outputs generated by the * MapReduce job align with the expected outputs that were set with calls to {@link * #addExpectedOutput(String)} and {@link #addExpectedOutput(String...)}. * * @return a reference to this object * @throws IOException if something goes wrong */ public TextIOJobBuilder verifyResults() throws IOException { FileStatus[] outputFiles = fs.listStatus( outputPath, new PathFilter() { @Override public boolean accept(final Path path) { return path.getName().startsWith("part"); } }); System.out.println("Output files: " + StringUtils.join(outputFiles)); int i = 0; for (FileStatus file : outputFiles) { List<String> actualLines = FileUtils.readLines(fs, file.getPath()); for (String actualLine : actualLines) { String expectedLine = expectedOutputs.get(i++); assertEquals(expectedLine, actualLine); } } assertEquals(expectedOutputs.size(), i); return this; }
public static Path[] getInputPaths(String rootPath) { try { Configuration conf = HBaseConfiguration.create(); Path root = new Path(rootPath); ArrayList<Path> paths = new ArrayList<Path>(); FileSystem fs = root.getFileSystem(conf); LinkedList<Path> list = new LinkedList<Path>(); list.push(root); if (!fs.exists(root)) { System.out.println("path not exists: " + root.toString()); return new Path[0]; } while (!list.isEmpty()) { Path path = list.pop(); if (fs.isFile(path)) { if (path.getName().matches("^.*part-r-\\d{5}.*$")) { paths.add(path); System.out.println("something is wrong with path" + path.toString()); } } else { FileStatus[] statuses = fs.listStatus(path); for (FileStatus status : statuses) { if (status.isDir()) { list.add(status.getPath()); } else if (status.getPath().getName().matches("^.*part-r-\\d{5}.*$")) { paths.add(status.getPath()); } } } } return paths.toArray(new Path[paths.size()]); } catch (IOException ignored) { return new Path[0]; } }
private void loadRMDTSecretManagerState(RMState rmState) throws Exception { FileStatus[] childNodes = fs.listStatus(rmDTSecretManagerRoot); for (FileStatus childNodeStatus : childNodes) { assert childNodeStatus.isFile(); String childNodeName = childNodeStatus.getPath().getName(); if (childNodeName.startsWith(DELEGATION_TOKEN_SEQUENCE_NUMBER_PREFIX)) { rmState.rmSecretManagerState.dtSequenceNumber = Integer.parseInt(childNodeName.split("_")[1]); continue; } Path childNodePath = getNodePath(rmDTSecretManagerRoot, childNodeName); byte[] childData = readFile(childNodePath, childNodeStatus.getLen()); ByteArrayInputStream is = new ByteArrayInputStream(childData); DataInputStream fsIn = new DataInputStream(is); if (childNodeName.startsWith(DELEGATION_KEY_PREFIX)) { DelegationKey key = new DelegationKey(); key.readFields(fsIn); rmState.rmSecretManagerState.masterKeyState.add(key); } else if (childNodeName.startsWith(DELEGATION_TOKEN_PREFIX)) { RMDelegationTokenIdentifier identifier = new RMDelegationTokenIdentifier(); identifier.readFields(fsIn); long renewDate = fsIn.readLong(); rmState.rmSecretManagerState.delegationTokenState.put(identifier, renewDate); } else { LOG.warn("Unknown file for recovering RMDelegationTokenSecretManager"); } fsIn.close(); } }
public static void runJob(String... args) throws Exception { Path smallFilePath = new Path(args[0]); Configuration conf = new Configuration(); FileSystem fs = smallFilePath.getFileSystem(conf); FileStatus smallFilePathStatus = fs.getFileStatus(smallFilePath); if (smallFilePathStatus.isDir()) { for (FileStatus f : fs.listStatus(smallFilePath)) { if (f.getPath().getName().contains(args[0])) { DistributedCache.addCacheFile(f.getPath().toUri(), conf); } } } else { DistributedCache.addCacheFile(smallFilePath.toUri(), conf); } Path inputPath = new Path(args[1]); Path outputPath = new Path(args[2]); Job job = new Job(conf); job.setJarByClass(Main.class); job.setMapperClass(GenericReplicatedJoin.class); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setNumReduceTasks(0); outputPath.getFileSystem(conf).delete(outputPath, true); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.waitForCompletion(true); }
@Test public void testAvroOut() { String type = "one"; AvroOutputFormat<String> avroOut = new AvroOutputFormat<String>(String.class); org.apache.hadoop.fs.Path result = new org.apache.hadoop.fs.Path(hdfsURI + "/avroTest"); avroOut.setOutputFilePath(new Path(result.toString())); avroOut.setWriteMode(FileSystem.WriteMode.NO_OVERWRITE); avroOut.setOutputDirectoryMode(FileOutputFormat.OutputDirectoryMode.ALWAYS); try { avroOut.open(0, 2); avroOut.writeRecord(type); avroOut.close(); avroOut.open(1, 2); avroOut.writeRecord(type); avroOut.close(); Assert.assertTrue("No result file present", hdfs.exists(result)); FileStatus[] files = hdfs.listStatus(result); Assert.assertEquals(2, files.length); for (FileStatus file : files) { Assert.assertTrue( "1.avro".equals(file.getPath().getName()) || "2.avro".equals(file.getPath().getName())); } } catch (IOException e) { e.printStackTrace(); Assert.fail(e.getMessage()); } }
/** * Return a list of all urls matching this input. If autocomplete is false, the list contains only * 1 element (same as getUrl()). Otherwise, it will try to return all the files beginning with * what is returned by getUrl(). * * @param jobConf A Configuration object * @return the list of input url */ public HashSet<URI> getAllUrls(Configuration jobConf) { HashSet<URI> urls = new HashSet<URI>(); if (!isAutoComplete()) { urls.add(url); } else { Path basePath = new Path(url); String filePrefix = basePath.getName(); try { FileSystem fs = basePath.getFileSystem(jobConf); if (!fs.exists(basePath.getParent())) { throw new IOException("Input directory not found: " + url); } FileStatus[] stats = fs.listStatus(basePath.getParent()); for (int i = 0; i < stats.length; i++) { Path path = stats[i].getPath(); if (fs.isFile(path) && path.getName().startsWith(filePrefix)) urls.add(path.toUri()); } } catch (IOException e) { System.err.println("Unable to autocomplete input file"); e.printStackTrace(); System.exit(1); } } return urls; }
protected void cleanup() throws Exception { try { int rotateInterval = conf.getInt("chukwaCollector.rotateInterval", 1000 * 60 * 5); // defaults to 5 minutes Path pLocalOutputDir = new Path(localOutputDir); FileStatus[] files = localFs.listStatus(pLocalOutputDir); String fileName = null; for (FileStatus file : files) { fileName = file.getPath().getName(); if (fileName.endsWith(".done")) { moveFile(localOutputDir + fileName); } else if (fileName.endsWith(".chukwa")) { long lastPeriod = System.currentTimeMillis() - rotateInterval - (2 * 60 * 1000); if (file.getModificationTime() < lastPeriod) { log.info("Moving .chukwa file over, " + localOutputDir + fileName); moveFile(localOutputDir + fileName); } } } } catch (Exception e) { log.warn("Cannot copy to the remote HDFS", e); throw e; } }
public static FileStatus[] listStatus(FileSystem fs, Path path, PathFilter filter) throws IOException { try { return fs.listStatus(path, filter); } catch (FileNotFoundException e) { return new FileStatus[0]; } }
private void loadDocumentIndex(String documentIndexPath) throws IOException { if (documentIndex == null) { documentIndex = new HashMap<String, Integer>(); Path p = new Path(documentIndexPath); FileSystem fs = FileSystem.get(p.toUri(), new Configuration()); int index = 0; for (FileStatus status : fs.listStatus(p)) { Path currPath = status.getPath(); if (!status.isDir() && !currPath.getName().startsWith("_")) { BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(fs.open(currPath))); String line = null; while ((line = reader.readLine()) != null) { documentIndex.put(line.trim(), index++); } } finally { if (reader != null) { reader.close(); } } } } log.info("Loaded document index with size: " + documentIndex.size()); } }
/** * Given a filesystem and path to a node, gets all the files which belong to a partition, replica * type and chunk id * * <p>Works only for {@link ReadOnlyStorageFormat.READONLY_V2} * * @param fs Underlying filesystem * @param path The node directory path * @param partitionId The partition id for which we get the files * @param replicaType The replica type * @param chunkId The chunk id * @return Returns list of files of this partition, replicaType, chunkId * @throws IOException */ public static FileStatus[] getDataChunkFiles( FileSystem fs, Path path, final int partitionId, final int replicaType, final int chunkId) throws IOException { return fs.listStatus( path, new PathFilter() { public boolean accept(Path input) { if (input .getName() .matches( "^" + Integer.toString(partitionId) + "_" + Integer.toString(replicaType) + "_" + Integer.toString(chunkId) + "\\.data")) { return true; } else { return false; } } }); }
/** * Check duplicated tweet IDs in <b>tweetIdDir</b>, and output the duplicates to stdout. * * @param tweetIdDir * @throws Exception */ public static void checkTidDuplicates(String tweetIdDir) throws Exception { // First change path strings to URI strings starting with 'file:' or 'hdfs:' tweetIdDir = MultiFileFolderWriter.getUriStrForPath(tweetIdDir); Set<String> tidSet = new HashSet<String>(); Configuration conf = HBaseConfiguration.create(); FileSystem fs = FileSystem.get(new URI(tweetIdDir), conf); int dupCount = 0; for (FileStatus srcFileStatus : fs.listStatus(new Path(tweetIdDir))) { String srcFileName = srcFileStatus.getPath().getName(); if (srcFileName.endsWith(".txt") && srcFileName.contains("tweetIds")) { BufferedReader brTid = new BufferedReader(new InputStreamReader(fs.open(srcFileStatus.getPath()))); String tid = brTid.readLine(); while (tid != null) { if (tidSet.contains(tid)) { System.out.println("Duplicated tweet ID: " + tid); dupCount++; } else { tidSet.add(tid); } tid = brTid.readLine(); } brTid.close(); } } System.out.println( "Number of unique tweet IDs: " + tidSet.size() + ", number of duplicates: " + dupCount); }
public int run(String[] args) throws Exception { // TODO Auto-generated method stub if (args.length < 1) { System.out.println("missing input "); System.exit(1); } Configuration conf = new Configuration(); Job job = new Job(new JobConf(conf)); job.setJarByClass(InstallApp.class); job.setMapperClass(InstallMapper.class); job.setReducerClass(InstalleReduce.class); job.setJobName("installtaotaosou"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileSystem fs = FileSystem.get(URI.create(args[0]), conf); FileStatus fileList[] = fs.listStatus(new Path(args[0])); int length = fileList.length; for (int i = 0; i < length; i++) { FileInputFormat.addInputPath(job, fileList[i].getPath()); } FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setNumReduceTasks(1); job.waitForCompletion(true); return 0; }
public static void readHiveResult(String path, OutputStreamWriter outStream, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(conf); Path dir = new Path(path); if (!fs.exists(dir)) { throw new IOException("can not found path:" + path); } FileStatus[] filelist = fs.listStatus(dir); Long bytesRead = 0l; long maxsize = 1024l * 1024 * 1024 * 10; for (FileStatus f : filelist) { if (!f.isDir() && !f.getPath().getName().startsWith("_")) { FSDataInputStream in = fs.open(f.getPath()); BufferedReader bf = new BufferedReader(new InputStreamReader(in)); String line; while ((line = bf.readLine()) != null) { bytesRead += line.getBytes().length; outStream.write(line.replaceAll("\001", ",").replaceAll("\t", ",")); outStream.write("\r\n"); if (bytesRead >= maxsize) { bf.close(); in.close(); return; } } bf.close(); in.close(); } } return; }
/** determines which files have failed for a given job */ private Set<String> getFailedFiles(Job job) throws IOException { Set<String> failedFiles = new HashSet<String>(); Path outDir = SequenceFileOutputFormat.getOutputPath(job); FileSystem fs = outDir.getFileSystem(getConf()); if (!fs.getFileStatus(outDir).isDir()) { throw new IOException(outDir.toString() + " is not a directory"); } FileStatus[] files = fs.listStatus(outDir); for (FileStatus f : files) { Path fPath = f.getPath(); if ((!f.isDir()) && (fPath.getName().startsWith(PART_PREFIX))) { LOG.info("opening " + fPath.toString()); SequenceFile.Reader reader = new SequenceFile.Reader(fs, fPath, getConf()); Text key = new Text(); Text value = new Text(); while (reader.next(key, value)) { failedFiles.add(key.toString()); } reader.close(); } } return failedFiles; }
private double[] getSparkModelInfoFromHDFS(Path location, Configuration conf) throws Exception { FileSystem fileSystem = FileSystem.get(location.toUri(), conf); FileStatus[] files = fileSystem.listStatus(location); if (files == null) throw new Exception("Couldn't find Spark Truck ML weights at: " + location); ArrayList<Double> modelInfo = new ArrayList<Double>(); for (FileStatus file : files) { if (file.getPath().getName().startsWith("_")) { continue; } InputStream stream = fileSystem.open(file.getPath()); StringWriter writer = new StringWriter(); IOUtils.copy(stream, writer, "UTF-8"); String raw = writer.toString(); for (String str : raw.split("\n")) { modelInfo.add(Double.valueOf(str)); } } return Doubles.toArray(modelInfo); }
/** debugging TODO remove */ private void readOutputFiles(String jobName, Path outDir) throws IOException { FileSystem fs = outDir.getFileSystem(getConf()); if (!fs.getFileStatus(outDir).isDir()) { throw new IOException(outDir.toString() + " is not a directory"); } FileStatus[] files = fs.listStatus(outDir); for (FileStatus f : files) { Path fPath = f.getPath(); if ((!f.isDir()) && (fPath.getName().startsWith(PART_PREFIX))) { LOG.info("opening " + fPath.toString()); SequenceFile.Reader reader = new SequenceFile.Reader(fs, fPath, getConf()); Text key = new Text(); Text value = new Text(); while (reader.next(key, value)) { LOG.info("read " + f.getPath().toString()); LOG.info("read: k=" + key.toString() + " v=" + value.toString()); } LOG.info("done reading " + fPath.toString()); reader.close(); } } }
public LinkedHashSet<Path> scan(FileSystem fs, Path filePath, Set<String> consumedFiles) { LinkedHashSet<Path> pathSet = Sets.newLinkedHashSet(); try { LOG.debug("Scanning {} with pattern {}", filePath, this.filePatternRegexp); FileStatus[] files = fs.listStatus(filePath); for (FileStatus status : files) { Path path = status.getPath(); String filePathStr = path.toString(); if (consumedFiles.contains(filePathStr)) { continue; } if (ignoredFiles.contains(filePathStr)) { continue; } if (acceptFile(filePathStr)) { LOG.debug("Found {}", filePathStr); pathSet.add(path); } else { // don't look at it again ignoredFiles.add(filePathStr); } } } catch (FileNotFoundException e) { LOG.warn("Failed to list directory {}", filePath, e); } catch (IOException e) { throw new RuntimeException(e); } return pathSet; }
/** * Copy a directory to a new FS -both paths must be qualified * * @param conf conf file * @param srcDirPath src dir * @param destDirPath dest dir * @return # of files copies */ public static int copyDirectory(Configuration conf, Path srcDirPath, Path destDirPath) throws IOException { FileSystem srcFS = FileSystem.get(srcDirPath.toUri(), conf); FileSystem destFS = FileSystem.get(destDirPath.toUri(), conf); // list all paths in the src. if (!srcFS.exists(srcDirPath)) { throw new FileNotFoundException("Source dir not found " + srcDirPath); } if (!srcFS.isDirectory(srcDirPath)) { throw new FileNotFoundException("Source dir not a directory " + srcDirPath); } FileStatus[] entries = srcFS.listStatus(srcDirPath); int srcFileCount = entries.length; if (srcFileCount == 0) { return 0; } if (!destFS.exists(destDirPath)) { destFS.mkdirs(destDirPath); } Path[] sourcePaths = new Path[srcFileCount]; for (int i = 0; i < srcFileCount; i++) { FileStatus e = entries[i]; Path srcFile = e.getPath(); if (srcFS.isDirectory(srcFile)) { throw new IOException( "Configuration dir " + srcDirPath + " contains a directory " + srcFile); } log.debug("copying src conf file {}", srcFile); sourcePaths[i] = srcFile; } log.debug("Copying {} files from to {} to dest {}", srcFileCount, srcDirPath, destDirPath); FileUtil.copy(srcFS, sourcePaths, destFS, destDirPath, false, true, conf); return srcFileCount; }