private File getHfdsFileToTmpFile(String hdfsPath, HdfsConfiguration configuration) { try { String fname = hdfsPath.substring(hdfsPath.lastIndexOf('/')); File outputDest = File.createTempFile(fname, ".hdfs"); if (outputDest.exists()) { outputDest.delete(); } HdfsInfo hdfsInfo = HdfsInfoFactory.newHdfsInfo(hdfsPath); FileSystem fileSystem = hdfsInfo.getFileSystem(); FileUtil.copy(fileSystem, new Path(hdfsPath), outputDest, false, fileSystem.getConf()); try { FileUtil.copyMerge( fileSystem, // src new Path(hdfsPath), FileSystem.getLocal(new Configuration()), // dest new Path(outputDest.toURI()), false, fileSystem.getConf(), null); } catch (IOException e) { return outputDest; } return new File(outputDest, fname); } catch (IOException ex) { throw new RuntimeCamelException(ex); } }
@Test public void testWordCountMR() throws Exception { // INIT Path inputPath = new Path(box.getInputPath(), "wordCount"); Path resultPath = new Path("testMapReduceResult"); String[] content = new String[] {"1 2 3\n", "2 3 4\n", "4 5 6 6\n"}; DFSUtil.writeToFile(box.getFS(), inputPath, true, content); Job job = MRUtil.setUpJob( box.getConf(), WordCount.class, WordCount.Map.class, WordCount.Reduce.class, Text.class, IntWritable.class, box.getInputPath(), box.getOutputPath()); // ACT & ASSERT boolean jobResult = job.waitForCompletion(true); assertTrue(jobResult); FileUtil.copyMerge( box.getFS(), box.getOutputPath(), box.getFS(), resultPath, false, box.getConf(), "\n"); String result = DFSUtil.getFileContent(box.getFS(), resultPath); Pattern p = Pattern.compile("[\\s]+"); for (String ln : result.split("\n")) { String[] splitty = p.split(ln.trim()); if (splitty.length == 2) { int term = new Integer(splitty[0]); int freq = new Integer(splitty[1]); switch (term) { case 1: assertEquals(freq, 1); break; case 2: assertEquals(freq, 2); break; case 3: assertEquals(freq, 2); break; case 4: assertEquals(freq, 2); break; case 5: assertEquals(freq, 1); break; case 6: assertEquals(freq, 2); break; default: throw new Exception("Unknown term " + term); } } } }
/** * Calls FileUtil.copyMerge using the specified source and destination paths. Both source and * destination are assumed to be on the local file system. The call will not delete source on * completion and will not add an additional string between files. * * @param src String non-null source path. * @param dst String non-null destination path. * @return boolean true if the call to FileUtil.copyMerge was successful. * @throws IOException if an I/O error occurs. */ private boolean copyMerge(String src, String dst) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); final boolean result; try { Path srcPath = new Path(TEST_ROOT_DIR, src); Path dstPath = new Path(TEST_ROOT_DIR, dst); boolean deleteSource = false; String addString = null; result = FileUtil.copyMerge(fs, srcPath, fs, dstPath, deleteSource, conf, addString); } finally { fs.close(); } return result; }
/** * 해당 경로에 있는 파일을 MERGE한다. * * @param conf Hadoop Configuration * @param path HDFS Path * @throws java.io.IOException Get Merge할 수 없는 경우 */ public static void merge(Configuration conf, String path) throws IOException { // 입력 경로의 모든 파일을 Get Merge하여 임시 파일에 기록한다. FileSystem fileSystem = FileSystem.get(conf); Path source = new Path(path); if (!fileSystem.getFileStatus(source).isDir()) { // 이미 파일이라면 더이상 Get Merge할 필요없다. return; } Path target = new Path(path + "_temporary"); FileUtil.copyMerge(fileSystem, source, fileSystem, target, true, conf, null); // 원 소스 파일을 삭제한다. fileSystem.delete(source, true); // 임시 파일을 원 소스 파일명으로 대체한다. Path in = new Path(path + "_temporary"); Path out = new Path(path); fileSystem.rename(in, out); // 임시 디렉토리를 삭제한다. fileSystem.delete(new Path(path + "_temporary"), true); }
private void distributeFiles() { try { URI[] uris = DistributedCache.getCacheFiles(conf); if (uris != null) { URI[] outURIs = new URI[uris.length]; for (int i = 0; i < uris.length; i++) { Path path = new Path(uris[i]); FileSystem fs = path.getFileSystem(conf); if (fs.isFile(path)) { outURIs[i] = uris[i]; } else { Path mergePath = new Path(path.getParent(), "sparkreadable-" + path.getName()); FileUtil.copyMerge(fs, path, fs, mergePath, false, conf, ""); outURIs[i] = mergePath.toUri(); } sparkContext.addFile(outURIs[i].toString()); } DistributedCache.setCacheFiles(outURIs, conf); } } catch (IOException e) { throw new RuntimeException("Error retrieving cache files", e); } }