private void executePostProcessing(DistCpOptions options) throws IOException { Path targetPath = options.getTargetPath(); FileSystem fs = targetPath.getFileSystem(getConf()); List<Path> inPaths = options.getSourcePaths(); assert inPaths.size() == 1 : "Source paths more than 1 can't be handled"; Path sourcePath = inPaths.get(0); Path includePath = new Path(getConf().get("falcon.include.path")); assert includePath .toString() .substring(0, sourcePath.toString().length()) .equals(sourcePath.toString()) : "Source path is not a subset of include path"; String relativePath = includePath.toString().substring(sourcePath.toString().length()); String fixedPath = getFixedPath(relativePath); FileStatus[] files = fs.globStatus(new Path(targetPath.toString() + "/" + fixedPath)); if (files != null) { for (FileStatus file : files) { fs.create(new Path(file.getPath(), EntityUtil.SUCCEEDED_FILE_NAME)).close(); LOG.info("Created " + new Path(file.getPath(), EntityUtil.SUCCEEDED_FILE_NAME)); } } else { LOG.info( "No files present in path: " + new Path(targetPath.toString() + "/" + fixedPath).toString()); } }
@Override protected int setJobInputData(Configuration config, Job job) throws InferenciaException { try { // Recuperamos los ficheros que vamos a procesar, y los anyadimos // como datos de entrada final FileSystem fs = FileSystem.get(new URI(InferenciaCte.hdfsUri), config); // Recuperamos los datos del path origen (data/*.bz2) FileStatus[] glob = fs.globStatus(new Path(getRutaFicheros())); // Si tenemos datos... if (null != glob) { if (glob.length > 0) { for (FileStatus fileStatus : glob) { Path pFich = fileStatus.getPath(); MultipleInputs.addInputPath(job, pFich, SequenceFileInputFormat.class, LoadMap.class); } } else { return noDataFound(); } } } catch (IOException e) { throw new InferenciaException(e, e.getMessage()); } catch (URISyntaxException e) { throw new InferenciaException(e, e.getMessage()); } return InferenciaCte.SUCCESS; }
/** * 입력으로 선택한 경로를 지정한 목적 경로로 이동한다. * * @param source 이동할 경로 * @param target 이동할 위치 * @param fs Hadoop FileSystem */ public static void move(String source, String target, FileSystem fs) throws Exception { Path srcPath = new Path(source); Path[] srcs = FileUtil.stat2Paths(fs.globStatus(srcPath), srcPath); Path dst = new Path(target); if (srcs.length > 1 && !fs.getFileStatus(dst).isDir()) { throw new FileSystemException( "When moving multiple files, destination should be a directory."); } for (int i = 0; i < srcs.length; i++) { if (!fs.rename(srcs[i], dst)) { FileStatus srcFstatus = null; FileStatus dstFstatus = null; try { srcFstatus = fs.getFileStatus(srcs[i]); } catch (FileNotFoundException e) { throw new FileNotFoundException(srcs[i] + ": No such file or directory"); } try { dstFstatus = fs.getFileStatus(dst); } catch (IOException e) { // Nothing } if ((srcFstatus != null) && (dstFstatus != null)) { if (srcFstatus.isDir() && !dstFstatus.isDir()) { throw new FileSystemException( "cannot overwrite non directory " + dst + " with directory " + srcs[i]); } } throw new FileSystemException("Failed to rename " + srcs[i] + " to " + dst); } } }
@Override public List<String> getContent(String path, int lineCount) throws IOException { FileStatus[] files = fileSystem.globStatus(new Path(path)); ArrayList<String> lines = new ArrayList<String>(); if (files != null) { for (FileStatus file : files) { if (lines.size() >= lineCount) { break; } if (!file.isDirectory()) { DataInputStream in = fileSystem.open(file.getPath()); BufferedReader dataReader = new BufferedReader(new InputStreamReader(in)); String line = dataReader.readLine(); while (line != null && lines.size() < lineCount) { lines.add(line); line = dataReader.readLine(); } dataReader.close(); in.close(); } } } return lines; }
private static boolean hasFile( dbutil db_util, FileSystem fs, String db_name, String file_id, String file_path) throws Exception { Get file_id_get = new Get(file_id.getBytes()); Result file_result = db_util.doGet(db_name, file_id_get); KeyValue file_names = file_result.getColumnLatest("d".getBytes(), "filenames".getBytes()); if (file_names == null) { return false; } String all_files = new String(file_names.getValue()); String[] files = all_files.split("\n"); for (String line : files) { if (line.equals(file_path)) { if (fs.globStatus(new Path(line + "*")).length == 0) { Put new_put = new Put(file_id.getBytes()); new_put.add( "d".getBytes(), "filenames".getBytes(), all_files.replace(file_path + "\n", "").getBytes()); db_util.doPut(db_name, new_put); return false; } return true; } } return false; }
public static boolean isHdfsDirEmpty(String hdfsFilePath) { try { Configuration conf = new Configuration(); Path path = new Path(hdfsFilePath); FileSystem fs = FileSystem.get(URI.create(hdfsFilePath), conf); FileStatus status[] = fs.globStatus(path); if (status == null || status.length == 0) { throw new FileNotFoundException( "Cannot access " + hdfsFilePath + ": No such file or directory."); } for (int i = 0; i < status.length; i++) { long totalSize = fs.getContentSummary(status[i].getPath()).getLength(); @SuppressWarnings("unused") String pathStr = status[i].getPath().toString(); return totalSize == 0 ? true : false; } } catch (IOException e) { LOG.error("[isHdfsDirEmpty]", e); return false; } return false; }
private void writeIndexDescriptors(ETwinIndexDescriptor ETwinIndexDescriptor) throws IOException { Configuration conf = getConf(); FileSystem fs = (new Path(IndexConfig.index.get()).getFileSystem(conf)); FileStatus[] fileStats = fs.globStatus(new Path(IndexConfig.index.get(), "*")); // We write one indexDescriptor per generated index segment. // Something to consider: right now it's a straight-up serialized Thrift object. // Would it be better to do the LzoBase64Line thing, so that we can apply our tools? // or extend the tools? for (int i = 0; i < fileStats.length; i++) { ETwinIndexDescriptor.setIndexPart(i); FileStatus stat = fileStats[i]; Path idxPath = new Path(stat.getPath().getParent(), "_" + stat.getPath().getName() + ".indexmeta"); FSDataOutputStream os = fs.create(idxPath, true); @SuppressWarnings("unchecked") ThriftWritable<ETwinIndexDescriptor> writable = (ThriftWritable<ETwinIndexDescriptor>) ThriftWritable.newInstance(ETwinIndexDescriptor.getClass()); writable.set(ETwinIndexDescriptor); writable.write(os); os.close(); } }
/* * Fetch a file that is in a Hadoop file system. Return a local File. * Interruptible. */ private File hdfsFetch(Path fromPath, Reporter reporter) throws IOException, InterruptedException { UUID uniqueId = UUID.randomUUID(); File toFile = new File(tempDir, uniqueId.toString() + "/" + fromPath.getName()); File toDir = new File(toFile.getParent()); if (toDir.exists()) { FileUtils.deleteDirectory(toDir); } toDir.mkdirs(); Path toPath = new Path(toFile.getCanonicalPath()); FileSystem fS = fromPath.getFileSystem(hadoopConf); FileSystem tofS = FileSystem.getLocal(hadoopConf); Throttler throttler = new Throttler((double) bytesPerSecThrottle); try { for (FileStatus fStatus : fS.globStatus(fromPath)) { log.info("Copying " + fStatus.getPath() + " to " + toPath); long bytesSoFar = 0; FSDataInputStream iS = fS.open(fStatus.getPath()); FSDataOutputStream oS = tofS.create(toPath); byte[] buffer = new byte[downloadBufferSize]; int nRead; while ((nRead = iS.read(buffer, 0, buffer.length)) != -1) { // Needed to being able to be interrupted at any moment. if (Thread.interrupted()) { iS.close(); oS.close(); cleanDirNoExceptions(toDir); throw new InterruptedException(); } bytesSoFar += nRead; oS.write(buffer, 0, nRead); throttler.incrementAndThrottle(nRead); if (bytesSoFar >= bytesToReportProgress) { reporter.progress(bytesSoFar); bytesSoFar = 0l; } } if (reporter != null) { reporter.progress(bytesSoFar); } oS.close(); iS.close(); } return toDir; } catch (ClosedByInterruptException e) { // This can be thrown by the method read. cleanDirNoExceptions(toDir); throw new InterruptedIOException(); } }
private void handleSampling(DriverContext context, MapWork mWork, JobConf job, HiveConf conf) throws Exception { assert mWork.getAliasToWork().keySet().size() == 1; String alias = mWork.getAliases().get(0); Operator<?> topOp = mWork.getAliasToWork().get(alias); PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias); ArrayList<String> paths = mWork.getPaths(); ArrayList<PartitionDesc> parts = mWork.getPartitionDescs(); List<Path> inputPaths = new ArrayList<Path>(paths.size()); for (String path : paths) { inputPaths.add(new Path(path)); } Path tmpPath = context.getCtx().getExternalTmpPath(inputPaths.get(0)); Path partitionFile = new Path(tmpPath, ".partitions"); ShimLoader.getHadoopShims().setTotalOrderPartitionFile(job, partitionFile); PartitionKeySampler sampler = new PartitionKeySampler(); if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) { console.printInfo("Use sampling data created in previous MR"); // merges sampling data from previous MR and make partition keys for total sort for (Path path : inputPaths) { FileSystem fs = path.getFileSystem(job); for (FileStatus status : fs.globStatus(new Path(path, ".sampling*"))) { sampler.addSampleFile(status.getPath(), job); } } } else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) { console.printInfo("Creating sampling data.."); assert topOp instanceof TableScanOperator; TableScanOperator ts = (TableScanOperator) topOp; FetchWork fetchWork; if (!partDesc.isPartitioned()) { assert paths.size() == 1; fetchWork = new FetchWork(inputPaths.get(0), partDesc.getTableDesc()); } else { fetchWork = new FetchWork(inputPaths, parts, partDesc.getTableDesc()); } fetchWork.setSource(ts); // random sampling FetchOperator fetcher = PartitionKeySampler.createSampler(fetchWork, conf, job, ts); try { ts.initialize(conf, new ObjectInspector[] {fetcher.getOutputObjectInspector()}); OperatorUtils.setChildrenCollector(ts.getChildOperators(), sampler); while (fetcher.pushRow()) {} } finally { fetcher.clearFetchContext(); } } else { throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType()); } sampler.writePartitionKeys(partitionFile, conf, job); }
public static FileStatus[] matchFilesOrDir(FileSystem fs, Path path) throws IOException { FileStatus[] srcs = fs.globStatus(path); if ((srcs != null) && srcs.length == 1) { if (srcs[0].isDir()) { srcs = fs.listStatus(srcs[0].getPath()); } } return (srcs); }
public static boolean globDelete(final FileSystem fs, final String path, final boolean recursive) throws IOException { boolean deleted = false; for (final Path p : FileUtil.stat2Paths(fs.globStatus(new Path(path)))) { fs.delete(p, recursive); deleted = true; } return deleted; }
private List<TestDataModel> collect(CacheStorage storage, Path contents) throws IOException { List<TestDataModel> results = new ArrayList<TestDataModel>(); FileSystem fs = storage.getFileSystem(); for (FileStatus status : fs.globStatus(contents)) { results.addAll(collectContent(fs, status)); } Collections.sort(results); return results; }
public void testMapCount() throws Exception { String namenode = null; MiniDFSCluster dfs = null; MiniDFSCluster mr = null; try { Configuration conf = new Configuration(); dfs = new MiniDFSCluster.Builder(conf).numDataNodes(3).format(true).build(); FileSystem fs = dfs.getFileSystem(); final FsShell shell = new FsShell(conf); namenode = fs.getUri().toString(); MyFile[] files = createFiles(fs.getUri(), "/srcdat"); long totsize = 0; for (MyFile f : files) { totsize += f.getSize(); } Configuration job = new JobConf(conf); job.setLong("distcp.bytes.per.map", totsize / 3); ToolRunner.run( new DistCpV1(job), new String[] { "-m", "100", "-log", namenode + "/logs", namenode + "/srcdat", namenode + "/destdat" }); assertTrue( "Source and destination directories do not match.", checkFiles(fs, "/destdat", files)); String logdir = namenode + "/logs"; System.out.println(execCmd(shell, "-lsr", logdir)); FileStatus[] logs = fs.listStatus(new Path(logdir)); // rare case where splits are exact, logs.length can be 4 assertTrue(logs.length == 2); deldir(fs, "/destdat"); deldir(fs, "/logs"); ToolRunner.run( new DistCpV1(job), new String[] { "-m", "1", "-log", namenode + "/logs", namenode + "/srcdat", namenode + "/destdat" }); System.out.println(execCmd(shell, "-lsr", logdir)); logs = fs.globStatus(new Path(namenode + "/logs/part*")); assertTrue("Unexpected map count, logs.length=" + logs.length, logs.length == 1); } finally { if (dfs != null) { dfs.shutdown(); } if (mr != null) { mr.shutdown(); } } }
public List<String> getInputPaths(PathInfo pi) throws IOException { List<String> list = new ArrayList<String>(); FileStatus[] fileStatuses; for (String branch : branches) { fileStatuses = fs.globStatus(new Path(pi.getFullPath() + branch), pathFilter); if (fileStatuses == null) { continue; } for (FileStatus f : fileStatuses) { list.add(f.getPath().toUri().getPath()); } } return list; }
private void verifyResult() throws IOException { FileStatus[] globStatus = fs.globStatus(new Path(OUTPUT + "/part-*")); int itemsRead = 0; for (FileStatus fts : globStatus) { BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(fts.getPath()))); String line = null; while ((line = reader.readLine()) != null) { String[] split = line.split("\t"); System.out.println(split[0] + " | " + split[1]); assertEquals(resultList[Integer.parseInt(split[0])], split[1]); itemsRead++; } } assertEquals(resultList.length, itemsRead); }
public static List<Path> getAllFilePaths(final FileSystem fs, Path path, final PathFilter filter) throws IOException { if (null == path) path = fs.getHomeDirectory(); if (path.toString().equals(FOWARD_SLASH)) path = new Path(""); final List<Path> paths = new ArrayList<Path>(); if (fs.isFile(path)) paths.add(path); else { for (final FileStatus status : fs.globStatus(new Path(path + FOWARD_ASTERISK), filter)) { final Path next = status.getPath(); paths.addAll(getAllFilePaths(fs, next, filter)); } } return paths; }
/** * Creates a new instance. * * @param conf current configuration * @param definition data type * @param pathExpression the source path (can include wildcard) * @throws IOException if failed to create instance * @throws IllegalArgumentException if some parameters were {@code null} */ @SuppressWarnings("unchecked") public TemporaryDataModelSource( Configuration conf, DataModelDefinition<?> definition, String pathExpression) throws IOException { this.conf = conf; this.definition = (DataModelDefinition<Object>) definition; this.object = definition.toObject(definition.newReflection().build()); Path path = new Path(pathExpression); this.fs = path.getFileSystem(conf); FileStatus[] list = fs.globStatus(path); List<Path> paths = new ArrayList<>(); for (int i = 0; i < list.length; i++) { paths.add(list[i].getPath()); } this.rest = paths.iterator(); }
/** * Processes the input file/folder argument. If the input is a file, then it is directly * considered for further processing by TraceBuilder. If the input is a folder, then all the * history logs in the input folder are considered for further processing. * * <p>If isRecursive is true, then the input path is recursively scanned for job history logs * for further processing by TraceBuilder. * * <p>NOTE: If the input represents a globbed path, then it is first flattened and then the * individual paths represented by the globbed input path are considered for further processing. * * @param input input path, possibly globbed * @param conf configuration * @param isRecursive whether to recursively traverse the input paths to find history logs * @return the input history log files' paths * @throws FileNotFoundException * @throws IOException */ static List<Path> processInputArgument(String input, Configuration conf, boolean isRecursive) throws FileNotFoundException, IOException { Path inPath = new Path(input); FileSystem fs = inPath.getFileSystem(conf); FileStatus[] inStatuses = fs.globStatus(inPath); List<Path> inputPaths = new LinkedList<Path>(); if (inStatuses == null || inStatuses.length == 0) { return inputPaths; } for (FileStatus inStatus : inStatuses) { Path thisPath = inStatus.getPath(); if (inStatus.isDirectory()) { // Find list of files in this path(recursively if -recursive option // is specified). List<FileStatus> historyLogs = new ArrayList<FileStatus>(); RemoteIterator<LocatedFileStatus> iter = fs.listFiles(thisPath, isRecursive); while (iter.hasNext()) { LocatedFileStatus child = iter.next(); String fileName = child.getPath().getName(); if (!(fileName.endsWith(".crc") || fileName.startsWith("."))) { historyLogs.add(child); } } if (historyLogs.size() > 0) { // Add the sorted history log file names in this path to the // inputPaths list FileStatus[] sortableNames = historyLogs.toArray(new FileStatus[historyLogs.size()]); Arrays.sort(sortableNames, new HistoryLogsComparator()); for (FileStatus historyLog : sortableNames) { inputPaths.add(historyLog.getPath()); } } } else { inputPaths.add(thisPath); } } return inputPaths; }
@Override public TupleEntryIterator openForRead( FlowProcess<? extends Configuration> flowProcess, RecordReader input) throws IOException { // always read via Hadoop FileSystem if in standalone/local mode, or if an RecordReader is // provided if (HadoopUtil.isLocal(flowProcess.getConfig()) || input != null) { LOG.info("delegating to parent"); return super.openForRead(flowProcess, input); } Path[] cachedFiles = getLocalCacheFiles(flowProcess); if (cachedFiles == null || cachedFiles.length == 0) return super.openForRead(flowProcess, null); List<Path> paths = new ArrayList<>(); List<Tap> taps = new ArrayList<>(); if (isSimpleGlob()) { FileSystem fs = FileSystem.get(flowProcess.getConfig()); FileStatus[] statuses = fs.globStatus(getHfs().getPath()); for (FileStatus status : statuses) paths.add(status.getPath()); } else { paths.add(getHfs().getPath()); } for (Path pathToFind : paths) { for (Path path : cachedFiles) { if (path.toString().endsWith(pathToFind.getName())) { LOG.info("found {} in distributed cache", path); taps.add(new Lfs(getScheme(), path.toString())); } } } if (paths.isEmpty()) // not in cache, read from HDFS { LOG.info( "could not find files in local resource path. delegating to parent: {}", super.getIdentifier()); return super.openForRead(flowProcess, input); } return new MultiSourceTap(taps.toArray(new Tap[taps.size()])).openForRead(flowProcess, input); }
/* * Fetch a file that is in a Hadoop file system. Return a local File. */ private File hdfsFetch(Path fromPath, Reporter reporter) throws IOException { File toFile = new File(tempDir, fromPath.toUri().getPath()); File toDir = new File(toFile.getParent()); if (toDir.exists()) { FileUtils.deleteDirectory(toDir); } toDir.mkdirs(); Path toPath = new Path(toFile.getCanonicalPath()); FileSystem fS = fromPath.getFileSystem(hadoopConf); FileSystem tofS = FileSystem.getLocal(hadoopConf); Throttler throttler = new Throttler((double) bytesPerSecThrottle); for (FileStatus fStatus : fS.globStatus(fromPath)) { log.info("Copying " + fStatus.getPath() + " to " + toPath); long bytesSoFar = 0; FSDataInputStream iS = fS.open(fStatus.getPath()); FSDataOutputStream oS = tofS.create(toPath); byte[] buffer = new byte[downloadBufferSize]; int nRead; while ((nRead = iS.read(buffer, 0, buffer.length)) != -1) { bytesSoFar += nRead; oS.write(buffer, 0, nRead); throttler.incrementAndThrottle(nRead); if (bytesSoFar >= bytesToReportProgress) { reporter.progress(bytesSoFar); bytesSoFar = 0l; } } if (reporter != null) { reporter.progress(bytesSoFar); } oS.close(); iS.close(); } return toDir; }
private void registerHfs( FlowProcess<? extends Configuration> process, Configuration conf, Hfs hfs) throws IOException { if (isSimpleGlob()) { FileSystem fs = FileSystem.get(conf); FileStatus[] statuses = fs.globStatus(getHfs().getPath()); if (statuses == null || statuses.length == 0) throw new TapException( String.format( "glob expression %s does not match any files on the filesystem", getHfs().getPath())); for (FileStatus fileStatus : statuses) registerURI(conf, fileStatus.getPath()); } else { registerURI(conf, hfs.getPath()); } hfs.sourceConfInitComplete(process, conf); }
public List<PathInfo> getPathInfo( String dcNumber, String service, String component, long startTime, long endTime) throws Exception { List<String> hours = getHoursForTimeRange(startTime, endTime); List<PathInfo> paths = new ArrayList<PathInfo>(); String glob; FileStatus[] fileStatuses; for (String hour : hours) { glob = "/service/" + dcNumber + "/" + service + "/" + logdir + hour + "/" + component; fileStatuses = fs.globStatus(new Path(glob), pathFilter); if (fileStatuses == null) { continue; } for (FileStatus f : fileStatuses) { if (f.isDir()) { paths.add(new PathInfo(logdir, f.getPath().toUri().getPath())); } } } return paths; }
public DataInput getStream() { try { if (!initialized) { initialized = true; if ((resFile == null) && (resDir == null)) { return null; } if (resFile != null) { return resFile.getFileSystem(conf).open(resFile); } resFs = resDir.getFileSystem(conf); FileStatus status = resFs.getFileStatus(resDir); assert status.isDir(); FileStatus[] resDirFS = resFs.globStatus(new Path(resDir + "/*")); resDirPaths = new Path[resDirFS.length]; int pos = 0; for (FileStatus resFS : resDirFS) { if (!resFS.isDir()) { resDirPaths[pos++] = resFS.getPath(); } } if (pos == 0) { return null; } return resFs.open(resDirPaths[resDirFilesNum++]); } else { return getNextStream(); } } catch (FileNotFoundException e) { LOG.info("getStream error: " + StringUtils.stringifyException(e)); return null; } catch (IOException e) { LOG.info("getStream error: " + StringUtils.stringifyException(e)); return null; } }
private List<FileStatus> singleThreadedListStatus( JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath()); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (recursive && stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
public static void decompressPath( final FileSystem fs, final String in, final String out, final String compressedFileSuffix, final boolean deletePrevious) throws IOException { final Path inPath = new Path(in); if (fs.isFile(inPath)) HDFSTools.decompressFile(fs, in, out, deletePrevious); else { final Path outPath = new Path(out); if (!fs.exists(outPath)) fs.mkdirs(outPath); for (final Path path : FileUtil.stat2Paths(fs.globStatus(new Path(in + FOWARD_ASTERISK)))) { if (path.getName().endsWith(compressedFileSuffix)) HDFSTools.decompressFile( fs, path.toString(), outPath.toString() + FOWARD_SLASH + path.getName().split("\\.")[0], deletePrevious); } } }
public static ArrayList<String[]> parseRemainderFiles(String outputPath, FileSystem fs) throws IOException { ArrayList<String[]> remainderKeys = new ArrayList<String[]>(); String[] keys; System.out.println("Handle remainder starts"); FileStatus[] fss = fs.globStatus(new Path(outputPath + "/remainder-r-*")); String last = null; for (FileStatus fst : fss) { FSDataInputStream in = fs.open(fst.getPath()); String line = in.readLine(); while (line != null) { if (last == null) last = line; else { remainderKeys.add(new String[] {line, last}); last = null; } line = in.readLine(); } in.close(); } if (last != null) remainderKeys.add(new String[] {last}); System.out.println("Handle remainder ends"); return remainderKeys; }
public static ConfusionMatrix readResult( FileSystem fs, Path pathPattern, Configuration conf, Parameters params) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); String defaultLabel = params.get("defaultCat"); FileStatus[] outputFiles = fs.globStatus(pathPattern); Map<String, Map<String, Integer>> confusionMatrix = new HashMap<String, Map<String, Integer>>(); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); while (reader.next(key, value)) { String correctLabel = key.stringAt(1); String classifiedLabel = key.stringAt(2); Map<String, Integer> rowMatrix = confusionMatrix.get(correctLabel); if (rowMatrix == null) { rowMatrix = new HashMap<String, Integer>(); } Integer count = Double.valueOf(value.get()).intValue(); rowMatrix.put(classifiedLabel, count); confusionMatrix.put(correctLabel, rowMatrix); } } ConfusionMatrix matrix = new ConfusionMatrix(confusionMatrix.keySet(), defaultLabel); for (Map.Entry<String, Map<String, Integer>> correctLabelSet : confusionMatrix.entrySet()) { Map<String, Integer> rowMatrix = correctLabelSet.getValue(); for (Map.Entry<String, Integer> classifiedLabelSet : rowMatrix.entrySet()) { matrix.addInstance(correctLabelSet.getKey(), classifiedLabelSet.getKey()); matrix.putCount( correctLabelSet.getKey(), classifiedLabelSet.getKey(), classifiedLabelSet.getValue()); } } return matrix; }
private void standAlone() throws IOException { String absSrcDir = fs.makeQualified(srcDir).toUri().getPath(); String absOutDir = fs.makeQualified(outDir).toUri().getPath(); Text bucket = new Text(absSrcDir + "-0"); List<Text> files = new ArrayList<Text>(); FileStatus[] contents = fs.listStatus(new Path(absSrcDir)); for (FileStatus content : contents) { if (!content.isDir()) { if (ignoredFiles != null) { // Check for files to skip ignoredFiles.reset(content.getPath().toUri().getPath()); if (ignoredFiles.matches()) { LOG.trace("Ignoring " + content.getPath().toString()); continue; } } files.add(new Text(content.getPath().toUri().getPath())); } } /* * Is the directory empty? */ if (files.isEmpty()) { return; } /* * We trick the reducer into doing some work for us by setting these configuration properties. */ job.set("mapred.tip.id", "task_000000000000_00000_r_000000"); job.set("mapred.task.id", "attempt_000000000000_0000_r_000000_0"); job.set("mapred.output.dir", absOutDir); /* * File output committer needs this. */ fs.mkdirs(new Path(absOutDir, "_temporary")); CrushReducer reducer = new CrushReducer(); reducer.configure(job); reducer.reduce(bucket, files.iterator(), new NullOutputCollector<Text, Text>(), Reporter.NULL); reducer.close(); /* * Use a glob here because the temporary and task attempt work dirs have funny names. * Include a * at the end to cover wildcards for compressed files. */ Path crushOutput = new Path(absOutDir + "/*/*/crush" + absSrcDir + "/" + dest.getName() + "*"); FileStatus[] statuses = fs.globStatus(crushOutput); if (statuses == null || 1 != statuses.length) { throw new AssertionError("Did not find the expected output in " + crushOutput.toString()); } rename(statuses[0].getPath(), dest.getParent(), dest.getName()); }
public int run(String[] args) throws Exception { // printUsage(); /* * SETUP */ Configuration argConf = getConf(); Hashtable<String, String> confArg = new Hashtable<String, String>(); setup(confArg, argConf); Date currentTime = new Date(); Date endDate = new Date(new Long(confArg.get("timestamp_stop"))); Boolean full_run = confArg.get("intermediate").matches("(?i).*true.*"); Boolean quick_add = confArg.get("quick_add").matches("(?i).*true.*"); logger.info("Running GeStore"); // ZooKeeper setup Configuration config = HBaseConfiguration.create(); zkWatcher = new ZooKeeperWatcher(config, "Testing", new HBaseAdmin(config)); zkInstance = new ZooKeeper( ZKConfig.getZKQuorumServersString(config), config.getInt("zookeeper.session.timeout", -1), zkWatcher); if (!confArg.get("task_id").isEmpty()) { confArg.put("temp_path", confArg.get("temp_path") + confArg.get("task_id")); } String lockRequest = confArg.get("file_id"); if (!confArg.get("run_id").isEmpty()) lockRequest = lockRequest + "_" + confArg.get("run_id") + "_"; if (!confArg.get("task_id").isEmpty()) lockRequest = lockRequest + "_" + confArg.get("task_id") + "_"; // Get type of movement toFrom type_move = checkArgs(confArg); if (type_move == toFrom.LOCAL2REMOTE && !confArg.get("format").equals("unknown")) { List<String> arguments = new ArrayList<String>(); arguments.add("-Dinput=" + confArg.get("local_path")); arguments.add("-Dtable=" + confArg.get("file_id")); arguments.add("-Dtimestamp=" + confArg.get("timestamp_stop")); arguments.add("-Dtype=" + confArg.get("format")); arguments.add("-Dtarget_dir=" + confArg.get("base_path") + "_" + confArg.get("file_id")); arguments.add("-Dtemp_hdfs_path=" + confArg.get("temp_path")); arguments.add("-Drun_id=" + confArg.get("run_id")); if (!confArg.get("run_id").isEmpty()) arguments.add("-Drun_id=" + confArg.get("run_id")); if (!confArg.get("task_id").isEmpty()) arguments.add("-Dtask_id=" + confArg.get("task_id")); if (quick_add) arguments.add("-Dquick_add=" + confArg.get("quick_add")); String lockName = lock(lockRequest); String[] argumentString = arguments.toArray(new String[arguments.size()]); adddb.main(argumentString); unlock(lockName); System.exit(0); } // Database registration dbutil db_util = new dbutil(config); db_util.register_database(confArg.get("db_name_files"), true); db_util.register_database(confArg.get("db_name_runs"), true); db_util.register_database(confArg.get("db_name_updates"), true); FileSystem hdfs = FileSystem.get(config); FileSystem localFS = FileSystem.getLocal(config); // Get source type confArg.put("source", getSource(db_util, confArg.get("db_name_files"), confArg.get("file_id"))); confArg.put( "database", isDatabase(db_util, confArg.get("db_name_files"), confArg.get("file_id"))); if (!confArg.get("source").equals("local") && type_move == toFrom.REMOTE2LOCAL && !confArg.get("timestamp_stop").equals(Integer.toString(Integer.MAX_VALUE))) { confArg.put("timestamp_stop", Long.toString(latestVersion(confArg, db_util))); } /* * Get previous timestamp */ Get run_id_get = new Get(confArg.get("run_id").getBytes()); Result run_get = db_util.doGet(confArg.get("db_name_runs"), run_id_get); KeyValue run_file_prev = run_get.getColumnLatest( "d".getBytes(), (confArg.get("file_id") + "_db_timestamp").getBytes()); String last_timestamp = new String("0"); if (null != run_file_prev && !confArg.get("source").equals("local")) { long last_timestamp_real = run_file_prev.getTimestamp(); Long current_timestamp = new Long(confArg.get("timestamp_real")); if ((current_timestamp - last_timestamp_real) > 36000) { last_timestamp = new String(run_file_prev.getValue()); Integer lastTimestamp = new Integer(last_timestamp); lastTimestamp += 1; last_timestamp = lastTimestamp.toString(); logger.info("Last timestamp: " + last_timestamp + " End data: " + endDate); Date last_run = new Date(run_file_prev.getTimestamp()); if (last_run.before(endDate) && !full_run) { confArg.put("timestamp_start", last_timestamp); } } } Integer tse = new Integer(confArg.get("timestamp_stop")); Integer tss = new Integer(confArg.get("timestamp_start")); if (tss > tse) { logger.info("No new version of requested file."); return 0; } /* * Generate file */ String lockName = lock(lockRequest); Get file_id_get = new Get(confArg.get("file_id").getBytes()); Result file_get = db_util.doGet(confArg.get("db_name_files"), file_id_get); if (!file_get.isEmpty()) { boolean found = hasFile( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg)); if (confArg.get("source").equals("fullfile")) { found = false; } String filenames_put = getFileNames( db_util, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg)); // Filename not found in file database if (!found && type_move == toFrom.REMOTE2LOCAL) { if (!confArg.get("source").equals("local")) { // Generate intermediate file if (getFile(hdfs, confArg, db_util) == null) { unlock(lockName); return 1; } // Put generated file into file database if (!confArg.get("format").equals("fullfile")) { putFileEntry( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), confArg.get("full_file_name"), confArg.get("source")); } } else { logger.warn("Remote file not found, and cannot be generated! File: " + confArg); unlock(lockName); return 1; } } } else { if (type_move == toFrom.REMOTE2LOCAL) { logger.warn("Remote file not found, and cannot be generated."); unlock(lockName); return 1; } } /* * Copy file * Update tables */ if (type_move == toFrom.LOCAL2REMOTE) { if (!confArg.get("format").equals("fullfile")) { putFileEntry( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg), confArg.get("source")); } putRunEntry( db_util, confArg.get("db_name_runs"), confArg.get("run_id"), confArg.get("file_id"), confArg.get("type"), confArg.get("timestamp_real"), confArg.get("timestamp_stop"), getFullPath(confArg), confArg.get("delimiter")); hdfs.copyFromLocalFile(new Path(confArg.get("local_path")), new Path(getFullPath(confArg))); } else if (type_move == toFrom.REMOTE2LOCAL) { FileStatus[] files = hdfs.globStatus(new Path(getFullPath(confArg) + "*")); putRunEntry( db_util, confArg.get("db_name_runs"), confArg.get("run_id"), confArg.get("file_id"), confArg.get("type"), confArg.get("timestamp_real"), confArg.get("timestamp_stop"), getFullPath(confArg), confArg.get("delimiter")); unlock(lockName); for (FileStatus file : files) { Path cur_file = file.getPath(); Path cur_local_path = new Path(new String(confArg.get("local_path") + confArg.get("file_id"))); String suffix = getSuffix(getFileName(confArg), cur_file.getName()); if (suffix.length() > 0) { cur_local_path = cur_local_path.suffix(new String("." + suffix)); } if (confArg.get("copy").equals("true")) { String crc = hdfs.getFileChecksum(cur_file).toString(); if (checksumLocalTest(cur_local_path, crc)) { continue; } else { hdfs.copyToLocalFile(cur_file, cur_local_path); writeChecksum(cur_local_path, crc); } } else { System.out.println(cur_local_path + "\t" + cur_file); } } } unlock(lockName); return 0; }
public void printClusters(String[] dictionary) throws IOException, InstantiationException, IllegalAccessException { Configuration conf = new Configuration(); if (this.termDictionary != null) { if ("text".equals(dictionaryFormat)) { dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary)); } else if ("sequencefile".equals(dictionaryFormat)) { FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(), conf); dictionary = VectorHelper.loadTermDictionary(conf, fs, this.termDictionary); } else { throw new IllegalArgumentException("Invalid dictionary format"); } } Writer writer = this.outputFile == null ? new OutputStreamWriter(System.out) : new FileWriter(this.outputFile); try { FileSystem fs = seqFileDir.getFileSystem(conf); for (FileStatus seqFile : fs.globStatus(new Path(seqFileDir, "part-*"))) { Path path = seqFile.getPath(); // System.out.println("Input Path: " + path); doesn't this interfere with output? SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); try { Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance(); Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance(); while (reader.next(key, value)) { Cluster cluster = (Cluster) value; String fmtStr = useJSON ? cluster.asJsonString() : cluster.asFormatString(dictionary); if (subString > 0 && fmtStr.length() > subString) { writer.write(':'); writer.write(fmtStr, 0, Math.min(subString, fmtStr.length())); } else { writer.write(fmtStr); } writer.write('\n'); if (dictionary != null) { String topTerms = getTopFeatures(cluster.getCenter(), dictionary, numTopFeatures); writer.write("\tTop Terms: "); writer.write(topTerms); writer.write('\n'); } List<WeightedVectorWritable> points = clusterIdToPoints.get(cluster.getId()); if (points != null) { writer.write("\tWeight: Point:\n\t"); for (Iterator<WeightedVectorWritable> iterator = points.iterator(); iterator.hasNext(); ) { WeightedVectorWritable point = iterator.next(); writer.write(String.valueOf(point.getWeight())); writer.write(": "); writer.write(AbstractCluster.formatVector(point.getVector(), dictionary)); if (iterator.hasNext()) { writer.write("\n\t"); } } writer.write('\n'); } } } finally { reader.close(); } } } finally { writer.close(); } }