public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); m_Sb.setLength(0); m_Start = split.getStart(); m_End = m_Start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the m_Start of the split FileSystem fs = file.getFileSystem(job); // getFileStatus fileStatus = fs.getFileStatus(split.getPath()); //noinspection deprecation @SuppressWarnings(value = "deprecated") long length = fs.getLength(file); FSDataInputStream fileIn = fs.open(split.getPath()); if (m_Start > 0) fileIn.seek(m_Start); if (codec != null) { CompressionInputStream inputStream = codec.createInputStream(fileIn); m_Input = new BufferedReader(new InputStreamReader(inputStream)); m_End = length; } else { m_Input = new BufferedReader(new InputStreamReader(fileIn)); } m_Current = m_Start; m_Key = split.getPath().getName(); }
/** * Generate the list of files and make them into FileSplits. This needs to be copied to insert a * filter on acceptable data */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); long desiredMappers = job.getConfiguration().getLong("org.systemsbiology.jxtandem.DesiredXMLInputMappers", 0); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> fileStatuses = listStatus(job); boolean forceNumberMappers = fileStatuses.size() == 1; for (FileStatus file : fileStatuses) { Path path = file.getPath(); if (!isPathAcceptable(path)) // filter acceptable data continue; FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); // use desired mappers to force more splits if (forceNumberMappers && desiredMappers > 0) maxSize = Math.min(maxSize, (length / desiredMappers)); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (withinSlop(splitSize, bytesRemaining)) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add( new FileSplit( path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add( new FileSplit( path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } System.out.println("Total # of splits: " + splits.size()); // LOG.debug("Total # of splits: " + splits.size()); return splits; }
private Job configureJob(Path secretsPath, Path saltFilePath, Path inputPath, Path outputPath) throws Exception { Job job = Job.getInstance(getConf()); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.getConfiguration().set(ObfuscateMapper.SECRET_WORDS_FILE_KEY, secretsPath.toString()); job.getConfiguration().set(ObfuscateMapper.SALT_FILE_KEY, saltFilePath.toString()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ObfuscateMapper.class); job.setNumReduceTasks(0); job.setJarByClass(getClass()); FileSystem.get(outputPath.toUri(), getConf()).delete(outputPath, true); return job; }
@Override protected boolean isSplitable(JobContext context, Path file) { String fname = file.getName().toLowerCase(); //noinspection SimplifiableIfStatementf,RedundantIfStatement if (fname.endsWith(".gz")) return false; return true; }
protected boolean isPathAcceptable(final Path pPath1) { String path = pPath1.toString().toLowerCase(); if (path.startsWith("part-r-")) return true; String extension = getExtension(); if (extension != null && path.endsWith(extension.toLowerCase())) return true; if (extension != null && path.endsWith(extension.toLowerCase() + ".gz")) return true; //noinspection SimplifiableIfStatement,RedundantIfStatement if (extension == null) return true; return false; }
private int getNumSecrets(Path secretsPath) throws Exception { FileSystem fileSystem = FileSystem.get(secretsPath.toUri(), getConf()); FSDataInputStream inputStream = fileSystem.open(secretsPath); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String currentLine; int numLines = 0; while ((currentLine = reader.readLine()) != null) { if (!currentLine.isEmpty()) { numLines++; } } reader.close(); return numLines; }
private void generateSaltIfNeeded(Path saltFilePath, Path secretsPath) throws Exception { FileSystem fileSystem = FileSystem.get(saltFilePath.toUri(), getConf()); if (!fileSystem.exists(saltFilePath)) { FSDataOutputStream outputStream = fileSystem.create(saltFilePath); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream)); int numSaltsToGenerate = getNumSecrets(secretsPath); System.out.printf("Generating %d salts\n", numSaltsToGenerate); for (int i = 0; i < numSaltsToGenerate; i++) { writer.write(BCrypt.gensalt()); writer.newLine(); } writer.close(); outputStream.close(); } }