@Override public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException { FileSplit[] splits = (FileSplit[]) super.getSplits(conf, numSplits); // Find new starts/ends of the filesplit that align with the LZO blocks. List<FileSplit> result = new ArrayList<FileSplit>(); for (FileSplit fileSplit : splits) { Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(conf); LzoIndex index = indexes.get(file); if (index == null) { throw new IOException("Index not found for " + file); } if (index.isEmpty()) { // Empty index, keep it as is. result.add(fileSplit); continue; } long start = fileSplit.getStart(); long end = start + fileSplit.getLength(); long lzoStart = index.alignSliceStartToIndex(start, end); long lzoEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen()); if (lzoStart != LzoIndex.NOT_FOUND && lzoEnd != LzoIndex.NOT_FOUND) { result.add(new FileSplit(file, lzoStart, lzoEnd - lzoStart, fileSplit.getLocations())); } } return result.toArray(new FileSplit[result.size()]); }
/** * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5 * that too, to verify that it all went ok. * * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf)); FileSystem.getLocal(conf).close(); // remove cached filesystem (if any) FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir_, true); localFs.mkdirs(outputDir_); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir_); TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir_, lzoFileName_); LzoIndex.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir_); List<InputSplit> is = inputFormat.getSplits(job); // verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5_.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5_.digest())); }
@Override protected FileStatus[] listStatus(JobConf conf) throws IOException { List<FileStatus> files = new ArrayList<FileStatus>(Arrays.asList(super.listStatus(conf))); String fileExtension = new LzopCodec().getDefaultExtension(); Iterator<FileStatus> it = files.iterator(); while (it.hasNext()) { FileStatus fileStatus = it.next(); Path file = fileStatus.getPath(); if (!file.toString().endsWith(fileExtension)) { // Get rid of non-LZO files. it.remove(); } else { FileSystem fs = file.getFileSystem(conf); LzoIndex index = LzoIndex.readIndex(fs, file); indexes.put(file, index); } } return files.toArray(new FileStatus[] {}); }
/** Make sure the lzo index class works as described. */ @Test public void testLzoIndex() { LzoIndex index = new LzoIndex(); assertTrue(index.isEmpty()); index = new LzoIndex(4); index.set(0, 0); index.set(1, 5); index.set(2, 10); index.set(3, 15); assertFalse(index.isEmpty()); assertEquals(0, index.findNextPosition(-1)); assertEquals(5, index.findNextPosition(1)); assertEquals(5, index.findNextPosition(5)); assertEquals(15, index.findNextPosition(11)); assertEquals(15, index.findNextPosition(15)); assertEquals(-1, index.findNextPosition(16)); assertEquals(5, index.alignSliceStartToIndex(3, 20)); assertEquals(15, index.alignSliceStartToIndex(15, 20)); assertEquals(10, index.alignSliceEndToIndex(8, 30)); assertEquals(10, index.alignSliceEndToIndex(10, 30)); assertEquals(30, index.alignSliceEndToIndex(17, 30)); assertEquals(LzoIndex.NOT_FOUND, index.alignSliceStartToIndex(16, 20)); }
@Override protected boolean isSplitable(FileSystem fs, Path filename) { LzoIndex index = indexes.get(filename); return !index.isEmpty(); }