@Override public Closeable createOutputStream(String hdfsPath, HdfsConfiguration configuration) { try { Closeable rout; HdfsInfo hdfsInfo = HdfsInfoFactory.newHdfsInfo(hdfsPath); Class<?> keyWritableClass = configuration.getKeyType().getWritableClass(); Class<?> valueWritableClass = configuration.getValueType().getWritableClass(); rout = SequenceFile.createWriter( hdfsInfo.getConf(), Writer.file(hdfsInfo.getPath()), Writer.keyClass(keyWritableClass), Writer.valueClass(valueWritableClass), Writer.bufferSize(configuration.getBufferSize()), Writer.replication(configuration.getReplication()), Writer.blockSize(configuration.getBlockSize()), Writer.compression( configuration.getCompressionType(), configuration.getCompressionCodec().getCodec()), Writer.progressable( new Progressable() { @Override public void progress() {} }), Writer.metadata(new SequenceFile.Metadata())); return rout; } catch (IOException ex) { throw new RuntimeCamelException(ex); } }
private void populateFile() throws IOException { IntWritable key = new IntWritable(); Text value = new Text(); try (Writer writer = SequenceFile.createWriter( new Configuration(), Writer.keyClass(IntWritable.class), Writer.valueClass(Text.class), Writer.file(new Path(this.inputFile.toURI())))) { for (int i = 0; i < 5; i++) { key.set(i); value.set("value-" + i); writer.append(key, value); } } }
@Override public long append( HdfsOutputStream hdfsostr, Object key, Object value, TypeConverter typeConverter) { try { Holder<Integer> keySize = new Holder<Integer>(); Writable keyWritable = getWritable(key, typeConverter, keySize); Holder<Integer> valueSize = new Holder<Integer>(); Writable valueWritable = getWritable(value, typeConverter, valueSize); Writer writer = (SequenceFile.Writer) hdfsostr.getOut(); writer.append(keyWritable, valueWritable); writer.sync(); return keySize.value + valueSize.value; } catch (Exception ex) { throw new RuntimeCamelException(ex); } }
private void generateData(String mrIncWorkingPathStr, String rowId, String recordId, String value) throws IOException { Path path = new Path(new Path(mrIncWorkingPathStr), "new"); Writer writer = new SequenceFile.Writer( miniCluster.getFileSystem(), conf, new Path(path, UUID.randomUUID().toString()), Text.class, BlurRecord.class); BlurRecord blurRecord = new BlurRecord(); blurRecord.setRowId(rowId); blurRecord.setRecordId(recordId); blurRecord.setFamily("fam0"); blurRecord.addColumn("col0", value); writer.append(new Text(rowId), blurRecord); writer.close(); }
@Test public void testProcessOutput() throws Exception { Configuration conf = getConfiguration(); conf.setInt("mapred.map.tasks", NUM_MAPS); Random rng = RandomUtils.getRandom(); // prepare the output TreeID[] keys = new TreeID[NUM_TREES]; MapredOutput[] values = new MapredOutput[NUM_TREES]; int[] firstIds = new int[NUM_MAPS]; randomKeyValues(rng, keys, values, firstIds); // store the output in a sequence file Path base = getTestTempDirPath("testdata"); FileSystem fs = base.getFileSystem(conf); Path outputFile = new Path(base, "PartialBuilderTest.seq"); Writer writer = SequenceFile.createWriter(fs, conf, outputFile, TreeID.class, MapredOutput.class); try { for (int index = 0; index < NUM_TREES; index++) { writer.append(keys[index], values[index]); } } finally { Closeables.close(writer, false); } // load the output and make sure its valid TreeID[] newKeys = new TreeID[NUM_TREES]; Node[] newTrees = new Node[NUM_TREES]; PartialBuilder.processOutput(new Job(conf), base, newKeys, newTrees); // check the forest for (int tree = 0; tree < NUM_TREES; tree++) { assertEquals(values[tree].getTree(), newTrees[tree]); } assertTrue("keys not equal", Arrays.deepEquals(keys, newKeys)); }
void writeDirs() throws IOException { print(Verbosity.INFO, "\n\nUsing temporary directory " + tmpDir.toUri().getPath()); FileStatus status = fs.getFileStatus(srcDir); Path tmpIn = new Path(tmpDir, "in"); bucketFiles = new Path(tmpIn, "dirs"); partitionMap = new Path(tmpIn, "partition-map"); counters = new Path(tmpIn, "counters"); skippedFiles = new HashSet<String>(); /* * Prefer the path returned by the status because it is always fully qualified. */ List<Path> dirs = asList(status.getPath()); Text key = new Text(); Text value = new Text(); Writer writer = SequenceFile.createWriter( fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK); int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks")); Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false); partitionBucketer.reset("partition-map"); jobCounters = new Counters(); try { while (!dirs.isEmpty()) { List<Path> nextLevel = new LinkedList<Path>(); for (Path dir : dirs) { jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); print(Verbosity.INFO, "\n\n" + dir.toUri().getPath()); FileStatus[] contents = fs.listStatus( dir, new PathFilter() { @Override public boolean accept(Path testPath) { if (ignoredFiles == null) return true; ignoredFiles.reset(testPath.toUri().getPath()); return !ignoredFiles.matches(); } }); if (contents == null || contents.length == 0) { print(Verbosity.INFO, " is empty"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length); Set<String> uncrushedFiles = new HashSet<String>(contents.length); long crushableBytes = 0; /* * Queue sub directories for subsequent inspection and examine the files in this directory. */ for (FileStatus content : contents) { Path path = content.getPath(); if (content.isDir()) { nextLevel.add(path); } else { boolean changed = uncrushedFiles.add(path.toUri().getPath()); assert changed : path.toUri().getPath(); long fileLength = content.getLen(); if (fileLength <= maxEligibleSize) { crushables.add(content); crushableBytes += fileLength; } } } /* * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the * number of files we found. */ if (!uncrushedFiles.isEmpty()) { if (-1 == findMatcher(dir)) { throw new IllegalArgumentException( "Could not find matching regex for directory: " + dir); } jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size()); } if (0 == crushableBytes) { print(Verbosity.INFO, " has no crushable files"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { /* * We found files to consider for crushing. */ long nBlocks = crushableBytes / dfsBlockSize; if (nBlocks * dfsBlockSize != crushableBytes) { nBlocks++; } /* * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory. */ long dirBuckets = nBlocks / maxFileBlocks; if (dirBuckets * maxFileBlocks != nBlocks) { dirBuckets++; } if (dirBuckets > Integer.MAX_VALUE) { throw new AssertionError("Too many buckets: " + dirBuckets); } Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs); directoryBucketer.reset(getPathPart(dir)); for (FileStatus file : crushables) { directoryBucketer.add(new FileStatusHasSize(file)); } List<Bucket> crushFiles = directoryBucketer.createBuckets(); if (crushFiles.isEmpty()) { jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { nBuckets += crushFiles.size(); jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); print(Verbosity.INFO, " => " + crushFiles.size() + " output files"); /* * Write out the mapping between a bucket and a file. */ for (Bucket crushFile : crushFiles) { String bucketId = crushFile.name(); List<String> bucketFiles = crushFile.contents(); print( Verbosity.INFO, format( "\n Output %s will include %,d input bytes from %,d files", bucketId, crushFile.size(), bucketFiles.size())); key.set(bucketId); for (String f : bucketFiles) { boolean changed = uncrushedFiles.remove(f); assert changed : f; pathMatcher.reset(f); pathMatcher.matches(); value.set(pathMatcher.group(5)); writer.append(key, value); /* * Print the input file with four leading spaces. */ print(Verbosity.VERBOSE, "\n " + f); } jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size()); partitionBucketer.add(crushFile); } } } if (!uncrushedFiles.isEmpty()) { print(Verbosity.INFO, "\n\n Skipped " + uncrushedFiles.size() + " files"); for (String uncrushed : uncrushedFiles) { print(Verbosity.VERBOSE, "\n " + uncrushed); } jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size()); } skippedFiles.addAll(uncrushedFiles); } } dirs = nextLevel; } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + bucketFiles, e); } } /* * Now that we have processed all the directories, write the partition map. */ List<Bucket> partitions = partitionBucketer.createBuckets(); assert partitions.size() <= numPartitions; writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); IntWritable partNum = new IntWritable(); try { for (Bucket partition : partitions) { String partitionName = partition.name(); partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1))); for (String bucketId : partition.contents()) { key.set(bucketId); writer.append(key, partNum); } } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } DataOutputStream countersStream = fs.create(this.counters); try { jobCounters.write(countersStream); } finally { try { countersStream.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } }