private void generateData(String mrIncWorkingPathStr, String rowId, String recordId, String value) throws IOException { Path path = new Path(new Path(mrIncWorkingPathStr), "new"); Writer writer = new SequenceFile.Writer( miniCluster.getFileSystem(), conf, new Path(path, UUID.randomUUID().toString()), Text.class, BlurRecord.class); BlurRecord blurRecord = new BlurRecord(); blurRecord.setRowId(rowId); blurRecord.setRecordId(recordId); blurRecord.setFamily("fam0"); blurRecord.addColumn("col0", value); writer.append(new Text(rowId), blurRecord); writer.close(); }
void writeDirs() throws IOException { print(Verbosity.INFO, "\n\nUsing temporary directory " + tmpDir.toUri().getPath()); FileStatus status = fs.getFileStatus(srcDir); Path tmpIn = new Path(tmpDir, "in"); bucketFiles = new Path(tmpIn, "dirs"); partitionMap = new Path(tmpIn, "partition-map"); counters = new Path(tmpIn, "counters"); skippedFiles = new HashSet<String>(); /* * Prefer the path returned by the status because it is always fully qualified. */ List<Path> dirs = asList(status.getPath()); Text key = new Text(); Text value = new Text(); Writer writer = SequenceFile.createWriter( fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK); int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks")); Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false); partitionBucketer.reset("partition-map"); jobCounters = new Counters(); try { while (!dirs.isEmpty()) { List<Path> nextLevel = new LinkedList<Path>(); for (Path dir : dirs) { jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); print(Verbosity.INFO, "\n\n" + dir.toUri().getPath()); FileStatus[] contents = fs.listStatus( dir, new PathFilter() { @Override public boolean accept(Path testPath) { if (ignoredFiles == null) return true; ignoredFiles.reset(testPath.toUri().getPath()); return !ignoredFiles.matches(); } }); if (contents == null || contents.length == 0) { print(Verbosity.INFO, " is empty"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length); Set<String> uncrushedFiles = new HashSet<String>(contents.length); long crushableBytes = 0; /* * Queue sub directories for subsequent inspection and examine the files in this directory. */ for (FileStatus content : contents) { Path path = content.getPath(); if (content.isDir()) { nextLevel.add(path); } else { boolean changed = uncrushedFiles.add(path.toUri().getPath()); assert changed : path.toUri().getPath(); long fileLength = content.getLen(); if (fileLength <= maxEligibleSize) { crushables.add(content); crushableBytes += fileLength; } } } /* * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the * number of files we found. */ if (!uncrushedFiles.isEmpty()) { if (-1 == findMatcher(dir)) { throw new IllegalArgumentException( "Could not find matching regex for directory: " + dir); } jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size()); } if (0 == crushableBytes) { print(Verbosity.INFO, " has no crushable files"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { /* * We found files to consider for crushing. */ long nBlocks = crushableBytes / dfsBlockSize; if (nBlocks * dfsBlockSize != crushableBytes) { nBlocks++; } /* * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory. */ long dirBuckets = nBlocks / maxFileBlocks; if (dirBuckets * maxFileBlocks != nBlocks) { dirBuckets++; } if (dirBuckets > Integer.MAX_VALUE) { throw new AssertionError("Too many buckets: " + dirBuckets); } Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs); directoryBucketer.reset(getPathPart(dir)); for (FileStatus file : crushables) { directoryBucketer.add(new FileStatusHasSize(file)); } List<Bucket> crushFiles = directoryBucketer.createBuckets(); if (crushFiles.isEmpty()) { jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { nBuckets += crushFiles.size(); jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); print(Verbosity.INFO, " => " + crushFiles.size() + " output files"); /* * Write out the mapping between a bucket and a file. */ for (Bucket crushFile : crushFiles) { String bucketId = crushFile.name(); List<String> bucketFiles = crushFile.contents(); print( Verbosity.INFO, format( "\n Output %s will include %,d input bytes from %,d files", bucketId, crushFile.size(), bucketFiles.size())); key.set(bucketId); for (String f : bucketFiles) { boolean changed = uncrushedFiles.remove(f); assert changed : f; pathMatcher.reset(f); pathMatcher.matches(); value.set(pathMatcher.group(5)); writer.append(key, value); /* * Print the input file with four leading spaces. */ print(Verbosity.VERBOSE, "\n " + f); } jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size()); partitionBucketer.add(crushFile); } } } if (!uncrushedFiles.isEmpty()) { print(Verbosity.INFO, "\n\n Skipped " + uncrushedFiles.size() + " files"); for (String uncrushed : uncrushedFiles) { print(Verbosity.VERBOSE, "\n " + uncrushed); } jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size()); } skippedFiles.addAll(uncrushedFiles); } } dirs = nextLevel; } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + bucketFiles, e); } } /* * Now that we have processed all the directories, write the partition map. */ List<Bucket> partitions = partitionBucketer.createBuckets(); assert partitions.size() <= numPartitions; writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); IntWritable partNum = new IntWritable(); try { for (Bucket partition : partitions) { String partitionName = partition.name(); partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1))); for (String bucketId : partition.contents()) { key.set(bucketId); writer.append(key, partNum); } } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } DataOutputStream countersStream = fs.create(this.counters); try { jobCounters.write(countersStream); } finally { try { countersStream.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } }