@Override
 public Closeable createOutputStream(String hdfsPath, HdfsConfiguration configuration) {
   try {
     Closeable rout;
     HdfsInfo hdfsInfo = HdfsInfoFactory.newHdfsInfo(hdfsPath);
     Class<?> keyWritableClass = configuration.getKeyType().getWritableClass();
     Class<?> valueWritableClass = configuration.getValueType().getWritableClass();
     rout =
         SequenceFile.createWriter(
             hdfsInfo.getConf(),
             Writer.file(hdfsInfo.getPath()),
             Writer.keyClass(keyWritableClass),
             Writer.valueClass(valueWritableClass),
             Writer.bufferSize(configuration.getBufferSize()),
             Writer.replication(configuration.getReplication()),
             Writer.blockSize(configuration.getBlockSize()),
             Writer.compression(
                 configuration.getCompressionType(),
                 configuration.getCompressionCodec().getCodec()),
             Writer.progressable(
                 new Progressable() {
                   @Override
                   public void progress() {}
                 }),
             Writer.metadata(new SequenceFile.Metadata()));
     return rout;
   } catch (IOException ex) {
     throw new RuntimeCamelException(ex);
   }
 }
 private void populateFile() throws IOException {
   IntWritable key = new IntWritable();
   Text value = new Text();
   try (Writer writer =
       SequenceFile.createWriter(
           new Configuration(),
           Writer.keyClass(IntWritable.class),
           Writer.valueClass(Text.class),
           Writer.file(new Path(this.inputFile.toURI())))) {
     for (int i = 0; i < 5; i++) {
       key.set(i);
       value.set("value-" + i);
       writer.append(key, value);
     }
   }
 }
 @Override
 public long append(
     HdfsOutputStream hdfsostr, Object key, Object value, TypeConverter typeConverter) {
   try {
     Holder<Integer> keySize = new Holder<Integer>();
     Writable keyWritable = getWritable(key, typeConverter, keySize);
     Holder<Integer> valueSize = new Holder<Integer>();
     Writable valueWritable = getWritable(value, typeConverter, valueSize);
     Writer writer = (SequenceFile.Writer) hdfsostr.getOut();
     writer.append(keyWritable, valueWritable);
     writer.sync();
     return keySize.value + valueSize.value;
   } catch (Exception ex) {
     throw new RuntimeCamelException(ex);
   }
 }
Example #4
0
 private void generateData(String mrIncWorkingPathStr, String rowId, String recordId, String value)
     throws IOException {
   Path path = new Path(new Path(mrIncWorkingPathStr), "new");
   Writer writer =
       new SequenceFile.Writer(
           miniCluster.getFileSystem(),
           conf,
           new Path(path, UUID.randomUUID().toString()),
           Text.class,
           BlurRecord.class);
   BlurRecord blurRecord = new BlurRecord();
   blurRecord.setRowId(rowId);
   blurRecord.setRecordId(recordId);
   blurRecord.setFamily("fam0");
   blurRecord.addColumn("col0", value);
   writer.append(new Text(rowId), blurRecord);
   writer.close();
 }
Example #5
0
  @Test
  public void testProcessOutput() throws Exception {
    Configuration conf = getConfiguration();
    conf.setInt("mapred.map.tasks", NUM_MAPS);

    Random rng = RandomUtils.getRandom();

    // prepare the output
    TreeID[] keys = new TreeID[NUM_TREES];
    MapredOutput[] values = new MapredOutput[NUM_TREES];
    int[] firstIds = new int[NUM_MAPS];
    randomKeyValues(rng, keys, values, firstIds);

    // store the output in a sequence file
    Path base = getTestTempDirPath("testdata");
    FileSystem fs = base.getFileSystem(conf);

    Path outputFile = new Path(base, "PartialBuilderTest.seq");
    Writer writer =
        SequenceFile.createWriter(fs, conf, outputFile, TreeID.class, MapredOutput.class);

    try {
      for (int index = 0; index < NUM_TREES; index++) {
        writer.append(keys[index], values[index]);
      }
    } finally {
      Closeables.close(writer, false);
    }

    // load the output and make sure its valid
    TreeID[] newKeys = new TreeID[NUM_TREES];
    Node[] newTrees = new Node[NUM_TREES];

    PartialBuilder.processOutput(new Job(conf), base, newKeys, newTrees);

    // check the forest
    for (int tree = 0; tree < NUM_TREES; tree++) {
      assertEquals(values[tree].getTree(), newTrees[tree]);
    }

    assertTrue("keys not equal", Arrays.deepEquals(keys, newKeys));
  }
Example #6
0
  void writeDirs() throws IOException {

    print(Verbosity.INFO, "\n\nUsing temporary directory " + tmpDir.toUri().getPath());

    FileStatus status = fs.getFileStatus(srcDir);

    Path tmpIn = new Path(tmpDir, "in");

    bucketFiles = new Path(tmpIn, "dirs");
    partitionMap = new Path(tmpIn, "partition-map");
    counters = new Path(tmpIn, "counters");

    skippedFiles = new HashSet<String>();

    /*
     * Prefer the path returned by the status because it is always fully qualified.
     */
    List<Path> dirs = asList(status.getPath());

    Text key = new Text();
    Text value = new Text();

    Writer writer =
        SequenceFile.createWriter(
            fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK);

    int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks"));

    Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false);
    partitionBucketer.reset("partition-map");

    jobCounters = new Counters();

    try {
      while (!dirs.isEmpty()) {
        List<Path> nextLevel = new LinkedList<Path>();

        for (Path dir : dirs) {
          jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);

          print(Verbosity.INFO, "\n\n" + dir.toUri().getPath());

          FileStatus[] contents =
              fs.listStatus(
                  dir,
                  new PathFilter() {
                    @Override
                    public boolean accept(Path testPath) {
                      if (ignoredFiles == null) return true;
                      ignoredFiles.reset(testPath.toUri().getPath());
                      return !ignoredFiles.matches();
                    }
                  });

          if (contents == null || contents.length == 0) {
            print(Verbosity.INFO, " is empty");

            jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
          } else {
            List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length);
            Set<String> uncrushedFiles = new HashSet<String>(contents.length);

            long crushableBytes = 0;

            /*
             * Queue sub directories for subsequent inspection and examine the files in this directory.
             */
            for (FileStatus content : contents) {
              Path path = content.getPath();

              if (content.isDir()) {
                nextLevel.add(path);
              } else {
                boolean changed = uncrushedFiles.add(path.toUri().getPath());

                assert changed : path.toUri().getPath();

                long fileLength = content.getLen();

                if (fileLength <= maxEligibleSize) {
                  crushables.add(content);
                  crushableBytes += fileLength;
                }
              }
            }

            /*
             * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the
             * number of files we found.
             */
            if (!uncrushedFiles.isEmpty()) {
              if (-1 == findMatcher(dir)) {
                throw new IllegalArgumentException(
                    "Could not find matching regex for directory: " + dir);
              }

              jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size());
            }

            if (0 == crushableBytes) {
              print(Verbosity.INFO, " has no crushable files");

              jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
            } else {
              /*
               * We found files to consider for crushing.
               */
              long nBlocks = crushableBytes / dfsBlockSize;

              if (nBlocks * dfsBlockSize != crushableBytes) {
                nBlocks++;
              }

              /*
               * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory.
               */
              long dirBuckets = nBlocks / maxFileBlocks;

              if (dirBuckets * maxFileBlocks != nBlocks) {
                dirBuckets++;
              }

              if (dirBuckets > Integer.MAX_VALUE) {
                throw new AssertionError("Too many buckets: " + dirBuckets);
              }

              Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs);

              directoryBucketer.reset(getPathPart(dir));

              for (FileStatus file : crushables) {
                directoryBucketer.add(new FileStatusHasSize(file));
              }

              List<Bucket> crushFiles = directoryBucketer.createBuckets();

              if (crushFiles.isEmpty()) {
                jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
              } else {
                nBuckets += crushFiles.size();

                jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);

                print(Verbosity.INFO, " => " + crushFiles.size() + " output files");

                /*
                 * Write out the mapping between a bucket and a file.
                 */
                for (Bucket crushFile : crushFiles) {
                  String bucketId = crushFile.name();

                  List<String> bucketFiles = crushFile.contents();

                  print(
                      Verbosity.INFO,
                      format(
                          "\n  Output %s will include %,d input bytes from %,d files",
                          bucketId, crushFile.size(), bucketFiles.size()));

                  key.set(bucketId);

                  for (String f : bucketFiles) {
                    boolean changed = uncrushedFiles.remove(f);

                    assert changed : f;

                    pathMatcher.reset(f);

                    pathMatcher.matches();

                    value.set(pathMatcher.group(5));

                    writer.append(key, value);

                    /*
                     * Print the input file with four leading spaces.
                     */
                    print(Verbosity.VERBOSE, "\n    " + f);
                  }

                  jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size());

                  partitionBucketer.add(crushFile);
                }
              }
            }

            if (!uncrushedFiles.isEmpty()) {
              print(Verbosity.INFO, "\n\n  Skipped " + uncrushedFiles.size() + " files");

              for (String uncrushed : uncrushedFiles) {
                print(Verbosity.VERBOSE, "\n    " + uncrushed);
              }

              jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size());
            }

            skippedFiles.addAll(uncrushedFiles);
          }
        }

        dirs = nextLevel;
      }
    } finally {
      try {
        writer.close();
      } catch (Exception e) {
        LOG.error("Trapped exception during close: " + bucketFiles, e);
      }
    }

    /*
     * Now that we have processed all the directories, write the partition map.
     */
    List<Bucket> partitions = partitionBucketer.createBuckets();

    assert partitions.size() <= numPartitions;

    writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);

    IntWritable partNum = new IntWritable();

    try {
      for (Bucket partition : partitions) {
        String partitionName = partition.name();

        partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)));

        for (String bucketId : partition.contents()) {
          key.set(bucketId);

          writer.append(key, partNum);
        }
      }
    } finally {
      try {
        writer.close();
      } catch (Exception e) {
        LOG.error("Trapped exception during close: " + partitionMap, e);
      }
    }

    DataOutputStream countersStream = fs.create(this.counters);

    try {
      jobCounters.write(countersStream);
    } finally {
      try {
        countersStream.close();
      } catch (Exception e) {
        LOG.error("Trapped exception during close: " + partitionMap, e);
      }
    }
  }