/** * Write out a SequenceFile that can be read by TotalOrderPartitioner that contains the split * points in startKeys. * * <p>This method was copied from HFileOutputFormat in hbase-0.90.1-cdh3u0. I had to copy it * because it's private. * * @param conf The job configuration. * @param partitionsPath output path for SequenceFile. * @param startKeys the region start keys to use as the partitions. * @throws IOException If there is an error. */ private static void writePartitionFile( Configuration conf, Path partitionsPath, List<HFileKeyValue> startKeys) throws IOException { if (startKeys.isEmpty()) { throw new IllegalArgumentException("No regions passed"); } // We're generating a list of split points, and we don't ever // have keys < the first region (which has an empty start key) // so we need to remove it. Otherwise we would end up with an // empty reducer with index 0. TreeSet<HFileKeyValue> sorted = new TreeSet<HFileKeyValue>(); sorted.addAll(startKeys); HFileKeyValue first = sorted.first(); if (0 != first.getRowKey().length) { throw new IllegalArgumentException( "First region of table should have empty start row key. Instead has: " + Bytes.toStringBinary(first.getRowKey())); } sorted.remove(first); // Write the actual file final SequenceFile.Writer writer = KijiMRPlatformBridge.get() .newSeqFileWriter(conf, partitionsPath, HFileKeyValue.class, NullWritable.class); try { for (HFileKeyValue startKey : sorted) { writer.append(startKey, NullWritable.get()); } } finally { writer.close(); } }
/** * Generate a list of start keys (one per region). Since we know that the row keys in kiji are * byte strings of length 16, we can reliably split them evenly. * * @param numRegions The number of regions to generate start keys for. * @return A list of start keys with size equal to <code>numRegions</code>. */ private static List<HFileKeyValue> generateEvenStartKeys(int numRegions) { List<HFileKeyValue> startKeys = new ArrayList<HFileKeyValue>(numRegions); // The first key is a special case, it must be empty. startKeys.add(HFileKeyValue.createFromRowKey(HConstants.EMPTY_BYTE_ARRAY)); if (numRegions > 1) { byte[][] splitKeys = KijiRowKeySplitter.get().getSplitKeys(numRegions); for (byte[] hbaseRowKey : splitKeys) { startKeys.add(HFileKeyValue.createFromRowKey(hbaseRowKey)); } } return startKeys; }
/** {@inheritDoc} */ @Override public void write(HFileKeyValue entry, NullWritable unused) throws IOException { final KeyValue kv = entry.getKeyValue(); kv.updateLatestStamp(mLatestTimestampBytes); final long recordLength = kv.getLength(); if (mCurrentHFileSize + recordLength >= mMaxFileSizeBytes) { // We can't fit this record in the current HFile without exceeding the max file size. if (Arrays.equals(mCurrentRow, kv.getRow())) { // But we're still adding data for a single row, so we can't close this HFile yet. LOG.debug("Reached max HFile size, but waiting to finish this row before closing."); } else { // Close it and open a new one. closeWriter(mWriter); mWriter = openNewWriter(); } } mWriter.append(kv); mTimeRangeTracker.includeTimestamp(kv); mCurrentHFileSize += recordLength; // Remember the row so we know when we are transitioning. mCurrentRow = kv.getRow(); }
/** * Generates a split for a given table. * * @param tableURI URI of the Kiji table to split. * @param nsplits Number of splits. * @param conf Base Hadoop configuration used to open the Kiji instance. * @return a list of split start keys, as HFileKeyValue (with no value, just the keys). * @throws IOException on I/O error. */ private static List<HFileKeyValue> makeTableKeySplit( KijiURI tableURI, int nsplits, Configuration conf) throws IOException { final Kiji kiji = Kiji.Factory.open(tableURI, conf); try { final KijiTable table = kiji.openTable(tableURI.getTable()); try { if (NUM_SPLITS_AUTO == nsplits) { final List<HFileKeyValue> startKeys = Lists.newArrayList(); for (KijiRegion region : table.getRegions()) { startKeys.add(HFileKeyValue.createFromRowKey(region.getStartKey())); } return startKeys; } else { switch (KijiTableLayout.getEncoding(table.getLayout().getDesc().getKeysFormat())) { case RAW: { // The user has explicitly specified how many HFiles to create, but this is not // possible when row key hashing is disabled. throw new JobConfigurationException( String.format( "Table '%s' has row key hashing disabled, so the number of HFile splits must be" + "determined by the number of HRegions in the HTable. " + "Use an HFileMapReduceJobOutput constructor that enables auto splitting.", table.getName())); } case FORMATTED: case HASH: case HASH_PREFIX: { // Those cases are supported: break; } default: throw new RuntimeException( "Unhandled row key encoding: " + KijiTableLayout.getEncoding(table.getLayout().getDesc().getKeysFormat())); } return generateEvenStartKeys(nsplits); } } finally { ResourceUtils.releaseOrLog(table); } } finally { ResourceUtils.releaseOrLog(kiji); } }
/** {@inheritDoc} */ @Override public void write(HFileKeyValue entry, NullWritable unused) throws IOException { final ColumnId lgId = ColumnId.fromByteArray(entry.getFamily()); getWriter(lgId).write(entry, unused); }