/** Creates the {@link Scanner} to use for this query. */ Scanner getScanner() throws HBaseException { final short metric_width = tsdb.metrics.width(); final byte[] start_row = new byte[metric_width + Const.TIMESTAMP_BYTES]; final byte[] end_row = new byte[metric_width + Const.TIMESTAMP_BYTES]; // We search at least one row before and one row after the start & end // time we've been given as it's quite likely that the exact timestamp // we're looking for is in the middle of a row. Plus, a number of things // rely on having a few extra data points before & after the exact start // & end dates in order to do proper rate calculation or downsampling near // the "edges" of the graph. Bytes.setInt(start_row, (int) getScanStartTime(), metric_width); Bytes.setInt( end_row, (end_time == UNSET ? -1 // Will scan until the end (0xFFF...). : (int) getScanEndTime()), metric_width); System.arraycopy(metric, 0, start_row, 0, metric_width); System.arraycopy(metric, 0, end_row, 0, metric_width); final Scanner scanner = tsdb.client.newScanner(tsdb.table); scanner.setStartKey(start_row); scanner.setStopKey(end_row); if (tags.size() > 0 || group_bys != null) { createAndSetFilter(scanner); } scanner.setFamily(TSDB.FAMILY); return scanner; }
/** * Extracts the qualifier of a cell containing a data point. * * @param kv The cell. * @return The qualifier, on a short, since it's expected to be on 2 bytes. */ private short extractQualifier(final KeyValue kv) { if (!Bytes.equals(TSDB.FAMILY, kv.family())) { throw new AssertionError("unexpected KeyValue family: " + Bytes.pretty(kv.family())); } final byte[] qual = kv.qualifier(); if (qual.length != 2) { throw new AssertionError("Invalid qualifier length: " + Bytes.pretty(qual)); } return Bytes.getShort(qual); }
@Override public int compare(KeyValue o1, KeyValue o2) { int d; if ((d = Bytes.memcmp(o1.key(), o2.key())) != 0) { return d; } else if ((d = Bytes.memcmp(o1.family(), o2.family())) != 0) { return d; } else if ((d = Bytes.memcmp(o1.qualifier(), o2.qualifier())) != 0) { return d; } else if ((d = Long.signum(o2.timestamp() - o1.timestamp())) != 0) { return d; } else { d = Bytes.memcmp(o1.value(), o2.value()); } return d; }
/** * Sets the row this instance holds in RAM using a row from a scanner. * * @param row The HBase row to set. * @throws IllegalStateException if this method was already called. */ void setRow(final ArrayList<KeyValue> row) { final byte[] key = row.get(0).key(); final long base_time = Bytes.getUnsignedInt(key, tsdb.metrics.width()); if (this.key == null) { this.key = key; final int npoints = row.size(); values = new long[npoints]; qualifiers = new short[npoints]; } else { throw new IllegalStateException("setRow was already called on " + this); } int index = 0; // position in `values'. for (final KeyValue kv : row) { final short qualifier = extractQualifier(kv); qualifiers[index] = qualifier; values[index] = extractLValue(qualifier, kv); if (index > 0 && timestamp(index - 1) >= timestamp(index)) { throw new AssertionError( "new timestamp = " + timestamp(index) + " is < previous=" + timestamp(index - 1) + " in setRow with kv=" + kv); } index++; } }
int indexOf(final byte[] family, final byte[] qualifier) { KeyValue searchTerm = new KeyValue(getRowKey(), family, qualifier, HBaseClient.EMPTY_ARRAY); int pos = Collections.binarySearch(kvList, searchTerm, KV_COMPARATOR); // never will exact match if (pos < 0) { pos = (pos + 1) * -1; // pos is now insertion point } if (pos == kvList.size()) { return -1; // doesn't exist } KeyValue kv = kvList.get(pos); return (Bytes.equals(family, kv.family()) && Bytes.equals(qualifier, kv.qualifier())) ? pos : -1; }
@Test public void addPointLong8BytesNegative() throws Exception { setupAddPointStorage(); HashMap<String, String> tags = new HashMap<String, String>(1); tags.put("host", "web01"); tsdb.addPoint("sys.cpu.user", 1356998400, -4294967296L, tags).joinUninterruptibly(); final byte[] row = new byte[] {0, 0, 1, 0x50, (byte) 0xE2, 0x27, 0, 0, 0, 1, 0, 0, 1}; final byte[] value = storage.getColumn(row, new byte[] {0, 7}); assertNotNull(value); assertEquals(-4294967296L, Bytes.getLong(value)); }
/** * Extracts the value of a cell containing a data point. * * @param qualifier The qualifier of that cell, as returned by {@link #extractQualifier}. * @param kv The cell. * @return The value of the cell, as a {@code long}, since it's expected to be on 8 bytes at most. * If the cell contains a floating point value, the bits of the {@code long} represent some * kind of a floating point value. */ private static long extractLValue(final short qualifier, final KeyValue kv) { final byte[] value = kv.value(); if ((qualifier & Const.FLAG_FLOAT) != 0) { if ((qualifier & 0x3) != 0x3) { throw new AssertionError("Float value qualifier size != 4: " + kv); } else if (value.length != 8) { throw new AssertionError("Float value not on 8 bytes: " + kv); } else if (value[0] != 0 || value[1] != 0 || value[2] != 0 || value[3] != 0) { throw new AssertionError("Float value with nonzero byte MSBs: " + kv); } return Bytes.getInt(value, 4); } else { if ((qualifier & 0x7) != 0x7) { throw new AssertionError("Integer value qualifier size != 4: " + kv); } else if (value.length != 8) { throw new AssertionError("Integer value not on 8 bytes: " + kv); } return Bytes.getLong(value); } }
@Test public void addPointFloatPrecision() throws Exception { setupAddPointStorage(); HashMap<String, String> tags = new HashMap<String, String>(1); tags.put("host", "web01"); tsdb.addPoint("sys.cpu.user", 1356998400, 42.5123459999F, tags).joinUninterruptibly(); final byte[] row = new byte[] {0, 0, 1, 0x50, (byte) 0xE2, 0x27, 0, 0, 0, 1, 0, 0, 1}; final byte[] value = storage.getColumn(row, new byte[] {0, 11}); assertNotNull(value); // should have 7 digits of precision assertEquals(42.512345F, Float.intBitsToFloat(Bytes.getInt(value)), 0.0000001); }
/** * Merges another HBase row into this one. When two continuous rows in HBase have data points that * are close enough together that they could be stored into the same row, it makes sense to merge * them into the same {@link RowSeq} instance in memory in order to save RAM. * * @param row The HBase row to merge into this instance. * @throws IllegalStateException if {@link #setRow} wasn't called first. * @throws IllegalArgumentException if the data points in the argument aren't close enough to * those in this instance time-wise to be all merged together. */ void addRow(final ArrayList<KeyValue> row) { final byte[] key = row.get(0).key(); final long base_time = Bytes.getUnsignedInt(key, tsdb.metrics.width()); // Save the old arrays in case we need to revert what we've done. final short old_qualifiers[] = qualifiers; final long old_values[] = values; int index = values.length; // position in `values'. if (this.key != null) { final int new_length = values.length + row.size(); values = Arrays.copyOf(values, new_length); qualifiers = Arrays.copyOf(qualifiers, new_length); } else { throw new IllegalStateException("setRow was never called on " + this); } final int time_adj = (int) (base_time - baseTime()); if (time_adj <= 0) { throw new AssertionError( "attempt to add a row with a base_time=" + base_time + " <= baseTime()=" + baseTime()); } for (final KeyValue kv : row) { short qualifier = extractQualifier(kv); final int time_delta = (qualifier & 0xFFFF) >>> Const.FLAG_BITS; if (!canTimeDeltaFit(time_delta)) { throw new IllegalArgumentException( "time_delta too large " + time_delta + " to be added to " + this); } qualifier = (short) (((time_delta + time_adj) << Const.FLAG_BITS) | (qualifier & Const.FLAGS_MASK)); qualifiers[index] = qualifier; values[index] = extractLValue(qualifier, kv); if (index > 0 && timestamp(index - 1) >= timestamp(index)) { LOG.error( "new timestamp = " + timestamp(index) + " (index=" + index + ") is < previous=" + timestamp(index - 1) + " in addRow with kv=" + kv + " in row=" + row); // Undo what we've done so far. qualifiers = old_qualifiers; values = old_values; return; // Ignore this row, it came out of order. } index++; } }
/** * Finds all the {@link Span}s that match this query. This is what actually scans the HBase table * and loads the data into {@link Span}s. * * @return A map from HBase row key to the {@link Span} for that row key. Since a {@link Span} * actually contains multiple HBase rows, the row key stored in the map has its timestamp * zero'ed out. * @throws HBaseException if there was a problem communicating with HBase to perform the search. * @throws IllegalArgumentException if bad data was retreived from HBase. */ private TreeMap<byte[], Span> findSpans() throws HBaseException { final short metric_width = tsdb.metrics.width(); final TreeMap<byte[], Span> spans = // The key is a row key from HBase. new TreeMap<byte[], Span>(new SpanCmp(metric_width)); int nrows = 0; int hbase_time = 0; // milliseconds. long starttime = System.nanoTime(); final Scanner scanner = getScanner(); try { ArrayList<ArrayList<KeyValue>> rows; while ((rows = scanner.nextRows().joinUninterruptibly()) != null) { hbase_time += (System.nanoTime() - starttime) / 1000000; for (final ArrayList<KeyValue> row : rows) { final byte[] key = row.get(0).key(); if (Bytes.memcmp(metric, key, 0, metric_width) != 0) { throw new IllegalDataException( "HBase returned a row that doesn't match" + " our scanner (" + scanner + ")! " + row + " does not start" + " with " + Arrays.toString(metric)); } Span datapoints = spans.get(key); if (datapoints == null) { datapoints = new Span(tsdb); spans.put(key, datapoints); } datapoints.addRow(tsdb.compact(row)); nrows++; starttime = System.nanoTime(); } } } catch (RuntimeException e) { throw e; } catch (Exception e) { throw new RuntimeException("Should never be here", e); } finally { hbase_time += (System.nanoTime() - starttime) / 1000000; scanlatency.add(hbase_time); } LOG.info(this + " matched " + nrows + " rows in " + spans.size() + " spans"); if (nrows == 0) { return null; } return spans; }
@Override public Boolean call(Boolean exists) throws Exception { if (!exists && create) { final PutRequest put = new PutRequest( tsdb.metaTable(), UniqueId.stringToUid(tsuid), TSMeta.FAMILY(), TSMeta.COUNTER_QUALIFIER(), Bytes.fromLong(0)); tsdb.getClient().put(put); } return exists; }
/** * Helper comparison function to compare tag name IDs. * * @param name_width Number of bytes used by a tag name ID. * @param tag A tag (array containing a tag name ID and a tag value ID). * @param group_by A tag name ID. * @return {@code true} number if {@code tag} should be used next (because it contains a smaller * ID), {@code false} otherwise. */ private boolean isTagNext(final short name_width, final byte[] tag, final byte[] group_by) { if (tag == null) { return false; } else if (group_by == null) { return true; } final int cmp = Bytes.memcmp(tag, group_by, 0, name_width); if (cmp == 0) { throw new AssertionError( "invariant violation: tag ID " + Arrays.toString(group_by) + " is both in 'tags' and" + " 'group_bys' in " + this); } return cmp < 0; }
@Test public void addPointBothSameTimeIntAndFloatMs() throws Exception { // this is an odd situation that can occur if the user puts an int and then // a float (or vice-versa) with the same timestamp. What happens in the // aggregators when this occurs? setupAddPointStorage(); HashMap<String, String> tags = new HashMap<String, String>(1); tags.put("host", "web01"); tsdb.addPoint("sys.cpu.user", 1356998400500L, 42, tags).joinUninterruptibly(); tsdb.addPoint("sys.cpu.user", 1356998400500L, 42.5F, tags).joinUninterruptibly(); final byte[] row = new byte[] {0, 0, 1, 0x50, (byte) 0xE2, 0x27, 0, 0, 0, 1, 0, 0, 1}; byte[] value = storage.getColumn(row, new byte[] {(byte) 0xF0, 0, 0x7D, 0}); assertEquals(2, storage.numColumns(row)); assertNotNull(value); assertEquals(42, value[0]); value = storage.getColumn(row, new byte[] {(byte) 0xF0, 0, 0x7D, 11}); assertNotNull(value); // should have 7 digits of precision assertEquals(42.5F, Float.intBitsToFloat(Bytes.getInt(value)), 0.0000001); }
@Override public Deferred<Object> call(final ArrayList<KeyValue> row) throws Exception { if (row == null || row.isEmpty()) { return Deferred.fromResult(null); } final ArrayList<byte[]> qualifiers = new ArrayList<byte[]>(row.size()); for (KeyValue column : row) { if (column.qualifier().length > RULE_PREFIX.length && Bytes.memcmp(RULE_PREFIX, column.qualifier(), 0, RULE_PREFIX.length) == 0) { qualifiers.add(column.qualifier()); } } final DeleteRequest delete = new DeleteRequest( tsdb.treeTable(), Tree.idToBytes(tree_id), Tree.TREE_FAMILY(), qualifiers.toArray(new byte[qualifiers.size()][])); return tsdb.getClient().delete(delete); }
/** Extracts the base timestamp from the row key. */ long baseTime() { return Bytes.getUnsignedInt(key, tsdb.metrics.width()); }
@Before public void before() throws Exception { config = mock(Config.class); when(config.getString("tsd.storage.hbase.data_table")).thenReturn("tsdb"); when(config.getString("tsd.storage.hbase.uid_table")).thenReturn("tsdb-uid"); when(config.getString("tsd.storage.hbase.meta_table")).thenReturn("tsdb-meta"); when(config.getString("tsd.storage.hbase.tree_table")).thenReturn("tsdb-tree"); when(config.enable_tsuid_incrementing()).thenReturn(true); when(config.enable_realtime_ts()).thenReturn(true); PowerMockito.whenNew(HBaseClient.class) .withArguments(anyString(), anyString()) .thenReturn(client); tsdb = new TSDB(config); storage = new MockBase(tsdb, client, true, true, true, true, true); storage.addColumn( new byte[] {0, 0, 1}, NAME_FAMILY, "metrics".getBytes(MockBase.ASCII()), "sys.cpu.0".getBytes(MockBase.ASCII())); storage.addColumn( new byte[] {0, 0, 1}, NAME_FAMILY, "metric_meta".getBytes(MockBase.ASCII()), ("{\"uid\":\"000001\",\"type\":\"METRIC\",\"name\":\"sys.cpu.0\"," + "\"description\":\"Description\",\"notes\":\"MyNotes\",\"created\":" + "1328140801,\"displayName\":\"System CPU\"}") .getBytes(MockBase.ASCII())); storage.addColumn( new byte[] {0, 0, 1}, NAME_FAMILY, "tagk".getBytes(MockBase.ASCII()), "host".getBytes(MockBase.ASCII())); storage.addColumn( new byte[] {0, 0, 1}, NAME_FAMILY, "tagk_meta".getBytes(MockBase.ASCII()), ("{\"uid\":\"000001\",\"type\":\"TAGK\",\"name\":\"host\"," + "\"description\":\"Description\",\"notes\":\"MyNotes\",\"created\":" + "1328140801,\"displayName\":\"Host server name\"}") .getBytes(MockBase.ASCII())); storage.addColumn( new byte[] {0, 0, 1}, NAME_FAMILY, "tagv".getBytes(MockBase.ASCII()), "web01".getBytes(MockBase.ASCII())); storage.addColumn( new byte[] {0, 0, 1}, NAME_FAMILY, "tagv_meta".getBytes(MockBase.ASCII()), ("{\"uid\":\"000001\",\"type\":\"TAGV\",\"name\":\"web01\"," + "\"description\":\"Description\",\"notes\":\"MyNotes\",\"created\":" + "1328140801,\"displayName\":\"Web server 1\"}") .getBytes(MockBase.ASCII())); storage.addColumn( new byte[] {0, 0, 1, 0, 0, 1, 0, 0, 1}, NAME_FAMILY, "ts_meta".getBytes(MockBase.ASCII()), ("{\"tsuid\":\"000001000001000001\",\"" + "description\":\"Description\",\"notes\":\"Notes\",\"created\":1328140800," + "\"custom\":null,\"units\":\"\",\"retention\":42,\"max\":1.0,\"min\":" + "\"NaN\",\"displayName\":\"Display\",\"dataType\":\"Data\"}") .getBytes(MockBase.ASCII())); storage.addColumn( new byte[] {0, 0, 1, 0, 0, 1, 0, 0, 1}, NAME_FAMILY, "ts_ctr".getBytes(MockBase.ASCII()), Bytes.fromLong(1L)); }
private static int fsck( final TSDB tsdb, final HBaseClient client, final byte[] table, final boolean fix, final String[] args) throws Exception { /** Callback to asynchronously delete a specific {@link KeyValue}. */ final class DeleteOutOfOrder implements Callback<Deferred<Object>, Object> { private final KeyValue kv; public DeleteOutOfOrder(final KeyValue kv) { this.kv = kv; } public Deferred<Object> call(final Object arg) { return client.delete(new DeleteRequest(table, kv.key(), kv.family(), kv.qualifier())); } public String toString() { return "delete out-of-order data"; } } int errors = 0; int correctable = 0; final short metric_width = width(tsdb, "metrics"); final short name_width = width(tsdb, "tag_names"); final short value_width = width(tsdb, "tag_values"); final ArrayList<Query> queries = new ArrayList<Query>(); CliQuery.parseCommandLineQuery(args, tsdb, queries, null, null); final StringBuilder buf = new StringBuilder(); for (final Query query : queries) { final long start_time = System.nanoTime(); long ping_start_time = start_time; LOG.info("Starting to fsck data covered by " + query); long kvcount = 0; long rowcount = 0; final Bytes.ByteMap<Seen> seen = new Bytes.ByteMap<Seen>(); final Scanner scanner = Core.getScanner(query); ArrayList<ArrayList<KeyValue>> rows; while ((rows = scanner.nextRows().joinUninterruptibly()) != null) { for (final ArrayList<KeyValue> row : rows) { rowcount++; // Take a copy of the row-key because we're going to zero-out the // timestamp and use that as a key in our `seen' map. final byte[] key = row.get(0).key().clone(); final long base_time = Bytes.getUnsignedInt(key, metric_width); for (int i = metric_width; i < metric_width + Const.TIMESTAMP_BYTES; i++) { key[i] = 0; } Seen prev = seen.get(key); if (prev == null) { prev = new Seen(base_time - 1, row.get(0)); seen.put(key, prev); } for (final KeyValue kv : row) { kvcount++; if (kvcount % 100000 == 0) { final long now = System.nanoTime(); ping_start_time = (now - ping_start_time) / 1000000; LOG.info( "... " + kvcount + " KV analyzed in " + ping_start_time + "ms (" + (100000 * 1000 / ping_start_time) + " KVs/s)"); ping_start_time = now; } if (kv.qualifier().length != 2) { LOG.warn( "Ignoring unsupported KV with a qualifier of " + kv.qualifier().length + " bytes:" + kv); continue; } final short qualifier = Bytes.getShort(kv.qualifier()); final short delta = (short) ((qualifier & 0xFFFF) >>> FLAG_BITS); final long timestamp = base_time + delta; byte[] value = kv.value(); if (value.length > 8) { errors++; LOG.error("Value more than 8 byte long with a 2-byte" + " qualifier.\n\t" + kv); } // TODO(tsuna): Don't hardcode 0x8 / 0x3 here. if ((qualifier & (0x8 | 0x3)) == (0x8 | 0x3)) { // float | 4 bytes // The qualifier says the value is on 4 bytes, and the value is // on 8 bytes, then the 4 MSBs must be 0s. Old versions of the // code were doing this. It's kinda sad. Some versions had a // bug whereby the value would be sign-extended, so we can // detect these values and fix them here. if (value.length == 8) { if (value[0] == -1 && value[1] == -1 && value[2] == -1 && value[3] == -1) { errors++; correctable++; if (fix) { value = value.clone(); // We're going to change it. value[0] = value[1] = value[2] = value[3] = 0; client.put(new PutRequest(table, kv.key(), kv.family(), kv.qualifier(), value)); } else { LOG.error( "Floating point value with 0xFF most significant" + " bytes, probably caused by sign extension bug" + " present in revisions [96908436..607256fc].\n" + "\t" + kv); } } else if (value[0] != 0 || value[1] != 0 || value[2] != 0 || value[3] != 0) { errors++; } } else if (value.length != 4) { errors++; LOG.error( "This floating point value must be encoded either on" + " 4 or 8 bytes, but it's on " + value.length + " bytes.\n\t" + kv); } } if (timestamp <= prev.timestamp()) { errors++; correctable++; if (fix) { final byte[] newkey = kv.key().clone(); // Fix the timestamp in the row key. final long new_base_time = (timestamp - (timestamp % Const.MAX_TIMESPAN)); Bytes.setInt(newkey, (int) new_base_time, metric_width); final short newqual = (short) ((timestamp - new_base_time) << FLAG_BITS | (qualifier & FLAGS_MASK)); final DeleteOutOfOrder delooo = new DeleteOutOfOrder(kv); if (timestamp < prev.timestamp()) { client .put( new PutRequest( table, newkey, kv.family(), Bytes.fromShort(newqual), value)) // Only delete the offending KV once we're sure that the new // KV has been persisted in HBase. .addCallbackDeferring(delooo); } else { // We have two data points at exactly the same timestamp. // This can happen when only the flags differ. This is // typically caused by one data point being an integer and // the other being a floating point value. In this case // we just delete the duplicate data point and keep the // first one we saw. delooo.call(null); } } else { buf.setLength(0); buf.append( timestamp < prev.timestamp() ? "Out of order data.\n\t" : "Duplicate data point with different flags.\n\t") .append(timestamp) .append(" (") .append(DumpSeries.date(timestamp)) .append(") @ ") .append(kv) .append("\n\t"); DumpSeries.formatKeyValue(buf, tsdb, kv, base_time); buf.append("\n\t was found after\n\t") .append(prev.timestamp) .append(" (") .append(DumpSeries.date(prev.timestamp)) .append(") @ ") .append(prev.kv) .append("\n\t"); DumpSeries.formatKeyValue( buf, tsdb, prev.kv, Bytes.getUnsignedInt(prev.kv.key(), metric_width)); LOG.error(buf.toString()); } } else { prev.setTimestamp(timestamp); prev.kv = kv; } } } } final long timing = (System.nanoTime() - start_time) / 1000000; System.out.println( kvcount + " KVs (in " + rowcount + " rows) analyzed in " + timing + "ms (~" + (kvcount * 1000 / timing) + " KV/s)"); } System.out.println(errors != 0 ? "Found " + errors + " errors." : "No error found."); if (!fix && correctable > 0) { System.out.println( correctable + " of these errors are automatically" + " correctable, re-run with --fix.\n" + "Make sure you understand the errors above and you" + " know what you're doing before using --fix."); } return errors; }