/** Creates the {@link Scanner} to use for this query. */ Scanner getScanner() throws HBaseException { final short metric_width = tsdb.metrics.width(); final byte[] start_row = new byte[metric_width + Const.TIMESTAMP_BYTES]; final byte[] end_row = new byte[metric_width + Const.TIMESTAMP_BYTES]; // We search at least one row before and one row after the start & end // time we've been given as it's quite likely that the exact timestamp // we're looking for is in the middle of a row. Plus, a number of things // rely on having a few extra data points before & after the exact start // & end dates in order to do proper rate calculation or downsampling near // the "edges" of the graph. Bytes.setInt(start_row, (int) getScanStartTime(), metric_width); Bytes.setInt( end_row, (end_time == UNSET ? -1 // Will scan until the end (0xFFF...). : (int) getScanEndTime()), metric_width); System.arraycopy(metric, 0, start_row, 0, metric_width); System.arraycopy(metric, 0, end_row, 0, metric_width); final Scanner scanner = tsdb.client.newScanner(tsdb.table); scanner.setStartKey(start_row); scanner.setStopKey(end_row); if (tags.size() > 0 || group_bys != null) { createAndSetFilter(scanner); } scanner.setFamily(TSDB.FAMILY); return scanner; }
/** * Finds all the {@link Span}s that match this query. This is what actually scans the HBase table * and loads the data into {@link Span}s. * * @return A map from HBase row key to the {@link Span} for that row key. Since a {@link Span} * actually contains multiple HBase rows, the row key stored in the map has its timestamp * zero'ed out. * @throws HBaseException if there was a problem communicating with HBase to perform the search. * @throws IllegalArgumentException if bad data was retreived from HBase. */ private TreeMap<byte[], Span> findSpans() throws HBaseException { final short metric_width = tsdb.metrics.width(); final TreeMap<byte[], Span> spans = // The key is a row key from HBase. new TreeMap<byte[], Span>(new SpanCmp(metric_width)); int nrows = 0; int hbase_time = 0; // milliseconds. long starttime = System.nanoTime(); final Scanner scanner = getScanner(); try { ArrayList<ArrayList<KeyValue>> rows; while ((rows = scanner.nextRows().joinUninterruptibly()) != null) { hbase_time += (System.nanoTime() - starttime) / 1000000; for (final ArrayList<KeyValue> row : rows) { final byte[] key = row.get(0).key(); if (Bytes.memcmp(metric, key, 0, metric_width) != 0) { throw new IllegalDataException( "HBase returned a row that doesn't match" + " our scanner (" + scanner + ")! " + row + " does not start" + " with " + Arrays.toString(metric)); } Span datapoints = spans.get(key); if (datapoints == null) { datapoints = new Span(tsdb); spans.put(key, datapoints); } datapoints.addRow(tsdb.compact(row)); nrows++; starttime = System.nanoTime(); } } } catch (RuntimeException e) { throw e; } catch (Exception e) { throw new RuntimeException("Should never be here", e); } finally { hbase_time += (System.nanoTime() - starttime) / 1000000; scanlatency.add(hbase_time); } LOG.info(this + " matched " + nrows + " rows in " + spans.size() + " spans"); if (nrows == 0) { return null; } return spans; }
/** * Sets the server-side regexp filter on the scanner. In order to find the rows with the relevant * tags, we use a server-side filter that matches a regular expression on the row key. * * @param scanner The scanner on which to add the filter. */ void createAndSetFilter(final Scanner scanner) { if (group_bys != null) { Collections.sort(group_bys, Bytes.MEMCMP); } final short name_width = tsdb.tag_names.width(); final short value_width = tsdb.tag_values.width(); final short tagsize = (short) (name_width + value_width); // Generate a regexp for our tags. Say we have 2 tags: { 0 0 1 0 0 2 } // and { 4 5 6 9 8 7 }, the regexp will be: // "^.{7}(?:.{6})*\\Q\000\000\001\000\000\002\\E(?:.{6})*\\Q\004\005\006\011\010\007\\E(?:.{6})*$" final StringBuilder buf = new StringBuilder( 15 // "^.{N}" + "(?:.{M})*" + "$" + ((13 + tagsize) // "(?:.{M})*\\Q" + tagsize bytes + "\\E" * (tags.size() + (group_bys == null ? 0 : group_bys.size() * 3)))); // In order to avoid re-allocations, reserve a bit more w/ groups ^^^ // Alright, let's build this regexp. From the beginning... buf.append( "(?s)" // Ensure we use the DOTALL flag. + "^.{") // ... start by skipping the metric ID and timestamp. .append(tsdb.metrics.width() + Const.TIMESTAMP_BYTES) .append("}"); final Iterator<byte[]> tags = this.tags.iterator(); final Iterator<byte[]> group_bys = (this.group_bys == null ? new ArrayList<byte[]>(0).iterator() : this.group_bys.iterator()); byte[] tag = tags.hasNext() ? tags.next() : null; byte[] group_by = group_bys.hasNext() ? group_bys.next() : null; // Tags and group_bys are already sorted. We need to put them in the // regexp in order by ID, which means we just merge two sorted lists. do { // Skip any number of tags. buf.append("(?:.{").append(tagsize).append("})*\\Q"); if (isTagNext(name_width, tag, group_by)) { addId(buf, tag); tag = tags.hasNext() ? tags.next() : null; } else { // Add a group_by. addId(buf, group_by); final byte[][] value_ids = (group_by_values == null ? null : group_by_values.get(group_by)); if (value_ids == null) { // We don't want any specific ID... buf.append(".{").append(value_width).append('}'); // Any value ID. } else { // We want specific IDs. List them: /(AAA|BBB|CCC|..)/ buf.append("(?:"); for (final byte[] value_id : value_ids) { buf.append("\\Q"); addId(buf, value_id); buf.append('|'); } // Replace the pipe of the last iteration. buf.setCharAt(buf.length() - 1, ')'); } group_by = group_bys.hasNext() ? group_bys.next() : null; } } while (tag != group_by); // Stop when they both become null. // Skip any number of tags before the end. buf.append("(?:.{").append(tagsize).append("})*$"); scanner.setKeyRegexp(buf.toString(), CHARSET); }
private static int fsck( final TSDB tsdb, final HBaseClient client, final byte[] table, final boolean fix, final String[] args) throws Exception { /** Callback to asynchronously delete a specific {@link KeyValue}. */ final class DeleteOutOfOrder implements Callback<Deferred<Object>, Object> { private final KeyValue kv; public DeleteOutOfOrder(final KeyValue kv) { this.kv = kv; } public Deferred<Object> call(final Object arg) { return client.delete(new DeleteRequest(table, kv.key(), kv.family(), kv.qualifier())); } public String toString() { return "delete out-of-order data"; } } int errors = 0; int correctable = 0; final short metric_width = width(tsdb, "metrics"); final short name_width = width(tsdb, "tag_names"); final short value_width = width(tsdb, "tag_values"); final ArrayList<Query> queries = new ArrayList<Query>(); CliQuery.parseCommandLineQuery(args, tsdb, queries, null, null); final StringBuilder buf = new StringBuilder(); for (final Query query : queries) { final long start_time = System.nanoTime(); long ping_start_time = start_time; LOG.info("Starting to fsck data covered by " + query); long kvcount = 0; long rowcount = 0; final Bytes.ByteMap<Seen> seen = new Bytes.ByteMap<Seen>(); final Scanner scanner = Core.getScanner(query); ArrayList<ArrayList<KeyValue>> rows; while ((rows = scanner.nextRows().joinUninterruptibly()) != null) { for (final ArrayList<KeyValue> row : rows) { rowcount++; // Take a copy of the row-key because we're going to zero-out the // timestamp and use that as a key in our `seen' map. final byte[] key = row.get(0).key().clone(); final long base_time = Bytes.getUnsignedInt(key, metric_width); for (int i = metric_width; i < metric_width + Const.TIMESTAMP_BYTES; i++) { key[i] = 0; } Seen prev = seen.get(key); if (prev == null) { prev = new Seen(base_time - 1, row.get(0)); seen.put(key, prev); } for (final KeyValue kv : row) { kvcount++; if (kvcount % 100000 == 0) { final long now = System.nanoTime(); ping_start_time = (now - ping_start_time) / 1000000; LOG.info( "... " + kvcount + " KV analyzed in " + ping_start_time + "ms (" + (100000 * 1000 / ping_start_time) + " KVs/s)"); ping_start_time = now; } if (kv.qualifier().length != 2) { LOG.warn( "Ignoring unsupported KV with a qualifier of " + kv.qualifier().length + " bytes:" + kv); continue; } final short qualifier = Bytes.getShort(kv.qualifier()); final short delta = (short) ((qualifier & 0xFFFF) >>> FLAG_BITS); final long timestamp = base_time + delta; byte[] value = kv.value(); if (value.length > 8) { errors++; LOG.error("Value more than 8 byte long with a 2-byte" + " qualifier.\n\t" + kv); } // TODO(tsuna): Don't hardcode 0x8 / 0x3 here. if ((qualifier & (0x8 | 0x3)) == (0x8 | 0x3)) { // float | 4 bytes // The qualifier says the value is on 4 bytes, and the value is // on 8 bytes, then the 4 MSBs must be 0s. Old versions of the // code were doing this. It's kinda sad. Some versions had a // bug whereby the value would be sign-extended, so we can // detect these values and fix them here. if (value.length == 8) { if (value[0] == -1 && value[1] == -1 && value[2] == -1 && value[3] == -1) { errors++; correctable++; if (fix) { value = value.clone(); // We're going to change it. value[0] = value[1] = value[2] = value[3] = 0; client.put(new PutRequest(table, kv.key(), kv.family(), kv.qualifier(), value)); } else { LOG.error( "Floating point value with 0xFF most significant" + " bytes, probably caused by sign extension bug" + " present in revisions [96908436..607256fc].\n" + "\t" + kv); } } else if (value[0] != 0 || value[1] != 0 || value[2] != 0 || value[3] != 0) { errors++; } } else if (value.length != 4) { errors++; LOG.error( "This floating point value must be encoded either on" + " 4 or 8 bytes, but it's on " + value.length + " bytes.\n\t" + kv); } } if (timestamp <= prev.timestamp()) { errors++; correctable++; if (fix) { final byte[] newkey = kv.key().clone(); // Fix the timestamp in the row key. final long new_base_time = (timestamp - (timestamp % Const.MAX_TIMESPAN)); Bytes.setInt(newkey, (int) new_base_time, metric_width); final short newqual = (short) ((timestamp - new_base_time) << FLAG_BITS | (qualifier & FLAGS_MASK)); final DeleteOutOfOrder delooo = new DeleteOutOfOrder(kv); if (timestamp < prev.timestamp()) { client .put( new PutRequest( table, newkey, kv.family(), Bytes.fromShort(newqual), value)) // Only delete the offending KV once we're sure that the new // KV has been persisted in HBase. .addCallbackDeferring(delooo); } else { // We have two data points at exactly the same timestamp. // This can happen when only the flags differ. This is // typically caused by one data point being an integer and // the other being a floating point value. In this case // we just delete the duplicate data point and keep the // first one we saw. delooo.call(null); } } else { buf.setLength(0); buf.append( timestamp < prev.timestamp() ? "Out of order data.\n\t" : "Duplicate data point with different flags.\n\t") .append(timestamp) .append(" (") .append(DumpSeries.date(timestamp)) .append(") @ ") .append(kv) .append("\n\t"); DumpSeries.formatKeyValue(buf, tsdb, kv, base_time); buf.append("\n\t was found after\n\t") .append(prev.timestamp) .append(" (") .append(DumpSeries.date(prev.timestamp)) .append(") @ ") .append(prev.kv) .append("\n\t"); DumpSeries.formatKeyValue( buf, tsdb, prev.kv, Bytes.getUnsignedInt(prev.kv.key(), metric_width)); LOG.error(buf.toString()); } } else { prev.setTimestamp(timestamp); prev.kv = kv; } } } } final long timing = (System.nanoTime() - start_time) / 1000000; System.out.println( kvcount + " KVs (in " + rowcount + " rows) analyzed in " + timing + "ms (~" + (kvcount * 1000 / timing) + " KV/s)"); } System.out.println(errors != 0 ? "Found " + errors + " errors." : "No error found."); if (!fix && correctable > 0) { System.out.println( correctable + " of these errors are automatically" + " correctable, re-run with --fix.\n" + "Make sure you understand the errors above and you" + " know what you're doing before using --fix."); } return errors; }