/** * Delete all CrawlURIs matching the given expression. * * @param match * @param queue * @param headKey * @return count of deleted items * @throws DatabaseException * @throws DatabaseException */ public long deleteMatchingFromQueue(String match, String queue, DatabaseEntry headKey) throws DatabaseException { long deletedCount = 0; Pattern pattern = Pattern.compile(match); DatabaseEntry key = headKey; DatabaseEntry value = new DatabaseEntry(); Cursor cursor = null; try { cursor = pendingUrisDB.openCursor(null, null); OperationStatus result = cursor.getSearchKeyRange(headKey, value, null); while (result == OperationStatus.SUCCESS) { if (value.getData().length > 0) { CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value); if (!curi.getClassKey().equals(queue)) { // rolled into next queue; finished with this queue break; } if (pattern.matcher(curi.toString()).matches()) { cursor.delete(); deletedCount++; } } result = cursor.getNext(key, value, null); } } finally { if (cursor != null) { cursor.close(); } } return deletedCount; }
/** * Calculate the insertKey that places a CrawlURI in the desired spot. First bytes are always * classKey (usu. host) based -- ensuring grouping by host -- terminated by a zero byte. Then 8 * bytes of data ensuring desired ordering within that 'queue' are used. The first byte of these 8 * is priority -- allowing 'immediate' and 'soon' items to sort above regular. Next 1 byte is * 'precedence'. Last 6 bytes are ordinal serial number, ensuring earlier-discovered URIs sort * before later. * * <p>NOTE: Dangers here are: (1) priorities or precedences over 2^7 (signed byte comparison) (2) * ordinals over 2^48 * * <p>Package access & static for testing purposes. * * @param curi * @return a DatabaseEntry key for the CrawlURI */ static DatabaseEntry calculateInsertKey(CrawlURI curi) { byte[] classKeyBytes = null; int len = 0; classKeyBytes = curi.getClassKey().getBytes(Charsets.UTF_8); len = classKeyBytes.length; byte[] keyData = new byte[len + 9]; System.arraycopy(classKeyBytes, 0, keyData, 0, len); keyData[len] = 0; long ordinalPlus = curi.getOrdinal() & 0x0000FFFFFFFFFFFFL; ordinalPlus = ((long) curi.getSchedulingDirective() << 56) | ordinalPlus; long precedence = Math.min(curi.getPrecedence(), 127); ordinalPlus = (((precedence) & 0xFFL) << 48) | ordinalPlus; ArchiveUtils.longIntoByteArray(ordinalPlus, keyData, len + 1); return new DatabaseEntry(keyData); }
/** * @param m marker or null to start with first entry * @param maxMatches * @return list of matches starting from marker position * @throws DatabaseException */ public CompositeData getFrom(String m, int maxMatches, Pattern pattern, boolean verbose) throws DatabaseException { int matches = 0; int tries = 0; ArrayList<String> results = new ArrayList<String>(maxMatches); DatabaseEntry key; if (m == null) { key = getFirstKey(); } else { byte[] marker = m.getBytes(); // = FrontierJMXTypes.fromString(m); key = new DatabaseEntry(marker); } DatabaseEntry value = new DatabaseEntry(); Cursor cursor = null; OperationStatus result = null; try { cursor = pendingUrisDB.openCursor(null, null); result = cursor.getSearchKey(key, value, null); while (matches < maxMatches && result == OperationStatus.SUCCESS) { if (value.getData().length > 0) { CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value); if (pattern.matcher(curi.toString()).matches()) { if (verbose) { results.add("[" + curi.getClassKey() + "] " + curi.shortReportLine()); } else { results.add(curi.toString()); } matches++; } tries++; } result = cursor.getNext(key, value, null); } } finally { if (cursor != null) { cursor.close(); } } if (result != OperationStatus.SUCCESS) { // end of scan m = null; } else { m = new String(key.getData()); // = FrontierJMXTypes.toString(key.getData()); } String[] arr = results.toArray(new String[results.size()]); CompositeData cd; try { cd = new CompositeDataSupport( /*FrontierJMXTypes.URI_LIST_DATA*/ null, new String[] {"list", "marker"}, new Object[] {arr, m}); } catch (OpenDataException e) { throw new IllegalStateException(e); } return cd; }