@Override public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException { // first find a list of url hashes that shall be deleted final long terminate = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE; int count = 0; synchronized (this) { for (Index depthStack : this.depthStacks.values()) { final HandleSet urlHashes = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 100); final Iterator<Row.Entry> i = depthStack.rows(); Row.Entry rowEntry; Request crawlEntry; while (i.hasNext() && (System.currentTimeMillis() < terminate)) { rowEntry = i.next(); crawlEntry = new Request(rowEntry); if (crawlEntry.profileHandle().equals(profileHandle)) { urlHashes.put(crawlEntry.url().hash()); } if (System.currentTimeMillis() > terminate) break; } for (final byte[] urlhash : urlHashes) { depthStack.remove(urlhash); count++; } } } return count; }
/** * count number of references for a given term this method may cause strong IO load if called too * frequently. */ @Override public int count(final byte[] termHash) { final Integer cachedCount = this.countCache.get(termHash); if (cachedCount != null) return cachedCount.intValue(); int countFile = 0; // read fresh values from file try { countFile = this.array.count(termHash); } catch (final Throwable e) { ConcurrentLog.logException(e); } assert countFile >= 0; // count from container in ram final ReferenceContainer<ReferenceType> countRam = this.ram.get(termHash, null); assert countRam == null || countRam.size() >= 0; int c = countRam == null ? countFile : countFile + countRam.size(); // exclude entries from delayed remove synchronized (this.removeDelayedURLs) { final HandleSet s = this.removeDelayedURLs.get(termHash); if (s != null) c -= s.size(); if (c < 0) c = 0; } // put count result into cache if (MemoryControl.shortStatus()) this.countCache.clear(); this.countCache.insert(termHash, c); return c; }
/** * special iterator for BufferedObjectIndex: iterates only objects from the buffer. The use case * for this iterator is given if first elements are iterated and then all iterated elements are * deleted from the index. To minimize the IO load the buffer is filled from the backend in such a * way that it creates a minimum of Read/Write-Head operations which is done using the removeOne() * method. The buffer will be filled with the demanded number of records. The given load value * does not denote the number of removeOne() operations but the number of records that are missing * in the buffer to provide the give load number of record entries. The given load number must not * exceed the maximal number of entries in the buffer. To give room for put()-inserts while the * iterator is running it is recommended to set the load value at maximum to the maximum number of * entries in the buffer divided by two. * * @param load number of records that shall be in the buffer when returning the buffer iterator * @return an iterator of the elements in the buffer. * @throws IOException */ public HandleSet keysFromBuffer(final int load) throws IOException { if (load > this.buffersize) throw new IOException("buffer load size exceeded"); synchronized (this.backend) { int missing = Math.min(this.backend.size(), load - this.buffer.size()); while (missing-- > 0) { try { this.buffer.put(this.backend.removeOne()); } catch (final SpaceExceededException e) { ConcurrentLog.logException(e); break; } } final HandleSet handles = new RowHandleSet( this.buffer.row().primaryKeyLength, this.buffer.row().objectOrder, this.buffer.size()); final Iterator<byte[]> i = this.buffer.keys(); while (i.hasNext()) { try { handles.put(i.next()); } catch (final SpaceExceededException e) { ConcurrentLog.logException(e); break; } } handles.optimize(); return handles; } }
@Override public void removeDelayed(final byte[] termHash, final byte[] urlHashBytes) { HandleSet r; synchronized (this.removeDelayedURLs) { r = this.removeDelayedURLs.get(termHash); } if (r == null) { r = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); } try { r.put(urlHashBytes); } catch (final SpaceExceededException e) { try { remove(termHash, urlHashBytes); } catch (final IOException e1) { } return; } synchronized (this.removeDelayedURLs) { this.removeDelayedURLs.put(termHash, r); } }
@Override public void removeDelayed() throws IOException { final HandleSet words = new RowHandleSet( Word.commonHashLength, Word.commonHashOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed. synchronized (this.removeDelayedURLs) { for (final byte[] b : this.removeDelayedURLs.keySet()) try { words.put(b); } catch (final SpaceExceededException e) { } } synchronized (this.removeDelayedURLs) { for (final byte[] b : words) { final HandleSet urls = this.removeDelayedURLs.remove(b); if (urls != null) remove(b, urls); } } this.countCache.clear(); }