private void addCandidateFromHeader(CrawlURL url, String urlInHeader) throws MalformedURLException { CrawlURL candidate = new CrawlURL(urlInHeader, url.getURL()); // 从种子重定向来的,也认为是种子 candidate.setSeed(url.isSeed()); url.addCandidate(candidate); }
/** * prepare the url before schedule into frontier: * * <pre> * 1. calculate the workqueue key; * 2. set priority for it; * 3. calculate the canonical format string,which will be used by the UriUniqFilter. * </pre> * * @param url */ public void prepare(CrawlURL url) { url.setWorkQueueKey(getWorkQueueKeyFor(url)); url.setPriority(getPriorityFor(url)); url.setCanonicalStr(getCanonicalStrFor(url)); }
@Override public void process(HandlerContext ctx, CrawlURL url) { // location String loc = url.getResponse().getHeader(HttpHeaders.Names.LOCATION); // no location, proceed the pipeline if (loc == null) { ctx.proceed(); return; } try { // location found, 跳过ExtracxtorHttp等后续,直接进入candidate handler addCandidateFromHeader(url, loc); ctx.jumpTo(candidateHandlerName); } catch (MalformedURLException e) { // location found but broken,视该URL为broken的,停止继续处理 url.setNeedRetry(false); ctx.finish(); } }
@Override public void run() { try { // 如果http请求没有发送完毕,我们还需要监听OP_WRITE状态 Boolean requestSendFinished = (Boolean) uri.getHandlerAttr(_REQUEST_SEND_FINISHED); if (Boolean.TRUE.equals(requestSendFinished)) { channel.register(selector, SelectionKey.OP_READ, uri); } else { channel.register(selector, SelectionKey.OP_READ | SelectionKey.OP_WRITE, uri); } } catch (ClosedChannelException e) { // channel由于某些原因关闭了,比如发送http request失败等。忽略之 } }
private void processReadableKey(SelectionKey key) { /** * * * <pre> * 这里暂时使用一连串的Heap-based ByteBuffer来保存每次读取到的网页数据, * * TODO: * 1. 使用directByteBuffer,bytebuffer池化/一次性分配大的然后slice? * 2. 读大小预测器 -- 保存在CrawURI中,每服务器一个 * 3. resume时传递的message不用bytebuffer,拷贝到一个相同大小的HeapChannelBuffer中; byteBuffer只在执行真正的网络IO时使用 * 4. CrawURI中使用一个composite channel buffer,每次将第三步中的buffer合并进去,减少拷贝次数 * </pre> */ ByteBuffer buffer = this.receiveBuffer; buffer.clear(); SocketChannel channel = (SocketChannel) key.channel(); CrawlURL uri = (CrawlURL) key.attachment(); int ret = 0; int readBytes = 0; try { while ((ret = channel.read(buffer)) > 0) { // 在低速网络情况下会抛出:java.io.IOException: // 远程主机强迫关闭了一个现有的连接。 readBytes += ret; if (!buffer.hasRemaining()) { break; } } // 读取完毕了?设置URI的状态 uri.setFetchStatus(ret < 0 ? FETCH_SUCCESSED : FETCH_ING); // 若本次读到了数据,无论是否读取完毕均resume pipeline执行,并将读取到的数据传递出去 if (readBytes > 0) { // 从DirectBuffer拷贝数据到一个compact的Heap ByteBuffer,传递出去 ByteBuffer msg = ByteBuffer.allocate(buffer.position()); buffer.flip(); msg.put(buffer); uri.getPipeline().resume(msg); } } catch (IOException e) { Object lastSendTime = uri.getHandlerAttr(_LAST_SEND_REQUEST_MILLIS); Long conTime = (Long) uri.getHandlerAttr(_CONNECT_SUCCESS_MILLIS); Integer sendReqTimes = (Integer) uri.getHandlerAttr(_REQUEST_SEND_TIMES); Integer sendBytes = (Integer) uri.getHandlerAttr(_REQUEST_ALREADY_SEND_SIZE); Integer requestSize = (Integer) uri.getHandlerAttr(_REQUEST_SIZE); long now = System.currentTimeMillis(); String debug = "\n"; if (lastSendTime != null) { debug += "距上次发送request时间(s):" + ((now - (Long) lastSendTime) / 1000); debug += "\n一共发送request次数:" + sendReqTimes; debug += "\n一共发送字节:" + sendBytes; debug += "\n请求共有字节:" + requestSize; } else { debug += "未发送过request,距连接成功时间(s):" + ((now - conTime) / 1000); } logger.error("error read http response ! URL: " + uri + debug, e); cancelAndClose(key); // TODO 读取响应失败,重试? uri.setFetchStatus(FETCH_FAILED); uri.getPipeline().resume(DefaultPipeline.EMPTY_MSG); } // 如果数据读取完毕,取消注册,关闭连接 if (ret < 0) { cancelAndClose(key); } }
private void processWritableKey(SelectionKey key) { CrawlURL url = (CrawlURL) key.attachment(); SocketChannel channel = (SocketChannel) key.channel(); ByteBuffer buffer = (ByteBuffer) url.getHandlerAttr(_REQUEST_BUFFER); try { // 发送http请求,若发送完成,取消OP_WRITE。 int writtenBytes = 0; for (int i = WRITE_SPIN_COUNT; i > 0; i--) { writtenBytes = channel.write(buffer); // write success if (writtenBytes != 0) { url.setHandlerAttr(_LAST_SEND_REQUEST_MILLIS, System.currentTimeMillis()); url.setHandlerAttr( _REQUEST_ALREADY_SEND_SIZE, (Integer) url.getHandlerAttr(_REQUEST_ALREADY_SEND_SIZE) + writtenBytes); url.setHandlerAttr( _REQUEST_SEND_TIMES, (Integer) url.getHandlerAttr(_REQUEST_SEND_TIMES) + 1); break; } } boolean reqSendFinished = !buffer.hasRemaining(); url.setHandlerAttr(_REQUEST_SEND_FINISHED, reqSendFinished); url.setHandlerAttr(_REQUEST_SEND_FINISHED_MILLIS, reqSendFinished); if (reqSendFinished) { url.removeHandlerAttr(_REQUEST_BUFFER); key.interestOps(key.interestOps() & ~SelectionKey.OP_WRITE); } } catch (IOException e) { logger.error("error send http request ! URL: " + url); cancelAndClose(key); url.setFetchStatus(FETCH_FAILED); url.getPipeline().resume(DefaultPipeline.EMPTY_MSG); } }