Esempio n. 1
0
 private void addCandidateFromHeader(CrawlURL url, String urlInHeader)
     throws MalformedURLException {
   CrawlURL candidate = new CrawlURL(urlInHeader, url.getURL());
   // 从种子重定向来的,也认为是种子
   candidate.setSeed(url.isSeed());
   url.addCandidate(candidate);
 }
Esempio n. 2
0
  /**
   * prepare the url before schedule into frontier:
   *
   * <pre>
   * 1. calculate the workqueue key;
   * 2. set priority for it;
   * 3. calculate the canonical format string,which will be used by the UriUniqFilter.
   * </pre>
   *
   * @param url
   */
  public void prepare(CrawlURL url) {
    url.setWorkQueueKey(getWorkQueueKeyFor(url));

    url.setPriority(getPriorityFor(url));

    url.setCanonicalStr(getCanonicalStrFor(url));
  }
Esempio n. 3
0
  @Override
  public void process(HandlerContext ctx, CrawlURL url) {
    // location
    String loc = url.getResponse().getHeader(HttpHeaders.Names.LOCATION);
    // no location, proceed the pipeline
    if (loc == null) {
      ctx.proceed();
      return;
    }

    try {
      // location found, 跳过ExtracxtorHttp等后续,直接进入candidate handler
      addCandidateFromHeader(url, loc);
      ctx.jumpTo(candidateHandlerName);
    } catch (MalformedURLException e) {
      // location found but broken,视该URL为broken的,停止继续处理
      url.setNeedRetry(false);
      ctx.finish();
    }
  }
Esempio n. 4
0
 @Override
 public void run() {
   try {
     // 如果http请求没有发送完毕,我们还需要监听OP_WRITE状态
     Boolean requestSendFinished = (Boolean) uri.getHandlerAttr(_REQUEST_SEND_FINISHED);
     if (Boolean.TRUE.equals(requestSendFinished)) {
       channel.register(selector, SelectionKey.OP_READ, uri);
     } else {
       channel.register(selector, SelectionKey.OP_READ | SelectionKey.OP_WRITE, uri);
     }
   } catch (ClosedChannelException e) {
     // channel由于某些原因关闭了,比如发送http request失败等。忽略之
   }
 }
Esempio n. 5
0
  private void processReadableKey(SelectionKey key) {
    /**
     *
     *
     * <pre>
     * 这里暂时使用一连串的Heap-based ByteBuffer来保存每次读取到的网页数据,
     *
     * TODO:
     * 1. 使用directByteBuffer,bytebuffer池化/一次性分配大的然后slice?
     * 2. 读大小预测器 -- 保存在CrawURI中,每服务器一个
     * 3. resume时传递的message不用bytebuffer,拷贝到一个相同大小的HeapChannelBuffer中; byteBuffer只在执行真正的网络IO时使用
     * 4. CrawURI中使用一个composite channel buffer,每次将第三步中的buffer合并进去,减少拷贝次数
     * </pre>
     */
    ByteBuffer buffer = this.receiveBuffer;
    buffer.clear();
    SocketChannel channel = (SocketChannel) key.channel();
    CrawlURL uri = (CrawlURL) key.attachment();

    int ret = 0;
    int readBytes = 0;
    try {
      while ((ret = channel.read(buffer)) > 0) { // 在低速网络情况下会抛出:java.io.IOException:
        // 远程主机强迫关闭了一个现有的连接。
        readBytes += ret;
        if (!buffer.hasRemaining()) {
          break;
        }
      }
      // 读取完毕了?设置URI的状态
      uri.setFetchStatus(ret < 0 ? FETCH_SUCCESSED : FETCH_ING);
      // 若本次读到了数据,无论是否读取完毕均resume pipeline执行,并将读取到的数据传递出去
      if (readBytes > 0) {
        // 从DirectBuffer拷贝数据到一个compact的Heap ByteBuffer,传递出去
        ByteBuffer msg = ByteBuffer.allocate(buffer.position());
        buffer.flip();
        msg.put(buffer);
        uri.getPipeline().resume(msg);
      }

    } catch (IOException e) {
      Object lastSendTime = uri.getHandlerAttr(_LAST_SEND_REQUEST_MILLIS);
      Long conTime = (Long) uri.getHandlerAttr(_CONNECT_SUCCESS_MILLIS);
      Integer sendReqTimes = (Integer) uri.getHandlerAttr(_REQUEST_SEND_TIMES);
      Integer sendBytes = (Integer) uri.getHandlerAttr(_REQUEST_ALREADY_SEND_SIZE);
      Integer requestSize = (Integer) uri.getHandlerAttr(_REQUEST_SIZE);
      long now = System.currentTimeMillis();
      String debug = "\n";
      if (lastSendTime != null) {
        debug += "距上次发送request时间(s):" + ((now - (Long) lastSendTime) / 1000);
        debug += "\n一共发送request次数:" + sendReqTimes;
        debug += "\n一共发送字节:" + sendBytes;
        debug += "\n请求共有字节:" + requestSize;
      } else {
        debug += "未发送过request,距连接成功时间(s):" + ((now - conTime) / 1000);
      }
      logger.error("error read http response ! URL: " + uri + debug, e);
      cancelAndClose(key);
      // TODO 读取响应失败,重试?
      uri.setFetchStatus(FETCH_FAILED);
      uri.getPipeline().resume(DefaultPipeline.EMPTY_MSG);
    }

    // 如果数据读取完毕,取消注册,关闭连接
    if (ret < 0) {
      cancelAndClose(key);
    }
  }
Esempio n. 6
0
 private void processWritableKey(SelectionKey key) {
   CrawlURL url = (CrawlURL) key.attachment();
   SocketChannel channel = (SocketChannel) key.channel();
   ByteBuffer buffer = (ByteBuffer) url.getHandlerAttr(_REQUEST_BUFFER);
   try {
     // 发送http请求,若发送完成,取消OP_WRITE。
     int writtenBytes = 0;
     for (int i = WRITE_SPIN_COUNT; i > 0; i--) {
       writtenBytes = channel.write(buffer);
       // write success
       if (writtenBytes != 0) {
         url.setHandlerAttr(_LAST_SEND_REQUEST_MILLIS, System.currentTimeMillis());
         url.setHandlerAttr(
             _REQUEST_ALREADY_SEND_SIZE,
             (Integer) url.getHandlerAttr(_REQUEST_ALREADY_SEND_SIZE) + writtenBytes);
         url.setHandlerAttr(
             _REQUEST_SEND_TIMES, (Integer) url.getHandlerAttr(_REQUEST_SEND_TIMES) + 1);
         break;
       }
     }
     boolean reqSendFinished = !buffer.hasRemaining();
     url.setHandlerAttr(_REQUEST_SEND_FINISHED, reqSendFinished);
     url.setHandlerAttr(_REQUEST_SEND_FINISHED_MILLIS, reqSendFinished);
     if (reqSendFinished) {
       url.removeHandlerAttr(_REQUEST_BUFFER);
       key.interestOps(key.interestOps() & ~SelectionKey.OP_WRITE);
     }
   } catch (IOException e) {
     logger.error("error send http request ! URL: " + url);
     cancelAndClose(key);
     url.setFetchStatus(FETCH_FAILED);
     url.getPipeline().resume(DefaultPipeline.EMPTY_MSG);
   }
 }