Java Protocol примеры использования

Язык программирования: Java

Пространство имен/Пакет: org.apache.nutch.protocol

Класс/Тип: Protocol

Примеров на hotexamples.com: 3

Java Protocol - 3 примера найдено. Это лучшие примеры Java кода для org.apache.nutch.protocol.Protocol, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

getProtocolOutput(3)

getRobotRules(1)

Пример #1

Показать файл

Файл: TestOOParser.java Проект: Earne/HiBench

  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    Protocol protocol;
    ProtocolFactory factory = new ProtocolFactory(conf);
    OOParser parser = new OOParser();
    parser.setConf(conf);

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = factory.getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

      parse = parser.getParse(content).get(content.getUrl());

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ");
      assertTrue(expectedText.equals(text));
    }
  }

Пример #2

Показать файл

Файл: FetcherThread.java Проект: Ccccz/nutch

  @SuppressWarnings("fallthrough")
  public void run() {
    activeThreads.incrementAndGet(); // count threads

    FetchItem fit = null;
    try {
      // checking for the server to be running and fetcher.parse to be true
      if (parsing && NutchServer.getInstance().isRunning()) reportToNutchServer = true;

      while (true) {
        // creating FetchNode for storing in FetchNodeDb
        if (reportToNutchServer) this.fetchNode = new FetchNode();
        else this.fetchNode = null;

        // check whether must be stopped
        if (isHalted()) {
          LOG.debug(getName() + " set to halted");
          fit = null;
          return;
        }

        fit = ((FetchItemQueues) fetchQueues).getFetchItem();
        if (fit == null) {
          if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
            LOG.debug(getName() + " spin-waiting ...");
            // spin-wait.
            ((AtomicInteger) spinWaiting).incrementAndGet();
            try {
              Thread.sleep(500);
            } catch (Exception e) {
            }
            ((AtomicInteger) spinWaiting).decrementAndGet();
            continue;
          } else {
            // all done, finish this thread
            LOG.info("Thread " + getName() + " has no more work available");
            return;
          }
        }
        lastRequestStart.set(System.currentTimeMillis());
        Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
        if (reprUrlWritable == null) {
          setReprUrl(fit.url.toString());
        } else {
          setReprUrl(reprUrlWritable.toString());
        }
        try {
          // fetch the page
          redirecting = false;
          redirectCount = 0;
          do {
            if (LOG.isInfoEnabled()) {
              LOG.info(
                  "fetching "
                      + fit.url
                      + " (queue crawl delay="
                      + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay
                      + "ms)");
            }
            if (LOG.isDebugEnabled()) {
              LOG.debug("redirectCount=" + redirectCount);
            }
            redirecting = false;
            Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
            BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
            if (!rules.isAllowed(fit.u.toString())) {
              // unblock
              ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
              if (LOG.isDebugEnabled()) {
                LOG.debug("Denied by robots.txt: " + fit.url);
              }
              output(
                  fit.url,
                  fit.datum,
                  null,
                  ProtocolStatus.STATUS_ROBOTS_DENIED,
                  CrawlDatum.STATUS_FETCH_GONE);
              reporter.incrCounter("FetcherStatus", "robots_denied", 1);
              continue;
            }
            if (rules.getCrawlDelay() > 0) {
              if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                // unblock
                ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                LOG.debug(
                    "Crawl-Delay for "
                        + fit.url
                        + " too long ("
                        + rules.getCrawlDelay()
                        + "), skipping");
                output(
                    fit.url,
                    fit.datum,
                    null,
                    ProtocolStatus.STATUS_ROBOTS_DENIED,
                    CrawlDatum.STATUS_FETCH_GONE);
                reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1);
                continue;
              } else {
                FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
                fiq.crawlDelay = rules.getCrawlDelay();
                if (LOG.isDebugEnabled()) {
                  LOG.debug(
                      "Crawl delay for queue: "
                          + fit.queueID
                          + " is set to "
                          + fiq.crawlDelay
                          + " as per robots.txt. url: "
                          + fit.url);
                }
              }
            }
            ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);
            ProtocolStatus status = output.getStatus();
            Content content = output.getContent();
            ParseStatus pstatus = null;
            // unblock queue
            ((FetchItemQueues) fetchQueues).finishFetchItem(fit);

            String urlString = fit.url.toString();

            // used for FetchNode
            if (fetchNode != null) {
              fetchNode.setStatus(status.getCode());
              fetchNode.setFetchTime(System.currentTimeMillis());
              fetchNode.setUrl(fit.url);
            }

            reporter.incrCounter("FetcherStatus", status.getName(), 1);

            switch (status.getCode()) {
              case ProtocolStatus.WOULDBLOCK:
                // retry ?
                ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                break;

              case ProtocolStatus.SUCCESS: // got a page
                pstatus =
                    output(
                        fit.url,
                        fit.datum,
                        content,
                        status,
                        CrawlDatum.STATUS_FETCH_SUCCESS,
                        fit.outlinkDepth);
                updateStatus(content.getContent().length);
                if (pstatus != null
                    && pstatus.isSuccess()
                    && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                  String newUrl = pstatus.getMessage();
                  int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                  Text redirUrl =
                      handleRedirect(
                          fit.url,
                          fit.datum,
                          urlString,
                          newUrl,
                          refreshTime < Fetcher.PERM_REFRESH_TIME,
                          Fetcher.CONTENT_REDIR);
                  if (redirUrl != null) {
                    fit = queueRedirect(redirUrl, fit);
                  }
                }
                break;

              case ProtocolStatus.MOVED: // redirect
              case ProtocolStatus.TEMP_MOVED:
                int code;
                boolean temp;
                if (status.getCode() == ProtocolStatus.MOVED) {
                  code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                  temp = false;
                } else {
                  code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                  temp = true;
                }
                output(fit.url, fit.datum, content, status, code);
                String newUrl = status.getMessage();
                Text redirUrl =
                    handleRedirect(
                        fit.url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR);
                if (redirUrl != null) {
                  fit = queueRedirect(redirUrl, fit);
                } else {
                  // stop redirecting
                  redirecting = false;
                }
                break;

              case ProtocolStatus.EXCEPTION:
                logError(fit.url, status.getMessage());
                int killedURLs =
                    ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID());
                if (killedURLs != 0)
                  reporter.incrCounter(
                      "FetcherStatus", "AboveExceptionThresholdInQueue", killedURLs);
                /* FALLTHROUGH */
              case ProtocolStatus.RETRY: // retry
              case ProtocolStatus.BLOCKED:
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                break;

              case ProtocolStatus.GONE: // gone
              case ProtocolStatus.NOTFOUND:
              case ProtocolStatus.ACCESS_DENIED:
              case ProtocolStatus.ROBOTS_DENIED:
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
                break;

              case ProtocolStatus.NOTMODIFIED:
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
                break;

              default:
                if (LOG.isWarnEnabled()) {
                  LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                }
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
            }

            if (redirecting && redirectCount > maxRedirect) {
              ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
              if (LOG.isInfoEnabled()) {
                LOG.info(" - redirect count exceeded " + fit.url);
              }
              output(
                  fit.url,
                  fit.datum,
                  null,
                  ProtocolStatus.STATUS_REDIR_EXCEEDED,
                  CrawlDatum.STATUS_FETCH_GONE);
            }

          } while (redirecting && (redirectCount <= maxRedirect));

        } catch (Throwable t) { // unexpected exception
          // unblock
          ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
          logError(fit.url, StringUtils.stringifyException(t));
          output(
              fit.url,
              fit.datum,
              null,
              ProtocolStatus.STATUS_FAILED,
              CrawlDatum.STATUS_FETCH_RETRY);
        }
      }

    } catch (Throwable e) {
      if (LOG.isErrorEnabled()) {
        LOG.error("fetcher caught:" + e.toString());
      }
    } finally {
      if (fit != null) ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
      activeThreads.decrementAndGet(); // count threads
      LOG.info("-finishing thread " + getName() + ", activeThreads=" + activeThreads);
    }
  }

Пример #3

Показать файл

Файл: ParserChecker.java Проект: harjiwiga/nutch-2.3.1

  public int run(String[] args) throws Exception {
    boolean dumpText = false;
    boolean force = false;
    String contentType = null;
    String url = null;

    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";

    if (args.length == 0) {
      LOG.error(usage);
      return (-1);
    }

    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("-forceAs")) {
        force = true;
        contentType = args[++i];
      } else if (args[i].equals("-dumpText")) {
        dumpText = true;
      } else if (i != args.length - 1) {
        LOG.error(usage);
        System.exit(-1);
      } else {
        url = URLUtil.toASCII(args[i]);
      }
    }

    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    WebPage page = WebPage.newBuilder().build();

    ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);

    if (!protocolOutput.getStatus().isSuccess()) {
      LOG.error(
          "Fetch failed with protocol status: "
              + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
              + ": "
              + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
      return (-1);
    }
    Content content = protocolOutput.getContent();

    if (content == null) {
      LOG.error("No content for " + url);
      return (-1);
    }
    page.setBaseUrl(new org.apache.avro.util.Utf8(url));
    page.setContent(ByteBuffer.wrap(content.getContent()));

    if (force) {
      content.setContentType(contentType);
    } else {
      contentType = content.getContentType();
    }

    if (contentType == null) {
      LOG.error("Failed to determine content type!");
      return (-1);
    }

    page.setContentType(new Utf8(contentType));

    if (ParserJob.isTruncated(url, page)) {
      LOG.warn("Content is truncated, parse may fail!");
    }

    Parse parse = new ParseUtil(conf).parse(url, page);

    if (parse == null) {
      LOG.error("Problem with parse - check log");
      return (-1);
    }

    // Calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page);

    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
      LOG.info("signature: " + StringUtil.toHexString(signature));
    }

    LOG.info("---------\nUrl\n---------------\n");
    System.out.print(url + "\n");
    LOG.info("---------\nMetadata\n---------\n");
    Map<CharSequence, ByteBuffer> metadata = page.getMetadata();
    StringBuffer sb = new StringBuffer();
    if (metadata != null) {
      Iterator<Entry<CharSequence, ByteBuffer>> iterator = metadata.entrySet().iterator();
      while (iterator.hasNext()) {
        Entry<CharSequence, ByteBuffer> entry = iterator.next();
        sb.append(entry.getKey().toString())
            .append(" : \t")
            .append(Bytes.toString(entry.getValue()))
            .append("\n");
      }
      System.out.print(sb.toString());
    }
    LOG.info("---------\nOutlinks\n---------\n");
    sb = new StringBuffer();
    for (Outlink l : parse.getOutlinks()) {
      sb.append("  outlink: ").append(l).append('\n');
    }
    System.out.print(sb.toString());
    if (page.getHeaders() != null) {
      LOG.info("---------\nHeaders\n---------\n");
      Map<CharSequence, CharSequence> headers = page.getHeaders();
      StringBuffer headersb = new StringBuffer();
      if (metadata != null) {
        Iterator<Entry<CharSequence, CharSequence>> iterator = headers.entrySet().iterator();
        while (iterator.hasNext()) {
          Entry<CharSequence, CharSequence> entry = iterator.next();
          headersb
              .append(entry.getKey().toString())
              .append(" : \t")
              .append(entry.getValue())
              .append("\n");
        }
        System.out.print(headersb.toString());
      }
    }
    if (dumpText) {
      LOG.info("---------\nParseText\n---------\n");
      System.out.print(parse.getText());
    }

    return 0;
  }