public void evaluateSequential(Frontier frontier, Seen seen) {
    Redirects r = null;
    try {
      r = _redirsClass.newInstance();
    } catch (InstantiationException e) {
      _log.info("IllegalAccessException. Using dummy.");
      r = new DummyRedirects();
    } catch (IllegalAccessException e) {
      _log.info("IllegalAccessException. Using dummy.");
      r = new DummyRedirects();
    }

    _queue = new BreadthFirstQueue(_tldm, r, seen, Integer.MAX_VALUE, Integer.MAX_VALUE, -1, false);
    _queue.schedule(frontier);

    _log.info(_queue.toString());

    int i = 0;

    while (_queue.size() > 0 && i <= CrawlerConstants.MAX_REDIRECTS) {
      List<Thread> ts = new ArrayList<Thread>();

      for (int j = 0; j < _threads; j++) {
        LookupThread lt =
            new LookupThread(
                _cm, _queue, _contentHandler, _output, _links, _robots, _eh, _ff, _blacklist, j);
        ts.add(lt); // new Thread(lt,"LookupThread-"+j));
      }

      _log.info("Starting threads round " + i++ + " with " + _queue.size() + " uris");

      for (Thread t : ts) {
        t.start();
      }

      Monitor m = new Monitor(ts, System.err, 1000 * 10);
      m.start();

      for (Thread t : ts) {
        try {
          t.join();
        } catch (InterruptedException e1) {
          _log.info(e1.getMessage());
          // e1.printStackTrace();
        }
      }

      m.shutdown();

      _queue.schedule(frontier);

      _log.info("ROUND " + i + " DONE with " + _queue.size() + " uris remaining in queue");

      _log.info("new queue: \n" + _queue.toString());
    }

    _log.info("DONE with " + _queue.size() + " uris remaining in queue");
  }
  public void evaluateBreadthFirst(
      Frontier frontier,
      Seen seen,
      Redirects redirects,
      int depth,
      int maxuris,
      int maxplds,
      int minActPlds,
      boolean minActPldsAlready4Seedlist,
      Mode crawlingMode) {
    Redirects r = redirects;
    if (_queue != null) r = _queue.getRedirects();
    if (_queue == null
        || !(_queue instanceof BreadthFirstQueue || _queue instanceof DiskBreadthFirstQueue)) {
      if (CrawlerConstants.BREADTHFIRSTQUEUE_ONDISK)
        _queue = new DiskBreadthFirstQueue(_tldm, r, seen, minActPlds, minActPldsAlready4Seedlist);
      else
        _queue =
            new BreadthFirstQueue(
                _tldm, r, seen, maxuris, maxplds, minActPlds, minActPldsAlready4Seedlist);
    } else {
      Seen tempseen = _queue.getSeen();
      _queue =
          new BreadthFirstQueue(
              _tldm, r, seen, maxuris, maxplds, minActPlds, minActPldsAlready4Seedlist);
      _queue.setRedirects(r);
      _queue.setSeen(tempseen);
    }

    if (_links == null) {
      _links = new LinkFilterDefault(frontier);
    }

    _queue.schedule(frontier);

    _links.setFollowABox(crawlingMode.followABox());
    _links.setFollowTBox(crawlingMode.followTBox());

    _log.info(_queue.toString());

    int rounds = crawlingMode.doExtraRound() ? depth + 1 : depth;
    for (int curRound = 0;
        (curRound <= rounds)
            && (CrawlerConstants.URI_LIMIT_ENABLED
                ? (LookupThread.getOverall200FetchesWithNonEmptyRDF()
                    < CrawlerConstants.URI_LIMIT_WITH_NON_EMPTY_RDF)
                : true);
        curRound++) {
      List<Thread> ts = new ArrayList<Thread>();

      // Extra round to get TBox
      if (curRound == depth) {
        _links.setFollowABox(false);
        _links.setFollowTBox(true);
      }

      for (int j = 0; j < _threads; j++) {
        LookupThread lt =
            new LookupThread(
                _cm, _queue, _contentHandler, _output, _links, _robots, _eh, _ff, _blacklist, j);
        ts.add(lt); // new Thread(lt,"LookupThread-"+j));
      }

      _log.info("Starting threads round " + curRound + " with " + _queue.size() + " uris");

      Monitor m = new Monitor(ts, System.err, 1000 * 10);
      m.start();

      for (Thread t : ts) {
        t.start();
      }

      for (Thread t : ts) {
        try {
          t.join();
        } catch (InterruptedException e1) {
          _log.info(e1.getMessage());
          // e1.printStackTrace();
        }
      }

      m.shutdown();

      _log.info("ROUND " + curRound + " DONE with " + _queue.size() + " uris remaining in queue");
      _log.fine("old queue: \n" + _queue.toString());

      if (_output instanceof LastReporter)
        _log.info(
            "Last non-empty context of this hop (# "
                + curRound
                + " ): "
                + ((LastReporter) _output).whoWasLast());

      if (CrawlerConstants.SPLIT_HOPWISE) {

        for (TakingHopsIntoAccount thia : CrawlerConstants.THOSE_WHO_TAKE_HOPS_INTO_ACCOUNT) {
          try {
            thia.nextHop(curRound + 1);
          } catch (Exception e) {
            e.printStackTrace();
          }
        }
      }

      _queue.schedule(frontier);

      _eh.handleNextRound();

      _log.fine("new queue: \n" + _queue.toString());
    }
  }
  public void evaluateLoadBalanced(Frontier frontier, Seen seen, int maxuris) {
    if (_queue == null || !(_queue instanceof LoadBalancingQueue)) {
      Redirects r = null;
      if (_queue != null) r = _queue.getRedirects();
      if (r == null)
        // try {
        // _log.info("The instance get executed");
        // r = _redirsClass.newInstance();
        r = new DummyRedirects();
      // }
      // catch (InstantiationException e) {
      // _log.info("InstantiationException. Using dummy.");
      //	r = new DummyRedirects();
      // } catch (IllegalAccessException e) {
      //	_log.info("IllegalAccessException. Using dummy.");
      //	r = new DummyRedirects();
      // }
      _queue = new LoadBalancingQueue(_tldm, r, seen);
    } else {
      Redirects r = _queue.getRedirects();
      seen = _queue.getSeen();
      _queue = new LoadBalancingQueue(_tldm, r, seen);
      _queue.setSeen(seen);
    }

    if (_links == null) {
      _links = new LinkFilterDefault(frontier);
    }

    _queue.schedule(frontier);

    _log.fine(_queue.toString());

    int i = 0;
    int uris = 0;

    while (uris < maxuris && _queue.size() > 0) {
      int size = _queue.size();

      List<Thread> ts = new ArrayList<Thread>();

      for (int j = 0; j < _threads; j++) {
        LookupThread lt =
            new LookupThread(
                _cm, _queue, _contentHandler, _output, _links, _robots, _eh, _ff, _blacklist, j);
        ts.add(lt); // new Thread(lt,"LookupThread-"+j));
      }

      _log.info("Starting threads round " + i++ + " with " + _queue.size() + " uris");

      for (Thread t : ts) {
        t.start();
      }

      Monitor m = new Monitor(ts, System.err, 1000 * 10);
      m.start();

      for (Thread t : ts) {
        try {
          t.join();
        } catch (InterruptedException e1) {
          _log.info(e1.getMessage());
          // e1.printStackTrace();
        }
      }

      m.shutdown();

      uris += size - _queue.size();

      _log.info("ROUND " + i + " DONE with " + _queue.size() + " uris remaining in queue");
      _log.fine("old queue: \n" + _queue.toString());

      _log.fine("frontier" + frontier);

      _queue.schedule(frontier);

      _log.info("new queue: \n" + _queue.toString());
    }
  }
Esempio n. 4
0
  public void run() {
    _log.info("starting thread ...");

    if (!(!CrawlerConstants.URI_LIMIT_ENABLED
        || (_overall200FetchesWithRDF.get() < CrawlerConstants.URI_LIMIT_WITH_NON_EMPTY_RDF))) {
      _log.info("URI limit reached. Stopping...");
      return;
    }

    int i = 0;

    URI lu = _q.poll();

    _log.fine("got " + lu);

    while (lu != null) {

      if (!(!CrawlerConstants.URI_LIMIT_ENABLED
          || (_overall200FetchesWithRDF.get() < CrawlerConstants.URI_LIMIT_WITH_NON_EMPTY_RDF))) {
        _log.info("URI limit reached. Stopping...");
        break;
      }

      setName("LT-" + _no + ":" + lu.toString());

      _q.addSeen(lu);

      i++;
      long time = System.currentTimeMillis();

      //				URI lu = _q.obtainRedirect(u);

      long time1 = System.currentTimeMillis();
      long time2 = time1;
      long time3 = time1;
      long bytes = -1;
      int status = 0;
      String type = null;

      //			List<URI> li = _sitemaps.getSitemapUris(lu);
      //			if (li != null && li.size() > 0) {
      //				_log.info("sitemap surprisingly actually has uris " + li);
      //			}

      Header[] headers = null;

      if (!_blacklist.fetchOk(lu, 0, null)) {
        _log.info("access denied per blacklist for " + lu);
        _eh.handleStatus(lu, CrawlerConstants.SKIP_SUFFIX, null, 0, -1);
      } else if (!_robots.accessOk(lu)) {
        _log.info("access denied per robots.txt for " + lu);
        _eh.handleStatus(lu, CrawlerConstants.SKIP_ROBOTS, null, 0, -1);
      } else {
        time2 = System.currentTimeMillis();

        HttpGet hget = new HttpGet(lu);
        hget.setHeaders(CrawlerConstants.HEADERS);

        try {
          HttpResponse hres = _hclient.connect(hget);

          HttpEntity hen = hres.getEntity();

          status = hres.getStatusLine().getStatusCode();

          Header ct = hres.getFirstHeader("Content-Type");
          if (ct != null) {
            type = hres.getFirstHeader("Content-Type").getValue();
          }

          _log.info("lookup on " + lu + " status " + status + " " + getName());

          if (status == HttpStatus.SC_OK) {
            if (hen != null) {
              if (_ff.fetchOk(lu, status, hen) && _contentHandler.canHandle(type)) {
                InputStream is = hen.getContent();
                Callback contentCb =
                    _content.newDataset(new Provenance(lu, hres.getAllHeaders(), status));
                Callbacks cbs =
                    new Callbacks(
                        new Callback[] {contentCb, _links, _stmtCountingCallback.reset()});
                _contentHandler.handle(lu, type, is, cbs);
                is.close();

                _overall200Fetches.incrementAndGet();

                if (_stmtCountingCallback.getStmtCount() > 0)
                  _overall200FetchesWithRDF.incrementAndGet();

                // System.out.println("done with " + lu);

                headers = hres.getAllHeaders();

                Header hloc = hres.getFirstHeader("Content-Location");
                if (hloc != null) {
                  URI to = new URI(hloc.getValue());

                  // handle local redirects
                  if (!to.isAbsolute()) {
                    to = lu.resolve(hloc.getValue());
                  }

                  _q.setRedirect(lu, to, status);
                  _eh.handleRedirect(lu, to, status);
                  _q.addSeen(to);
                }
              } else {
                _log.info("disallowed via fetch filter " + lu + " type " + type);
                _eh.handleStatus(lu, CrawlerConstants.SKIP_MIMETYPE, null, 0, -1);
                hget.abort();
                hen = null;
                status = 0;
              }
            } else {
              _log.info("HttpEntity for " + lu + " is null");
            }
          } else if (status == HttpStatus.SC_MOVED_PERMANENTLY
              || status == HttpStatus.SC_MOVED_TEMPORARILY
              || status == HttpStatus.SC_SEE_OTHER
              || status == HttpStatus.SC_TEMPORARY_REDIRECT) {
            // treating all redirects the same but shouldn't: 301 -> rename context URI, 302,307 ->
            // keep original context URI, 303 -> spec inconclusive
            Header[] loc = hres.getHeaders("location");
            String path = loc[0].getValue();
            _log.info("redirecting (" + status + ") to " + path);
            URI to = new URI(path);

            // handle local redirects
            if (!to.isAbsolute()) {
              to = lu.resolve(path);
            }

            // set redirect from original uri to new uri
            _q.setRedirect(lu, to, status);
            _eh.handleRedirect(lu, to, status);

            headers = hres.getAllHeaders();
          }

          if (hen != null) {
            bytes = hen.getContentLength();
          }
          hget.abort();
        } catch (Throwable e) {
          hget.abort();
          _log.warning("Exception " + e.getClass().getName() + " " + lu);
          _eh.handleError(lu, e);
        }

        time3 = System.currentTimeMillis();

        if (status != 0) {
          _eh.handleStatus(lu, status, headers, (time3 - time2), bytes);
        }

        _log.fine(
            lu
                + " "
                + (time1 - time)
                + " ms before lookup, "
                + (time2 - time1)
                + " ms to check if lookup is ok, "
                + (time3 - time2)
                + " ms for lookup");
      }

      lu = _q.poll();
    }

    _log.info(
        "finished thread after fetching "
            + i
            + " uris; "
            + getOverall200Fetches()
            + " in all threads overall until now ("
            + getOverall200FetchesWithNonEmptyRDF()
            + " with non-empty RDF).");
  }