예제 #1
0
  private boolean listingExists(URL url) {
    DB db = LecturaCrawlerSuite.getDB();
    DBCollection collection = db.getCollection("listings");

    DBCursor cursor = collection.find(new BasicDBObject("url", url.toString()));
    ;
    int cnt = cursor.count();
    cursor.close();
    return cnt > 0;
  }
예제 #2
0
  public void configureLogger(String level, int ttl, boolean debug) {
    if (name == null) return;

    Level logLevel = LecturaCrawlerSuite.convertLogLevel(level);

    logger.setLevel(logLevel);

    // disable passing the logs up to the parent handler
    logger.setUseParentHandlers(false);

    // if debug mode is on, enable console handler
    if (debug) {
      ConsoleHandler consoleHandler = new ConsoleHandler();
      consoleHandler.setLevel(Level.ALL);
      logger.addHandler(consoleHandler);
    }

    logger.addHandler(new MongoLogHandler("log." + name, ttl));
  }
예제 #3
0
  protected String loadCustomPage(URL url) {
    logger.info("Loading custom page: " + url);
    CloseableHttpResponse response = null;
    int attempts = 0;
    int retry_ = (retry == 0) ? 1 : retry;
    boolean responseOk = false;
    while (attempts < retry_ && !responseOk) {

      if (attempts > 0) {
        try {
          logger.finest("Going to sleep for " + retryAfter + "ms " + url);
          Thread.sleep(retryAfter);
        } catch (InterruptedException e) {
          // ignore
          logger.severe("Failed to sleep for " + retryAfter + "ms " + url);
        }
        logger.info(
            "Retrying to get page after "
                + retryAfter
                + "ms : Attempt "
                + new Integer(attempts + 1)
                + "/"
                + retry
                + " "
                + url);
      }

      try {
        response = LecturaCrawlerSuite.getResponse(url, useProxy);
      } catch (Exception e) {
        logger.severe("Failed to retrieve response from custom page: " + e.getMessage());
      }

      responseOk = (response != null && response.getStatusLine().getStatusCode() == 200);
      attempts++;
    }

    try {
      HttpEntity entity = response.getEntity();

      // if the status code is not OK, report a problem
      if (response.getStatusLine().getStatusCode() != 200) {
        Header[] headers = response.getAllHeaders();
        String headersStr = response.getStatusLine().toString();
        for (int i = 0; i < headers.length; i++) {
          headersStr += " | " + headers[i];
        }

        logger.warning("Status code not OK: [" + url.toString() + "] " + headersStr);
        logger.warning(EntityUtils.toString(entity));
      }

      if (entity != null) {
        return EntityUtils.toString(entity);
      } else {
        return null;
      }

    } catch (Exception e) {
      logger.severe("Failed to load custom page: [" + url.toString() + "] " + e.getMessage());
      return null;
    } finally {
      try {
        response.close();
      } catch (IOException e) {
        logger.severe("Failed to close HTTP response: " + e.getMessage());
      }
    }
  }
예제 #4
0
  protected void loadPage(URL url, Parser parser) {
    // toto je prasarna jak kreten
    StackTraceElement[] stackTraceElements = Thread.currentThread().getStackTrace();
    // if testListing called this, perform no check
    if (!stackTraceElements[2].getMethodName().equals("testListing")) {
      if (parser.equals(listingParser)) {
        if (listingExists(url)) {
          currentListing = new Listing();
          currentListing.setDuplicate(true);
          // avoid the request
          return;
        }
      }
    }

    CloseableHttpResponse response = null;
    int attempts = 0;
    int retry_ = (retry == 0) ? 1 : retry;
    boolean responseOk = false;
    while (attempts < retry_ && !responseOk) {

      if (attempts > 0) {

        // consume the entity, close the previous response -> release the resources (connection,
        // ...)
        if (response != null) {
          try {
            // this will consume the entity and release the resources automatically, e.g. connection
            EntityUtils.consume(response.getEntity());
          } catch (IOException e) {
            logger.severe("Failed to consume the entity: " + e.getMessage());
          }

          try {
            response.close();
          } catch (IOException e) {
            logger.severe("Failed to close HTTP response: " + e.getMessage());
          }
        }

        try {
          logger.finest("Going to sleep for " + retryAfter + "ms " + url);
          Thread.sleep(retryAfter);
        } catch (InterruptedException e) {
          // ignore
          logger.severe("Failed to sleep for " + retryAfter + "ms " + url);
        }
        logger.info(
            "Retrying to get page after "
                + retryAfter
                + "ms : Attempt "
                + new Integer(attempts + 1)
                + "/"
                + retry
                + " "
                + url);
      }

      try {
        response = LecturaCrawlerSuite.getResponse(url, useProxy);
      } catch (Exception e) {
        logger.severe("Failed to retrieve response: " + e.getMessage());
      }

      responseOk =
          (response != null && response.getStatusLine().getStatusCode() == HttpStatus.SC_OK);
      attempts++;
    }

    try {
      HttpEntity entity = response.getEntity();
      // if the status code is not OK, report a problem
      if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
        Header[] headers = response.getAllHeaders();
        String headersStr = response.getStatusLine().toString();
        for (int i = 0; i < headers.length; i++) {
          headersStr += " | " + headers[i];
        }

        logger.warning("Status code not OK: [" + url.toString() + "] " + headersStr);
        logger.warning(EntityUtils.toString(entity));
      }

      if (entity != null) {
        Header[] headers = response.getHeaders("Content-Type");
        Charset charset = Charset.forName("UTF-8");
        String regexCharset = ".*?charset=(.*)$";
        for (Header header : headers) {
          String val = header.getValue();
          if (val.matches(regexCharset)) {
            try {
              charset = Charset.forName(val.replaceAll(regexCharset, "$1"));
              break;
            } catch (IllegalCharsetNameException e) {
              // ignore, it is already set to utf-8
            }
          }
        }
        parser.parse(EntityUtils.toString(entity, charset));
      }

    } catch (Exception e) {
      logger.severe("Failed to load page: [" + url + "] " + e.getMessage());
    } finally {
      try {
        // this will consume the entity and release the resources automatically, e.g. connection
        EntityUtils.consume(response.getEntity());
      } catch (IOException e) {
        logger.severe("Failed to consume the entity: " + e.getMessage());
      }

      try {
        response.close();
      } catch (IOException e) {
        logger.severe("Failed to close HTTP response: " + e.getMessage());
      }
    }
  }