private boolean listingExists(URL url) { DB db = LecturaCrawlerSuite.getDB(); DBCollection collection = db.getCollection("listings"); DBCursor cursor = collection.find(new BasicDBObject("url", url.toString())); ; int cnt = cursor.count(); cursor.close(); return cnt > 0; }
public void configureLogger(String level, int ttl, boolean debug) { if (name == null) return; Level logLevel = LecturaCrawlerSuite.convertLogLevel(level); logger.setLevel(logLevel); // disable passing the logs up to the parent handler logger.setUseParentHandlers(false); // if debug mode is on, enable console handler if (debug) { ConsoleHandler consoleHandler = new ConsoleHandler(); consoleHandler.setLevel(Level.ALL); logger.addHandler(consoleHandler); } logger.addHandler(new MongoLogHandler("log." + name, ttl)); }
protected String loadCustomPage(URL url) { logger.info("Loading custom page: " + url); CloseableHttpResponse response = null; int attempts = 0; int retry_ = (retry == 0) ? 1 : retry; boolean responseOk = false; while (attempts < retry_ && !responseOk) { if (attempts > 0) { try { logger.finest("Going to sleep for " + retryAfter + "ms " + url); Thread.sleep(retryAfter); } catch (InterruptedException e) { // ignore logger.severe("Failed to sleep for " + retryAfter + "ms " + url); } logger.info( "Retrying to get page after " + retryAfter + "ms : Attempt " + new Integer(attempts + 1) + "/" + retry + " " + url); } try { response = LecturaCrawlerSuite.getResponse(url, useProxy); } catch (Exception e) { logger.severe("Failed to retrieve response from custom page: " + e.getMessage()); } responseOk = (response != null && response.getStatusLine().getStatusCode() == 200); attempts++; } try { HttpEntity entity = response.getEntity(); // if the status code is not OK, report a problem if (response.getStatusLine().getStatusCode() != 200) { Header[] headers = response.getAllHeaders(); String headersStr = response.getStatusLine().toString(); for (int i = 0; i < headers.length; i++) { headersStr += " | " + headers[i]; } logger.warning("Status code not OK: [" + url.toString() + "] " + headersStr); logger.warning(EntityUtils.toString(entity)); } if (entity != null) { return EntityUtils.toString(entity); } else { return null; } } catch (Exception e) { logger.severe("Failed to load custom page: [" + url.toString() + "] " + e.getMessage()); return null; } finally { try { response.close(); } catch (IOException e) { logger.severe("Failed to close HTTP response: " + e.getMessage()); } } }
protected void loadPage(URL url, Parser parser) { // toto je prasarna jak kreten StackTraceElement[] stackTraceElements = Thread.currentThread().getStackTrace(); // if testListing called this, perform no check if (!stackTraceElements[2].getMethodName().equals("testListing")) { if (parser.equals(listingParser)) { if (listingExists(url)) { currentListing = new Listing(); currentListing.setDuplicate(true); // avoid the request return; } } } CloseableHttpResponse response = null; int attempts = 0; int retry_ = (retry == 0) ? 1 : retry; boolean responseOk = false; while (attempts < retry_ && !responseOk) { if (attempts > 0) { // consume the entity, close the previous response -> release the resources (connection, // ...) if (response != null) { try { // this will consume the entity and release the resources automatically, e.g. connection EntityUtils.consume(response.getEntity()); } catch (IOException e) { logger.severe("Failed to consume the entity: " + e.getMessage()); } try { response.close(); } catch (IOException e) { logger.severe("Failed to close HTTP response: " + e.getMessage()); } } try { logger.finest("Going to sleep for " + retryAfter + "ms " + url); Thread.sleep(retryAfter); } catch (InterruptedException e) { // ignore logger.severe("Failed to sleep for " + retryAfter + "ms " + url); } logger.info( "Retrying to get page after " + retryAfter + "ms : Attempt " + new Integer(attempts + 1) + "/" + retry + " " + url); } try { response = LecturaCrawlerSuite.getResponse(url, useProxy); } catch (Exception e) { logger.severe("Failed to retrieve response: " + e.getMessage()); } responseOk = (response != null && response.getStatusLine().getStatusCode() == HttpStatus.SC_OK); attempts++; } try { HttpEntity entity = response.getEntity(); // if the status code is not OK, report a problem if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { Header[] headers = response.getAllHeaders(); String headersStr = response.getStatusLine().toString(); for (int i = 0; i < headers.length; i++) { headersStr += " | " + headers[i]; } logger.warning("Status code not OK: [" + url.toString() + "] " + headersStr); logger.warning(EntityUtils.toString(entity)); } if (entity != null) { Header[] headers = response.getHeaders("Content-Type"); Charset charset = Charset.forName("UTF-8"); String regexCharset = ".*?charset=(.*)$"; for (Header header : headers) { String val = header.getValue(); if (val.matches(regexCharset)) { try { charset = Charset.forName(val.replaceAll(regexCharset, "$1")); break; } catch (IllegalCharsetNameException e) { // ignore, it is already set to utf-8 } } } parser.parse(EntityUtils.toString(entity, charset)); } } catch (Exception e) { logger.severe("Failed to load page: [" + url + "] " + e.getMessage()); } finally { try { // this will consume the entity and release the resources automatically, e.g. connection EntityUtils.consume(response.getEntity()); } catch (IOException e) { logger.severe("Failed to consume the entity: " + e.getMessage()); } try { response.close(); } catch (IOException e) { logger.severe("Failed to close HTTP response: " + e.getMessage()); } } }