/** * Предполагается, что на этапе запуска приожения, если с MainUrl что-то не так то контейнер не * запустится :-) */ @PostConstruct public void init() { try { mainURI = new URI(properties.getProperty("MainUrl"), true, "UTF-8"); } catch (Exception e) { throw new RuntimeException(ERR_MSG + e.getMessage()); } if (!mainURI.isAbsoluteURI()) { throw new RuntimeException(ERR_MSG + "URI not absolute path"); } try { String mainHost = mainURI.getHost(); if (mainHost == null) { throw new RuntimeException(ERR_MSG + "bad URI host"); } } catch (URIException e) { throw new RuntimeException(ERR_MSG + e.getMessage()); } try { secureURI = new URI( properties.getProperty("SecureUrl", mainURI.toString().replaceFirst("http", "https")), true, "UTF-8"); } catch (Exception e) { throw new RuntimeException(ERR_MSG + e.getMessage()); } }
private boolean isValidMessageToScan(HttpMessage msg) { if (getScannerOptions().isScanHeadersAllRequests()) { return true; } // First we check if it's a dynamic or static page // I'd to do this because scanning starts to be veeeeery slow // -- // this is a trivial implementation, should be good to have // a page dynamic check at the parent plugin level which should // use or not Variants according to the behavior of the request // (e.g. different content or status error/redirect) String query = null; try { query = msg.getRequestHeader().getURI().getQuery(); } catch (URIException e) { log.error(e.getMessage(), e); } // If there's almost one GET parameter go ahead if (query == null || query.isEmpty()) { // If also the Request body is null maybe it's a static page oer a null parameter page if (msg.getRequestBody().length() == 0) { return false; } } return true; }
/** * parameter와 queryString 를 가져온다. * * @param method * @return */ private String getHttpInfoDumy(HttpMethod method) { NameValuePair[] params = null; String methodType = "GET"; String reqBody = null; if (method instanceof PostMethod) { params = ((PostMethod) method).getParameters(); methodType = "POST"; StringRequestEntity sre = (StringRequestEntity) ((PostMethod) method).getRequestEntity(); reqBody = sre.getContent(); } StringBuffer sb = new StringBuffer(); try { sb.append("#### getHttpInfoDumy ####"); sb.append("\n## " + methodType + " [" + method.getURI() + "], hscd[" + this.hashCode() + "]"); } catch (URIException e) { sb.append("\n## getParamsQueryStr- URIException " + e.getMessage() + "]"); return sb.toString(); } if (method.getQueryString() != null && method.getQueryString().length() > 0) sb.append("\n" + "## queryString[" + method.getQueryString() + "]"); if (params != null) { for (int i = 0; i < params.length; i++) { NameValuePair param = params[i]; sb.append( "\n" + "## POST body param[" + i + "], name[" + param.getName() + "], value[" + param.getValue() + "]"); } } if (reqBody != null) { sb.append("\n" + "## POST body String [" + reqBody + "]"); } sb.append("\n##########"); return sb.toString(); }
public boolean populate(CrawlURI curi, HttpClient http, HttpMethod method, String payload) { // http is not used. // payload is not used. boolean result = false; Map formItems = null; try { formItems = getFormItems(curi); } catch (AttributeNotFoundException e1) { logger.severe("Failed get of form items for " + curi); } if (formItems == null || formItems.size() <= 0) { try { logger.severe("No form items for " + method.getURI()); } catch (URIException e) { logger.severe("No form items and exception getting uri: " + e.getMessage()); } return result; } NameValuePair[] data = new NameValuePair[formItems.size()]; int index = 0; String key = null; for (Iterator i = formItems.keySet().iterator(); i.hasNext(); ) { key = (String) i.next(); data[index++] = new NameValuePair(key, (String) formItems.get(key)); } if (method instanceof PostMethod) { ((PostMethod) method).setRequestBody(data); result = true; } else if (method instanceof GetMethod) { // Append these values to the query string. // Get current query string, then add data, then get it again // only this time its our data only... then append. HttpMethodBase hmb = (HttpMethodBase) method; String currentQuery = hmb.getQueryString(); hmb.setQueryString(data); String newQuery = hmb.getQueryString(); hmb.setQueryString(((currentQuery != null) ? currentQuery : "") + "&" + newQuery); result = true; } else { logger.severe("Unknown method type: " + method); } return result; }
public boolean isPrerequisite(final CrawlURI curi) { boolean result = false; String curiStr = curi.getUURI().toString(); String loginUri = getPrerequisite(curi); if (loginUri != null) { try { UURI uuri = UURIFactory.getInstance(curi.getUURI(), loginUri); if (uuri != null && curiStr != null && uuri.toString().equals(curiStr)) { result = true; if (!curi.isPrerequisite()) { curi.setPrerequisite(true); logger.fine(curi + " is prereq."); } } } catch (URIException e) { logger.severe("Failed to uuri: " + curi + ", " + e.getMessage()); } } return result; }
protected void addHeaderLink(CrawlURI curi, Header loc) { if (loc == null) { // If null, return without adding anything. return; } // TODO: consider possibility of multiple headers try { /** * 302重定向使用自定义的方法存储link * * @modify: wuliufu * @since : 2012-05-11 */ curi.createAndAddLocationLink( curi.getVia(), loc.getValue(), loc.getName() + ":", Link.REFER_HOP); if (curi.getObject(URLInfo.ATTACH) != null) { UURI outUURI = UURIFactory.getInstance(curi.getUURI(), loc.getValue()); logger.debug( "ParseHTTP: curi = " + curi.getUURI().toString() + "&& " + loc.getName() + "=" + outUURI.toString()); curi.putObject(outUURI.toString(), curi.getObject(URLInfo.ATTACH)); } numberOfLinksExtracted++; } catch (URIException e) { // There may not be a controller (e.g. If we're being run // by the extractor tool). if (getController() != null) { getController().logUriError(e, curi.getUURI(), loc.getValue()); } else { logger.info(curi + ", " + loc.getValue() + ": " + e.getMessage()); } } }
/** Run method of the thread */ public void run() { queue = manager.workQueue; while (manager.hasWorkLeft()) { working = false; // code to make the worker pause, if the pause button has been presed // if the stop signal has been given stop the thread if (stop) { return; } // this pasuses the thread synchronized (this) { while (pleaseWait) { try { wait(); } catch (InterruptedException e) { return; } catch (Exception e) { e.printStackTrace(); } } } GetMethod httpget = null; HeadMethod httphead = null; try { work = (WorkUnit) queue.take(); working = true; url = work.getWork(); int code = 0; String responce = ""; String rawResponce = ""; // if the work is a head request if (work.getMethod().equalsIgnoreCase("HEAD")) { if (Config.debug) { System.out.println("DEBUG Worker[" + threadId + "]: HEAD " + url.toString()); } httphead = new HeadMethod(url.toString()); // set the custom HTTP headers Vector HTTPheaders = manager.getHTTPHeaders(); for (int a = 0; a < HTTPheaders.size(); a++) { HTTPHeader httpHeader = (HTTPHeader) HTTPheaders.elementAt(a); /* * Host header has to be set in a different way! */ if (httpHeader.getHeader().startsWith("Host")) { httphead.getParams().setVirtualHost(httpHeader.getValue()); } else { httphead.setRequestHeader(httpHeader.getHeader(), httpHeader.getValue()); } } httphead.setFollowRedirects(Config.followRedirects); /* * this code is used to limit the number of request/sec */ if (manager.isLimitRequests()) { while (manager.getTotalDone() / ((System.currentTimeMillis() - manager.getTimestarted()) / 1000.0) > manager.getLimitRequestsTo()) { Thread.sleep(100); } } /* * Send the head request */ code = httpclient.executeMethod(httphead); if (Config.debug) { System.out.println("DEBUG Worker[" + threadId + "]: " + code + " " + url.toString()); } httphead.releaseConnection(); } // if we are doing a get request else if (work.getMethod().equalsIgnoreCase("GET")) { // make the request; if (Config.debug) { System.out.println("DEBUG Worker[" + threadId + "]: GET " + url.toString()); } httpget = new GetMethod(url.toString()); // set the custom HTTP headers Vector HTTPheaders = manager.getHTTPHeaders(); for (int a = 0; a < HTTPheaders.size(); a++) { HTTPHeader httpHeader = (HTTPHeader) HTTPheaders.elementAt(a); /* * Host header has to be set in a different way! */ if (httpHeader.getHeader().startsWith("Host")) { httpget.getParams().setVirtualHost(httpHeader.getValue()); } else { httpget.setRequestHeader(httpHeader.getHeader(), httpHeader.getValue()); } } httpget.setFollowRedirects(Config.followRedirects); /* * this code is used to limit the number of request/sec */ if (manager.isLimitRequests()) { while (manager.getTotalDone() / ((System.currentTimeMillis() - manager.getTimestarted()) / 1000.0) > manager.getLimitRequestsTo()) { Thread.sleep(100); } } code = httpclient.executeMethod(httpget); if (Config.debug) { System.out.println("DEBUG Worker[" + threadId + "]: " + code + " " + url.toString()); } // set up the input stream BufferedReader input = new BufferedReader(new InputStreamReader(httpget.getResponseBodyAsStream())); // save the headers into a string, used in viewing raw responce String rawHeader; rawHeader = httpget.getStatusLine() + "\r\n"; Header headers[] = httpget.getResponseHeaders(); StringBuffer buf = new StringBuffer(); for (int a = 0; a < headers.length; a++) { buf.append(headers[a].getName() + ": " + headers[a].getValue() + "\r\n"); } rawHeader = rawHeader + buf.toString(); buf = new StringBuffer(); // read in the responce body String line; while ((line = input.readLine()) != null) { buf.append("\r\n" + line); } responce = buf.toString(); input.close(); rawResponce = rawHeader + responce; // clean the responce // parse the html of what we have found if (Config.parseHTML && !work.getBaseCaseObj().isUseRegexInstead()) { Header contentType = httpget.getResponseHeader("Content-Type"); if (contentType != null) { if (contentType.getValue().startsWith("text")) { manager.addHTMLToParseQueue(new HTMLparseWorkUnit(responce, work)); } } } responce = FilterResponce.CleanResponce(responce, work); Thread.sleep(10); httpget.releaseConnection(); } else { // There is no need to deal with requests other than HEAD or GET } // if we need to check the against the base case if (work.getMethod().equalsIgnoreCase("GET") && work.getBaseCaseObj().useContentAnalysisMode()) { if (code == 200) { if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Base Case Check " + url.toString()); } // TODO move this option to the Adv options // if the responce does not match the base case Pattern regexFindFile = Pattern.compile(".*file not found.*", Pattern.CASE_INSENSITIVE); Matcher m = regexFindFile.matcher(responce); // need to clean the base case of the item we are looking for String basecase = FilterResponce.removeItemCheckedFor( work.getBaseCaseObj().getBaseCase(), work.getItemToCheck()); if (m.find()) { // do nothing as we have a 404 } else if (!responce.equalsIgnoreCase(basecase)) { if (work.isDir()) { if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found Dir (base case)" + url.toString()); } // we found a dir manager.foundDir(url, code, responce, basecase, rawResponce, work.getBaseCaseObj()); } else { // found a file if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found File (base case)" + url.toString()); } manager.foundFile( url, code, responce, work.getBaseCaseObj().getBaseCase(), rawResponce, work.getBaseCaseObj()); } } } else if (code == 404 || code == 400) { // again do nothing as it is not there } else { if (work.isDir()) { if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found Dir (base case)" + url.toString()); } // we found a dir manager.foundDir( url, code, responce, work.getBaseCaseObj().getBaseCase(), rawResponce, work.getBaseCaseObj()); } else { // found a file if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found File (base case)" + url.toString()); } manager.foundFile( url, code, responce, work.getBaseCaseObj().getBaseCase(), rawResponce, work.getBaseCaseObj()); } // manager.foundError(url, "Base Case Mode Error - Responce code came back as " + code + // " it should have been 200"); // manager.workDone(); } } /* * use the custom regex check instead */ else if (work.getBaseCaseObj().isUseRegexInstead()) { Pattern regexFindFile = Pattern.compile(work.getBaseCaseObj().getRegex()); Matcher m = regexFindFile.matcher(rawResponce); /* System.out.println("======Trying to find======"); System.out.println(work.getBaseCaseObj().getRegex()); System.out.println("======In======"); System.out.println(responce); System.out.println("======/In======"); */ if (m.find()) { // do nothing as we have a 404 if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Regex matched so it's a 404, " + url.toString()); } } else { if (Config.parseHTML) { Header contentType = httpget.getResponseHeader("Content-Type"); if (contentType != null) { if (contentType.getValue().startsWith("text")) { manager.addHTMLToParseQueue(new HTMLparseWorkUnit(rawResponce, work)); } } } if (work.isDir()) { if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found Dir (regex) " + url.toString()); } // we found a dir manager.foundDir( url, code, responce, work.getBaseCaseObj().getBaseCase(), rawResponce, work.getBaseCaseObj()); } else { // found a file if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Found File (regex) " + url.toString()); } manager.foundFile( url, code, responce, work.getBaseCaseObj().getBaseCase(), rawResponce, work.getBaseCaseObj()); } // manager.foundError(url, "Base Case Mode Error - Responce code came back as " + code + // " it should have been 200"); // manager.workDone(); } } // just check the responce code else { // if is not the fail code, a 404 or a 400 then we have a possible if (code != work.getBaseCaseObj().getFailCode() && code != 404 && code != 0 && code != 400) { if (work.getMethod().equalsIgnoreCase("HEAD")) { if (Config.debug) { System.out.println( "DEBUG Worker[" + threadId + "]: Getting responce via GET " + url.toString()); } rawResponce = ""; httpget = new GetMethod(url.toString()); Vector HTTPheaders = manager.getHTTPHeaders(); for (int a = 0; a < HTTPheaders.size(); a++) { HTTPHeader httpHeader = (HTTPHeader) HTTPheaders.elementAt(a); httpget.setRequestHeader(httpHeader.getHeader(), httpHeader.getValue()); } httpget.setFollowRedirects(Config.followRedirects); /* * this code is used to limit the number of request/sec */ if (manager.isLimitRequests()) { while (manager.getTotalDone() / ((System.currentTimeMillis() - manager.getTimestarted()) / 1000.0) > manager.getLimitRequestsTo()) { Thread.sleep(100); } } int newCode = httpclient.executeMethod(httpget); // in some cases the second get can return a different result, than the first head // request! if (newCode != code) { manager.foundError( url, "Return code for first HEAD, is different to the second GET: " + code + " - " + newCode); } rawResponce = ""; // build a string version of the headers rawResponce = httpget.getStatusLine() + "\r\n"; Header headers[] = httpget.getResponseHeaders(); StringBuffer buf = new StringBuffer(); for (int a = 0; a < headers.length; a++) { buf.append(headers[a].getName() + ": " + headers[a].getValue() + "\r\n"); } buf.append("\r\n"); rawResponce = rawResponce + buf.toString(); if (httpget.getResponseContentLength() > 0) { // get the http body BufferedReader input = new BufferedReader(new InputStreamReader(httpget.getResponseBodyAsStream())); String line; String tempResponce = ""; buf = new StringBuffer(); while ((line = input.readLine()) != null) { buf.append("\r\n" + line); } tempResponce = buf.toString(); input.close(); rawResponce = rawResponce + tempResponce; Header contentType = httpget.getResponseHeader("Content-Type"); if (Config.parseHTML) { contentType = httpget.getResponseHeader("Content-Type"); if (contentType != null) { if (contentType.getValue().startsWith("text")) { manager.addHTMLToParseQueue(new HTMLparseWorkUnit(tempResponce, work)); } } } } httpget.releaseConnection(); } if (work.isDir()) { manager.foundDir(url, code, rawResponce, work.getBaseCaseObj()); } else { manager.foundFile(url, code, rawResponce, work.getBaseCaseObj()); } } } manager.workDone(); Thread.sleep(20); } catch (NoHttpResponseException e) { manager.foundError(url, "NoHttpResponseException " + e.getMessage()); manager.workDone(); } catch (ConnectTimeoutException e) { manager.foundError(url, "ConnectTimeoutException " + e.getMessage()); manager.workDone(); } catch (URIException e) { manager.foundError(url, "URIException " + e.getMessage()); manager.workDone(); } catch (IOException e) { manager.foundError(url, "IOException " + e.getMessage()); manager.workDone(); } catch (InterruptedException e) { // manager.foundError(url, "InterruptedException " + e.getMessage()); manager.workDone(); return; } catch (IllegalArgumentException e) { e.printStackTrace(); manager.foundError(url, "IllegalArgumentException " + e.getMessage()); manager.workDone(); } finally { if (httpget != null) { httpget.releaseConnection(); } if (httphead != null) { httphead.releaseConnection(); } } } }